From 0cd30ee02ffa7cc99107e294c5cb66db7d2928af Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 00:06:17 +0000 Subject: [PATCH 001/118] initialized Structure --- src/transformers/models/__init__.py | 1 + src/transformers/models/hiera/__init__.py | 82 +++ src/transformers/models/hiera/benchmarking.py | 77 +++ .../models/hiera/configuration_hiera.py | 128 +++++ .../models/hiera/convert_hiera_to_pytorch.py | 27 + src/transformers/models/hiera/hiera.py | 535 ++++++++++++++++++ src/transformers/models/hiera/hiera_mae.py | 398 +++++++++++++ src/transformers/models/hiera/hiera_utils.py | 287 ++++++++++ 8 files changed, 1535 insertions(+) create mode 100644 src/transformers/models/hiera/__init__.py create mode 100644 src/transformers/models/hiera/benchmarking.py create mode 100644 src/transformers/models/hiera/configuration_hiera.py create mode 100644 src/transformers/models/hiera/convert_hiera_to_pytorch.py create mode 100644 src/transformers/models/hiera/hiera.py create mode 100644 src/transformers/models/hiera/hiera_mae.py create mode 100644 src/transformers/models/hiera/hiera_utils.py diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 5686cf516c49..0ef69742dc18 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -105,6 +105,7 @@ graphormer, groupvit, herbert, + hiera, hubert, ibert, idefics, diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py new file mode 100644 index 000000000000..bfd200e9dcb9 --- /dev/null +++ b/src/transformers/models/hiera/__init__.py @@ -0,0 +1,82 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_flax_available, + is_tf_available, + is_torch_available, +) + + +_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_vit_mae"] = [ + "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ViTMAEForPreTraining", + "ViTMAELayer", + "ViTMAEModel", + "ViTMAEPreTrainedModel", + ] + +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_vit_mae"] = [ + "TFViTMAEForPreTraining", + "TFViTMAEModel", + "TFViTMAEPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_vit_mae import ( + VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, + ViTMAEForPreTraining, + ViTMAELayer, + ViTMAEModel, + ViTMAEPreTrainedModel, + ) + + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py new file mode 100644 index 000000000000..33166028977a --- /dev/null +++ b/src/transformers/models/hiera/benchmarking.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- + +import time +from typing import List, Tuple, Union + +import torch +from tqdm import tqdm + +# From https://github.com/facebookresearch/ToMe/ +def benchmark( + model: torch.nn.Module, + device: torch.device = 0, + input_size: Tuple[int] = (3, 224, 224), + batch_size: int = 64, + runs: int = 40, + throw_out: float = 0.25, + use_fp16: bool = False, + verbose: bool = False, +) -> float: + """ + Benchmark the given model with random inputs at the given batch size. + + Args: + - model: the module to benchmark + - device: the device to use for benchmarking + - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w) + - batch_size: the batch size to use for evaluation + - runs: the number of total runs to do + - throw_out: the percentage of runs to throw out at the start of testing + - use_fp16: whether or not to benchmark with float16 and autocast + - verbose: whether or not to use tqdm to print progress / print throughput at end + + Returns: + - the throughput measured in images / second + """ + if not isinstance(device, torch.device): + device = torch.device(device) + is_cuda = torch.device(device).type == "cuda" + + model = model.eval().to(device) + input = torch.rand(batch_size, *input_size, device=device) + if use_fp16: + input = input.half() + + warm_up = int(runs * throw_out) + total = 0 + start = time.time() + + with torch.autocast(device.type, enabled=use_fp16): + with torch.no_grad(): + for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"): + if i == warm_up: + if is_cuda: + torch.cuda.synchronize() + total = 0 + start = time.time() + + model(input) + total += batch_size + + if is_cuda: + torch.cuda.synchronize() + + end = time.time() + elapsed = end - start + + throughput = total / elapsed + + if verbose: + print(f"Throughput: {throughput:.2f} im/s") + + return throughput diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py new file mode 100644 index 000000000000..de5de9e7d9e9 --- /dev/null +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -0,0 +1,128 @@ +""" hiera model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json", + # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae +} + + +class ViTMAEConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT + MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with + the defaults will yield a similar configuration to that of the ViT + [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + decoder_num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the decoder. + decoder_hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the decoder. + decoder_num_hidden_layers (`int`, *optional*, defaults to 8): + Number of hidden layers in the decoder. + decoder_intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder. + mask_ratio (`float`, *optional*, defaults to 0.75): + The ratio of the number of masked tokens in the input sequence. + norm_pix_loss (`bool`, *optional*, defaults to `False`): + Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved + representation quality in the experiments of the authors. + + Example: + + ```python + >>> from transformers import ViTMAEConfig, ViTMAEModel + + >>> # Initializing a ViT MAE vit-mae-base style configuration + >>> configuration = ViTMAEConfig() + + >>> # Initializing a model (with random weights) from the vit-mae-base style configuration + >>> model = ViTMAEModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "vit_mae" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + decoder_num_attention_heads=16, + decoder_hidden_size=512, + decoder_num_hidden_layers=8, + decoder_intermediate_size=2048, + mask_ratio=0.75, + norm_pix_loss=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.decoder_num_attention_heads = decoder_num_attention_heads + self.decoder_hidden_size = decoder_hidden_size + self.decoder_num_hidden_layers = decoder_num_hidden_layers + self.decoder_intermediate_size = decoder_intermediate_size + self.mask_ratio = mask_ratio + self.norm_pix_loss = norm_pix_loss diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py new file mode 100644 index 000000000000..506507e4e66e --- /dev/null +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -0,0 +1,27 @@ +import argparse + +import requests +import torch +from PIL import Image + + + +def rename_key(name): + if "patch_embed.proj" in name: + name = name.replace("patch_embed.proj", "patch_embed.projection") + return name + + +def e(orig_state_dict, config): + for key in orig_state_dict.copy().keys(): + val = orig_state_dict.pop(key) + + if "qkv" in key: + pass + else: + new_name = rename_key(key) + orig_state_dict[new_name] = val + + return orig_state_dict + + diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py new file mode 100644 index 000000000000..35e8c93e160b --- /dev/null +++ b/src/transformers/models/hiera/hiera.py @@ -0,0 +1,535 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# +# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +# +# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, +# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, +# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. +# +# Paper: https://arxiv.org/abs/2306.00989/ +# +# References: +# slowfast: https://github.com/facebookresearch/SlowFast +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# -------------------------------------------------------- + +import math +from functools import partial +from typing import List, Tuple, Callable, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from timm.models.layers import DropPath, Mlp + +from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll + + + +class MaskUnitAttention(nn.Module): + """ + Computes either Mask Unit or Global Attention. Also is able to perform q pooling. + + Note: this assumes the tokens have already been flattened and unrolled into mask units. + See `Unroll` for more details. + """ + + def __init__( + self, + dim: int, + dim_out: int, + heads: int, + q_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ): + """ + Args: + - dim, dim_out: The input and output feature dimensions. + - heads: The number of attention heads. + - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). + - window_size: The current (flattened) size of a mask unit *after* pooling (if any). + - use_mask_unit_attn: Use Mask Unit or Global Attention. + """ + super().__init__() + + self.dim = dim + self.dim_out = dim_out + self.heads = heads + self.q_stride = q_stride + + self.head_dim = dim_out // heads + self.scale = (self.head_dim) ** -0.5 + + self.qkv = nn.Linear(dim, 3 * dim_out) + self.proj = nn.Linear(dim_out, dim_out) + + self.window_size = window_size + self.use_mask_unit_attn = use_mask_unit_attn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ Input should be of shape [batch, tokens, channels]. """ + B, N, _ = x.shape + num_windows = ( + (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1 + ) + + qkv = ( + self.qkv(x) + .reshape(B, -1, num_windows, 3, self.heads, self.head_dim) + .permute(3, 0, 4, 2, 1, 5) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + + if self.q_stride > 1: + # Refer to Unroll to see how this performs a maxpool-Nd + q = ( + q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim) + .max(dim=3) + .values + ) + + if hasattr(F, "scaled_dot_product_attention"): + # Note: the original paper did *not* use SDPA, it's a free boost! + x = F.scaled_dot_product_attention(q, k, v) + else: + attn = (q * self.scale) @ k.transpose(-1, -2) + attn = attn.softmax(dim=-1) + x = (attn @ v) + + x = x.transpose(1, 3).reshape(B, -1, self.dim_out) + x = self.proj(x) + return x + + +class HieraBlock(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + heads: int, + mlp_ratio: float = 4.0, + drop_path: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + act_layer: nn.Module = nn.GELU, + q_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ): + super().__init__() + + self.dim = dim + self.dim_out = dim_out + + self.norm1 = norm_layer(dim) + self.attn = MaskUnitAttention( + dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn + ) + + self.norm2 = norm_layer(dim_out) + self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer) + + self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity() + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Attention + Q Pooling + x_norm = self.norm1(x) + if self.dim != self.dim_out: + x = do_pool(self.proj(x_norm), stride=self.attn.q_stride) + x = x + self.drop_path(self.attn(x_norm)) + + # MLP + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Head(nn.Module): + def __init__( + self, + dim: int, + num_classes: int, + dropout_rate: float = 0.0, + act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1), + ): + super().__init__() + self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity() + self.projection = nn.Linear(dim, num_classes) + # act_fun for eval and testing only + self.act_func = act_func + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.dropout(x) + x = self.projection(x) + if not self.training: + x = self.act_func(x) + return x + + +class PatchEmbed(nn.Module): + """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d).""" + + def __init__( + self, + dim_in: int, + dim_out: int, + kernel: Tuple[int, ...], + stride: Tuple[int, ...], + padding: Tuple[int, ...], + ): + super().__init__() + + # Support any number of spatial dimensions + self.spatial_dims = len(kernel) + self.proj = conv_nd(self.spatial_dims)( + dim_in, + dim_out, + kernel_size=kernel, + stride=stride, + padding=padding, + ) + + def forward( + self, x: torch.Tensor, mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + x = do_masked_conv(x, self.proj, mask) + x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1) + return x + + +class Hiera(nn.Module): + def __init__( + self, + input_size: Tuple[int, ...] = (224, 224), + in_chans: int = 3, + embed_dim: int = 96, # initial embed dim + num_heads: int = 1, # initial number of heads + num_classes: int = 1000, + stages: Tuple[int, ...] = (2, 3, 16, 3), + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, ...] = (2, 2), + mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) + # mask_unit_attn: which stages use mask unit attention? + mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), + dim_mul: float = 2.0, + head_mul: float = 2.0, + patch_kernel: Tuple[int, ...] = (7, 7), + patch_stride: Tuple[int, ...] = (4, 4), + patch_padding: Tuple[int, ...] = (3, 3), + mlp_ratio: float = 4.0, + drop_path_rate: float = 0.0, + norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), + head_dropout: float = 0.0, + head_init_scale: float = 0.001, + sep_pos_embed: bool = False, + ): + super().__init__() + + depth = sum(stages) + self.patch_stride = patch_stride + self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)] + num_tokens = math.prod(self.tokens_spatial_shape) + flat_mu_size = math.prod(mask_unit_size) + flat_q_stride = math.prod(q_stride) + + assert q_pool < len(stages) + self.q_pool, self.q_stride = q_pool, q_stride + self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size + self.mask_spatial_shape = [ + i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) + ] + self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] + + self.patch_embed = PatchEmbed( + in_chans, embed_dim, patch_kernel, patch_stride, patch_padding + ) + + self.sep_pos_embed = sep_pos_embed + if sep_pos_embed: + self.pos_embed_spatial = nn.Parameter( + torch.zeros( + 1, + self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], + embed_dim, + ) + ) + self.pos_embed_temporal = nn.Parameter( + torch.zeros(1, self.tokens_spatial_shape[0], embed_dim) + ) + else: + self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim)) + + # Setup roll and reroll modules + self.unroll = Unroll( + input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1]) + ) + self.reroll = Reroll( + input_size, + patch_stride, + [q_stride] * len(self.stage_ends[:-1]), + self.stage_ends, + q_pool, + ) + # q_pool locations + q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]] + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + + # Transformer blocks + cur_stage = 0 + self.blocks = nn.ModuleList() + + for i in range(depth): + dim_out = embed_dim + # Mask unit or global attention. + # Lag by 1 block, so that global attention, + # applied post pooling on lower resolution + use_mask_unit_attn = mask_unit_attn[cur_stage] + + if i - 1 in self.stage_ends: + dim_out = int(embed_dim * dim_mul) + num_heads = int(num_heads * head_mul) + cur_stage += 1 + if i in q_pool_blocks: + flat_mu_size //= flat_q_stride + + block = HieraBlock( + dim=embed_dim, + dim_out=dim_out, + heads=num_heads, + mlp_ratio=mlp_ratio, + drop_path=dpr[i], + norm_layer=norm_layer, + q_stride=(flat_q_stride if i in q_pool_blocks else 1), + window_size=flat_mu_size, + use_mask_unit_attn=use_mask_unit_attn, + ) + + embed_dim = dim_out + self.blocks.append(block) + + self.norm = norm_layer(embed_dim) + self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout) + + # Initialize everything + if sep_pos_embed: + nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02) + nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02) + else: + nn.init.trunc_normal_(self.pos_embed, std=0.02) + self.apply(partial(self._init_weights)) + self.head.projection.weight.data.mul_(head_init_scale) + self.head.projection.bias.data.mul_(head_init_scale) + + def _init_weights(self, m, init_bias=0.02): + if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): + nn.init.trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, init_bias) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, init_bias) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + if self.sep_pos_embed: + return ["pos_embed_spatial", "pos_embed_temporal"] + else: + return ["pos_embed"] + + def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: + """ + Generates a random mask, mask_ratio fraction are dropped. + 1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc. + """ + B = x.shape[0] + # Tokens selected for masking at mask unit level + num_windows = math.prod(self.mask_spatial_shape) # num_mask_units + len_keep = int(num_windows * (1 - mask_ratio)) + noise = torch.rand(B, num_windows, device=x.device) + + # Sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1 + ) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # Generate the binary mask: 1 is *keep*, 0 is *remove* + # Note this is opposite to original MAE + mask = torch.zeros([B, num_windows], device=x.device) + mask[:, :len_keep] = 1 + # Unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return mask.bool() + + def get_pos_embed(self) -> torch.Tensor: + if self.sep_pos_embed: + return self.pos_embed_spatial.repeat( + 1, self.tokens_spatial_shape[0], 1 + ) + torch.repeat_interleave( + self.pos_embed_temporal, + self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], + dim=1, + ) + else: + return self.pos_embed + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor = None, + return_intermediates: bool = False, + ) -> torch.Tensor: + """ + mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim. + Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. + """ + # Slowfast training passes in a list + if isinstance(x, list): + x = x[0] + intermediates = [] + + x = self.patch_embed( + x, + mask=mask.view( + x.shape[0], 1, *self.mask_spatial_shape + ) # B, C, *mask_spatial_shape + if mask is not None + else None, + ) + x = x + self.get_pos_embed() + x = self.unroll(x) + + # Discard masked tokens + if mask is not None: + x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view( + x.shape[0], -1, x.shape[-1] + ) + + for i, blk in enumerate(self.blocks): + x = blk(x) + + if return_intermediates and i in self.stage_ends: + intermediates.append(self.reroll(x, i, mask=mask)) + + if mask is None: + x = x.mean(dim=1) + x = self.norm(x) + x = self.head(x) + + # x may not always be in spatial order here. + # e.g. if q_pool = 2, mask_unit_size = (8, 8), and + # q_stride = (2, 2), not all unrolls were consumed, + # intermediates[-1] is x in spatial order + if return_intermediates: + return x, intermediates + + return x + + +# Image models + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_tiny_224(**kwdargs): + return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_small_224(**kwdargs): + return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_base_224(**kwdargs): + return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_base_plus_224(**kwdargs): + return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_large_224(**kwdargs): + return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_huge_224(**kwdargs): + return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs) + + +# Video models + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_base_16x224(num_classes: int = 400, **kwdargs): + return Hiera( + num_classes=num_classes, # K400 has 400 classes + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + **kwdargs + ) + + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_base_plus_16x224(**kwdargs): + return hiera_base_16x224( + embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + ) + + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_large_16x224(**kwdargs): + return hiera_base_16x224( + embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + ) + + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_huge_16x224(**kwdargs): + return hiera_base_16x224( + embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + ) diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py new file mode 100644 index 000000000000..64c69cc89d71 --- /dev/null +++ b/src/transformers/models/hiera/hiera_mae.py @@ -0,0 +1,398 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# mae: https://github.com/facebookresearch/mae +# slowfast: https://github.com/facebookresearch/SlowFast +# -------------------------------------------------------- + + +from functools import partial +from typing import Tuple, Optional + +import math +import torch +import torch.nn as nn + +from .hiera import Hiera, HieraBlock +from .hiera_utils import pretrained_model, undo_windowing, conv_nd + + +def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: + if isinstance(head, nn.Identity): + return x + + B, num_mask_units = x.shape[0:2] + # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx]) + permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) + x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute)) + + # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C'] + permute = [0] + list(range(2, len(x.shape))) + [1] + x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1]) + return x + + +class MaskedAutoencoderHiera(Hiera): + """Masked Autoencoder with Hiera backbone""" + + def __init__( + self, + in_chans: int = 3, + patch_stride: Tuple[int, ...] = (4, 4), + mlp_ratio: float = 4.0, + decoder_embed_dim: int = 512, + decoder_depth: int = 8, + decoder_num_heads: int = 16, + norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), + **kwdargs, + ): + super().__init__( + in_chans=in_chans, + patch_stride=patch_stride, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + **kwdargs, + ) + + del self.norm, self.head + encoder_dim_out = self.blocks[-1].dim_out + self.encoder_norm = norm_layer(encoder_dim_out) + self.mask_unit_spatial_shape_final = [ + i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride) + ] + self.tokens_spatial_shape_final = [ + i // s ** (self.q_pool) + for i, s in zip(self.tokens_spatial_shape, self.q_stride) + ] + # -------------------------------------------------------------------------- + # Multi-scale fusion heads + curr_mu_size = self.mask_unit_size + self.multi_scale_fusion_heads = nn.ModuleList() + + for i in self.stage_ends[: self.q_pool]: # resolution constant after q_pool + kernel = [ + i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final) + ] + curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)] + self.multi_scale_fusion_heads.append( + conv_nd(len(self.q_stride))( + self.blocks[i].dim_out, + encoder_dim_out, + kernel_size=kernel, + stride=kernel, + ) + ) + self.multi_scale_fusion_heads.append(nn.Identity()) # final stage, no transform + + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.decoder_embed = nn.Linear(encoder_dim_out, decoder_embed_dim) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) + + self.decoder_pos_embed = nn.Parameter( + torch.zeros( + 1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim + ) + ) + + self.decoder_blocks = nn.ModuleList( + [ + HieraBlock( + dim=decoder_embed_dim, + dim_out=decoder_embed_dim, + heads=decoder_num_heads, + norm_layer=norm_layer, + mlp_ratio=mlp_ratio, + ) + for i in range(decoder_depth) + ] + ) + self.decoder_norm = norm_layer(decoder_embed_dim) + + self.pred_stride = patch_stride[-1] * ( + self.q_stride[-1] ** self.q_pool + ) # patch stride of prediction + + self.decoder_pred = nn.Linear( + decoder_embed_dim, + (self.pred_stride ** min(2, len(self.q_stride))) * in_chans, + ) # predictor + # -------------------------------------------------------------------------- + + self.initialize_weights() + + def initialize_weights(self): + nn.init.trunc_normal_(self.mask_token, std=0.02) + nn.init.trunc_normal_(self.decoder_pos_embed, std=0.02) + self.apply(self._mae_init_weights) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + def _mae_init_weights(self, m: nn.Module): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_pixel_label_2d( + self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True + ) -> torch.Tensor: + # mask (boolean tensor): True must correspond to *masked* + input_img = input_img.permute(0, 2, 3, 1) + + size = self.pred_stride + label = input_img.unfold(1, size, size).unfold(2, size, size) + label = label.flatten(1, 2).flatten(2) + label = label[mask] + if norm: + mean = label.mean(dim=-1, keepdim=True) + var = label.var(dim=-1, keepdim=True) + label = (label - mean) / (var + 1.0e-6) ** 0.5 + + return label + + def get_pixel_label_3d( + self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True + ) -> torch.Tensor: + # mask (boolean tensor): True must correspond to *masked* + + # We use time strided loss, only take the first frame from each token + input_vid = input_vid[:, :, ::self.patch_stride[0], :, :] + + size = self.pred_stride + label = input_vid.unfold(3, size, size).unfold(4, size, size) + label = label.permute(0, 2, 3, 4, 5, 6, 1) # Different from 2d, mistake during training lol + label = label.flatten(1, 3).flatten(2) + label = label[mask] + + if norm: + mean = label.mean(dim=-1, keepdim=True) + var = label.var(dim=-1, keepdim=True) + label = (label - mean) / (var + 1.0e-6) ** 0.5 + + return label + + + def forward_encoder( + self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + + if mask is None: + mask = self.get_random_mask(x, mask_ratio) # [B, #MUs_all] + + # Get multi-scale representations from encoder + _, intermediates = super().forward(x, mask, return_intermediates=True) + # Resolution unchanged after q_pool stages, so skip those features + intermediates = intermediates[: self.q_pool] + intermediates[-1:] + + # Multi-scale fusion + x = 0.0 + for head, interm_x in zip(self.multi_scale_fusion_heads, intermediates): + x += apply_fusion_head(head, interm_x) + + x = self.encoder_norm(x) + + return x, mask + + def forward_decoder( + self, x: torch.Tensor, mask: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Embed tokens + x = self.decoder_embed(x) + + # Combine visible and mask tokens + + # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] + # mask: [B, #MUs_all] + x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) + mask_tokens = self.mask_token.view( + (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,) + ) + mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:])) + mask = mask.expand((-1,) * 2 + x.shape[2:]).bool() + x_dec[mask] = x.flatten() + x_dec = ~mask * mask_tokens + mask * x_dec + + # Get back spatial order + x = undo_windowing( + x_dec, + self.tokens_spatial_shape_final, + self.mask_unit_spatial_shape_final, + ) + mask = undo_windowing( + mask[..., 0:1], + self.tokens_spatial_shape_final, + self.mask_unit_spatial_shape_final, + ) + + # Flatten + x = x.reshape(x.shape[0], -1, x.shape[-1]) + mask = mask.view(x.shape[0], -1) + + # Add pos embed + x = x + self.decoder_pos_embed + + # Apply decoder blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # Predictor projection + x = self.decoder_pred(x) + + return x, mask + + def forward_loss( + self, x: torch.Tensor, pred: torch.Tensor, mask: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Note: in mask, 0 is *visible*, 1 is *masked* + + x: e.g. [B, 3, H, W] + pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + """ + if len(self.q_stride) == 2: + label = self.get_pixel_label_2d(x, mask) + elif len(self.q_stride) == 3: + label = self.get_pixel_label_3d(x, mask) + else: + raise NotImplementedError + + pred = pred[mask] + loss = (pred - label) ** 2 + + return loss.mean(), pred, label + + def forward( + self, + x: torch.Tensor, + mask_ratio: float = 0.6, + mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + latent, mask = self.forward_encoder(x, mask_ratio, mask=mask) + pred, pred_mask = self.forward_decoder( + latent, mask + ) # pred_mask is mask at resolution of *prediction* + + # Toggle mask, to generate labels for *masked* tokens + return *self.forward_loss(x, pred, ~pred_mask), mask + + + + +# Image Models + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", +}, default="mae_in1k") +def mae_hiera_tiny_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", +}, default="mae_in1k") +def mae_hiera_small_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", +}, default="mae_in1k") +def mae_hiera_base_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", +}, default="mae_in1k") +def mae_hiera_base_plus_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", +}, default="mae_in1k") +def mae_hiera_large_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", +}, default="mae_in1k") +def mae_hiera_huge_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + ) + + + +# Video Models + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", +}, default="mae_k400") +def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs): + return MaskedAutoencoderHiera( + num_classes=num_classes, # K400 has 400 classes + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + q_pool=2, + **kwdargs + ) + + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", +}, default="mae_k400") +@pretrained_model(None) +def mae_hiera_base_plus_16x224(**kwdargs): + return mae_hiera_base_16x224( + embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + ) + + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", +}, default="mae_k400") +@pretrained_model(None) +def mae_hiera_large_16x224(**kwdargs): + return mae_hiera_base_16x224( + embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + ) + + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", +}, default="mae_k400") +def mae_hiera_huge_16x224(**kwdargs): + return mae_hiera_base_16x224( + embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + ) diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py new file mode 100644 index 000000000000..992c03e08079 --- /dev/null +++ b/src/transformers/models/hiera/hiera_utils.py @@ -0,0 +1,287 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# +# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +# +# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, +# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, +# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. +# +# Paper: https://arxiv.org/abs/2306.00989/ +# +# References: +# slowfast: https://github.com/facebookresearch/SlowFast +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# -------------------------------------------------------- + +import math +from typing import List, Tuple, Optional, Type, Callable, Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from .convert_hiera_to_pytorch import e + +def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: + """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ + + def inner(model_func: Callable) -> Callable: + def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module: + if pretrained: + if checkpoints is None: + raise RuntimeError("This model currently doesn't have pretrained weights available.") + elif checkpoint is None: + raise RuntimeError("No checkpoint specified.") + elif checkpoint not in checkpoints: + raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") + + state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") + # state_dict["model_state"] = e(state_dict["model_state"],{}) + if "head.projection.weight" in state_dict["model_state"]: + # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it + if "num_classes" not in kwdargs: + kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0] + # If the user specified a different number of classes, remove the projection weights or else we'll error out + elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]: + del state_dict["model_state"]["head.projection.weight"] + del state_dict["model_state"]["head.projection.bias"] + + model = model_func(**kwdargs) + if pretrained: + # Disable being strict when trying to load a encoder-decoder model into an encoder-only model + if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"): + strict = False + + model.load_state_dict(state_dict["model_state"], strict=strict) + + return model + + return model_def + + return inner + + + +def conv_nd(n: int) -> Type[nn.Module]: + """ + Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. + If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises) + """ + return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] + + +def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: + # Refer to `Unroll` to see how this performs a maxpool-Nd + return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values + + +def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor: + # target_size: [(T), (H), W] + # (spatial) mask: [B, C, (t), (h), w] + if mask is None: + return mask + + assert len(mask.shape[2:]) == len(target_size) + if mask.shape[2:] != target_size: + return F.interpolate(mask.float(), size=target_size) + return mask + + +def do_masked_conv( + x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None +) -> torch.Tensor: + """Zero-out the masked regions of the input before conv. + Prevents leakage of masked regions when using overlapping kernels. + """ + if conv is None: + return x + if mask is None: + return conv(x) + + mask = get_resized_mask(target_size=x.shape[2:], mask=mask) + return conv(x * mask.bool()) + + +def undo_windowing( + x: torch.Tensor, shape: List[int], mu_shape: List[int] +) -> torch.Tensor: + """ + Restore spatial organization by undoing windowed organization of mask units. + + Args: + x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] + shape: current spatial shape, if it were not organized into mask unit + windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. + mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] + Returns: + x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] + """ + D = len(shape) + B, C = x.shape[0], x.shape[-1] + # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] + num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] + x = x.view(B, *num_MUs, *mu_shape, C) + + # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] + permute = ( + [0] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], + [], + ) + + [len(x.shape) - 1] + ) + x = x.permute(permute).reshape(B, *shape, C) + + return x + + + +class Unroll(nn.Module): + """ + Reorders the tokens such that patches are contiguous in memory. + E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as + [B, (Sy, Sx, H // Sy, W // Sx), C] + + This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). + Not only is this faster, but it also makes it easy to support inputs of arbitrary + dimensions in addition to patch-wise sparsity. + + Performing this operation multiple times in sequence puts entire windows as contiguous + in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of + size 8x8 would be contiguous in memory, allowing operations like mask unit attention + computed easily and efficiently, while also allowing max to be applied sequentially. + + Note: This means that intermediate values of the model are not in HxW order, so they + need to be re-rolled if you want to use the intermediate values as a HxW feature map. + The last block of the network is fine though, since by then the strides are all consumed. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + self.schedule = unroll_schedule + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: Flattened patch embeddings [B, N, C] + Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd + """ + B, _, C = x.shape + + cur_size = self.size + x = x.view(*([B] + cur_size + [C])) + + for strides in self.schedule: + # Move patches with the given strides to the batch dimension + + # Create a view of the tensor with the patch stride as separate dims + # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] + cur_size = [i // s for i, s in zip(cur_size, strides)] + new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] + x = x.view(new_shape) + + # Move the patch stride into the batch dimension + # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] + L = len(new_shape) + permute = ( + [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] + ) + x = x.permute(permute) + + # Now finally flatten the relevant dims into the batch dimension + x = x.flatten(0, len(strides)) + B *= math.prod(strides) + + x = x.reshape(-1, math.prod(self.size), C) + return x + + +class Reroll(nn.Module): + """ + Undos the "unroll" operation so that you can use intermediate features. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + stage_ends: List[int], + q_pool: int, + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + + # The first stage has to reverse everything + # The next stage has to reverse all but the first unroll, etc. + self.schedule = {} + size = self.size + for i in range(stage_ends[-1] + 1): + self.schedule[i] = unroll_schedule, size + # schedule unchanged if no pooling at a stage end + if i in stage_ends[:q_pool]: + if len(unroll_schedule) > 0: + size = [n // s for n, s in zip(size, unroll_schedule[0])] + unroll_schedule = unroll_schedule[1:] + + def forward( + self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Roll the given tensor back up to spatial order assuming it's from the given block. + + If no mask is provided: + - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. + If a mask is provided: + - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + """ + schedule, size = self.schedule[block_idx] + B, N, C = x.shape + + D = len(size) + cur_mu_shape = [1] * D + + for strides in schedule: + # Extract the current patch from N + x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) + + # Move that patch into the current MU + # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] + L = len(x.shape) + permute = ( + [0, 1 + D] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], + [], + ) + + [L - 1] + ) + x = x.permute(permute) + + # Reshape to [B, N//(Sy*Sx), *MU, C] + for i in range(D): + cur_mu_shape[i] *= strides[i] + x = x.reshape(B, -1, *cur_mu_shape, C) + N = x.shape[1] + + # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) + x = x.view(B, N, *cur_mu_shape, C) + + # If masked, return [B, #MUs, MUy, MUx, C] + if mask is not None: + return x + + # If not masked, we can return [B, H, W, C] + x = undo_windowing(x, size, cur_mu_shape) + + return x \ No newline at end of file From ec4111f144ae048c842d96f729dbcaacc1faf053 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 02:17:36 +0000 Subject: [PATCH 002/118] Updated variable names --- .../models/hiera/convert_hiera_to_pytorch.py | 30 +-- src/transformers/models/hiera/hiera.py | 200 +++++++++--------- src/transformers/models/hiera/hiera_mae.py | 42 ++-- src/transformers/models/hiera/hiera_utils.py | 6 +- 4 files changed, 141 insertions(+), 137 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 506507e4e66e..f1d0c4135796 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -7,21 +7,25 @@ def rename_key(name): - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "patch_embed.projection") + # if "patch_embed.proj" in name: + # name = name.replace("patch_embed.proj", "patch_embed.projection") + # # elif "block.proj" in name: + # # name = name.replace("block.proj", "block.projection") + # elif "attn.proj" in name: + # name = name.replace("attn.proj", "attn.projection") + if ".proj." in name: + name = name.replace(".proj.", ".projection.") + if "attn" in name: + name = name.replace("attn", "attention") + if "pos_embed" in name: + name = name.replace("pos_embed", "position_embeddings") + if "patch_embed" in name: + name = name.replace("patch_embed", "patch_embedding") return name -def e(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - pass - else: - new_name = rename_key(key) - orig_state_dict[new_name] = val - - return orig_state_dict +def convert_state_dict(orig_state_dict, config): + updated_model_state = {rename_key(k): v for k, v in orig_state_dict.items()} + return updated_model_state diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 35e8c93e160b..fcb04f68934e 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -42,47 +42,47 @@ class MaskUnitAttention(nn.Module): def __init__( self, - dim: int, - dim_out: int, - heads: int, + input_dim: int, + output_dim: int, + number_of_heads: int, q_stride: int = 1, window_size: int = 0, - use_mask_unit_attn: bool = False, + use_mask_unit_attention: bool = False, ): """ Args: - - dim, dim_out: The input and output feature dimensions. - - heads: The number of attention heads. + - input_dim, output_dim: The input and output feature dimensions. + - number_of_heads: The number of attention number_of_heads. - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). - window_size: The current (flattened) size of a mask unit *after* pooling (if any). - - use_mask_unit_attn: Use Mask Unit or Global Attention. + - use_mask_unit_attention: Use Mask Unit or Global Attention. """ super().__init__() - self.dim = dim - self.dim_out = dim_out - self.heads = heads + self.input_dim = input_dim + self.output_dim = output_dim + self.number_of_heads = number_of_heads self.q_stride = q_stride - self.head_dim = dim_out // heads + self.head_dim = output_dim // number_of_heads self.scale = (self.head_dim) ** -0.5 - self.qkv = nn.Linear(dim, 3 * dim_out) - self.proj = nn.Linear(dim_out, dim_out) + self.qkv = nn.Linear(input_dim, 3 * output_dim) + self.projection = nn.Linear(output_dim, output_dim) self.window_size = window_size - self.use_mask_unit_attn = use_mask_unit_attn + self.use_mask_unit_attention = use_mask_unit_attention def forward(self, x: torch.Tensor) -> torch.Tensor: """ Input should be of shape [batch, tokens, channels]. """ - B, N, _ = x.shape + batch_size , num_channels , _ = x.shape num_windows = ( - (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1 + (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 ) qkv = ( self.qkv(x) - .reshape(B, -1, num_windows, 3, self.heads, self.head_dim) + .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim) .permute(3, 0, 4, 2, 1, 5) ) q, k, v = qkv[0], qkv[1], qkv[2] @@ -90,7 +90,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if self.q_stride > 1: # Refer to Unroll to see how this performs a maxpool-Nd q = ( - q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim) + q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) .max(dim=3) .values ) @@ -99,52 +99,52 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Note: the original paper did *not* use SDPA, it's a free boost! x = F.scaled_dot_product_attention(q, k, v) else: - attn = (q * self.scale) @ k.transpose(-1, -2) - attn = attn.softmax(dim=-1) - x = (attn @ v) + attention = (q * self.scale) @ k.transpose(-1, -2) + attention = attention.softmax(dim=-1) + x = (attention @ v) - x = x.transpose(1, 3).reshape(B, -1, self.dim_out) - x = self.proj(x) + x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim) + x = self.projection(x) return x class HieraBlock(nn.Module): def __init__( self, - dim: int, - dim_out: int, - heads: int, + input_dim: int, + output_dim: int, + number_of_heads: int, mlp_ratio: float = 4.0, drop_path: float = 0.0, norm_layer: nn.Module = nn.LayerNorm, act_layer: nn.Module = nn.GELU, q_stride: int = 1, window_size: int = 0, - use_mask_unit_attn: bool = False, + use_mask_unit_attention: bool = False, ): super().__init__() - self.dim = dim - self.dim_out = dim_out + self.input_dim = input_dim + self.output_dim = output_dim - self.norm1 = norm_layer(dim) - self.attn = MaskUnitAttention( - dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn + self.norm1 = norm_layer(input_dim) + self.attention = MaskUnitAttention( + input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention ) - self.norm2 = norm_layer(dim_out) - self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer) + self.norm2 = norm_layer(output_dim) + self.mlp = Mlp(output_dim, int(output_dim * mlp_ratio), act_layer=act_layer) self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity() - if dim != dim_out: - self.proj = nn.Linear(dim, dim_out) + if input_dim != output_dim: + self.projection = nn.Linear(input_dim, output_dim) def forward(self, x: torch.Tensor) -> torch.Tensor: # Attention + Q Pooling - x_norm = self.norm1(x) - if self.dim != self.dim_out: - x = do_pool(self.proj(x_norm), stride=self.attn.q_stride) - x = x + self.drop_path(self.attn(x_norm)) + normalized_input = self.norm1(x) + if self.input_dim != self.output_dim: + x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride) + x = x + self.drop_path(self.attention(normalized_input)) # MLP x = x + self.drop_path(self.mlp(self.norm2(x))) @@ -154,14 +154,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Head(nn.Module): def __init__( self, - dim: int, + input_dim: int, num_classes: int, dropout_rate: float = 0.0, act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1), ): super().__init__() self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity() - self.projection = nn.Linear(dim, num_classes) + self.projection = nn.Linear(input_dim, num_classes) # act_fun for eval and testing only self.act_func = act_func @@ -173,13 +173,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class PatchEmbed(nn.Module): - """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d).""" +class PatchEmbedding(nn.Module): + """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).""" def __init__( self, dim_in: int, - dim_out: int, + output_dim: int, kernel: Tuple[int, ...], stride: Tuple[int, ...], padding: Tuple[int, ...], @@ -188,9 +188,9 @@ def __init__( # Support any number of spatial dimensions self.spatial_dims = len(kernel) - self.proj = conv_nd(self.spatial_dims)( + self.projection = conv_nd(self.spatial_dims)( dim_in, - dim_out, + output_dim, kernel_size=kernel, stride=stride, padding=padding, @@ -199,7 +199,7 @@ def __init__( def forward( self, x: torch.Tensor, mask: Optional[torch.Tensor] = None ) -> torch.Tensor: - x = do_masked_conv(x, self.proj, mask) + x = do_masked_conv(x, self.projection, mask) x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1) return x @@ -209,8 +209,8 @@ def __init__( self, input_size: Tuple[int, ...] = (224, 224), in_chans: int = 3, - embed_dim: int = 96, # initial embed dim - num_heads: int = 1, # initial number of heads + embedding_dimention: int = 96, # initial embedding input_dim + number_of_heads: int = 1, # initial number of number_of_heads num_classes: int = 1000, stages: Tuple[int, ...] = (2, 3, 16, 3), q_pool: int = 3, # number of q_pool stages @@ -228,7 +228,7 @@ def __init__( norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), head_dropout: float = 0.0, head_init_scale: float = 0.001, - sep_pos_embed: bool = False, + sep_position_embeddings: bool = False, ): super().__init__() @@ -247,24 +247,24 @@ def __init__( ] self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] - self.patch_embed = PatchEmbed( - in_chans, embed_dim, patch_kernel, patch_stride, patch_padding + self.patch_embedding = PatchEmbedding( + in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding ) - self.sep_pos_embed = sep_pos_embed - if sep_pos_embed: - self.pos_embed_spatial = nn.Parameter( + self.sep_position_embeddings = sep_position_embeddings + if sep_position_embeddings: + self.position_embeddings_spatial = nn.Parameter( torch.zeros( 1, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], - embed_dim, + embedding_dimention, ) ) - self.pos_embed_temporal = nn.Parameter( - torch.zeros(1, self.tokens_spatial_shape[0], embed_dim) + self.position_embeddings_temporal = nn.Parameter( + torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention) ) else: - self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention)) # Setup roll and reroll modules self.unroll = Unroll( @@ -287,43 +287,43 @@ def __init__( self.blocks = nn.ModuleList() for i in range(depth): - dim_out = embed_dim + output_dim = embedding_dimention # Mask unit or global attention. # Lag by 1 block, so that global attention, # applied post pooling on lower resolution - use_mask_unit_attn = mask_unit_attn[cur_stage] + use_mask_unit_attention = mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: - dim_out = int(embed_dim * dim_mul) - num_heads = int(num_heads * head_mul) + output_dim = int(embedding_dimention * dim_mul) + number_of_heads = int(number_of_heads * head_mul) cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride block = HieraBlock( - dim=embed_dim, - dim_out=dim_out, - heads=num_heads, + input_dim=embedding_dimention, + output_dim=output_dim, + number_of_heads=number_of_heads, mlp_ratio=mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), window_size=flat_mu_size, - use_mask_unit_attn=use_mask_unit_attn, + use_mask_unit_attention=use_mask_unit_attention, ) - embed_dim = dim_out + embedding_dimention = output_dim self.blocks.append(block) - self.norm = norm_layer(embed_dim) - self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout) + self.norm = norm_layer(embedding_dimention) + self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout) # Initialize everything - if sep_pos_embed: - nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02) - nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02) + if sep_position_embeddings: + nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02) + nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02) else: - nn.init.trunc_normal_(self.pos_embed, std=0.02) + nn.init.trunc_normal_(self.position_embeddings, std=0.02) self.apply(partial(self._init_weights)) self.head.projection.weight.data.mul_(head_init_scale) self.head.projection.bias.data.mul_(head_init_scale) @@ -339,21 +339,21 @@ def _init_weights(self, m, init_bias=0.02): @torch.jit.ignore def no_weight_decay(self): - if self.sep_pos_embed: - return ["pos_embed_spatial", "pos_embed_temporal"] + if self.sep_position_embeddings: + return ["position_embeddings_spatial", "position_embeddings_temporal"] else: - return ["pos_embed"] + return ["position_embeddings"] def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: """ Generates a random mask, mask_ratio fraction are dropped. 1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc. """ - B = x.shape[0] + batch_size = x.shape[0] # Tokens selected for masking at mask unit level num_windows = math.prod(self.mask_spatial_shape) # num_mask_units len_keep = int(num_windows * (1 - mask_ratio)) - noise = torch.rand(B, num_windows, device=x.device) + noise = torch.rand(batch_size , num_windows, device=x.device) # Sort noise for each sample ids_shuffle = torch.argsort( @@ -363,24 +363,24 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: # Generate the binary mask: 1 is *keep*, 0 is *remove* # Note this is opposite to original MAE - mask = torch.zeros([B, num_windows], device=x.device) + mask = torch.zeros([batch_size , num_windows], device=x.device) mask[:, :len_keep] = 1 # Unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) return mask.bool() - def get_pos_embed(self) -> torch.Tensor: - if self.sep_pos_embed: - return self.pos_embed_spatial.repeat( + def get_position_embeddings(self) -> torch.Tensor: + if self.sep_position_embeddings: + return self.position_embeddings_spatial.repeat( 1, self.tokens_spatial_shape[0], 1 ) + torch.repeat_interleave( - self.pos_embed_temporal, + self.position_embeddings_temporal, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], dim=1, ) else: - return self.pos_embed + return self.position_embeddings def forward( self, @@ -389,7 +389,7 @@ def forward( return_intermediates: bool = False, ) -> torch.Tensor: """ - mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim. + mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. """ # Slowfast training passes in a list @@ -397,15 +397,15 @@ def forward( x = x[0] intermediates = [] - x = self.patch_embed( + x = self.patch_embedding( x, mask=mask.view( x.shape[0], 1, *self.mask_spatial_shape - ) # B, C, *mask_spatial_shape + ) # batch_size , C, *mask_spatial_shape if mask is not None else None, ) - x = x + self.get_pos_embed() + x = x + self.get_position_embeddings() x = self.unroll(x) # Discard masked tokens @@ -442,7 +442,7 @@ def forward( "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", }, default="mae_in1k_ft_in1k") def hiera_tiny_224(**kwdargs): - return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs) + return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs) @pretrained_model({ @@ -450,7 +450,7 @@ def hiera_tiny_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", }, default="mae_in1k_ft_in1k") def hiera_small_224(**kwdargs): - return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs) + return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) @pretrained_model({ @@ -458,7 +458,7 @@ def hiera_small_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", }, default="mae_in1k_ft_in1k") def hiera_base_224(**kwdargs): - return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs) + return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs) @pretrained_model({ @@ -466,7 +466,7 @@ def hiera_base_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", }, default="mae_in1k_ft_in1k") def hiera_base_plus_224(**kwdargs): - return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs) + return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) @pretrained_model({ @@ -474,7 +474,7 @@ def hiera_base_plus_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", }, default="mae_in1k_ft_in1k") def hiera_large_224(**kwdargs): - return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) @pretrained_model({ @@ -482,7 +482,7 @@ def hiera_large_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", }, default="mae_in1k_ft_in1k") def hiera_huge_224(**kwdargs): - return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) # Video models @@ -500,7 +500,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs): patch_kernel=(3, 7, 7), patch_stride=(2, 4, 4), patch_padding=(1, 3, 3), - sep_pos_embed=True, + sep_position_embeddings=True, **kwdargs ) @@ -511,7 +511,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs): }, default="mae_k400_ft_k400") def hiera_base_plus_16x224(**kwdargs): return hiera_base_16x224( - embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs ) @@ -521,7 +521,7 @@ def hiera_base_plus_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_large_16x224(**kwdargs): return hiera_base_16x224( - embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs ) @@ -531,5 +531,5 @@ def hiera_large_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_huge_16x224(**kwdargs): return hiera_base_16x224( - embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs ) diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index 64c69cc89d71..a0504997350b 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -25,14 +25,14 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: if isinstance(head, nn.Identity): return x - B, num_mask_units = x.shape[0:2] - # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx]) + batch_size , num_mask_units = x.shape[0:2] + # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size * #MUs, C, My, Mx]) permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) - x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute)) + x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) - # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C'] + # Restore original layout, e.g. [batch_size * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C'] permute = [0] + list(range(2, len(x.shape))) + [1] - x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1]) + x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1]) return x @@ -132,7 +132,7 @@ def initialize_weights(self): self.apply(self._mae_init_weights) # initialize patch_embed like nn.Linear (instead of nn.Conv2d) - w = self.patch_embed.proj.weight.data + w = self.patch_embed.projection.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) def _mae_init_weights(self, m: nn.Module): @@ -188,7 +188,7 @@ def forward_encoder( ) -> Tuple[torch.Tensor, torch.Tensor]: if mask is None: - mask = self.get_random_mask(x, mask_ratio) # [B, #MUs_all] + mask = self.get_random_mask(x, mask_ratio) # [batch_size , #MUs_all] # Get multi-scale representations from encoder _, intermediates = super().forward(x, mask, return_intermediates=True) @@ -212,8 +212,8 @@ def forward_decoder( # Combine visible and mask tokens - # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] - # mask: [B, #MUs_all] + # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] + # mask: [batch_size , #MUs_all] x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) mask_tokens = self.mask_token.view( (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,) @@ -258,9 +258,9 @@ def forward_loss( """ Note: in mask, 0 is *visible*, 1 is *masked* - x: e.g. [B, 3, H, W] - pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] - label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + x: e.g. [batch_size , 3, H, W] + pred: [batch_size * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + label: [batch_size * num_pred_tokens, num_pixels_in_pred_patch * in_chans] """ if len(self.q_stride) == 2: label = self.get_pixel_label_2d(x, mask) @@ -299,7 +299,7 @@ def forward( }, default="mae_in1k") def mae_hiera_tiny_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, + embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, ) @@ -308,7 +308,7 @@ def mae_hiera_tiny_224(**kwargs): }, default="mae_in1k") def mae_hiera_small_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, + embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, ) @@ -317,7 +317,7 @@ def mae_hiera_small_224(**kwargs): }, default="mae_in1k") def mae_hiera_base_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, ) @@ -326,7 +326,7 @@ def mae_hiera_base_224(**kwargs): }, default="mae_in1k") def mae_hiera_base_plus_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, ) @@ -335,7 +335,7 @@ def mae_hiera_base_plus_224(**kwargs): }, default="mae_in1k") def mae_hiera_large_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, ) @@ -344,7 +344,7 @@ def mae_hiera_large_224(**kwargs): }, default="mae_in1k") def mae_hiera_huge_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, ) @@ -375,7 +375,7 @@ def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs): @pretrained_model(None) def mae_hiera_base_plus_16x224(**kwdargs): return mae_hiera_base_16x224( - embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs ) @@ -385,7 +385,7 @@ def mae_hiera_base_plus_16x224(**kwdargs): @pretrained_model(None) def mae_hiera_large_16x224(**kwdargs): return mae_hiera_base_16x224( - embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs ) @@ -394,5 +394,5 @@ def mae_hiera_large_16x224(**kwdargs): }, default="mae_k400") def mae_hiera_huge_16x224(**kwdargs): return mae_hiera_base_16x224( - embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs ) diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py index 992c03e08079..c96c63cbfaf9 100644 --- a/src/transformers/models/hiera/hiera_utils.py +++ b/src/transformers/models/hiera/hiera_utils.py @@ -24,7 +24,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .convert_hiera_to_pytorch import e +from .convert_hiera_to_pytorch import convert_state_dict def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ @@ -40,7 +40,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") - # state_dict["model_state"] = e(state_dict["model_state"],{}) + state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) if "head.projection.weight" in state_dict["model_state"]: # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it if "num_classes" not in kwdargs: @@ -53,7 +53,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool model = model_func(**kwdargs) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model - if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"): + if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): strict = False model.load_state_dict(state_dict["model_state"], strict=strict) From 126de187bdaf7628d04f70f7d788581fdb45be2c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 08:10:34 +0000 Subject: [PATCH 003/118] Added Config class, basic HF setup, convert_to_hf --- src/transformers/__init__.py | 6 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/hiera/__init__.py | 157 ++++++++----- .../models/hiera/configuration_hiera.py | 193 +++++++--------- .../models/hiera/convert_hiera_to_pytorch.py | 212 ++++++++++++++++++ src/transformers/models/hiera/hiera.py | 129 +++++------ 6 files changed, 470 insertions(+), 230 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 84a664580227..aa1d07603390 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -496,6 +496,7 @@ "GroupViTVisionConfig", ], "models.herbert": ["HerbertTokenizer"], + "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"], "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"], "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"], "models.idefics": [ @@ -5247,6 +5248,7 @@ GroupViTVisionConfig, ) from .models.herbert import HerbertTokenizer + from .models.hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig from .models.idefics import ( @@ -6941,6 +6943,10 @@ HubertModel, HubertPreTrainedModel, ) + from .models.hiera import ( + Hiera, + HieraBlock + ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, IBertForMaskedLM, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 682241ea4a84..97ca773d1113 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -115,6 +115,7 @@ ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), + ("hiera","HieraConfig") ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), @@ -347,6 +348,7 @@ ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP") ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -579,6 +581,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), + ("hiera","Hiera") ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index bfd200e9dcb9..3ea6efb0056a 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -1,28 +1,18 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. from typing import TYPE_CHECKING from ...utils import ( OptionalDependencyNotAvailable, _LazyModule, - is_flax_available, - is_tf_available, is_torch_available, ) -_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} +_import_structure = { + "configuration_hiera": [ + "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP", + "HireaConfig", + ], +} try: if not is_torch_available(): @@ -30,28 +20,20 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_vit_mae"] = [ - "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", - "ViTMAEForPreTraining", - "ViTMAELayer", - "ViTMAEModel", - "ViTMAEPreTrainedModel", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_vit_mae"] = [ - "TFViTMAEForPreTraining", - "TFViTMAEModel", - "TFViTMAEPreTrainedModel", + _import_structure["hirea"] = [ + "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST", + "Hirea", + "Head", + "HieraBlock", + "MaskUnitAttention" + "" ] if TYPE_CHECKING: - from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig + from .configuration_hiera import ( + HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, + HieraConfig, + ) try: if not is_torch_available(): @@ -59,24 +41,99 @@ except OptionalDependencyNotAvailable: pass else: - from .modeling_vit_mae import ( - VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, - ViTMAEForPreTraining, - ViTMAELayer, - ViTMAEModel, - ViTMAEPreTrainedModel, + from .hiera import ( + Hiera, + Head, + HieraBlock, + MaskUnitAttention, ) - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel - - else: import sys sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + +####### PREV: + +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# from typing import TYPE_CHECKING + +# from ...utils import ( +# OptionalDependencyNotAvailable, +# _LazyModule, +# is_flax_available, +# is_tf_available, +# is_torch_available, +# ) + + +# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} + +# try: +# if not is_torch_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# _import_structure["modeling_vit_mae"] = [ +# "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", +# "ViTMAEForPreTraining", +# "ViTMAELayer", +# "ViTMAEModel", +# "ViTMAEPreTrainedModel", +# ] + +# try: +# if not is_tf_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# _import_structure["modeling_tf_vit_mae"] = [ +# "TFViTMAEForPreTraining", +# "TFViTMAEModel", +# "TFViTMAEPreTrainedModel", +# ] + +# if TYPE_CHECKING: +# from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig + +# try: +# if not is_torch_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# from .modeling_vit_mae import ( +# VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, +# ViTMAEForPreTraining, +# ViTMAELayer, +# ViTMAEModel, +# ViTMAEPreTrainedModel, +# ) + +# try: +# if not is_tf_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel + + +# else: +# import sys + +# sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index de5de9e7d9e9..c7dfaeaeedfb 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -2,127 +2,108 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging - +from typing import Tuple logger = logging.get_logger(__name__) -VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json", - # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae +HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + } -class ViTMAEConfig(PretrainedConfig): +class HieraConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT - MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with - the defaults will yield a similar configuration to that of the ViT - [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture. + This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with + the defaults will yield a similar configuration to that of the Hiera + [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` are supported. - hidden_dropout_prob (`float`, *optional*, defaults to 0.0): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the layer normalization layers. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - num_channels (`int`, *optional*, defaults to 3): - The number of input channels. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add a bias to the queries, keys and values. - decoder_num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each attention layer in the decoder. - decoder_hidden_size (`int`, *optional*, defaults to 512): - Dimensionality of the decoder. - decoder_num_hidden_layers (`int`, *optional*, defaults to 8): - Number of hidden layers in the decoder. - decoder_intermediate_size (`int`, *optional*, defaults to 2048): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder. - mask_ratio (`float`, *optional*, defaults to 0.75): - The ratio of the number of masked tokens in the input sequence. - norm_pix_loss (`bool`, *optional*, defaults to `False`): - Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved - representation quality in the experiments of the authors. - - Example: - - ```python - >>> from transformers import ViTMAEConfig, ViTMAEModel - - >>> # Initializing a ViT MAE vit-mae-base style configuration - >>> configuration = ViTMAEConfig() - - >>> # Initializing a model (with random weights) from the vit-mae-base style configuration - >>> model = ViTMAEModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "vit_mae" - + input_size (Tuple[int, ...], optional): Dimensions of the input image (height, width). Defaults to (224, 224). + in_chans (int, optional): Number of input channels. Defaults to 3. + embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96. + number_of_heads (int, optional): Initial number of attention heads. Defaults to 1. + num_classes (int, optional): Number of output classes. Defaults to 1000. + stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. + q_pool (int, optional): Number of pooling stages for queries. Defaults to 3. + q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2). + mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride. + mask_unit_attn (Tuple[bool, ...], optional): Specifies which stages use mask unit attention. Defaults to (True, True, False, False). + dim_mul (float, optional): Factor for increasing the dimensionality through the network. Defaults to 2.0. + head_mul (float, optional): Factor for increasing the number of heads through the network. Defaults to 2.0. + patch_kernel (Tuple[int, ...], optional): Kernel size for patch embedding. Defaults to (7, 7). + patch_stride (Tuple[int, ...], optional): Stride for patch embedding. Defaults to (4, 4). + patch_padding (Tuple[int, ...], optional): Padding for patch embedding. Defaults to (3, 3). + mlp_ratio (float, optional): Ratio of hidden size to feed-forward layer size. Defaults to 4.0. + drop_path_rate (float, optional): Dropout rate for stochastic depth. Defaults to 0.0. + head_dropout (float, optional): Dropout rate for attention heads. Defaults to 0.0. + head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001. + sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False. + + + Example: + ```python + >>> from transformers import HieraConfig, Hiera + + >>> # Initializing a ViT MAE vit-mae-base style configuration + >>> configuration = HieraConfig() + + >>> # Initializing a model (with random weights) from the vit-mae-base style configuration + >>> model = Hiera(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "hiera" def __init__( self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - decoder_num_attention_heads=16, - decoder_hidden_size=512, - decoder_num_hidden_layers=8, - decoder_intermediate_size=2048, - mask_ratio=0.75, - norm_pix_loss=False, + input_size: Tuple[int, ...] = (224, 224), + in_chans: int = 3, + embedding_dimension: int = 96, # initial embedding input_dim + number_of_heads: int = 1, # initial number of number_of_heads + num_classes: int = 1000, + stages: Tuple[int, ...] = (2, 3, 16, 3), + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, ...] = (2, 2), + mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) + # mask_unit_attn: which stages use mask unit attention? + mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), + dim_mul: float = 2.0, + head_mul: float = 2.0, + patch_kernel: Tuple[int, ...] = (7, 7), + patch_stride: Tuple[int, ...] = (4, 4), + patch_padding: Tuple[int, ...] = (3, 3), + mlp_ratio: float = 4.0, + drop_path_rate: float = 0.0, + head_dropout: float = 0.0, + head_init_scale: float = 0.001, + sep_position_embeddings: bool = False, **kwargs, + ): super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.decoder_num_attention_heads = decoder_num_attention_heads - self.decoder_hidden_size = decoder_hidden_size - self.decoder_num_hidden_layers = decoder_num_hidden_layers - self.decoder_intermediate_size = decoder_intermediate_size - self.mask_ratio = mask_ratio - self.norm_pix_loss = norm_pix_loss + self.input_size = input_size + self.in_chans = in_chans + self.embedding_dimension = embedding_dimension + self.number_of_heads = number_of_heads + self.num_classes = num_classes + self.stages = stages + self.q_pool = q_pool + self.q_stride = q_stride + self.mask_unit_size = mask_unit_size + self.mask_unit_attn = mask_unit_attn + self.dim_mul = dim_mul + self.head_mul = head_mul + self.patch_kernel = patch_kernel + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.mlp_ratio = mlp_ratio + self.drop_path_rate = drop_path_rate + self.head_dropout = head_dropout + self.head_init_scale = head_init_scale + self.sep_position_embeddings = sep_position_embeddings \ No newline at end of file diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index f1d0c4135796..77556120bcb4 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,6 +3,12 @@ import requests import torch from PIL import Image +# from .configuration_hiera import HieraConfig +# from .hiera import Hiera +# from transformers import HieraConfig, Hiera +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD @@ -29,3 +35,209 @@ def convert_state_dict(orig_state_dict, config): return updated_model_state + +class HieraImageProcessor: + def __init__(self, size): + self.size = size + self.transform_list = [ + transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(self.size) + ] + self.transform_vis = transforms.Compose(self.transform_list) + self.transform_norm = transforms.Compose(self.transform_list + [ + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ]) + + def process_image(self, image_url): + # Load the image + img = Image.open(requests.get(image_url, stream=True).raw) + + # Apply transformations + img_vis = self.transform_vis(img) + img_norm = self.transform_norm(img) + + return img_norm + + + +def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): + pretrained_models_links = { + "hiera_tiny_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", + }, + "hiera_small_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", + }, + "hiera_base_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", + }, + "hiera_base_plus_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", + }, + "hiera_large_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", + }, + "hiera_huge_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", + }, + "hiera_base_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", + }, + "hiera_base_plus_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", + }, + "hiera_large_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", + }, + "hiera_huge_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", + } + } + + + if "hiera_tiny_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2),) + checkpoints = pretrained_models_links["hiera_tiny_224"] + checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"] + + elif "hiera_small_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2),) + checkpoints = pretrained_models_links["hiera_small_224"] + checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] + + elif "hiera_base_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(2, 3, 16, 3),) + checkpoints = pretrained_models_links["hiera_base_224"] + checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] + + elif "hiera_base_plus_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3),) + checkpoints = pretrained_models_links["hiera_base_plus_224"] + checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"] + + elif "hiera_large_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4),) + checkpoints = pretrained_models_links["hiera_large_224"] + checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"] + + elif "hiera_huge_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4)) + checkpoints = pretrained_models_links["hiera_huge_224"] + checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"] + + elif "hiera_base_16x224" in checkpoint_url: + config = HieraConfig(num_classes=num_classes, # Assuming num_classes is defined elsewhere + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True,) + checkpoints = pretrained_models_links["hiera_base_16x224"] + checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"] + + elif "hiera_base_plus_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3)) + checkpoints = pretrained_models_links["hiera_base_plus_16x224"] + checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"] + + elif "hiera_large_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), ) + checkpoints = pretrained_models_links["hiera_large_16x224"] + checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"] + + elif "hiera_huge_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4) ) + checkpoints = pretrained_models_links["hiera_huge_16x224"] + checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"] + + + pretrained = True + if pretrained: + if checkpoints is None: + raise RuntimeError("This model currently doesn't have pretrained weights available.") + elif checkpoint is None: + raise RuntimeError("No checkpoint specified.") + elif checkpoint not in checkpoints: + raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") + + state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") + state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) + if "head.projection.weight" in state_dict["model_state"]: + # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it + if config.num_classes is None: + config.num_classes = state_dict["model_state"]["head.projection.weight"].shape[0] + # If the user specified a different number of classes, remove the projection weights or else we'll error out + elif config.num_classes != state_dict["model_state"]["head.projection.weight"].shape[0]: + del state_dict["model_state"]["head.projection.weight"] + del state_dict["model_state"]["head.projection.bias"] + + model = Hiera(config) + if pretrained: + # Disable being strict when trying to load a encoder-decoder model into an encoder-only model + if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): + strict = False + + model.load_state_dict(state_dict["model_state"], strict=strict) + + + + + url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" + + image = Image.open(requests.get(url, stream=True).raw) + + + image_processor = HieraImageProcessor(size=config.image_size) + inputs = image_processor.process_image(images=image, return_tensors="pt") + + # forward pass + out = model(inputs[None, ...]) + + # 207: golden retriever (imagenet-1k) + out.argmax(dim=-1).item() + + + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + + print(f"Saving image processor to {pytorch_dump_folder_path}") + image_processor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth" + convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/") + diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index fcb04f68934e..7e42d5914d44 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -21,7 +21,7 @@ import math from functools import partial from typing import List, Tuple, Callable, Optional - +from .configuration_hiera import HieraConfig import torch import torch.nn as nn import torch.nn.functional as F @@ -205,106 +205,85 @@ def forward( class Hiera(nn.Module): - def __init__( - self, - input_size: Tuple[int, ...] = (224, 224), - in_chans: int = 3, - embedding_dimention: int = 96, # initial embedding input_dim - number_of_heads: int = 1, # initial number of number_of_heads - num_classes: int = 1000, - stages: Tuple[int, ...] = (2, 3, 16, 3), - q_pool: int = 3, # number of q_pool stages - q_stride: Tuple[int, ...] = (2, 2), - mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) - # mask_unit_attn: which stages use mask unit attention? - mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), - dim_mul: float = 2.0, - head_mul: float = 2.0, - patch_kernel: Tuple[int, ...] = (7, 7), - patch_stride: Tuple[int, ...] = (4, 4), - patch_padding: Tuple[int, ...] = (3, 3), - mlp_ratio: float = 4.0, - drop_path_rate: float = 0.0, - norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), - head_dropout: float = 0.0, - head_init_scale: float = 0.001, - sep_position_embeddings: bool = False, - ): + def __init__(self, config: HieraConfig): super().__init__() - - depth = sum(stages) - self.patch_stride = patch_stride - self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)] + self.config = config + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) # Example, adjust as needed + self.config = config + depth = sum(self.config.stages) + self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)] num_tokens = math.prod(self.tokens_spatial_shape) - flat_mu_size = math.prod(mask_unit_size) - flat_q_stride = math.prod(q_stride) + flat_mu_size = math.prod(self.config.mask_unit_size) + flat_q_stride = math.prod(self.config.q_stride) - assert q_pool < len(stages) - self.q_pool, self.q_stride = q_pool, q_stride - self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size + assert self.config.q_pool < len(self.config.stages) + self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride + self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size self.mask_spatial_shape = [ i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) ] - self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] + self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)] self.patch_embedding = PatchEmbedding( - in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding + self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding ) - self.sep_position_embeddings = sep_position_embeddings - if sep_position_embeddings: + if self.config.sep_position_embeddings: self.position_embeddings_spatial = nn.Parameter( torch.zeros( 1, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], - embedding_dimention, + self.config.embedding_dimension, ) ) self.position_embeddings_temporal = nn.Parameter( - torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention) + torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension) ) else: - self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension)) # Setup roll and reroll modules self.unroll = Unroll( - input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1]) + self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1]) ) self.reroll = Reroll( - input_size, - patch_stride, - [q_stride] * len(self.stage_ends[:-1]), + self.config.input_size, + self.config.patch_stride, + [self.config.q_stride] * len(self.stage_ends[:-1]), self.stage_ends, - q_pool, + self.config.q_pool, ) # q_pool locations - q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]] + q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]] # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)] # Transformer blocks cur_stage = 0 self.blocks = nn.ModuleList() for i in range(depth): - output_dim = embedding_dimention + output_dim = self.config.embedding_dimension # Mask unit or global attention. # Lag by 1 block, so that global attention, # applied post pooling on lower resolution - use_mask_unit_attention = mask_unit_attn[cur_stage] + use_mask_unit_attention = self.config.mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: - output_dim = int(embedding_dimention * dim_mul) - number_of_heads = int(number_of_heads * head_mul) + output_dim = int(self.config.embedding_dimension * self.config.dim_mul) + number_of_heads = int(self.config.number_of_heads * self.config.head_mul) cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride + else: + number_of_heads = self.config.number_of_heads block = HieraBlock( - input_dim=embedding_dimention, + input_dim=self.config.embedding_dimension, output_dim=output_dim, number_of_heads=number_of_heads, - mlp_ratio=mlp_ratio, + mlp_ratio=self.config.mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), @@ -312,21 +291,21 @@ def __init__( use_mask_unit_attention=use_mask_unit_attention, ) - embedding_dimention = output_dim + self.config.embedding_dimension = output_dim self.blocks.append(block) - self.norm = norm_layer(embedding_dimention) - self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout) + self.norm = norm_layer(self.config.embedding_dimension) + self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout) # Initialize everything - if sep_position_embeddings: + if self.config.sep_position_embeddings: nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02) nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02) else: nn.init.trunc_normal_(self.position_embeddings, std=0.02) self.apply(partial(self._init_weights)) - self.head.projection.weight.data.mul_(head_init_scale) - self.head.projection.bias.data.mul_(head_init_scale) + self.head.projection.weight.data.mul_(self.config.head_init_scale) + self.head.projection.bias.data.mul_(self.config.head_init_scale) def _init_weights(self, m, init_bias=0.02): if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): @@ -339,7 +318,7 @@ def _init_weights(self, m, init_bias=0.02): @torch.jit.ignore def no_weight_decay(self): - if self.sep_position_embeddings: + if self.config.sep_position_embeddings: return ["position_embeddings_spatial", "position_embeddings_temporal"] else: return ["position_embeddings"] @@ -371,7 +350,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: return mask.bool() def get_position_embeddings(self) -> torch.Tensor: - if self.sep_position_embeddings: + if self.config.sep_position_embeddings: return self.position_embeddings_spatial.repeat( 1, self.tokens_spatial_shape[0], 1 ) + torch.repeat_interleave( @@ -441,8 +420,9 @@ def forward( "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", }, default="mae_in1k_ft_in1k") -def hiera_tiny_224(**kwdargs): - return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs) +def hiera_tiny_224(**kwargs): + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs) + return Hiera(config) @pretrained_model({ @@ -450,15 +430,16 @@ def hiera_tiny_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", }, default="mae_in1k_ft_in1k") def hiera_small_224(**kwdargs): - return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) + return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) @pretrained_model({ "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", }, default="mae_in1k_ft_in1k") -def hiera_base_224(**kwdargs): - return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs) +def hiera_base_224(**kwargs): + config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + return Hiera(config) @pretrained_model({ @@ -466,7 +447,7 @@ def hiera_base_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", }, default="mae_in1k_ft_in1k") def hiera_base_plus_224(**kwdargs): - return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) + return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) @pretrained_model({ @@ -474,7 +455,7 @@ def hiera_base_plus_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", }, default="mae_in1k_ft_in1k") def hiera_large_224(**kwdargs): - return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) @pretrained_model({ @@ -482,7 +463,7 @@ def hiera_large_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", }, default="mae_in1k_ft_in1k") def hiera_huge_224(**kwdargs): - return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) # Video models @@ -511,7 +492,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs): }, default="mae_k400_ft_k400") def hiera_base_plus_16x224(**kwdargs): return hiera_base_16x224( - embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs + embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs ) @@ -521,7 +502,7 @@ def hiera_base_plus_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_large_16x224(**kwdargs): return hiera_base_16x224( - embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs + embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs ) @@ -531,5 +512,5 @@ def hiera_large_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_huge_16x224(**kwdargs): return hiera_base_16x224( - embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs + embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs ) From 75a34406ccf2afb2f0c80b634007f320da23e5f6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 21:48:20 +0000 Subject: [PATCH 004/118] Fixed Convert function, added hiera to HF files, Initilized test files --- src/transformers/__init__.py | 7 + .../models/auto/configuration_auto.py | 6 +- src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/hiera/__init__.py | 3 + .../models/hiera/convert_hiera_to_pytorch.py | 56 ++-- src/transformers/models/hiera/hiera.py | 242 +++++++----------- .../models/hiera/hiera_image_processor.py | 56 ++++ tests/models/hiera/__init__.py | 0 tests/models/hiera/test_modeling_vit_mae.py | 44 ++++ 9 files changed, 226 insertions(+), 189 deletions(-) create mode 100644 src/transformers/models/hiera/hiera_image_processor.py create mode 100644 tests/models/hiera/__init__.py create mode 100644 tests/models/hiera/test_modeling_vit_mae.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index aa1d07603390..d8018bfba4c3 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -4117,6 +4117,13 @@ "TFGroupViTVisionModel", ] ) + _import_structure["models.hiera"].extend( + [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "Hiera", + + ] + ) _import_structure["models.hubert"].extend( [ "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 0875b5b4faa4..520399067ec7 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -115,7 +115,7 @@ ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), - ("hiera","HieraConfig") + ("hiera","HieraConfig"), ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), @@ -348,7 +348,7 @@ ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP") + ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -581,7 +581,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera","Hiera") + ("hiera","Hiera"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1de0249831db..fde580b54580 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -114,6 +114,7 @@ ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), ("groupvit", "GroupViTModel"), + ("hiera", "Hiera"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 3ea6efb0056a..f88e32d03c98 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -47,6 +47,9 @@ HieraBlock, MaskUnitAttention, ) + from .hiera_image_processor import ( + HieraImageProcessor + ) else: import sys diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 77556120bcb4..d1b6e8a4ad30 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,8 +3,9 @@ import requests import torch from PIL import Image -# from .configuration_hiera import HieraConfig -# from .hiera import Hiera +from transformers.models.hiera.configuration_hiera import HieraConfig +from transformers.models.hiera.hiera import Hiera +from transformers.models.hiera.hiera_image_processor import HieraImageProcessor # from transformers import HieraConfig, Hiera from torchvision import transforms from torchvision.transforms.functional import InterpolationMode @@ -35,33 +36,8 @@ def convert_state_dict(orig_state_dict, config): return updated_model_state - -class HieraImageProcessor: - def __init__(self, size): - self.size = size - self.transform_list = [ - transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), - transforms.CenterCrop(self.size) - ] - self.transform_vis = transforms.Compose(self.transform_list) - self.transform_norm = transforms.Compose(self.transform_list + [ - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ]) - - def process_image(self, image_url): - # Load the image - img = Image.open(requests.get(image_url, stream=True).raw) - - # Apply transformations - img_vis = self.transform_vis(img) - img_norm = self.transform_norm(img) - - return img_norm - - - -def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): +def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): + strict = True pretrained_models_links = { "hiera_tiny_224": { "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", @@ -121,9 +97,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] elif "hiera_base_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(2, 3, 16, 3),) + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + checkpoints = pretrained_models_links["hiera_base_224"] checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] @@ -180,7 +155,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): stages=(2, 6, 36, 4) ) checkpoints = pretrained_models_links["hiera_huge_16x224"] checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"] - + elif checkpoint not in checkpoints: + raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") pretrained = True if pretrained: @@ -188,10 +164,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): raise RuntimeError("This model currently doesn't have pretrained weights available.") elif checkpoint is None: raise RuntimeError("No checkpoint specified.") - elif checkpoint not in checkpoints: - raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") - state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") + state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu") state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) if "head.projection.weight" in state_dict["model_state"]: # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it @@ -202,24 +176,24 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): del state_dict["model_state"]["head.projection.weight"] del state_dict["model_state"]["head.projection.bias"] - model = Hiera(config) + model = Hiera(config=config) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): strict = False - model.load_state_dict(state_dict["model_state"], strict=strict) + model.load_state_dict(state_dict["model_state"]) + # model.load_state_dict(state_dict["model_state"], strict=strict) url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" - image = Image.open(requests.get(url, stream=True).raw) - image_processor = HieraImageProcessor(size=config.image_size) - inputs = image_processor.process_image(images=image, return_tensors="pt") + image_processor = HieraImageProcessor(size=224) + inputs = image_processor.process_image(image_url=url) # forward pass out = model(inputs[None, ...]) diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 7e42d5914d44..7bafed5c3cd0 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -25,11 +25,40 @@ import torch import torch.nn as nn import torch.nn.functional as F +from dataclasses import dataclass from timm.models.layers import DropPath, Mlp - -from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll - +from ...modeling_utils import PreTrainedModel +# from ...modeling_outputs import BaseModelOutput +# from ...utils import ( +# ModelOutput, +# add_start_docstrings, +# add_start_docstrings_to_model_forward, +# logging, +# replace_return_docstrings, +# ) + +from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll + +# @dataclass +# class HieraModelOutput(ModelOutput): +# """ +# Base class for Hiera model's outputs. + +# Args: +# last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): +# Last layer hidden-states. +# attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): +# Attentions weights from the model, one for each layer. +# hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): +# Hidden states of the model at the output of each layer. +# intermediates (list[torch.Tensor], optional): +# Intermediate representations or features from the model, if applicable. +# """ +# last_hidden_state: torch.FloatTensor +# attentions: Optional[Tuple[torch.FloatTensor]] = None +# hidden_states: Optional[Tuple[torch.FloatTensor]] = None +# intermediates: Optional[list[torch.Tensor]] = None class MaskUnitAttention(nn.Module): @@ -204,86 +233,110 @@ def forward( return x -class Hiera(nn.Module): +class Hiera(PreTrainedModel): + config_class = HieraConfig + base_model_prefix = "hiera" + main_input_name = "x" + supports_gradient_checkpointing = True + def __init__(self, config: HieraConfig): - super().__init__() + self.input_size = config.input_size + self.in_chans = config.in_chans + self.embedding_dimension = config.embedding_dimension + self.number_of_heads = config.number_of_heads + self.num_classes = config.num_classes + self.stages = config.stages + self.q_pool = config.q_pool + self.q_stride = config.q_stride + self.mask_unit_size = config.mask_unit_size + self.mask_unit_attn = config.mask_unit_attn + self.dim_mul = config.dim_mul + self.head_mul = config.head_mul + self.patch_kernel = config.patch_kernel + self.patch_stride = config.patch_stride + self.patch_padding = config.patch_padding + self.mlp_ratio = config.mlp_ratio + self.drop_path_rate = config.drop_path_rate + self.head_dropout = config.head_dropout + self.head_init_scale = config.head_init_scale + self.sep_position_embeddings = config.sep_position_embeddings + + super().__init__(config) self.config = config - super().__init__() norm_layer = partial(nn.LayerNorm, eps=1e-6) # Example, adjust as needed - self.config = config - depth = sum(self.config.stages) - self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)] + depth = sum(self.stages) + self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)] num_tokens = math.prod(self.tokens_spatial_shape) - flat_mu_size = math.prod(self.config.mask_unit_size) - flat_q_stride = math.prod(self.config.q_stride) + flat_mu_size = math.prod(self.mask_unit_size) + flat_q_stride = math.prod(self.q_stride) - assert self.config.q_pool < len(self.config.stages) - self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride - self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size + assert self.q_pool < len(self.stages) + self.q_pool, self.q_stride = self.q_pool, self.q_stride + self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size self.mask_spatial_shape = [ i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) ] - self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)] + self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)] self.patch_embedding = PatchEmbedding( - self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding + self.in_chans, self.embedding_dimension, self.patch_kernel, self.patch_stride, self.patch_padding ) - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: self.position_embeddings_spatial = nn.Parameter( torch.zeros( 1, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], - self.config.embedding_dimension, + self.embedding_dimension, ) ) self.position_embeddings_temporal = nn.Parameter( - torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension) + torch.zeros(1, self.tokens_spatial_shape[0], self.embedding_dimension) ) else: - self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension)) # Setup roll and reroll modules self.unroll = Unroll( - self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1]) + self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]) ) self.reroll = Reroll( - self.config.input_size, - self.config.patch_stride, - [self.config.q_stride] * len(self.stage_ends[:-1]), + self.input_size, + self.patch_stride, + [self.q_stride] * len(self.stage_ends[:-1]), self.stage_ends, - self.config.q_pool, + self.q_pool, ) # q_pool locations - q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]] + q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]] # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)] + dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)] # Transformer blocks cur_stage = 0 self.blocks = nn.ModuleList() for i in range(depth): - output_dim = self.config.embedding_dimension + output_dim = self.embedding_dimension # Mask unit or global attention. # Lag by 1 block, so that global attention, # applied post pooling on lower resolution - use_mask_unit_attention = self.config.mask_unit_attn[cur_stage] + use_mask_unit_attention = self.mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: - output_dim = int(self.config.embedding_dimension * self.config.dim_mul) - number_of_heads = int(self.config.number_of_heads * self.config.head_mul) + output_dim = int(self.embedding_dimension * self.dim_mul) + number_of_heads = int(self.number_of_heads * self.head_mul) cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride else: - number_of_heads = self.config.number_of_heads + number_of_heads = self.number_of_heads block = HieraBlock( - input_dim=self.config.embedding_dimension, + input_dim=self.embedding_dimension, output_dim=output_dim, number_of_heads=number_of_heads, - mlp_ratio=self.config.mlp_ratio, + mlp_ratio=self.mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), @@ -291,21 +344,22 @@ def __init__(self, config: HieraConfig): use_mask_unit_attention=use_mask_unit_attention, ) - self.config.embedding_dimension = output_dim + self.embedding_dimension = output_dim self.blocks.append(block) - self.norm = norm_layer(self.config.embedding_dimension) - self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout) + self.norm = norm_layer(self.embedding_dimension) + self.head = Head(self.embedding_dimension, self.num_classes, dropout_rate=self.head_dropout) # Initialize everything - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02) nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02) else: nn.init.trunc_normal_(self.position_embeddings, std=0.02) self.apply(partial(self._init_weights)) - self.head.projection.weight.data.mul_(self.config.head_init_scale) - self.head.projection.bias.data.mul_(self.config.head_init_scale) + self.head.projection.weight.data.mul_(self.head_init_scale) + self.head.projection.bias.data.mul_(self.head_init_scale) + self.post_init() def _init_weights(self, m, init_bias=0.02): if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): @@ -318,7 +372,7 @@ def _init_weights(self, m, init_bias=0.02): @torch.jit.ignore def no_weight_decay(self): - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: return ["position_embeddings_spatial", "position_embeddings_temporal"] else: return ["position_embeddings"] @@ -350,7 +404,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: return mask.bool() def get_position_embeddings(self) -> torch.Tensor: - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: return self.position_embeddings_spatial.repeat( 1, self.tokens_spatial_shape[0], 1 ) + torch.repeat_interleave( @@ -411,106 +465,4 @@ def forward( if return_intermediates: return x, intermediates - return x - - -# Image models - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_tiny_224(**kwargs): - config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs) - return Hiera(config) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_small_224(**kwdargs): - return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_base_224(**kwargs): - config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) - return Hiera(config) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_base_plus_224(**kwdargs): - return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_large_224(**kwdargs): - return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_huge_224(**kwdargs): - return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) - - -# Video models - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_base_16x224(num_classes: int = 400, **kwdargs): - return Hiera( - num_classes=num_classes, # K400 has 400 classes - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_position_embeddings=True, - **kwdargs - ) - - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_base_plus_16x224(**kwdargs): - return hiera_base_16x224( - embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs - ) - - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_large_16x224(**kwdargs): - return hiera_base_16x224( - embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs - ) - - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_huge_16x224(**kwdargs): - return hiera_base_16x224( - embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs - ) + return x \ No newline at end of file diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py new file mode 100644 index 000000000000..4900e4a4d3fb --- /dev/null +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -0,0 +1,56 @@ + +"""Image processor class for Hirea.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import rescale, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, +) +from ...utils import TensorType, is_vision_available, logging +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from PIL import Image +import requests + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class HieraImageProcessor(BaseImageProcessor): + def __init__(self, size): + self.size = size + self.transform_list = [ + transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(self.size) + ] + self.transform_vis = transforms.Compose(self.transform_list) + self.transform_norm = transforms.Compose(self.transform_list + [ + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ]) + + def process_image(self, image_url): + # Load the image + img = Image.open(requests.get(image_url, stream=True).raw) + + # Apply transformations + img_vis = self.transform_vis(img) + img_norm = self.transform_norm(img) + + return img_norm \ No newline at end of file diff --git a/tests/models/hiera/__init__.py b/tests/models/hiera/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py new file mode 100644 index 000000000000..014d41766a8e --- /dev/null +++ b/tests/models/hiera/test_modeling_vit_mae.py @@ -0,0 +1,44 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch ViTMAE model. """ + + +import math +import tempfile +import unittest + +import numpy as np + +from transformers import ViTMAEConfig +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ViTMAEForPreTraining, ViTMAEModel + from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import ViTImageProcessor \ No newline at end of file From 5569dad499855d951b2e0d6096583eee3b5e6916 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 23:41:40 +0000 Subject: [PATCH 005/118] better naming for x in forward pass --- src/transformers/__init__.py | 4 +- .../models/auto/configuration_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 2 +- src/transformers/models/hiera/__init__.py | 2 +- .../models/hiera/configuration_hiera.py | 8 +- .../models/hiera/convert_hiera_to_pytorch.py | 10 +- src/transformers/models/hiera/hiera.py | 163 ++++++++++-------- src/transformers/models/hiera/hiera_mae.py | 6 +- src/transformers/models/hiera/hiera_utils.py | 6 +- tests/models/hiera/test_modeling_hiera.py | 87 ++++++++++ tests/models/hiera/test_modeling_vit_mae.py | 44 ----- 11 files changed, 199 insertions(+), 135 deletions(-) create mode 100644 tests/models/hiera/test_modeling_hiera.py delete mode 100644 tests/models/hiera/test_modeling_vit_mae.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d8018bfba4c3..d3646a75f940 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -4120,7 +4120,7 @@ _import_structure["models.hiera"].extend( [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", - "Hiera", + "HieraModel", ] ) @@ -6951,7 +6951,7 @@ HubertPreTrainedModel, ) from .models.hiera import ( - Hiera, + HieraModel, HieraBlock ) from .models.ibert import ( diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 520399067ec7..58ce7f77f5a8 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -581,7 +581,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera","Hiera"), + ("hiera","HieraModel"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index fde580b54580..ddb20abdcc12 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -114,7 +114,7 @@ ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), ("groupvit", "GroupViTModel"), - ("hiera", "Hiera"), + ("hiera", "HieraModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index f88e32d03c98..0434517bf52c 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -42,7 +42,7 @@ pass else: from .hiera import ( - Hiera, + HieraModel, Head, HieraBlock, MaskUnitAttention, diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index c7dfaeaeedfb..e3133354f6ea 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -13,8 +13,8 @@ class HieraConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with - the defaults will yield a similar configuration to that of the Hiera + This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an HieraModel model according to the specified arguments, defining the model architecture. Instantiating a configuration with + the defaults will yield a similar configuration to that of the HieraModel [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the @@ -46,13 +46,13 @@ class HieraConfig(PretrainedConfig): Example: ```python - >>> from transformers import HieraConfig, Hiera + >>> from transformers import HieraConfig, HieraModel >>> # Initializing a ViT MAE vit-mae-base style configuration >>> configuration = HieraConfig() >>> # Initializing a model (with random weights) from the vit-mae-base style configuration - >>> model = Hiera(configuration) + >>> model = HieraModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index d1b6e8a4ad30..d0294f12deab 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,10 +3,10 @@ import requests import torch from PIL import Image -from transformers.models.hiera.configuration_hiera import HieraConfig -from transformers.models.hiera.hiera import Hiera -from transformers.models.hiera.hiera_image_processor import HieraImageProcessor -# from transformers import HieraConfig, Hiera +# from transformers.models.hiera.configuration_hiera import HieraConfig +# from transformers.models.hiera.hiera import HieraModel +# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor +# from transformers import HieraConfig, HieraModel from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD @@ -176,7 +176,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): del state_dict["model_state"]["head.projection.weight"] del state_dict["model_state"]["head.projection.bias"] - model = Hiera(config=config) + model = HieraModel(config=config) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 7bafed5c3cd0..72917eb8e1a4 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -20,7 +20,7 @@ import math from functools import partial -from typing import List, Tuple, Callable, Optional +from typing import List, Tuple, Callable, Optional, Union from .configuration_hiera import HieraConfig import torch import torch.nn as nn @@ -29,36 +29,34 @@ from timm.models.layers import DropPath, Mlp from ...modeling_utils import PreTrainedModel -# from ...modeling_outputs import BaseModelOutput -# from ...utils import ( -# ModelOutput, -# add_start_docstrings, -# add_start_docstrings_to_model_forward, -# logging, -# replace_return_docstrings, -# ) +from ...modeling_outputs import BaseModelOutput +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll -# @dataclass -# class HieraModelOutput(ModelOutput): -# """ -# Base class for Hiera model's outputs. - -# Args: -# last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): -# Last layer hidden-states. -# attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): -# Attentions weights from the model, one for each layer. -# hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): -# Hidden states of the model at the output of each layer. -# intermediates (list[torch.Tensor], optional): -# Intermediate representations or features from the model, if applicable. -# """ -# last_hidden_state: torch.FloatTensor -# attentions: Optional[Tuple[torch.FloatTensor]] = None -# hidden_states: Optional[Tuple[torch.FloatTensor]] = None -# intermediates: Optional[list[torch.Tensor]] = None +@dataclass +class HieraModelOutput(ModelOutput): + """ + Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput. + + Args: + last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): + Last layer hidden-states. + attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): + Attentions weights from the model, one for each layer. + hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): + Hidden states of the model at the output of each layer. + intermediates (List[torch.Tensor], optional): + Intermediate representations or features from the model, if applicable. + """ + last_hidden_state: torch.FloatTensor + intermediates: Optional[List[torch.Tensor]] = None class MaskUnitAttention(nn.Module): @@ -102,15 +100,15 @@ def __init__( self.window_size = window_size self.use_mask_unit_attention = use_mask_unit_attention - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: """ Input should be of shape [batch, tokens, channels]. """ - batch_size , num_channels , _ = x.shape + batch_size , num_channels , _ = embeddings.shape num_windows = ( (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 ) qkv = ( - self.qkv(x) + self.qkv(embeddings) .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim) .permute(3, 0, 4, 2, 1, 5) ) @@ -126,15 +124,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if hasattr(F, "scaled_dot_product_attention"): # Note: the original paper did *not* use SDPA, it's a free boost! - x = F.scaled_dot_product_attention(q, k, v) + embeddings = F.scaled_dot_product_attention(q, k, v) else: attention = (q * self.scale) @ k.transpose(-1, -2) attention = attention.softmax(dim=-1) - x = (attention @ v) + embeddings = (attention @ v) - x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim) - x = self.projection(x) - return x + embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim) + embeddings = self.projection(embeddings) + return embeddings class HieraBlock(nn.Module): @@ -168,16 +166,16 @@ def __init__( if input_dim != output_dim: self.projection = nn.Linear(input_dim, output_dim) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: # Attention + Q Pooling - normalized_input = self.norm1(x) + normalized_embeddings = self.norm1(embeddings) if self.input_dim != self.output_dim: - x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride) - x = x + self.drop_path(self.attention(normalized_input)) + embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride) + embeddings = embeddings + self.drop_path(self.attention(normalized_embeddings)) # MLP - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x + embeddings = embeddings + self.drop_path(self.mlp(self.norm2(embeddings))) + return embeddings class Head(nn.Module): @@ -226,17 +224,36 @@ def __init__( ) def forward( - self, x: torch.Tensor, mask: Optional[torch.Tensor] = None + self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None ) -> torch.Tensor: - x = do_masked_conv(x, self.projection, mask) - x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1) - return x + embeddings = do_masked_conv(pixel_values, self.projection, mask) + embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) + return embeddings + +class HireaModel(PreTrainedModel): + """ + Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + + This model is a PyTorch implementation of the Hiera architecture for image classification. + + The model can be used as follows: + + Args: + config (HieraConfig): Configuration class instance for `Hiera`. + + Example usage: + >>> from your_model_file import Hiera, HieraConfig + >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + + >>> model = Hiera(config) + >>> inputs = torch.rand((1, 3, 224, 224)) + >>> outputs = model(inputs) + """ -class Hiera(PreTrainedModel): config_class = HieraConfig base_model_prefix = "hiera" - main_input_name = "x" + main_input_name = "pixel_values" supports_gradient_checkpointing = True def __init__(self, config: HieraConfig): @@ -417,52 +434,56 @@ def get_position_embeddings(self) -> torch.Tensor: def forward( self, - x: torch.Tensor, + pixel_values: torch.Tensor, mask: torch.Tensor = None, + return_dict: Optional[bool] = True, return_intermediates: bool = False, - ) -> torch.Tensor: + ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: """ mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. """ # Slowfast training passes in a list - if isinstance(x, list): - x = x[0] + if isinstance(pixel_values, list): + pixel_values = pixel_values[0] intermediates = [] - x = self.patch_embedding( - x, + pached_embeddings = self.patch_embedding( + pixel_values, mask=mask.view( - x.shape[0], 1, *self.mask_spatial_shape + pixel_values.shape[0], 1, *self.mask_spatial_shape ) # batch_size , C, *mask_spatial_shape if mask is not None else None, ) - x = x + self.get_position_embeddings() - x = self.unroll(x) + embeddings = pached_embeddings + self.get_position_embeddings() + embeddings = self.unroll(embeddings) # Discard masked tokens if mask is not None: - x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view( - x.shape[0], -1, x.shape[-1] + embeddings = embeddings[mask[..., None].tile(1, self.mu_size, embeddings.shape[2])].view( + embeddings.shape[0], -1, embeddings.shape[-1] ) - for i, blk in enumerate(self.blocks): - x = blk(x) + for i, block in enumerate(self.blocks): + embeddings = block(embeddings) if return_intermediates and i in self.stage_ends: - intermediates.append(self.reroll(x, i, mask=mask)) + intermediates.append(self.reroll(embeddings, i, mask=mask)) if mask is None: - x = x.mean(dim=1) - x = self.norm(x) - x = self.head(x) + embeddings = embeddings.mean(dim=1) + embeddings = self.norm(embeddings) + embeddings = self.head(embeddings) - # x may not always be in spatial order here. + # embeddings may not always be in spatial order here. # e.g. if q_pool = 2, mask_unit_size = (8, 8), and # q_stride = (2, 2), not all unrolls were consumed, - # intermediates[-1] is x in spatial order - if return_intermediates: - return x, intermediates - - return x \ No newline at end of file + # intermediates[-1] is embeddings in spatial order + if not return_dict: + return tuple(v for v in [embeddings, intermediates] if v is not None) + + return HieraModelOutput( + last_hidden_state=embeddings, + intermediates=intermediates if return_intermediates else None, + ) \ No newline at end of file diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index a0504997350b..c45056318a38 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from .hiera import Hiera, HieraBlock +from .hiera import HieraModel, HieraBlock from .hiera_utils import pretrained_model, undo_windowing, conv_nd @@ -36,8 +36,8 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: return x -class MaskedAutoencoderHiera(Hiera): - """Masked Autoencoder with Hiera backbone""" +class MaskedAutoencoderHiera(HieraModel): + """Masked Autoencoder with HieraModel backbone""" def __init__( self, diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py index c96c63cbfaf9..a35b33210941 100644 --- a/src/transformers/models/hiera/hiera_utils.py +++ b/src/transformers/models/hiera/hiera_utils.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # -# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles # # Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, # Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, @@ -27,7 +27,7 @@ from .convert_hiera_to_pytorch import convert_state_dict def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: - """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ + """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ def inner(model_func: Callable) -> Callable: def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module: @@ -69,7 +69,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool def conv_nd(n: int) -> Type[nn.Module]: """ Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. - If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises) + If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises) """ return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py new file mode 100644 index 000000000000..8d593af2a622 --- /dev/null +++ b/tests/models/hiera/test_modeling_hiera.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Hiera model. """ + +import unittest + +from transformers import HieraConfig +from transformers.testing_utils import ( + require_torch, + slow, + torch_device, +) +from transformers.utils import is_torch_available + +if is_torch_available(): + import torch + from transformers import HieraModel + # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model + from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST + + +class HieraModelTester: + # Define this tester to initialize Hiera model and its configurations for testing + def __init__( + self, + parent, + batch_size=8, + num_channels=3, + image_size=224, + # Add other model-specific parameters here + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + # Initialize other necessary attributes here + + def prepare_config_and_inputs(self): + # Prepare configuration and inputs for testing your model + pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device) + + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return HieraConfig( + # Define necessary configuration parameters here + ) + + def create_and_check_model(self, config, pixel_values): + model = HieraModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values=pixel_values) + # Perform checks here, e.g., output shapes, etc. + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size)) + + +@require_torch +class HieraModelTest(unittest.TestCase): + + def setUp(self): + self.model_tester = HieraModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = HieraModel.from_pretrained(model_name) + self.assertIsNotNone(model) \ No newline at end of file diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py deleted file mode 100644 index 014d41766a8e..000000000000 --- a/tests/models/hiera/test_modeling_vit_mae.py +++ /dev/null @@ -1,44 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Testing suite for the PyTorch ViTMAE model. """ - - -import math -import tempfile -import unittest - -import numpy as np - -from transformers import ViTMAEConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device -from transformers.utils import cached_property, is_torch_available, is_vision_available - -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin - - -if is_torch_available(): - import torch - from torch import nn - - from transformers import ViTMAEForPreTraining, ViTMAEModel - from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST - - -if is_vision_available(): - from PIL import Image - - from transformers import ViTImageProcessor \ No newline at end of file From 11017c67c64c691911eb660361c219038d08e43a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 17 Feb 2024 00:10:52 +0000 Subject: [PATCH 006/118] Moved utils to hiera --- src/transformers/models/hiera/hiera.py | 226 ++++++++++++++++++++++++- 1 file changed, 223 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 72917eb8e1a4..cca502aa80c9 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -20,7 +20,7 @@ import math from functools import partial -from typing import List, Tuple, Callable, Optional, Union +from typing import List, Tuple, Callable, Optional, Union, Type from .configuration_hiera import HieraConfig import torch import torch.nn as nn @@ -38,7 +38,227 @@ replace_return_docstrings, ) -from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll + +def conv_nd(n: int) -> Type[nn.Module]: + """ + Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. + If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises) + """ + return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] + + +def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: + # Refer to `Unroll` to see how this performs a maxpool-Nd + return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values + + +def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor: + # target_size: [(T), (H), W] + # (spatial) mask: [B, C, (t), (h), w] + if mask is None: + return mask + + assert len(mask.shape[2:]) == len(target_size) + if mask.shape[2:] != target_size: + return F.interpolate(mask.float(), size=target_size) + return mask + + +def do_masked_conv( + x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None +) -> torch.Tensor: + """Zero-out the masked regions of the input before conv. + Prevents leakage of masked regions when using overlapping kernels. + """ + if conv is None: + return x + if mask is None: + return conv(x) + + mask = get_resized_mask(target_size=x.shape[2:], mask=mask) + return conv(x * mask.bool()) + + +def undo_windowing( + x: torch.Tensor, shape: List[int], mu_shape: List[int] +) -> torch.Tensor: + """ + Restore spatial organization by undoing windowed organization of mask units. + + Args: + x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] + shape: current spatial shape, if it were not organized into mask unit + windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. + mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] + Returns: + x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] + """ + D = len(shape) + B, C = x.shape[0], x.shape[-1] + # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] + num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] + x = x.view(B, *num_MUs, *mu_shape, C) + + # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] + permute = ( + [0] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], + [], + ) + + [len(x.shape) - 1] + ) + x = x.permute(permute).reshape(B, *shape, C) + + return x + + + +class Unroll(nn.Module): + """ + Reorders the tokens such that patches are contiguous in memory. + E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as + [B, (Sy, Sx, H // Sy, W // Sx), C] + + This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). + Not only is this faster, but it also makes it easy to support inputs of arbitrary + dimensions in addition to patch-wise sparsity. + + Performing this operation multiple times in sequence puts entire windows as contiguous + in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of + size 8x8 would be contiguous in memory, allowing operations like mask unit attention + computed easily and efficiently, while also allowing max to be applied sequentially. + + Note: This means that intermediate values of the model are not in HxW order, so they + need to be re-rolled if you want to use the intermediate values as a HxW feature map. + The last block of the network is fine though, since by then the strides are all consumed. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + self.schedule = unroll_schedule + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: Flattened patch embeddings [B, N, C] + Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd + """ + B, _, C = x.shape + + cur_size = self.size + x = x.view(*([B] + cur_size + [C])) + + for strides in self.schedule: + # Move patches with the given strides to the batch dimension + + # Create a view of the tensor with the patch stride as separate dims + # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] + cur_size = [i // s for i, s in zip(cur_size, strides)] + new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] + x = x.view(new_shape) + + # Move the patch stride into the batch dimension + # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] + L = len(new_shape) + permute = ( + [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] + ) + x = x.permute(permute) + + # Now finally flatten the relevant dims into the batch dimension + x = x.flatten(0, len(strides)) + B *= math.prod(strides) + + x = x.reshape(-1, math.prod(self.size), C) + return x + + +class Reroll(nn.Module): + """ + Undos the "unroll" operation so that you can use intermediate features. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + stage_ends: List[int], + q_pool: int, + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + + # The first stage has to reverse everything + # The next stage has to reverse all but the first unroll, etc. + self.schedule = {} + size = self.size + for i in range(stage_ends[-1] + 1): + self.schedule[i] = unroll_schedule, size + # schedule unchanged if no pooling at a stage end + if i in stage_ends[:q_pool]: + if len(unroll_schedule) > 0: + size = [n // s for n, s in zip(size, unroll_schedule[0])] + unroll_schedule = unroll_schedule[1:] + + def forward( + self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Roll the given tensor back up to spatial order assuming it's from the given block. + + If no mask is provided: + - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. + If a mask is provided: + - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + """ + schedule, size = self.schedule[block_idx] + B, N, C = x.shape + + D = len(size) + cur_mu_shape = [1] * D + + for strides in schedule: + # Extract the current patch from N + x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) + + # Move that patch into the current MU + # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] + L = len(x.shape) + permute = ( + [0, 1 + D] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], + [], + ) + + [L - 1] + ) + x = x.permute(permute) + + # Reshape to [B, N//(Sy*Sx), *MU, C] + for i in range(D): + cur_mu_shape[i] *= strides[i] + x = x.reshape(B, -1, *cur_mu_shape, C) + N = x.shape[1] + + # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) + x = x.view(B, N, *cur_mu_shape, C) + + # If masked, return [B, #MUs, MUy, MUx, C] + if mask is not None: + return x + + # If not masked, we can return [B, H, W, C] + x = undo_windowing(x, size, cur_mu_shape) + + return x + @dataclass class HieraModelOutput(ModelOutput): @@ -231,7 +451,7 @@ def forward( return embeddings -class HireaModel(PreTrainedModel): +class HieraModel(PreTrainedModel): """ Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. From ad959d49a2c073e1ebce29ee8736d26ad18a8710 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 00:17:14 +0000 Subject: [PATCH 007/118] Change hiera -> hiera_model --- src/transformers/models/hiera/__init__.py | 89 +----- src/transformers/models/hiera/benchmarking.py | 77 ----- src/transformers/models/hiera/hiera_mae.py | 2 +- .../models/hiera/{hiera.py => hiera_model.py} | 0 src/transformers/models/hiera/hiera_utils.py | 287 ------------------ 5 files changed, 3 insertions(+), 452 deletions(-) delete mode 100644 src/transformers/models/hiera/benchmarking.py rename src/transformers/models/hiera/{hiera.py => hiera_model.py} (100%) delete mode 100644 src/transformers/models/hiera/hiera_utils.py diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 0434517bf52c..1f388d5361ab 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -41,7 +41,7 @@ except OptionalDependencyNotAvailable: pass else: - from .hiera import ( + from .hiera_model import ( HieraModel, Head, HieraBlock, @@ -54,89 +54,4 @@ else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) - -####### PREV: - -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# from typing import TYPE_CHECKING - -# from ...utils import ( -# OptionalDependencyNotAvailable, -# _LazyModule, -# is_flax_available, -# is_tf_available, -# is_torch_available, -# ) - - -# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} - -# try: -# if not is_torch_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# _import_structure["modeling_vit_mae"] = [ -# "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", -# "ViTMAEForPreTraining", -# "ViTMAELayer", -# "ViTMAEModel", -# "ViTMAEPreTrainedModel", -# ] - -# try: -# if not is_tf_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# _import_structure["modeling_tf_vit_mae"] = [ -# "TFViTMAEForPreTraining", -# "TFViTMAEModel", -# "TFViTMAEPreTrainedModel", -# ] - -# if TYPE_CHECKING: -# from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig - -# try: -# if not is_torch_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# from .modeling_vit_mae import ( -# VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, -# ViTMAEForPreTraining, -# ViTMAELayer, -# ViTMAEModel, -# ViTMAEPreTrainedModel, -# ) - -# try: -# if not is_tf_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel - - -# else: -# import sys - -# sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py deleted file mode 100644 index 33166028977a..000000000000 --- a/src/transformers/models/hiera/benchmarking.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -# -------------------------------------------------------- - -import time -from typing import List, Tuple, Union - -import torch -from tqdm import tqdm - -# From https://github.com/facebookresearch/ToMe/ -def benchmark( - model: torch.nn.Module, - device: torch.device = 0, - input_size: Tuple[int] = (3, 224, 224), - batch_size: int = 64, - runs: int = 40, - throw_out: float = 0.25, - use_fp16: bool = False, - verbose: bool = False, -) -> float: - """ - Benchmark the given model with random inputs at the given batch size. - - Args: - - model: the module to benchmark - - device: the device to use for benchmarking - - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w) - - batch_size: the batch size to use for evaluation - - runs: the number of total runs to do - - throw_out: the percentage of runs to throw out at the start of testing - - use_fp16: whether or not to benchmark with float16 and autocast - - verbose: whether or not to use tqdm to print progress / print throughput at end - - Returns: - - the throughput measured in images / second - """ - if not isinstance(device, torch.device): - device = torch.device(device) - is_cuda = torch.device(device).type == "cuda" - - model = model.eval().to(device) - input = torch.rand(batch_size, *input_size, device=device) - if use_fp16: - input = input.half() - - warm_up = int(runs * throw_out) - total = 0 - start = time.time() - - with torch.autocast(device.type, enabled=use_fp16): - with torch.no_grad(): - for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"): - if i == warm_up: - if is_cuda: - torch.cuda.synchronize() - total = 0 - start = time.time() - - model(input) - total += batch_size - - if is_cuda: - torch.cuda.synchronize() - - end = time.time() - elapsed = end - start - - throughput = total / elapsed - - if verbose: - print(f"Throughput: {throughput:.2f} im/s") - - return throughput diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index c45056318a38..f0e2e7854bff 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from .hiera import HieraModel, HieraBlock +from .hiera_model import HieraModel, HieraBlock from .hiera_utils import pretrained_model, undo_windowing, conv_nd diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera_model.py similarity index 100% rename from src/transformers/models/hiera/hiera.py rename to src/transformers/models/hiera/hiera_model.py diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py deleted file mode 100644 index a35b33210941..000000000000 --- a/src/transformers/models/hiera/hiera_utils.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -# -------------------------------------------------------- -# -# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles -# -# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, -# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, -# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. -# -# Paper: https://arxiv.org/abs/2306.00989/ -# -# References: -# slowfast: https://github.com/facebookresearch/SlowFast -# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm -# -------------------------------------------------------- - -import math -from typing import List, Tuple, Optional, Type, Callable, Dict - -import torch -import torch.nn as nn -import torch.nn.functional as F -from .convert_hiera_to_pytorch import convert_state_dict - -def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: - """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ - - def inner(model_func: Callable) -> Callable: - def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module: - if pretrained: - if checkpoints is None: - raise RuntimeError("This model currently doesn't have pretrained weights available.") - elif checkpoint is None: - raise RuntimeError("No checkpoint specified.") - elif checkpoint not in checkpoints: - raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") - - state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") - state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) - if "head.projection.weight" in state_dict["model_state"]: - # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it - if "num_classes" not in kwdargs: - kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0] - # If the user specified a different number of classes, remove the projection weights or else we'll error out - elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]: - del state_dict["model_state"]["head.projection.weight"] - del state_dict["model_state"]["head.projection.bias"] - - model = model_func(**kwdargs) - if pretrained: - # Disable being strict when trying to load a encoder-decoder model into an encoder-only model - if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): - strict = False - - model.load_state_dict(state_dict["model_state"], strict=strict) - - return model - - return model_def - - return inner - - - -def conv_nd(n: int) -> Type[nn.Module]: - """ - Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. - If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises) - """ - return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] - - -def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: - # Refer to `Unroll` to see how this performs a maxpool-Nd - return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values - - -def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor: - # target_size: [(T), (H), W] - # (spatial) mask: [B, C, (t), (h), w] - if mask is None: - return mask - - assert len(mask.shape[2:]) == len(target_size) - if mask.shape[2:] != target_size: - return F.interpolate(mask.float(), size=target_size) - return mask - - -def do_masked_conv( - x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None -) -> torch.Tensor: - """Zero-out the masked regions of the input before conv. - Prevents leakage of masked regions when using overlapping kernels. - """ - if conv is None: - return x - if mask is None: - return conv(x) - - mask = get_resized_mask(target_size=x.shape[2:], mask=mask) - return conv(x * mask.bool()) - - -def undo_windowing( - x: torch.Tensor, shape: List[int], mu_shape: List[int] -) -> torch.Tensor: - """ - Restore spatial organization by undoing windowed organization of mask units. - - Args: - x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] - shape: current spatial shape, if it were not organized into mask unit - windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. - mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] - Returns: - x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] - """ - D = len(shape) - B, C = x.shape[0], x.shape[-1] - # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] - num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] - x = x.view(B, *num_MUs, *mu_shape, C) - - # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] - permute = ( - [0] - + sum( - [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], - [], - ) - + [len(x.shape) - 1] - ) - x = x.permute(permute).reshape(B, *shape, C) - - return x - - - -class Unroll(nn.Module): - """ - Reorders the tokens such that patches are contiguous in memory. - E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as - [B, (Sy, Sx, H // Sy, W // Sx), C] - - This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). - Not only is this faster, but it also makes it easy to support inputs of arbitrary - dimensions in addition to patch-wise sparsity. - - Performing this operation multiple times in sequence puts entire windows as contiguous - in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of - size 8x8 would be contiguous in memory, allowing operations like mask unit attention - computed easily and efficiently, while also allowing max to be applied sequentially. - - Note: This means that intermediate values of the model are not in HxW order, so they - need to be re-rolled if you want to use the intermediate values as a HxW feature map. - The last block of the network is fine though, since by then the strides are all consumed. - """ - - def __init__( - self, - input_size: Tuple[int, ...], - patch_stride: Tuple[int, ...], - unroll_schedule: List[Tuple[int, ...]], - ): - super().__init__() - self.size = [i // s for i, s in zip(input_size, patch_stride)] - self.schedule = unroll_schedule - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Input: Flattened patch embeddings [B, N, C] - Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd - """ - B, _, C = x.shape - - cur_size = self.size - x = x.view(*([B] + cur_size + [C])) - - for strides in self.schedule: - # Move patches with the given strides to the batch dimension - - # Create a view of the tensor with the patch stride as separate dims - # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] - cur_size = [i // s for i, s in zip(cur_size, strides)] - new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] - x = x.view(new_shape) - - # Move the patch stride into the batch dimension - # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] - L = len(new_shape) - permute = ( - [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] - ) - x = x.permute(permute) - - # Now finally flatten the relevant dims into the batch dimension - x = x.flatten(0, len(strides)) - B *= math.prod(strides) - - x = x.reshape(-1, math.prod(self.size), C) - return x - - -class Reroll(nn.Module): - """ - Undos the "unroll" operation so that you can use intermediate features. - """ - - def __init__( - self, - input_size: Tuple[int, ...], - patch_stride: Tuple[int, ...], - unroll_schedule: List[Tuple[int, ...]], - stage_ends: List[int], - q_pool: int, - ): - super().__init__() - self.size = [i // s for i, s in zip(input_size, patch_stride)] - - # The first stage has to reverse everything - # The next stage has to reverse all but the first unroll, etc. - self.schedule = {} - size = self.size - for i in range(stage_ends[-1] + 1): - self.schedule[i] = unroll_schedule, size - # schedule unchanged if no pooling at a stage end - if i in stage_ends[:q_pool]: - if len(unroll_schedule) > 0: - size = [n // s for n, s in zip(size, unroll_schedule[0])] - unroll_schedule = unroll_schedule[1:] - - def forward( - self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None - ) -> torch.Tensor: - """ - Roll the given tensor back up to spatial order assuming it's from the given block. - - If no mask is provided: - - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. - If a mask is provided: - - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. - """ - schedule, size = self.schedule[block_idx] - B, N, C = x.shape - - D = len(size) - cur_mu_shape = [1] * D - - for strides in schedule: - # Extract the current patch from N - x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) - - # Move that patch into the current MU - # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] - L = len(x.shape) - permute = ( - [0, 1 + D] - + sum( - [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], - [], - ) - + [L - 1] - ) - x = x.permute(permute) - - # Reshape to [B, N//(Sy*Sx), *MU, C] - for i in range(D): - cur_mu_shape[i] *= strides[i] - x = x.reshape(B, -1, *cur_mu_shape, C) - N = x.shape[1] - - # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) - x = x.view(B, N, *cur_mu_shape, C) - - # If masked, return [B, #MUs, MUy, MUx, C] - if mask is not None: - return x - - # If not masked, we can return [B, H, W, C] - x = undo_windowing(x, size, cur_mu_shape) - - return x \ No newline at end of file From fac7b231f7c7277bc202edfc9b9e765e802620fa Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 01:10:17 +0000 Subject: [PATCH 008/118] Fixed integration into tranformers --- src/transformers/__init__.py | 2 +- src/transformers/models/hiera/__init__.py | 13 ++++++++----- .../models/hiera/hiera_image_processor.py | 2 +- src/transformers/models/hiera/hiera_model.py | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d3646a75f940..359e0f1a3f50 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -6951,8 +6951,8 @@ HubertPreTrainedModel, ) from .models.hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, - HieraBlock ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 1f388d5361ab..2b83a4c8d693 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -9,8 +9,8 @@ _import_structure = { "configuration_hiera": [ - "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP", - "HireaConfig", + "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", + "HieraConfig", ], } @@ -20,15 +20,16 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["hirea"] = [ - "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST", - "Hirea", + _import_structure["hiera_model"] = [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "HieraModel", "Head", "HieraBlock", "MaskUnitAttention" "" ] + if TYPE_CHECKING: from .configuration_hiera import ( HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -42,10 +43,12 @@ pass else: from .hiera_model import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, Head, HieraBlock, MaskUnitAttention, + ) from .hiera_image_processor import ( HieraImageProcessor diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py index 4900e4a4d3fb..d3f2ce96a64b 100644 --- a/src/transformers/models/hiera/hiera_image_processor.py +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -1,5 +1,5 @@ -"""Image processor class for Hirea.""" +"""Image processor class for Hiera.""" from typing import Dict, List, Optional, Union diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py index cca502aa80c9..5e7493e3c6a7 100644 --- a/src/transformers/models/hiera/hiera_model.py +++ b/src/transformers/models/hiera/hiera_model.py @@ -38,6 +38,9 @@ replace_return_docstrings, ) +HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "", +] def conv_nd(n: int) -> Type[nn.Module]: """ From 866ffc7573e58a2e6341cae78c36ceb75ceeeba4 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 01:23:55 +0000 Subject: [PATCH 009/118] Fix: Convert Checkpoint --- .../models/hiera/convert_hiera_to_pytorch.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index d0294f12deab..76c86bcb0cbb 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,9 +3,9 @@ import requests import torch from PIL import Image -# from transformers.models.hiera.configuration_hiera import HieraConfig -# from transformers.models.hiera.hiera import HieraModel -# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor +from transformers import HieraConfig +from transformers import HieraModel +from transformers.models.hiera.hiera_image_processor import HieraImageProcessor # from transformers import HieraConfig, HieraModel from torchvision import transforms from torchvision.transforms.functional import InterpolationMode @@ -199,11 +199,13 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): out = model(inputs[None, ...]) # 207: golden retriever (imagenet-1k) - out.argmax(dim=-1).item() + out.last_hidden_state.argmax(dim=-1).item() + # If you also want intermediate feature maps + out = model(inputs[None, ...], return_intermediates=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) + for x in out.intermediates: + print(x.shape) print(f"Saving image processor to {pytorch_dump_folder_path}") image_processor.save_pretrained(pytorch_dump_folder_path) From b3828e19951bdcdd9482fc0b2d3c050d13510c1e Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 07:38:00 +0000 Subject: [PATCH 010/118] added documentation for hiera --- README.md | 1 + README_de.md | 1 + README_es.md | 1 + README_fr.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_pt-br.md | 1 + README_ru.md | 1 + README_te.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 ++ docs/source/en/index.md | 1 + 14 files changed, 15 insertions(+) diff --git a/README.md b/README.md index b7077ce61032..8e33f4f20ac4 100644 --- a/README.md +++ b/README.md @@ -388,6 +388,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_de.md b/README_de.md index f21bebdc7811..82f998c3140c 100644 --- a/README_de.md +++ b/README_de.md @@ -384,6 +384,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_es.md b/README_es.md index 9dfbf8931aba..980de1212979 100644 --- a/README_es.md +++ b/README_es.md @@ -361,6 +361,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_fr.md b/README_fr.md index 75ebdd315f65..211ccfcc9e1f 100644 --- a/README_fr.md +++ b/README_fr.md @@ -382,6 +382,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Les Transformers sont-ils vraiment inefficaces pour la représentation graphique ?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT : la segmentation sémantique émerge de la supervision textuelle](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ : référentiel complet pour la compréhension du langage polonais](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (de Facebook) publié avec l'article [Hiera : un transformateur de vision hiérarchique sans cloches et sifflets]( https://arxiv.org/abs/2306.00989) par Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT : Apprentissage de la représentation autonome de la parole par prédiction masquée des unités cachées](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT : Quantification entière de BERT avec des entiers uniquement](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS : Un ensemble de données filtré à l'échelle du Web d'intercalation de documents texte-image](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_hd.md b/README_hd.md index 6402c3ee5eb7..272999ff1cb6 100644 --- a/README_hd.md +++ b/README_hd.md @@ -335,6 +335,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** ((फेसबुक से) पेपर के साथ जारी किया गया [हिरा: बेल्स-एंड-व्हिसल्स के बिना एक पदानुक्रमित विजन ट्रांसफार्मर](https://arxiv.org/abs/2306.00989) by चैतन्य रयाली, युआन-टिंग हू, डैनियल बोल्या, चेन वेई, हाओकी फैन, पो-याओ हुआंग, वैभव अग्रवाल, अर्कबंधु चौधरी, ओमिद पौरसीद, जूडी हॉफमैन, जितेंद्र मलिक, द्वारा यांगहाओ ली, क्रिस्टोफ़ फ़िचटेनहोफ़र 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा। 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ja.md b/README_ja.md index bd8a058b7b1b..51fdc9d64710 100644 --- a/README_ja.md +++ b/README_ja.md @@ -395,6 +395,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook から) Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer から公開された研究論文 [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ko.md b/README_ko.md index 533ab4685bce..b844bc23474c 100644 --- a/README_ko.md +++ b/README_ko.md @@ -310,6 +310,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook 에서) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_pt-br.md b/README_pt-br.md index 40841bd82b9f..279b128a05d8 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -389,6 +389,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ru.md b/README_ru.md index 3e6f3d54f27e..ef7c970f1ae3 100644 --- a/README_ru.md +++ b/README_ru.md @@ -380,6 +380,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_te.md b/README_te.md index 2c0b97dada67..e8073232a6a8 100644 --- a/README_te.md +++ b/README_te.md @@ -382,6 +382,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_zh-hans.md b/README_zh-hans.md index f2b9b38273bf..154425954ace 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -334,6 +334,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (来自 Facebook) 伴随论文 [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 由 Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_zh-hant.md b/README_zh-hant.md index 1d5155529aa0..2e3ab3ec4bd4 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -346,6 +346,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 678b679cb143..c169a2c625fb 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -620,6 +620,8 @@ title: CLAP - local: model_doc/encodec title: EnCodec + - local: model_doc/hiera + title: Hiera - local: model_doc/hubert title: Hubert - local: model_doc/mctct diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 81dc97e97134..4c809e6c100b 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -154,6 +154,7 @@ Flax), PyTorch, and/or TensorFlow. | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | +| [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ❌ | ❌ | From 82672b2ab5f12a39478c406102f671b217ef7bde Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 07:38:31 +0000 Subject: [PATCH 011/118] added documentation for hiera --- docs/source/en/model_doc/hiera.md | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 docs/source/en/model_doc/hiera.md diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md new file mode 100644 index 000000000000..1c46bae9b072 --- /dev/null +++ b/docs/source/en/model_doc/hiera.md @@ -0,0 +1,40 @@ + + +# Hiera + +## Overview + +Hubert was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer + +The abstract from the paper is the following: + +Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera. + +## HireaConfig + +[[autodoc]] HieraConfig + + + + +## HireaModel + +[[autodoc]] HireaModel + - forward + + + \ No newline at end of file From 00478b60cd5cc1e448e64bb4ce25961baf8f8368 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 07:39:18 +0000 Subject: [PATCH 012/118] added Docstings to models, Transformers based changes --- src/transformers/__init__.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/hiera/__init__.py | 24 ++-- .../models/hiera/configuration_hiera.py | 15 +++ .../models/hiera/convert_hiera_to_pytorch.py | 15 +++ .../models/hiera/hiera_image_processor.py | 14 +++ src/transformers/models/hiera/hiera_mae.py | 113 +----------------- src/transformers/models/hiera/hiera_model.py | 89 +++++++++----- 8 files changed, 124 insertions(+), 149 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 359e0f1a3f50..346eb625808b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -4121,6 +4121,7 @@ [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", + "HieraPreTrainedModel" ] ) @@ -6953,6 +6954,7 @@ from .models.hiera import ( HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, + HieraPreTrainedModel ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index c9cd6fca69d6..788b671a232d 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -69,6 +69,7 @@ ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), ("groupvit", "CLIPImageProcessor"), + ("hiera", "HieraImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), ("instructblip", "BlipImageProcessor"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 2b83a4c8d693..0787bffe767e 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -1,3 +1,18 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import TYPE_CHECKING from ...utils import ( @@ -23,9 +38,7 @@ _import_structure["hiera_model"] = [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", - "Head", - "HieraBlock", - "MaskUnitAttention" + "HieraPreTrainedModel" "" ] @@ -45,10 +58,7 @@ from .hiera_model import ( HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, - Head, - HieraBlock, - MaskUnitAttention, - + HieraPreTrainedModel ) from .hiera_image_processor import ( HieraImageProcessor diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index e3133354f6ea..a4ab4fd9d30b 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -1,5 +1,20 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ hiera model configuration""" + from ...configuration_utils import PretrainedConfig from ...utils import logging from typing import Tuple diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 76c86bcb0cbb..5ca2ecd262d9 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -1,3 +1,18 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import requests diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py index d3f2ce96a64b..4e41e14bc6f8 100644 --- a/src/transformers/models/hiera/hiera_image_processor.py +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -1,3 +1,17 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Image processor class for Hiera.""" diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index f0e2e7854bff..d4ec15058b2d 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,8 +17,7 @@ import torch import torch.nn as nn -from .hiera_model import HieraModel, HieraBlock -from .hiera_utils import pretrained_model, undo_windowing, conv_nd +from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: @@ -287,112 +286,4 @@ def forward( ) # pred_mask is mask at resolution of *prediction* # Toggle mask, to generate labels for *masked* tokens - return *self.forward_loss(x, pred, ~pred_mask), mask - - - - -# Image Models - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", -}, default="mae_in1k") -def mae_hiera_tiny_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", -}, default="mae_in1k") -def mae_hiera_small_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", -}, default="mae_in1k") -def mae_hiera_base_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", -}, default="mae_in1k") -def mae_hiera_base_plus_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", -}, default="mae_in1k") -def mae_hiera_large_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", -}, default="mae_in1k") -def mae_hiera_huge_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, - ) - - - -# Video Models - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", -}, default="mae_k400") -def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs): - return MaskedAutoencoderHiera( - num_classes=num_classes, # K400 has 400 classes - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_pos_embed=True, - q_pool=2, - **kwdargs - ) - - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", -}, default="mae_k400") -@pretrained_model(None) -def mae_hiera_base_plus_16x224(**kwdargs): - return mae_hiera_base_16x224( - embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs - ) - - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", -}, default="mae_k400") -@pretrained_model(None) -def mae_hiera_large_16x224(**kwdargs): - return mae_hiera_base_16x224( - embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs - ) - - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", -}, default="mae_k400") -def mae_hiera_huge_16x224(**kwdargs): - return mae_hiera_base_16x224( - embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs - ) + return *self.forward_loss(x, pred, ~pred_mask), mask \ No newline at end of file diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py index 5e7493e3c6a7..b1ed0db0e4b9 100644 --- a/src/transformers/models/hiera/hiera_model.py +++ b/src/transformers/models/hiera/hiera_model.py @@ -271,10 +271,6 @@ class HieraModelOutput(ModelOutput): Args: last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): Last layer hidden-states. - attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): - Attentions weights from the model, one for each layer. - hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): - Hidden states of the model at the output of each layer. intermediates (List[torch.Tensor], optional): Intermediate representations or features from the model, if applicable. """ @@ -422,10 +418,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.act_func(x) return x - +@add_start_docstrings(""" +Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). +""") class PatchEmbedding(nn.Module): - """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).""" - def __init__( self, dim_in: int, @@ -453,27 +449,49 @@ def forward( embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) return embeddings - -class HieraModel(PreTrainedModel): +class HieraPreTrainedModel(PreTrainedModel): """ - Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + config_class = HieraConfig + base_model_prefix = "hiera" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True - This model is a PyTorch implementation of the Hiera architecture for image classification. + def _init_weights(self, module, init_bias=0.02): + if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): + nn.init.trunc_normal_(module.weight, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + nn.init.constant_(module.bias, init_bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, init_bias) + nn.init.constant_(module.weight, 1.0) - The model can be used as follows: - Args: - config (HieraConfig): Configuration class instance for `Hiera`. - Example usage: - >>> from your_model_file import Hiera, HieraConfig - >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) - >>> model = Hiera(config) - >>> inputs = torch.rand((1, 3, 224, 224)) - >>> outputs = model(inputs) - """ +@add_start_docstrings(""" +Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + +This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. + +The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance. +Parameters: + config ([`HieraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + +Example usage: + >>> from your_model_file import Hiera, HieraConfig + >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + + >>> model = Hiera(config) + >>> inputs = torch.rand((1, 3, 224, 224)) + >>> outputs = model(inputs) + """) +class HieraModel(HieraPreTrainedModel): config_class = HieraConfig base_model_prefix = "hiera" main_input_name = "pixel_values" @@ -601,14 +619,6 @@ def __init__(self, config: HieraConfig): self.head.projection.bias.data.mul_(self.head_init_scale) self.post_init() - def _init_weights(self, m, init_bias=0.02): - if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): - nn.init.trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, init_bias) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, init_bias) - nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): @@ -655,6 +665,25 @@ def get_position_embeddings(self) -> torch.Tensor: else: return self.position_embeddings + @add_start_docstrings_to_model_forward(""" + The forward pass for the Hiera model. + + Args: + pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`. + + mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). + mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. + Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. + + + return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. + + return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. + + + + """) + @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig") def forward( self, pixel_values: torch.Tensor, @@ -663,8 +692,6 @@ def forward( return_intermediates: bool = False, ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: """ - mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. - Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. """ # Slowfast training passes in a list if isinstance(pixel_values, list): From 4144fe8a41da4b088e9db149b756e5b32aa7c934 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sun, 18 Feb 2024 06:55:51 +0000 Subject: [PATCH 013/118] make style and quality --- src/transformers/__init__.py | 15 +-- .../models/auto/configuration_auto.py | 6 +- src/transformers/models/hiera/__init__.py | 19 +--- .../models/hiera/configuration_hiera.py | 18 ++-- .../models/hiera/convert_hiera_to_pytorch.py | 102 +++++++++--------- .../models/hiera/hiera_image_processor.py | 51 ++++----- src/transformers/models/hiera/hiera_mae.py | 54 +++------- 7 files changed, 104 insertions(+), 161 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 346eb625808b..27141f8fc304 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -496,7 +496,7 @@ "GroupViTVisionConfig", ], "models.herbert": ["HerbertTokenizer"], - "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"], + "models.hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig"], "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"], "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"], "models.idefics": [ @@ -4118,12 +4118,7 @@ ] ) _import_structure["models.hiera"].extend( - [ - "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", - "HieraModel", - "HieraPreTrainedModel" - - ] + ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"] ) _import_structure["models.hubert"].extend( [ @@ -6944,6 +6939,7 @@ GroupViTTextModel, GroupViTVisionModel, ) + from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel from .models.hubert import ( HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, HubertForCTC, @@ -6951,11 +6947,6 @@ HubertModel, HubertPreTrainedModel, ) - from .models.hiera import ( - HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, - HieraModel, - HieraPreTrainedModel - ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, IBertForMaskedLM, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 58ce7f77f5a8..8c47296b1140 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -114,8 +114,8 @@ ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), + ("hiera", "HieraConfig"), ("hubert", "HubertConfig"), - ("hiera","HieraConfig"), ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), @@ -347,8 +347,8 @@ ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("hiera", "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -581,7 +581,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera","HieraModel"), + ("hiera", "HieraModel"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 0787bffe767e..fcffbbf7593e 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -35,12 +35,7 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["hiera_model"] = [ - "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", - "HieraModel", - "HieraPreTrainedModel" - "" - ] + _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "] if TYPE_CHECKING: @@ -55,16 +50,10 @@ except OptionalDependencyNotAvailable: pass else: - from .hiera_model import ( - HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, - HieraModel, - HieraPreTrainedModel - ) - from .hiera_image_processor import ( - HieraImageProcessor - ) + from .hiera_image_processor import HieraImageProcessor + from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index a4ab4fd9d30b..8d40e7a72777 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -15,15 +15,15 @@ """ hiera model configuration""" +from typing import Tuple + from ...configuration_utils import PretrainedConfig from ...utils import logging -from typing import Tuple -logger = logging.get_logger(__name__) -HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { +logger = logging.get_logger(__name__) -} +HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class HieraConfig(PretrainedConfig): @@ -42,7 +42,7 @@ class HieraConfig(PretrainedConfig): embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96. number_of_heads (int, optional): Initial number of attention heads. Defaults to 1. num_classes (int, optional): Number of output classes. Defaults to 1000. - stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. + stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. q_pool (int, optional): Number of pooling stages for queries. Defaults to 3. q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2). mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride. @@ -58,7 +58,7 @@ class HieraConfig(PretrainedConfig): head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001. sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False. - + Example: ```python >>> from transformers import HieraConfig, HieraModel @@ -72,9 +72,10 @@ class HieraConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ``` - """ + """ model_type = "hiera" + def __init__( self, input_size: Tuple[int, ...] = (224, 224), @@ -99,7 +100,6 @@ def __init__( head_init_scale: float = 0.001, sep_position_embeddings: bool = False, **kwargs, - ): super().__init__(**kwargs) self.input_size = input_size @@ -121,4 +121,4 @@ def __init__( self.drop_path_rate = drop_path_rate self.head_dropout = head_dropout self.head_init_scale = head_init_scale - self.sep_position_embeddings = sep_position_embeddings \ No newline at end of file + self.sep_position_embeddings = sep_position_embeddings diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 5ca2ecd262d9..794a62147d78 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -15,17 +15,11 @@ import argparse -import requests import torch -from PIL import Image -from transformers import HieraConfig -from transformers import HieraModel -from transformers.models.hiera.hiera_image_processor import HieraImageProcessor -# from transformers import HieraConfig, HieraModel -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +# from transformers import HieraConfig, HieraModel +from transformers import HieraConfig, HieraModel +from transformers.models.hiera.hiera_image_processor import HieraImageProcessor def rename_key(name): @@ -51,7 +45,7 @@ def convert_state_dict(orig_state_dict, config): return updated_model_state -def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): +def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs): strict = True pretrained_models_links = { "hiera_tiny_224": { @@ -93,21 +87,24 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): "hiera_huge_16x224": { "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", - } + }, } - if "hiera_tiny_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(1, 2, 7, 2),) + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2), + ) checkpoints = pretrained_models_links["hiera_tiny_224"] checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"] elif "hiera_small_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(1, 2, 11, 2),) + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2), + ) checkpoints = pretrained_models_links["hiera_small_224"] checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] @@ -118,56 +115,57 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] elif "hiera_base_plus_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=112, - number_of_heads=2, - stages=(2, 3, 16, 3),) + config = HieraConfig( + embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3), + ) checkpoints = pretrained_models_links["hiera_base_plus_224"] checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"] elif "hiera_large_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=144, - number_of_heads=2, - stages=(2, 6, 36, 4),) + config = HieraConfig( + embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), + ) checkpoints = pretrained_models_links["hiera_large_224"] checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"] elif "hiera_huge_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=256, - number_of_heads=4, - stages=(2, 6, 36, 4)) + config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4)) checkpoints = pretrained_models_links["hiera_huge_224"] checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"] elif "hiera_base_16x224" in checkpoint_url: - config = HieraConfig(num_classes=num_classes, # Assuming num_classes is defined elsewhere - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_position_embeddings=True,) + config = HieraConfig( + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True, + ) checkpoints = pretrained_models_links["hiera_base_16x224"] checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"] elif "hiera_base_plus_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=112, - number_of_heads=2, - stages=(2, 3, 16, 3)) + config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3)) checkpoints = pretrained_models_links["hiera_base_plus_16x224"] checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"] elif "hiera_large_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=144, - number_of_heads=2, - stages=(2, 6, 36, 4), ) + config = HieraConfig( + embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), + ) checkpoints = pretrained_models_links["hiera_large_16x224"] checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"] elif "hiera_huge_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=256, - number_of_heads=4, - stages=(2, 6, 36, 4) ) + config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4)) checkpoints = pretrained_models_links["hiera_huge_16x224"] checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"] elif checkpoint not in checkpoints: @@ -181,7 +179,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): raise RuntimeError("No checkpoint specified.") state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu") - state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) + state_dict["model_state"] = convert_state_dict(state_dict["model_state"], {}) if "head.projection.weight" in state_dict["model_state"]: # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it if config.num_classes is None: @@ -194,19 +192,16 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): model = HieraModel(config=config) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model - if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): + if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr( + model, "decoder_position_embeddings" + ): strict = False - model.load_state_dict(state_dict["model_state"]) + model.load_state_dict(state_dict["model_state"], strict) # model.load_state_dict(state_dict["model_state"], strict=strict) - - - url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" - - image_processor = HieraImageProcessor(size=224) inputs = image_processor.process_image(image_url=url) @@ -220,7 +215,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): out = model(inputs[None, ...], return_intermediates=True) for x in out.intermediates: - print(x.shape) + print(x.shape) print(f"Saving image processor to {pytorch_dump_folder_path}") image_processor.save_pretrained(pytorch_dump_folder_path) @@ -231,4 +226,3 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth" convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/") - diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py index 4e41e14bc6f8..0200687c4835 100644 --- a/src/transformers/models/hiera/hiera_image_processor.py +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -15,32 +15,18 @@ """Image processor class for Hiera.""" -from typing import Dict, List, Optional, Union -import numpy as np - -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import rescale, resize, to_channel_dimension_format -from ...image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - infer_channel_dimension_format, - is_scaled_image, - make_list_of_images, - to_numpy_array, - valid_images, -) -from ...utils import TensorType, is_vision_available, logging -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from PIL import Image import requests +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +from ...image_processing_utils import BaseImageProcessor +from ...utils import is_vision_available, logging if is_vision_available(): - import PIL + from PIL import Image + from torchvision import transforms + from torchvision.transforms.functional import InterpolationMode logger = logging.get_logger(__name__) @@ -51,20 +37,23 @@ def __init__(self, size): self.size = size self.transform_list = [ transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), - transforms.CenterCrop(self.size) + transforms.CenterCrop(self.size), ] self.transform_vis = transforms.Compose(self.transform_list) - self.transform_norm = transforms.Compose(self.transform_list + [ - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ]) - + self.transform_norm = transforms.Compose( + self.transform_list + + [ + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ] + ) + def process_image(self, image_url): # Load the image img = Image.open(requests.get(image_url, stream=True).raw) - + # Apply transformations - img_vis = self.transform_vis(img) + # img_vis = self.transform_vis(img) img_norm = self.transform_norm(img) - - return img_norm \ No newline at end of file + + return img_norm diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index d4ec15058b2d..56b91bc7acb7 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -10,28 +10,28 @@ # -------------------------------------------------------- +import math from functools import partial -from typing import Tuple, Optional +from typing import Optional, Tuple -import math import torch import torch.nn as nn -from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd +from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: if isinstance(head, nn.Identity): return x - batch_size , num_mask_units = x.shape[0:2] + batch_size, num_mask_units = x.shape[0:2] # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size * #MUs, C, My, Mx]) permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) - x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) + x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) # Restore original layout, e.g. [batch_size * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C'] permute = [0] + list(range(2, len(x.shape))) + [1] - x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1]) + x = x.permute(permute).reshape(batch_size, num_mask_units, *x.shape[2:], x.shape[1]) return x @@ -64,8 +64,7 @@ def __init__( i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride) ] self.tokens_spatial_shape_final = [ - i // s ** (self.q_pool) - for i, s in zip(self.tokens_spatial_shape, self.q_stride) + i // s ** (self.q_pool) for i, s in zip(self.tokens_spatial_shape, self.q_stride) ] # -------------------------------------------------------------------------- # Multi-scale fusion heads @@ -73,9 +72,7 @@ def __init__( self.multi_scale_fusion_heads = nn.ModuleList() for i in self.stage_ends[: self.q_pool]: # resolution constant after q_pool - kernel = [ - i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final) - ] + kernel = [i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)] curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)] self.multi_scale_fusion_heads.append( conv_nd(len(self.q_stride))( @@ -94,9 +91,7 @@ def __init__( self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) self.decoder_pos_embed = nn.Parameter( - torch.zeros( - 1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim - ) + torch.zeros(1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim) ) self.decoder_blocks = nn.ModuleList( @@ -113,9 +108,7 @@ def __init__( ) self.decoder_norm = norm_layer(decoder_embed_dim) - self.pred_stride = patch_stride[-1] * ( - self.q_stride[-1] ** self.q_pool - ) # patch stride of prediction + self.pred_stride = patch_stride[-1] * (self.q_stride[-1] ** self.q_pool) # patch stride of prediction self.decoder_pred = nn.Linear( decoder_embed_dim, @@ -143,9 +136,7 @@ def _mae_init_weights(self, m: nn.Module): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) - def get_pixel_label_2d( - self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True - ) -> torch.Tensor: + def get_pixel_label_2d(self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor: # mask (boolean tensor): True must correspond to *masked* input_img = input_img.permute(0, 2, 3, 1) @@ -160,13 +151,11 @@ def get_pixel_label_2d( return label - def get_pixel_label_3d( - self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True - ) -> torch.Tensor: + def get_pixel_label_3d(self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor: # mask (boolean tensor): True must correspond to *masked* # We use time strided loss, only take the first frame from each token - input_vid = input_vid[:, :, ::self.patch_stride[0], :, :] + input_vid = input_vid[:, :, :: self.patch_stride[0], :, :] size = self.pred_stride label = input_vid.unfold(3, size, size).unfold(4, size, size) @@ -181,11 +170,9 @@ def get_pixel_label_3d( return label - def forward_encoder( self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: - if mask is None: mask = self.get_random_mask(x, mask_ratio) # [batch_size , #MUs_all] @@ -203,9 +190,7 @@ def forward_encoder( return x, mask - def forward_decoder( - self, x: torch.Tensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + def forward_decoder(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: # Embed tokens x = self.decoder_embed(x) @@ -214,9 +199,7 @@ def forward_decoder( # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] # mask: [batch_size , #MUs_all] x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) - mask_tokens = self.mask_token.view( - (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,) - ) + mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)) mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:])) mask = mask.expand((-1,) * 2 + x.shape[2:]).bool() x_dec[mask] = x.flatten() @@ -279,11 +262,8 @@ def forward( mask_ratio: float = 0.6, mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - latent, mask = self.forward_encoder(x, mask_ratio, mask=mask) - pred, pred_mask = self.forward_decoder( - latent, mask - ) # pred_mask is mask at resolution of *prediction* + pred, pred_mask = self.forward_decoder(latent, mask) # pred_mask is mask at resolution of *prediction* # Toggle mask, to generate labels for *masked* tokens - return *self.forward_loss(x, pred, ~pred_mask), mask \ No newline at end of file + return *self.forward_loss(x, pred, ~pred_mask), mask From d23a70d0d14d5ca50a7d20fde8e2ace59231a714 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sun, 18 Feb 2024 06:56:38 +0000 Subject: [PATCH 014/118] make style and quality --- src/transformers/models/hiera/hiera_model.py | 128 ++++++++----------- 1 file changed, 56 insertions(+), 72 deletions(-) diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py index b1ed0db0e4b9..9345084769ec 100644 --- a/src/transformers/models/hiera/hiera_model.py +++ b/src/transformers/models/hiera/hiera_model.py @@ -19,29 +19,29 @@ # -------------------------------------------------------- import math +from dataclasses import dataclass from functools import partial -from typing import List, Tuple, Callable, Optional, Union, Type -from .configuration_hiera import HieraConfig +from typing import Callable, List, Optional, Tuple, Type, Union + import torch import torch.nn as nn import torch.nn.functional as F -from dataclasses import dataclass - from timm.models.layers import DropPath, Mlp + from ...modeling_utils import PreTrainedModel -from ...modeling_outputs import BaseModelOutput from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - logging, - replace_return_docstrings, ) +from .configuration_hiera import HieraConfig + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "", + "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", ] + def conv_nd(n: int) -> Type[nn.Module]: """ Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. @@ -67,9 +67,7 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso return mask -def do_masked_conv( - x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None -) -> torch.Tensor: +def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None) -> torch.Tensor: """Zero-out the masked regions of the input before conv. Prevents leakage of masked regions when using overlapping kernels. """ @@ -82,9 +80,7 @@ def do_masked_conv( return conv(x * mask.bool()) -def undo_windowing( - x: torch.Tensor, shape: List[int], mu_shape: List[int] -) -> torch.Tensor: +def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor: """ Restore spatial organization by undoing windowed organization of mask units. @@ -116,7 +112,6 @@ def undo_windowing( return x - class Unroll(nn.Module): """ Reorders the tokens such that patches are contiguous in memory. @@ -169,9 +164,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Move the patch stride into the batch dimension # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] L = len(new_shape) - permute = ( - [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] - ) + permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] x = x.permute(permute) # Now finally flatten the relevant dims into the batch dimension @@ -210,9 +203,7 @@ def __init__( size = [n // s for n, s in zip(size, unroll_schedule[0])] unroll_schedule = unroll_schedule[1:] - def forward( - self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None - ) -> torch.Tensor: + def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor: """ Roll the given tensor back up to spatial order assuming it's from the given block. @@ -269,11 +260,12 @@ class HieraModelOutput(ModelOutput): Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput. Args: - last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): + last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): Last layer hidden-states. - intermediates (List[torch.Tensor], optional): + intermediates (List[torch.Tensor], optional): Intermediate representations or features from the model, if applicable. """ + last_hidden_state: torch.FloatTensor intermediates: Optional[List[torch.Tensor]] = None @@ -320,15 +312,13 @@ def __init__( self.use_mask_unit_attention = use_mask_unit_attention def forward(self, embeddings: torch.Tensor) -> torch.Tensor: - """ Input should be of shape [batch, tokens, channels]. """ - batch_size , num_channels , _ = embeddings.shape - num_windows = ( - (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 - ) + """Input should be of shape [batch, tokens, channels].""" + batch_size, num_channels, _ = embeddings.shape + num_windows = (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 qkv = ( self.qkv(embeddings) - .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim) + .reshape(batch_size, -1, num_windows, 3, self.number_of_heads, self.head_dim) .permute(3, 0, 4, 2, 1, 5) ) q, k, v = qkv[0], qkv[1], qkv[2] @@ -336,7 +326,7 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor: if self.q_stride > 1: # Refer to Unroll to see how this performs a maxpool-Nd q = ( - q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) + q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) .max(dim=3) .values ) @@ -347,9 +337,9 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor: else: attention = (q * self.scale) @ k.transpose(-1, -2) attention = attention.softmax(dim=-1) - embeddings = (attention @ v) + embeddings = attention @ v - embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim) + embeddings = embeddings.transpose(1, 3).reshape(batch_size, -1, self.output_dim) embeddings = self.projection(embeddings) return embeddings @@ -418,9 +408,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.act_func(x) return x -@add_start_docstrings(""" + +@add_start_docstrings( + """ Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). -""") +""" +) class PatchEmbedding(nn.Module): def __init__( self, @@ -442,18 +435,18 @@ def __init__( padding=padding, ) - def forward( - self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None - ) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: embeddings = do_masked_conv(pixel_values, self.projection, mask) embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) return embeddings + class HieraPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + config_class = HieraConfig base_model_prefix = "hiera" main_input_name = "pixel_values" @@ -469,9 +462,8 @@ def _init_weights(self, module, init_bias=0.02): nn.init.constant_(module.weight, 1.0) - - -@add_start_docstrings(""" +@add_start_docstrings( + """ Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. @@ -482,7 +474,7 @@ def _init_weights(self, module, init_bias=0.02): config ([`HieraConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. - + Example usage: >>> from your_model_file import Hiera, HieraConfig >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) @@ -490,7 +482,8 @@ def _init_weights(self, module, init_bias=0.02): >>> model = Hiera(config) >>> inputs = torch.rand((1, 3, 224, 224)) >>> outputs = model(inputs) - """) + """ +) class HieraModel(HieraPreTrainedModel): config_class = HieraConfig base_model_prefix = "hiera" @@ -531,9 +524,7 @@ def __init__(self, config: HieraConfig): assert self.q_pool < len(self.stages) self.q_pool, self.q_stride = self.q_pool, self.q_stride self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size - self.mask_spatial_shape = [ - i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) - ] + self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)] self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)] self.patch_embedding = PatchEmbedding( @@ -555,9 +546,7 @@ def __init__(self, config: HieraConfig): self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension)) # Setup roll and reroll modules - self.unroll = Unroll( - self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]) - ) + self.unroll = Unroll(self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])) self.reroll = Reroll( self.input_size, self.patch_stride, @@ -566,7 +555,7 @@ def __init__(self, config: HieraConfig): self.q_pool, ) # q_pool locations - q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]] + q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]] # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)] @@ -619,7 +608,6 @@ def __init__(self, config: HieraConfig): self.head.projection.bias.data.mul_(self.head_init_scale) self.post_init() - @torch.jit.ignore def no_weight_decay(self): if self.sep_position_embeddings: @@ -632,21 +620,19 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: Generates a random mask, mask_ratio fraction are dropped. 1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc. """ - batch_size = x.shape[0] + batch_size = x.shape[0] # Tokens selected for masking at mask unit level num_windows = math.prod(self.mask_spatial_shape) # num_mask_units len_keep = int(num_windows * (1 - mask_ratio)) - noise = torch.rand(batch_size , num_windows, device=x.device) + noise = torch.rand(batch_size, num_windows, device=x.device) # Sort noise for each sample - ids_shuffle = torch.argsort( - noise, dim=1 - ) # ascend: small is keep, large is remove + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove ids_restore = torch.argsort(ids_shuffle, dim=1) # Generate the binary mask: 1 is *keep*, 0 is *remove* # Note this is opposite to original MAE - mask = torch.zeros([batch_size , num_windows], device=x.device) + mask = torch.zeros([batch_size, num_windows], device=x.device) mask[:, :len_keep] = 1 # Unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) @@ -665,34 +651,34 @@ def get_position_embeddings(self) -> torch.Tensor: else: return self.position_embeddings - @add_start_docstrings_to_model_forward(""" + @add_start_docstrings_to_model_forward( + """ The forward pass for the Hiera model. Args: pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`. - + mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. - + return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. - - - - """) - @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig") + + + + """ + ) def forward( self, pixel_values: torch.Tensor, mask: torch.Tensor = None, return_dict: Optional[bool] = True, - return_intermediates: bool = False, + return_intermediates: bool = True, ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: - """ - """ + """ """ # Slowfast training passes in a list if isinstance(pixel_values, list): pixel_values = pixel_values[0] @@ -700,9 +686,7 @@ def forward( pached_embeddings = self.patch_embedding( pixel_values, - mask=mask.view( - pixel_values.shape[0], 1, *self.mask_spatial_shape - ) # batch_size , C, *mask_spatial_shape + mask=mask.view(pixel_values.shape[0], 1, *self.mask_spatial_shape) # batch_size , C, *mask_spatial_shape if mask is not None else None, ) @@ -732,8 +716,8 @@ def forward( # intermediates[-1] is embeddings in spatial order if not return_dict: return tuple(v for v in [embeddings, intermediates] if v is not None) - + return HieraModelOutput( last_hidden_state=embeddings, intermediates=intermediates if return_intermediates else None, - ) \ No newline at end of file + ) From c677783fcfd38ea9b0771e61eaaa1091bf7850fa Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Mon, 26 Feb 2024 23:11:01 +0000 Subject: [PATCH 015/118] Integration & Block tests running --- tests/models/hiera/test_modeling_hiera.py | 265 +++++++++++++++++++--- 1 file changed, 235 insertions(+), 30 deletions(-) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 8d593af2a622..72badde557df 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -15,7 +15,8 @@ """ Testing suite for the PyTorch Hiera model. """ import unittest - +from typing import Tuple +from transformers.models.hiera.hiera_model import HieraBlock from transformers import HieraConfig from transformers.testing_utils import ( require_torch, @@ -23,65 +24,269 @@ torch_device, ) from transformers.utils import is_torch_available - +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD if is_torch_available(): import torch from transformers import HieraModel - # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model - from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST - - + from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST + from torchvision.transforms.functional import InterpolationMode + from torchvision import transforms + from PIL import Image +import math class HieraModelTester: - # Define this tester to initialize Hiera model and its configurations for testing def __init__( self, parent, - batch_size=8, - num_channels=3, - image_size=224, - # Add other model-specific parameters here + input_size: Tuple[int, ...] = (224, 224), + in_chans: int = 3, + embedding_dimension: int = 96, # initial embedding input_dim + number_of_heads: int = 1, # initial number of number_of_heads + num_classes: int = 1000, + stages: Tuple[int, ...] = (2, 3, 16, 3), + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, ...] = (2, 2), + mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) + # mask_unit_attn: which stages use mask unit attention? + mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), + dim_mul: float = 2.0, + head_mul: float = 2.0, + patch_kernel: Tuple[int, ...] = (7, 7), + patch_stride: Tuple[int, ...] = (4, 4), + patch_padding: Tuple[int, ...] = (3, 3), + mlp_ratio: float = 4.0, + drop_path_rate: float = 0.0, + head_dropout: float = 0.0, + head_init_scale: float = 0.001, + sep_position_embeddings: bool = False, ): self.parent = parent - self.batch_size = batch_size - self.num_channels = num_channels - self.image_size = image_size - # Initialize other necessary attributes here + self.input_size = input_size + self.in_chans = in_chans + self.embedding_dimension = embedding_dimension + self.number_of_heads = number_of_heads + self.num_classes = num_classes + self.stages = stages + self.q_pool = q_pool + self.q_stride = q_stride + self.mask_unit_size = mask_unit_size + self.mask_unit_attn = mask_unit_attn + self.dim_mul = dim_mul + self.head_mul = head_mul + self.patch_kernel = patch_kernel + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.mlp_ratio = mlp_ratio + self.drop_path_rate = drop_path_rate + self.head_dropout = head_dropout + self.head_init_scale = head_init_scale + self.sep_position_embeddings = sep_position_embeddings - def prepare_config_and_inputs(self): + def prepare_config_and_inputs(self,checkpoint_url): # Prepare configuration and inputs for testing your model - pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device) + pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1])) - config = self.get_config() + config = self.get_config(checkpoint_url=checkpoint_url) return config, pixel_values - def get_config(self): - return HieraConfig( - # Define necessary configuration parameters here - ) + def get_config(self,checkpoint_url): + if "hiera_tiny_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2),) + + elif "hiera_small_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2),) + + elif "hiera_base_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), ) + + + elif "hiera_base_plus_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3),) + + elif "hiera_large_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4),) + + elif "hiera_huge_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4)) + + elif "hiera_base_16x224" in checkpoint_url: + config = HieraConfig(num_classes=self.num_classes, + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True,) + + elif "hiera_base_plus_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3)) + + elif "hiera_large_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), ) + + elif "hiera_huge_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4) ) + else: + raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})") + + return config def create_and_check_model(self, config, pixel_values): + batch_size = 1 model = HieraModel(config=config) - model.to(torch_device) + num_patches = int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)**2 + flat_q_stride = math.prod(self.q_stride) + embedding_dimension = self.embedding_dimension + indermediate_shapes = [] + for _ in self.stages: + indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension)) + num_patches = num_patches/flat_q_stride + embedding_dimension = embedding_dimension * 2 model.eval() with torch.no_grad(): result = model(pixel_values=pixel_values) - # Perform checks here, e.g., output shapes, etc. - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size)) + + for idx, x in enumerate(result.intermediates): + self.parent.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape") @require_torch -class HieraModelTest(unittest.TestCase): +class HieraModelTest(): def setUp(self): self.model_tester = HieraModelTester(self) def test_model(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_model(*config_and_inputs) + for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name) + self.model_tester.create_and_check_model(*config_and_inputs) - @slow + # @slow def test_model_from_pretrained(self): for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = HieraModel.from_pretrained(model_name) - self.assertIsNotNone(model) \ No newline at end of file + self.assertIsNotNone(model) + +@require_torch +@slow +class HieraModelIntegrationTest(unittest.TestCase): + def test_forward(self): + torch_device = "cpu" + input_size = 224 + batch_size =1 + patch_kernel = (7,7) + patch_padding = (3,3) + patch_stride = (4,4) + q_stride = (2,2) + flat_q_stride = math.prod(q_stride) + stages=(2, 3, 16, 3) + embedding_dimension = 96 + model = HieraModel.from_pretrained("/home/ubuntu/home/hiera/model/") + model.to(torch_device) + + random_tensor = torch.rand(batch_size, 3, input_size, input_size) + num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1)**2 + + indermediate_shapes = [] + for _ in stages: + indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension)) + num_patches = num_patches/flat_q_stride + embedding_dimension = embedding_dimension * 2 + out = model(random_tensor) + + out.last_hidden_state.argmax(dim=-1).item() + + out = model(random_tensor, return_intermediates=True) + for idx, x in enumerate(out.intermediates): + self.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape") + +class TestHieraBlock(unittest.TestCase): + def test_output_shape(self): + batch_size, input_dim, output_dim = 1, 96, 192 + number_of_heads = 2 + mlp_ratio = 4.0 + drop_path = 0.0 + q_stride = 4 + window_size = 16 + use_mask_unit_attention = True + num_patches = 3136 + + block = HieraBlock( + input_dim=input_dim, + output_dim=output_dim, + number_of_heads=number_of_heads, + mlp_ratio=mlp_ratio, + drop_path=drop_path, + q_stride=q_stride, + window_size=window_size, + use_mask_unit_attention=use_mask_unit_attention + ) + + # Create a dummy input + x = torch.randn(batch_size, num_patches,input_dim) + + # Forward pass + out = block(x) + + # Check the shape of the output + expected_shape = (batch_size, num_patches/q_stride, output_dim) + self.assertEqual(out.shape, expected_shape, "Output shape is incorrect") + + def test_input_output_dim_equality(self): + batch_size, input_dim, output_dim = 1, 96, 96 + number_of_heads = 1 + mlp_ratio = 4.0 + drop_path = 0.0 + q_stride = 1 + window_size = 64 + use_mask_unit_attention = True + num_patches = 3136 + block = HieraBlock( + input_dim=input_dim, + output_dim=output_dim, + number_of_heads=number_of_heads, + mlp_ratio=mlp_ratio, + drop_path=drop_path, + q_stride=q_stride, + window_size=window_size, + use_mask_unit_attention=use_mask_unit_attention + ) + + # Create a dummy input + x = torch.randn(batch_size, num_patches,input_dim) + + # Forward pass + out = block(x) + + # Check the shape of the output + expected_shape = (batch_size, num_patches, output_dim) + self.assertEqual(out.shape, expected_shape, "Output shape is incorrect. Input shape should be equal to output shape") + + +if __name__ == '__main__': + test = HieraModelIntegrationTest() + test.test_forward() + block_test = TestHieraBlock() + block_test.test_output_shape() + block_test.test_input_output_dim_equality() + model_test = HieraModelTest() + model_test.setUp() + model_test.test_model() + model_test.test_model_from_pretrained() From 130b55b1f010ff22058d420dbaff3cf54e87db06 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 06:04:01 +0000 Subject: [PATCH 016/118] Fixed bugs --- src/transformers/__init__.py | 6 +++++- src/transformers/models/auto/configuration_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/hiera/__init__.py | 8 ++++++-- src/transformers/models/hiera/hiera_mae.py | 2 +- .../models/hiera/{hiera_model.py => modeling_hiera.py} | 0 6 files changed, 14 insertions(+), 5 deletions(-) rename src/transformers/models/hiera/{hiera_model.py => modeling_hiera.py} (100%) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4d7ef6ce20d3..9d668babbec2 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -6981,7 +6981,11 @@ GroupViTTextModel, GroupViTVisionModel, ) - from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel + from .models.hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, + HieraModel, + HieraPreTrainedModel, + ) from .models.hubert import ( HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, HubertForCTC, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6f824a2e955d..10511e2ff47e 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -590,7 +590,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera", "HieraModel"), + ("hiera", "Hiera"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 0fc417e795e4..fb4d571632a4 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -501,6 +501,7 @@ ("efficientnet", "EfficientNetModel"), ("focalnet", "FocalNetModel"), ("glpn", "GLPNModel"), + ("hiera", "HieraModel"), ("imagegpt", "ImageGPTModel"), ("levit", "LevitModel"), ("mobilenet_v1", "MobileNetV1Model"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index fcffbbf7593e..d32f0a934fea 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -35,7 +35,11 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "] + _import_structure["modeling_hiera"] = [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "HieraModel", + "HieraPreTrainedModel " + ] if TYPE_CHECKING: @@ -51,7 +55,7 @@ pass else: from .hiera_image_processor import HieraImageProcessor - from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel + from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel else: import sys diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index 56b91bc7acb7..7c42c22734a1 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing +from .modeling_hiera import HieraBlock, HieraModel, conv_nd, undo_windowing def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/modeling_hiera.py similarity index 100% rename from src/transformers/models/hiera/hiera_model.py rename to src/transformers/models/hiera/modeling_hiera.py From 733c59e25212ba99ea00a83db780b034bbfa9376 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 00:06:17 +0000 Subject: [PATCH 017/118] initialized Structure --- src/transformers/models/__init__.py | 1 + src/transformers/models/hiera/__init__.py | 82 +++ src/transformers/models/hiera/benchmarking.py | 77 +++ .../models/hiera/configuration_hiera.py | 128 +++++ .../models/hiera/convert_hiera_to_pytorch.py | 27 + src/transformers/models/hiera/hiera.py | 535 ++++++++++++++++++ src/transformers/models/hiera/hiera_mae.py | 398 +++++++++++++ src/transformers/models/hiera/hiera_utils.py | 287 ++++++++++ 8 files changed, 1535 insertions(+) create mode 100644 src/transformers/models/hiera/__init__.py create mode 100644 src/transformers/models/hiera/benchmarking.py create mode 100644 src/transformers/models/hiera/configuration_hiera.py create mode 100644 src/transformers/models/hiera/convert_hiera_to_pytorch.py create mode 100644 src/transformers/models/hiera/hiera.py create mode 100644 src/transformers/models/hiera/hiera_mae.py create mode 100644 src/transformers/models/hiera/hiera_utils.py diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index ebb3db25fb96..5b9c5404fd7a 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -106,6 +106,7 @@ graphormer, groupvit, herbert, + hiera, hubert, ibert, idefics, diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py new file mode 100644 index 000000000000..bfd200e9dcb9 --- /dev/null +++ b/src/transformers/models/hiera/__init__.py @@ -0,0 +1,82 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_flax_available, + is_tf_available, + is_torch_available, +) + + +_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_vit_mae"] = [ + "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ViTMAEForPreTraining", + "ViTMAELayer", + "ViTMAEModel", + "ViTMAEPreTrainedModel", + ] + +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_vit_mae"] = [ + "TFViTMAEForPreTraining", + "TFViTMAEModel", + "TFViTMAEPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_vit_mae import ( + VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, + ViTMAEForPreTraining, + ViTMAELayer, + ViTMAEModel, + ViTMAEPreTrainedModel, + ) + + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py new file mode 100644 index 000000000000..33166028977a --- /dev/null +++ b/src/transformers/models/hiera/benchmarking.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- + +import time +from typing import List, Tuple, Union + +import torch +from tqdm import tqdm + +# From https://github.com/facebookresearch/ToMe/ +def benchmark( + model: torch.nn.Module, + device: torch.device = 0, + input_size: Tuple[int] = (3, 224, 224), + batch_size: int = 64, + runs: int = 40, + throw_out: float = 0.25, + use_fp16: bool = False, + verbose: bool = False, +) -> float: + """ + Benchmark the given model with random inputs at the given batch size. + + Args: + - model: the module to benchmark + - device: the device to use for benchmarking + - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w) + - batch_size: the batch size to use for evaluation + - runs: the number of total runs to do + - throw_out: the percentage of runs to throw out at the start of testing + - use_fp16: whether or not to benchmark with float16 and autocast + - verbose: whether or not to use tqdm to print progress / print throughput at end + + Returns: + - the throughput measured in images / second + """ + if not isinstance(device, torch.device): + device = torch.device(device) + is_cuda = torch.device(device).type == "cuda" + + model = model.eval().to(device) + input = torch.rand(batch_size, *input_size, device=device) + if use_fp16: + input = input.half() + + warm_up = int(runs * throw_out) + total = 0 + start = time.time() + + with torch.autocast(device.type, enabled=use_fp16): + with torch.no_grad(): + for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"): + if i == warm_up: + if is_cuda: + torch.cuda.synchronize() + total = 0 + start = time.time() + + model(input) + total += batch_size + + if is_cuda: + torch.cuda.synchronize() + + end = time.time() + elapsed = end - start + + throughput = total / elapsed + + if verbose: + print(f"Throughput: {throughput:.2f} im/s") + + return throughput diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py new file mode 100644 index 000000000000..de5de9e7d9e9 --- /dev/null +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -0,0 +1,128 @@ +""" hiera model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json", + # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae +} + + +class ViTMAEConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT + MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with + the defaults will yield a similar configuration to that of the ViT + [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + decoder_num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the decoder. + decoder_hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the decoder. + decoder_num_hidden_layers (`int`, *optional*, defaults to 8): + Number of hidden layers in the decoder. + decoder_intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder. + mask_ratio (`float`, *optional*, defaults to 0.75): + The ratio of the number of masked tokens in the input sequence. + norm_pix_loss (`bool`, *optional*, defaults to `False`): + Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved + representation quality in the experiments of the authors. + + Example: + + ```python + >>> from transformers import ViTMAEConfig, ViTMAEModel + + >>> # Initializing a ViT MAE vit-mae-base style configuration + >>> configuration = ViTMAEConfig() + + >>> # Initializing a model (with random weights) from the vit-mae-base style configuration + >>> model = ViTMAEModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "vit_mae" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + decoder_num_attention_heads=16, + decoder_hidden_size=512, + decoder_num_hidden_layers=8, + decoder_intermediate_size=2048, + mask_ratio=0.75, + norm_pix_loss=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.decoder_num_attention_heads = decoder_num_attention_heads + self.decoder_hidden_size = decoder_hidden_size + self.decoder_num_hidden_layers = decoder_num_hidden_layers + self.decoder_intermediate_size = decoder_intermediate_size + self.mask_ratio = mask_ratio + self.norm_pix_loss = norm_pix_loss diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py new file mode 100644 index 000000000000..506507e4e66e --- /dev/null +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -0,0 +1,27 @@ +import argparse + +import requests +import torch +from PIL import Image + + + +def rename_key(name): + if "patch_embed.proj" in name: + name = name.replace("patch_embed.proj", "patch_embed.projection") + return name + + +def e(orig_state_dict, config): + for key in orig_state_dict.copy().keys(): + val = orig_state_dict.pop(key) + + if "qkv" in key: + pass + else: + new_name = rename_key(key) + orig_state_dict[new_name] = val + + return orig_state_dict + + diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py new file mode 100644 index 000000000000..35e8c93e160b --- /dev/null +++ b/src/transformers/models/hiera/hiera.py @@ -0,0 +1,535 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# +# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +# +# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, +# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, +# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. +# +# Paper: https://arxiv.org/abs/2306.00989/ +# +# References: +# slowfast: https://github.com/facebookresearch/SlowFast +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# -------------------------------------------------------- + +import math +from functools import partial +from typing import List, Tuple, Callable, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from timm.models.layers import DropPath, Mlp + +from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll + + + +class MaskUnitAttention(nn.Module): + """ + Computes either Mask Unit or Global Attention. Also is able to perform q pooling. + + Note: this assumes the tokens have already been flattened and unrolled into mask units. + See `Unroll` for more details. + """ + + def __init__( + self, + dim: int, + dim_out: int, + heads: int, + q_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ): + """ + Args: + - dim, dim_out: The input and output feature dimensions. + - heads: The number of attention heads. + - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). + - window_size: The current (flattened) size of a mask unit *after* pooling (if any). + - use_mask_unit_attn: Use Mask Unit or Global Attention. + """ + super().__init__() + + self.dim = dim + self.dim_out = dim_out + self.heads = heads + self.q_stride = q_stride + + self.head_dim = dim_out // heads + self.scale = (self.head_dim) ** -0.5 + + self.qkv = nn.Linear(dim, 3 * dim_out) + self.proj = nn.Linear(dim_out, dim_out) + + self.window_size = window_size + self.use_mask_unit_attn = use_mask_unit_attn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ Input should be of shape [batch, tokens, channels]. """ + B, N, _ = x.shape + num_windows = ( + (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1 + ) + + qkv = ( + self.qkv(x) + .reshape(B, -1, num_windows, 3, self.heads, self.head_dim) + .permute(3, 0, 4, 2, 1, 5) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + + if self.q_stride > 1: + # Refer to Unroll to see how this performs a maxpool-Nd + q = ( + q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim) + .max(dim=3) + .values + ) + + if hasattr(F, "scaled_dot_product_attention"): + # Note: the original paper did *not* use SDPA, it's a free boost! + x = F.scaled_dot_product_attention(q, k, v) + else: + attn = (q * self.scale) @ k.transpose(-1, -2) + attn = attn.softmax(dim=-1) + x = (attn @ v) + + x = x.transpose(1, 3).reshape(B, -1, self.dim_out) + x = self.proj(x) + return x + + +class HieraBlock(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + heads: int, + mlp_ratio: float = 4.0, + drop_path: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + act_layer: nn.Module = nn.GELU, + q_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ): + super().__init__() + + self.dim = dim + self.dim_out = dim_out + + self.norm1 = norm_layer(dim) + self.attn = MaskUnitAttention( + dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn + ) + + self.norm2 = norm_layer(dim_out) + self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer) + + self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity() + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Attention + Q Pooling + x_norm = self.norm1(x) + if self.dim != self.dim_out: + x = do_pool(self.proj(x_norm), stride=self.attn.q_stride) + x = x + self.drop_path(self.attn(x_norm)) + + # MLP + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Head(nn.Module): + def __init__( + self, + dim: int, + num_classes: int, + dropout_rate: float = 0.0, + act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1), + ): + super().__init__() + self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity() + self.projection = nn.Linear(dim, num_classes) + # act_fun for eval and testing only + self.act_func = act_func + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.dropout(x) + x = self.projection(x) + if not self.training: + x = self.act_func(x) + return x + + +class PatchEmbed(nn.Module): + """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d).""" + + def __init__( + self, + dim_in: int, + dim_out: int, + kernel: Tuple[int, ...], + stride: Tuple[int, ...], + padding: Tuple[int, ...], + ): + super().__init__() + + # Support any number of spatial dimensions + self.spatial_dims = len(kernel) + self.proj = conv_nd(self.spatial_dims)( + dim_in, + dim_out, + kernel_size=kernel, + stride=stride, + padding=padding, + ) + + def forward( + self, x: torch.Tensor, mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + x = do_masked_conv(x, self.proj, mask) + x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1) + return x + + +class Hiera(nn.Module): + def __init__( + self, + input_size: Tuple[int, ...] = (224, 224), + in_chans: int = 3, + embed_dim: int = 96, # initial embed dim + num_heads: int = 1, # initial number of heads + num_classes: int = 1000, + stages: Tuple[int, ...] = (2, 3, 16, 3), + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, ...] = (2, 2), + mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) + # mask_unit_attn: which stages use mask unit attention? + mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), + dim_mul: float = 2.0, + head_mul: float = 2.0, + patch_kernel: Tuple[int, ...] = (7, 7), + patch_stride: Tuple[int, ...] = (4, 4), + patch_padding: Tuple[int, ...] = (3, 3), + mlp_ratio: float = 4.0, + drop_path_rate: float = 0.0, + norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), + head_dropout: float = 0.0, + head_init_scale: float = 0.001, + sep_pos_embed: bool = False, + ): + super().__init__() + + depth = sum(stages) + self.patch_stride = patch_stride + self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)] + num_tokens = math.prod(self.tokens_spatial_shape) + flat_mu_size = math.prod(mask_unit_size) + flat_q_stride = math.prod(q_stride) + + assert q_pool < len(stages) + self.q_pool, self.q_stride = q_pool, q_stride + self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size + self.mask_spatial_shape = [ + i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) + ] + self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] + + self.patch_embed = PatchEmbed( + in_chans, embed_dim, patch_kernel, patch_stride, patch_padding + ) + + self.sep_pos_embed = sep_pos_embed + if sep_pos_embed: + self.pos_embed_spatial = nn.Parameter( + torch.zeros( + 1, + self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], + embed_dim, + ) + ) + self.pos_embed_temporal = nn.Parameter( + torch.zeros(1, self.tokens_spatial_shape[0], embed_dim) + ) + else: + self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim)) + + # Setup roll and reroll modules + self.unroll = Unroll( + input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1]) + ) + self.reroll = Reroll( + input_size, + patch_stride, + [q_stride] * len(self.stage_ends[:-1]), + self.stage_ends, + q_pool, + ) + # q_pool locations + q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]] + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + + # Transformer blocks + cur_stage = 0 + self.blocks = nn.ModuleList() + + for i in range(depth): + dim_out = embed_dim + # Mask unit or global attention. + # Lag by 1 block, so that global attention, + # applied post pooling on lower resolution + use_mask_unit_attn = mask_unit_attn[cur_stage] + + if i - 1 in self.stage_ends: + dim_out = int(embed_dim * dim_mul) + num_heads = int(num_heads * head_mul) + cur_stage += 1 + if i in q_pool_blocks: + flat_mu_size //= flat_q_stride + + block = HieraBlock( + dim=embed_dim, + dim_out=dim_out, + heads=num_heads, + mlp_ratio=mlp_ratio, + drop_path=dpr[i], + norm_layer=norm_layer, + q_stride=(flat_q_stride if i in q_pool_blocks else 1), + window_size=flat_mu_size, + use_mask_unit_attn=use_mask_unit_attn, + ) + + embed_dim = dim_out + self.blocks.append(block) + + self.norm = norm_layer(embed_dim) + self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout) + + # Initialize everything + if sep_pos_embed: + nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02) + nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02) + else: + nn.init.trunc_normal_(self.pos_embed, std=0.02) + self.apply(partial(self._init_weights)) + self.head.projection.weight.data.mul_(head_init_scale) + self.head.projection.bias.data.mul_(head_init_scale) + + def _init_weights(self, m, init_bias=0.02): + if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): + nn.init.trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, init_bias) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, init_bias) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + if self.sep_pos_embed: + return ["pos_embed_spatial", "pos_embed_temporal"] + else: + return ["pos_embed"] + + def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: + """ + Generates a random mask, mask_ratio fraction are dropped. + 1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc. + """ + B = x.shape[0] + # Tokens selected for masking at mask unit level + num_windows = math.prod(self.mask_spatial_shape) # num_mask_units + len_keep = int(num_windows * (1 - mask_ratio)) + noise = torch.rand(B, num_windows, device=x.device) + + # Sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1 + ) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # Generate the binary mask: 1 is *keep*, 0 is *remove* + # Note this is opposite to original MAE + mask = torch.zeros([B, num_windows], device=x.device) + mask[:, :len_keep] = 1 + # Unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return mask.bool() + + def get_pos_embed(self) -> torch.Tensor: + if self.sep_pos_embed: + return self.pos_embed_spatial.repeat( + 1, self.tokens_spatial_shape[0], 1 + ) + torch.repeat_interleave( + self.pos_embed_temporal, + self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], + dim=1, + ) + else: + return self.pos_embed + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor = None, + return_intermediates: bool = False, + ) -> torch.Tensor: + """ + mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim. + Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. + """ + # Slowfast training passes in a list + if isinstance(x, list): + x = x[0] + intermediates = [] + + x = self.patch_embed( + x, + mask=mask.view( + x.shape[0], 1, *self.mask_spatial_shape + ) # B, C, *mask_spatial_shape + if mask is not None + else None, + ) + x = x + self.get_pos_embed() + x = self.unroll(x) + + # Discard masked tokens + if mask is not None: + x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view( + x.shape[0], -1, x.shape[-1] + ) + + for i, blk in enumerate(self.blocks): + x = blk(x) + + if return_intermediates and i in self.stage_ends: + intermediates.append(self.reroll(x, i, mask=mask)) + + if mask is None: + x = x.mean(dim=1) + x = self.norm(x) + x = self.head(x) + + # x may not always be in spatial order here. + # e.g. if q_pool = 2, mask_unit_size = (8, 8), and + # q_stride = (2, 2), not all unrolls were consumed, + # intermediates[-1] is x in spatial order + if return_intermediates: + return x, intermediates + + return x + + +# Image models + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_tiny_224(**kwdargs): + return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_small_224(**kwdargs): + return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_base_224(**kwdargs): + return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_base_plus_224(**kwdargs): + return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_large_224(**kwdargs): + return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs) + + +@pretrained_model({ + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", +}, default="mae_in1k_ft_in1k") +def hiera_huge_224(**kwdargs): + return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs) + + +# Video models + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_base_16x224(num_classes: int = 400, **kwdargs): + return Hiera( + num_classes=num_classes, # K400 has 400 classes + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + **kwdargs + ) + + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_base_plus_16x224(**kwdargs): + return hiera_base_16x224( + embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + ) + + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_large_16x224(**kwdargs): + return hiera_base_16x224( + embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + ) + + +@pretrained_model({ + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", +}, default="mae_k400_ft_k400") +def hiera_huge_16x224(**kwdargs): + return hiera_base_16x224( + embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + ) diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py new file mode 100644 index 000000000000..64c69cc89d71 --- /dev/null +++ b/src/transformers/models/hiera/hiera_mae.py @@ -0,0 +1,398 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# mae: https://github.com/facebookresearch/mae +# slowfast: https://github.com/facebookresearch/SlowFast +# -------------------------------------------------------- + + +from functools import partial +from typing import Tuple, Optional + +import math +import torch +import torch.nn as nn + +from .hiera import Hiera, HieraBlock +from .hiera_utils import pretrained_model, undo_windowing, conv_nd + + +def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: + if isinstance(head, nn.Identity): + return x + + B, num_mask_units = x.shape[0:2] + # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx]) + permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) + x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute)) + + # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C'] + permute = [0] + list(range(2, len(x.shape))) + [1] + x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1]) + return x + + +class MaskedAutoencoderHiera(Hiera): + """Masked Autoencoder with Hiera backbone""" + + def __init__( + self, + in_chans: int = 3, + patch_stride: Tuple[int, ...] = (4, 4), + mlp_ratio: float = 4.0, + decoder_embed_dim: int = 512, + decoder_depth: int = 8, + decoder_num_heads: int = 16, + norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), + **kwdargs, + ): + super().__init__( + in_chans=in_chans, + patch_stride=patch_stride, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + **kwdargs, + ) + + del self.norm, self.head + encoder_dim_out = self.blocks[-1].dim_out + self.encoder_norm = norm_layer(encoder_dim_out) + self.mask_unit_spatial_shape_final = [ + i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride) + ] + self.tokens_spatial_shape_final = [ + i // s ** (self.q_pool) + for i, s in zip(self.tokens_spatial_shape, self.q_stride) + ] + # -------------------------------------------------------------------------- + # Multi-scale fusion heads + curr_mu_size = self.mask_unit_size + self.multi_scale_fusion_heads = nn.ModuleList() + + for i in self.stage_ends[: self.q_pool]: # resolution constant after q_pool + kernel = [ + i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final) + ] + curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)] + self.multi_scale_fusion_heads.append( + conv_nd(len(self.q_stride))( + self.blocks[i].dim_out, + encoder_dim_out, + kernel_size=kernel, + stride=kernel, + ) + ) + self.multi_scale_fusion_heads.append(nn.Identity()) # final stage, no transform + + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.decoder_embed = nn.Linear(encoder_dim_out, decoder_embed_dim) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) + + self.decoder_pos_embed = nn.Parameter( + torch.zeros( + 1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim + ) + ) + + self.decoder_blocks = nn.ModuleList( + [ + HieraBlock( + dim=decoder_embed_dim, + dim_out=decoder_embed_dim, + heads=decoder_num_heads, + norm_layer=norm_layer, + mlp_ratio=mlp_ratio, + ) + for i in range(decoder_depth) + ] + ) + self.decoder_norm = norm_layer(decoder_embed_dim) + + self.pred_stride = patch_stride[-1] * ( + self.q_stride[-1] ** self.q_pool + ) # patch stride of prediction + + self.decoder_pred = nn.Linear( + decoder_embed_dim, + (self.pred_stride ** min(2, len(self.q_stride))) * in_chans, + ) # predictor + # -------------------------------------------------------------------------- + + self.initialize_weights() + + def initialize_weights(self): + nn.init.trunc_normal_(self.mask_token, std=0.02) + nn.init.trunc_normal_(self.decoder_pos_embed, std=0.02) + self.apply(self._mae_init_weights) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + def _mae_init_weights(self, m: nn.Module): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_pixel_label_2d( + self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True + ) -> torch.Tensor: + # mask (boolean tensor): True must correspond to *masked* + input_img = input_img.permute(0, 2, 3, 1) + + size = self.pred_stride + label = input_img.unfold(1, size, size).unfold(2, size, size) + label = label.flatten(1, 2).flatten(2) + label = label[mask] + if norm: + mean = label.mean(dim=-1, keepdim=True) + var = label.var(dim=-1, keepdim=True) + label = (label - mean) / (var + 1.0e-6) ** 0.5 + + return label + + def get_pixel_label_3d( + self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True + ) -> torch.Tensor: + # mask (boolean tensor): True must correspond to *masked* + + # We use time strided loss, only take the first frame from each token + input_vid = input_vid[:, :, ::self.patch_stride[0], :, :] + + size = self.pred_stride + label = input_vid.unfold(3, size, size).unfold(4, size, size) + label = label.permute(0, 2, 3, 4, 5, 6, 1) # Different from 2d, mistake during training lol + label = label.flatten(1, 3).flatten(2) + label = label[mask] + + if norm: + mean = label.mean(dim=-1, keepdim=True) + var = label.var(dim=-1, keepdim=True) + label = (label - mean) / (var + 1.0e-6) ** 0.5 + + return label + + + def forward_encoder( + self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + + if mask is None: + mask = self.get_random_mask(x, mask_ratio) # [B, #MUs_all] + + # Get multi-scale representations from encoder + _, intermediates = super().forward(x, mask, return_intermediates=True) + # Resolution unchanged after q_pool stages, so skip those features + intermediates = intermediates[: self.q_pool] + intermediates[-1:] + + # Multi-scale fusion + x = 0.0 + for head, interm_x in zip(self.multi_scale_fusion_heads, intermediates): + x += apply_fusion_head(head, interm_x) + + x = self.encoder_norm(x) + + return x, mask + + def forward_decoder( + self, x: torch.Tensor, mask: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Embed tokens + x = self.decoder_embed(x) + + # Combine visible and mask tokens + + # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] + # mask: [B, #MUs_all] + x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) + mask_tokens = self.mask_token.view( + (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,) + ) + mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:])) + mask = mask.expand((-1,) * 2 + x.shape[2:]).bool() + x_dec[mask] = x.flatten() + x_dec = ~mask * mask_tokens + mask * x_dec + + # Get back spatial order + x = undo_windowing( + x_dec, + self.tokens_spatial_shape_final, + self.mask_unit_spatial_shape_final, + ) + mask = undo_windowing( + mask[..., 0:1], + self.tokens_spatial_shape_final, + self.mask_unit_spatial_shape_final, + ) + + # Flatten + x = x.reshape(x.shape[0], -1, x.shape[-1]) + mask = mask.view(x.shape[0], -1) + + # Add pos embed + x = x + self.decoder_pos_embed + + # Apply decoder blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # Predictor projection + x = self.decoder_pred(x) + + return x, mask + + def forward_loss( + self, x: torch.Tensor, pred: torch.Tensor, mask: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Note: in mask, 0 is *visible*, 1 is *masked* + + x: e.g. [B, 3, H, W] + pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + """ + if len(self.q_stride) == 2: + label = self.get_pixel_label_2d(x, mask) + elif len(self.q_stride) == 3: + label = self.get_pixel_label_3d(x, mask) + else: + raise NotImplementedError + + pred = pred[mask] + loss = (pred - label) ** 2 + + return loss.mean(), pred, label + + def forward( + self, + x: torch.Tensor, + mask_ratio: float = 0.6, + mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + latent, mask = self.forward_encoder(x, mask_ratio, mask=mask) + pred, pred_mask = self.forward_decoder( + latent, mask + ) # pred_mask is mask at resolution of *prediction* + + # Toggle mask, to generate labels for *masked* tokens + return *self.forward_loss(x, pred, ~pred_mask), mask + + + + +# Image Models + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", +}, default="mae_in1k") +def mae_hiera_tiny_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", +}, default="mae_in1k") +def mae_hiera_small_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", +}, default="mae_in1k") +def mae_hiera_base_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", +}, default="mae_in1k") +def mae_hiera_base_plus_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", +}, default="mae_in1k") +def mae_hiera_large_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + ) + + +@pretrained_model({ + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", +}, default="mae_in1k") +def mae_hiera_huge_224(**kwargs): + return MaskedAutoencoderHiera( + embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + ) + + + +# Video Models + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", +}, default="mae_k400") +def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs): + return MaskedAutoencoderHiera( + num_classes=num_classes, # K400 has 400 classes + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + q_pool=2, + **kwdargs + ) + + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", +}, default="mae_k400") +@pretrained_model(None) +def mae_hiera_base_plus_16x224(**kwdargs): + return mae_hiera_base_16x224( + embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + ) + + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", +}, default="mae_k400") +@pretrained_model(None) +def mae_hiera_large_16x224(**kwdargs): + return mae_hiera_base_16x224( + embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + ) + + +@pretrained_model({ + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", +}, default="mae_k400") +def mae_hiera_huge_16x224(**kwdargs): + return mae_hiera_base_16x224( + embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + ) diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py new file mode 100644 index 000000000000..992c03e08079 --- /dev/null +++ b/src/transformers/models/hiera/hiera_utils.py @@ -0,0 +1,287 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# +# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +# +# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, +# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, +# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. +# +# Paper: https://arxiv.org/abs/2306.00989/ +# +# References: +# slowfast: https://github.com/facebookresearch/SlowFast +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# -------------------------------------------------------- + +import math +from typing import List, Tuple, Optional, Type, Callable, Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from .convert_hiera_to_pytorch import e + +def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: + """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ + + def inner(model_func: Callable) -> Callable: + def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module: + if pretrained: + if checkpoints is None: + raise RuntimeError("This model currently doesn't have pretrained weights available.") + elif checkpoint is None: + raise RuntimeError("No checkpoint specified.") + elif checkpoint not in checkpoints: + raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") + + state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") + # state_dict["model_state"] = e(state_dict["model_state"],{}) + if "head.projection.weight" in state_dict["model_state"]: + # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it + if "num_classes" not in kwdargs: + kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0] + # If the user specified a different number of classes, remove the projection weights or else we'll error out + elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]: + del state_dict["model_state"]["head.projection.weight"] + del state_dict["model_state"]["head.projection.bias"] + + model = model_func(**kwdargs) + if pretrained: + # Disable being strict when trying to load a encoder-decoder model into an encoder-only model + if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"): + strict = False + + model.load_state_dict(state_dict["model_state"], strict=strict) + + return model + + return model_def + + return inner + + + +def conv_nd(n: int) -> Type[nn.Module]: + """ + Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. + If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises) + """ + return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] + + +def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: + # Refer to `Unroll` to see how this performs a maxpool-Nd + return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values + + +def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor: + # target_size: [(T), (H), W] + # (spatial) mask: [B, C, (t), (h), w] + if mask is None: + return mask + + assert len(mask.shape[2:]) == len(target_size) + if mask.shape[2:] != target_size: + return F.interpolate(mask.float(), size=target_size) + return mask + + +def do_masked_conv( + x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None +) -> torch.Tensor: + """Zero-out the masked regions of the input before conv. + Prevents leakage of masked regions when using overlapping kernels. + """ + if conv is None: + return x + if mask is None: + return conv(x) + + mask = get_resized_mask(target_size=x.shape[2:], mask=mask) + return conv(x * mask.bool()) + + +def undo_windowing( + x: torch.Tensor, shape: List[int], mu_shape: List[int] +) -> torch.Tensor: + """ + Restore spatial organization by undoing windowed organization of mask units. + + Args: + x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] + shape: current spatial shape, if it were not organized into mask unit + windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. + mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] + Returns: + x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] + """ + D = len(shape) + B, C = x.shape[0], x.shape[-1] + # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] + num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] + x = x.view(B, *num_MUs, *mu_shape, C) + + # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] + permute = ( + [0] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], + [], + ) + + [len(x.shape) - 1] + ) + x = x.permute(permute).reshape(B, *shape, C) + + return x + + + +class Unroll(nn.Module): + """ + Reorders the tokens such that patches are contiguous in memory. + E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as + [B, (Sy, Sx, H // Sy, W // Sx), C] + + This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). + Not only is this faster, but it also makes it easy to support inputs of arbitrary + dimensions in addition to patch-wise sparsity. + + Performing this operation multiple times in sequence puts entire windows as contiguous + in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of + size 8x8 would be contiguous in memory, allowing operations like mask unit attention + computed easily and efficiently, while also allowing max to be applied sequentially. + + Note: This means that intermediate values of the model are not in HxW order, so they + need to be re-rolled if you want to use the intermediate values as a HxW feature map. + The last block of the network is fine though, since by then the strides are all consumed. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + self.schedule = unroll_schedule + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: Flattened patch embeddings [B, N, C] + Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd + """ + B, _, C = x.shape + + cur_size = self.size + x = x.view(*([B] + cur_size + [C])) + + for strides in self.schedule: + # Move patches with the given strides to the batch dimension + + # Create a view of the tensor with the patch stride as separate dims + # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] + cur_size = [i // s for i, s in zip(cur_size, strides)] + new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] + x = x.view(new_shape) + + # Move the patch stride into the batch dimension + # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] + L = len(new_shape) + permute = ( + [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] + ) + x = x.permute(permute) + + # Now finally flatten the relevant dims into the batch dimension + x = x.flatten(0, len(strides)) + B *= math.prod(strides) + + x = x.reshape(-1, math.prod(self.size), C) + return x + + +class Reroll(nn.Module): + """ + Undos the "unroll" operation so that you can use intermediate features. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + stage_ends: List[int], + q_pool: int, + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + + # The first stage has to reverse everything + # The next stage has to reverse all but the first unroll, etc. + self.schedule = {} + size = self.size + for i in range(stage_ends[-1] + 1): + self.schedule[i] = unroll_schedule, size + # schedule unchanged if no pooling at a stage end + if i in stage_ends[:q_pool]: + if len(unroll_schedule) > 0: + size = [n // s for n, s in zip(size, unroll_schedule[0])] + unroll_schedule = unroll_schedule[1:] + + def forward( + self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Roll the given tensor back up to spatial order assuming it's from the given block. + + If no mask is provided: + - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. + If a mask is provided: + - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + """ + schedule, size = self.schedule[block_idx] + B, N, C = x.shape + + D = len(size) + cur_mu_shape = [1] * D + + for strides in schedule: + # Extract the current patch from N + x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) + + # Move that patch into the current MU + # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] + L = len(x.shape) + permute = ( + [0, 1 + D] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], + [], + ) + + [L - 1] + ) + x = x.permute(permute) + + # Reshape to [B, N//(Sy*Sx), *MU, C] + for i in range(D): + cur_mu_shape[i] *= strides[i] + x = x.reshape(B, -1, *cur_mu_shape, C) + N = x.shape[1] + + # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) + x = x.view(B, N, *cur_mu_shape, C) + + # If masked, return [B, #MUs, MUy, MUx, C] + if mask is not None: + return x + + # If not masked, we can return [B, H, W, C] + x = undo_windowing(x, size, cur_mu_shape) + + return x \ No newline at end of file From 86a43eddd60f08808a795d10e6565e43148dabf0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 02:17:36 +0000 Subject: [PATCH 018/118] Updated variable names --- .../models/hiera/convert_hiera_to_pytorch.py | 30 +-- src/transformers/models/hiera/hiera.py | 200 +++++++++--------- src/transformers/models/hiera/hiera_mae.py | 42 ++-- src/transformers/models/hiera/hiera_utils.py | 6 +- 4 files changed, 141 insertions(+), 137 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 506507e4e66e..f1d0c4135796 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -7,21 +7,25 @@ def rename_key(name): - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "patch_embed.projection") + # if "patch_embed.proj" in name: + # name = name.replace("patch_embed.proj", "patch_embed.projection") + # # elif "block.proj" in name: + # # name = name.replace("block.proj", "block.projection") + # elif "attn.proj" in name: + # name = name.replace("attn.proj", "attn.projection") + if ".proj." in name: + name = name.replace(".proj.", ".projection.") + if "attn" in name: + name = name.replace("attn", "attention") + if "pos_embed" in name: + name = name.replace("pos_embed", "position_embeddings") + if "patch_embed" in name: + name = name.replace("patch_embed", "patch_embedding") return name -def e(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - pass - else: - new_name = rename_key(key) - orig_state_dict[new_name] = val - - return orig_state_dict +def convert_state_dict(orig_state_dict, config): + updated_model_state = {rename_key(k): v for k, v in orig_state_dict.items()} + return updated_model_state diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 35e8c93e160b..fcb04f68934e 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -42,47 +42,47 @@ class MaskUnitAttention(nn.Module): def __init__( self, - dim: int, - dim_out: int, - heads: int, + input_dim: int, + output_dim: int, + number_of_heads: int, q_stride: int = 1, window_size: int = 0, - use_mask_unit_attn: bool = False, + use_mask_unit_attention: bool = False, ): """ Args: - - dim, dim_out: The input and output feature dimensions. - - heads: The number of attention heads. + - input_dim, output_dim: The input and output feature dimensions. + - number_of_heads: The number of attention number_of_heads. - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). - window_size: The current (flattened) size of a mask unit *after* pooling (if any). - - use_mask_unit_attn: Use Mask Unit or Global Attention. + - use_mask_unit_attention: Use Mask Unit or Global Attention. """ super().__init__() - self.dim = dim - self.dim_out = dim_out - self.heads = heads + self.input_dim = input_dim + self.output_dim = output_dim + self.number_of_heads = number_of_heads self.q_stride = q_stride - self.head_dim = dim_out // heads + self.head_dim = output_dim // number_of_heads self.scale = (self.head_dim) ** -0.5 - self.qkv = nn.Linear(dim, 3 * dim_out) - self.proj = nn.Linear(dim_out, dim_out) + self.qkv = nn.Linear(input_dim, 3 * output_dim) + self.projection = nn.Linear(output_dim, output_dim) self.window_size = window_size - self.use_mask_unit_attn = use_mask_unit_attn + self.use_mask_unit_attention = use_mask_unit_attention def forward(self, x: torch.Tensor) -> torch.Tensor: """ Input should be of shape [batch, tokens, channels]. """ - B, N, _ = x.shape + batch_size , num_channels , _ = x.shape num_windows = ( - (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1 + (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 ) qkv = ( self.qkv(x) - .reshape(B, -1, num_windows, 3, self.heads, self.head_dim) + .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim) .permute(3, 0, 4, 2, 1, 5) ) q, k, v = qkv[0], qkv[1], qkv[2] @@ -90,7 +90,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if self.q_stride > 1: # Refer to Unroll to see how this performs a maxpool-Nd q = ( - q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim) + q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) .max(dim=3) .values ) @@ -99,52 +99,52 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Note: the original paper did *not* use SDPA, it's a free boost! x = F.scaled_dot_product_attention(q, k, v) else: - attn = (q * self.scale) @ k.transpose(-1, -2) - attn = attn.softmax(dim=-1) - x = (attn @ v) + attention = (q * self.scale) @ k.transpose(-1, -2) + attention = attention.softmax(dim=-1) + x = (attention @ v) - x = x.transpose(1, 3).reshape(B, -1, self.dim_out) - x = self.proj(x) + x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim) + x = self.projection(x) return x class HieraBlock(nn.Module): def __init__( self, - dim: int, - dim_out: int, - heads: int, + input_dim: int, + output_dim: int, + number_of_heads: int, mlp_ratio: float = 4.0, drop_path: float = 0.0, norm_layer: nn.Module = nn.LayerNorm, act_layer: nn.Module = nn.GELU, q_stride: int = 1, window_size: int = 0, - use_mask_unit_attn: bool = False, + use_mask_unit_attention: bool = False, ): super().__init__() - self.dim = dim - self.dim_out = dim_out + self.input_dim = input_dim + self.output_dim = output_dim - self.norm1 = norm_layer(dim) - self.attn = MaskUnitAttention( - dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn + self.norm1 = norm_layer(input_dim) + self.attention = MaskUnitAttention( + input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention ) - self.norm2 = norm_layer(dim_out) - self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer) + self.norm2 = norm_layer(output_dim) + self.mlp = Mlp(output_dim, int(output_dim * mlp_ratio), act_layer=act_layer) self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity() - if dim != dim_out: - self.proj = nn.Linear(dim, dim_out) + if input_dim != output_dim: + self.projection = nn.Linear(input_dim, output_dim) def forward(self, x: torch.Tensor) -> torch.Tensor: # Attention + Q Pooling - x_norm = self.norm1(x) - if self.dim != self.dim_out: - x = do_pool(self.proj(x_norm), stride=self.attn.q_stride) - x = x + self.drop_path(self.attn(x_norm)) + normalized_input = self.norm1(x) + if self.input_dim != self.output_dim: + x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride) + x = x + self.drop_path(self.attention(normalized_input)) # MLP x = x + self.drop_path(self.mlp(self.norm2(x))) @@ -154,14 +154,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Head(nn.Module): def __init__( self, - dim: int, + input_dim: int, num_classes: int, dropout_rate: float = 0.0, act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1), ): super().__init__() self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity() - self.projection = nn.Linear(dim, num_classes) + self.projection = nn.Linear(input_dim, num_classes) # act_fun for eval and testing only self.act_func = act_func @@ -173,13 +173,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class PatchEmbed(nn.Module): - """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d).""" +class PatchEmbedding(nn.Module): + """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).""" def __init__( self, dim_in: int, - dim_out: int, + output_dim: int, kernel: Tuple[int, ...], stride: Tuple[int, ...], padding: Tuple[int, ...], @@ -188,9 +188,9 @@ def __init__( # Support any number of spatial dimensions self.spatial_dims = len(kernel) - self.proj = conv_nd(self.spatial_dims)( + self.projection = conv_nd(self.spatial_dims)( dim_in, - dim_out, + output_dim, kernel_size=kernel, stride=stride, padding=padding, @@ -199,7 +199,7 @@ def __init__( def forward( self, x: torch.Tensor, mask: Optional[torch.Tensor] = None ) -> torch.Tensor: - x = do_masked_conv(x, self.proj, mask) + x = do_masked_conv(x, self.projection, mask) x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1) return x @@ -209,8 +209,8 @@ def __init__( self, input_size: Tuple[int, ...] = (224, 224), in_chans: int = 3, - embed_dim: int = 96, # initial embed dim - num_heads: int = 1, # initial number of heads + embedding_dimention: int = 96, # initial embedding input_dim + number_of_heads: int = 1, # initial number of number_of_heads num_classes: int = 1000, stages: Tuple[int, ...] = (2, 3, 16, 3), q_pool: int = 3, # number of q_pool stages @@ -228,7 +228,7 @@ def __init__( norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), head_dropout: float = 0.0, head_init_scale: float = 0.001, - sep_pos_embed: bool = False, + sep_position_embeddings: bool = False, ): super().__init__() @@ -247,24 +247,24 @@ def __init__( ] self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] - self.patch_embed = PatchEmbed( - in_chans, embed_dim, patch_kernel, patch_stride, patch_padding + self.patch_embedding = PatchEmbedding( + in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding ) - self.sep_pos_embed = sep_pos_embed - if sep_pos_embed: - self.pos_embed_spatial = nn.Parameter( + self.sep_position_embeddings = sep_position_embeddings + if sep_position_embeddings: + self.position_embeddings_spatial = nn.Parameter( torch.zeros( 1, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], - embed_dim, + embedding_dimention, ) ) - self.pos_embed_temporal = nn.Parameter( - torch.zeros(1, self.tokens_spatial_shape[0], embed_dim) + self.position_embeddings_temporal = nn.Parameter( + torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention) ) else: - self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention)) # Setup roll and reroll modules self.unroll = Unroll( @@ -287,43 +287,43 @@ def __init__( self.blocks = nn.ModuleList() for i in range(depth): - dim_out = embed_dim + output_dim = embedding_dimention # Mask unit or global attention. # Lag by 1 block, so that global attention, # applied post pooling on lower resolution - use_mask_unit_attn = mask_unit_attn[cur_stage] + use_mask_unit_attention = mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: - dim_out = int(embed_dim * dim_mul) - num_heads = int(num_heads * head_mul) + output_dim = int(embedding_dimention * dim_mul) + number_of_heads = int(number_of_heads * head_mul) cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride block = HieraBlock( - dim=embed_dim, - dim_out=dim_out, - heads=num_heads, + input_dim=embedding_dimention, + output_dim=output_dim, + number_of_heads=number_of_heads, mlp_ratio=mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), window_size=flat_mu_size, - use_mask_unit_attn=use_mask_unit_attn, + use_mask_unit_attention=use_mask_unit_attention, ) - embed_dim = dim_out + embedding_dimention = output_dim self.blocks.append(block) - self.norm = norm_layer(embed_dim) - self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout) + self.norm = norm_layer(embedding_dimention) + self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout) # Initialize everything - if sep_pos_embed: - nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02) - nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02) + if sep_position_embeddings: + nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02) + nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02) else: - nn.init.trunc_normal_(self.pos_embed, std=0.02) + nn.init.trunc_normal_(self.position_embeddings, std=0.02) self.apply(partial(self._init_weights)) self.head.projection.weight.data.mul_(head_init_scale) self.head.projection.bias.data.mul_(head_init_scale) @@ -339,21 +339,21 @@ def _init_weights(self, m, init_bias=0.02): @torch.jit.ignore def no_weight_decay(self): - if self.sep_pos_embed: - return ["pos_embed_spatial", "pos_embed_temporal"] + if self.sep_position_embeddings: + return ["position_embeddings_spatial", "position_embeddings_temporal"] else: - return ["pos_embed"] + return ["position_embeddings"] def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: """ Generates a random mask, mask_ratio fraction are dropped. 1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc. """ - B = x.shape[0] + batch_size = x.shape[0] # Tokens selected for masking at mask unit level num_windows = math.prod(self.mask_spatial_shape) # num_mask_units len_keep = int(num_windows * (1 - mask_ratio)) - noise = torch.rand(B, num_windows, device=x.device) + noise = torch.rand(batch_size , num_windows, device=x.device) # Sort noise for each sample ids_shuffle = torch.argsort( @@ -363,24 +363,24 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: # Generate the binary mask: 1 is *keep*, 0 is *remove* # Note this is opposite to original MAE - mask = torch.zeros([B, num_windows], device=x.device) + mask = torch.zeros([batch_size , num_windows], device=x.device) mask[:, :len_keep] = 1 # Unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) return mask.bool() - def get_pos_embed(self) -> torch.Tensor: - if self.sep_pos_embed: - return self.pos_embed_spatial.repeat( + def get_position_embeddings(self) -> torch.Tensor: + if self.sep_position_embeddings: + return self.position_embeddings_spatial.repeat( 1, self.tokens_spatial_shape[0], 1 ) + torch.repeat_interleave( - self.pos_embed_temporal, + self.position_embeddings_temporal, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], dim=1, ) else: - return self.pos_embed + return self.position_embeddings def forward( self, @@ -389,7 +389,7 @@ def forward( return_intermediates: bool = False, ) -> torch.Tensor: """ - mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim. + mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. """ # Slowfast training passes in a list @@ -397,15 +397,15 @@ def forward( x = x[0] intermediates = [] - x = self.patch_embed( + x = self.patch_embedding( x, mask=mask.view( x.shape[0], 1, *self.mask_spatial_shape - ) # B, C, *mask_spatial_shape + ) # batch_size , C, *mask_spatial_shape if mask is not None else None, ) - x = x + self.get_pos_embed() + x = x + self.get_position_embeddings() x = self.unroll(x) # Discard masked tokens @@ -442,7 +442,7 @@ def forward( "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", }, default="mae_in1k_ft_in1k") def hiera_tiny_224(**kwdargs): - return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs) + return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs) @pretrained_model({ @@ -450,7 +450,7 @@ def hiera_tiny_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", }, default="mae_in1k_ft_in1k") def hiera_small_224(**kwdargs): - return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs) + return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) @pretrained_model({ @@ -458,7 +458,7 @@ def hiera_small_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", }, default="mae_in1k_ft_in1k") def hiera_base_224(**kwdargs): - return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs) + return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs) @pretrained_model({ @@ -466,7 +466,7 @@ def hiera_base_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", }, default="mae_in1k_ft_in1k") def hiera_base_plus_224(**kwdargs): - return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs) + return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) @pretrained_model({ @@ -474,7 +474,7 @@ def hiera_base_plus_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", }, default="mae_in1k_ft_in1k") def hiera_large_224(**kwdargs): - return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) @pretrained_model({ @@ -482,7 +482,7 @@ def hiera_large_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", }, default="mae_in1k_ft_in1k") def hiera_huge_224(**kwdargs): - return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) # Video models @@ -500,7 +500,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs): patch_kernel=(3, 7, 7), patch_stride=(2, 4, 4), patch_padding=(1, 3, 3), - sep_pos_embed=True, + sep_position_embeddings=True, **kwdargs ) @@ -511,7 +511,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs): }, default="mae_k400_ft_k400") def hiera_base_plus_16x224(**kwdargs): return hiera_base_16x224( - embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs ) @@ -521,7 +521,7 @@ def hiera_base_plus_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_large_16x224(**kwdargs): return hiera_base_16x224( - embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs ) @@ -531,5 +531,5 @@ def hiera_large_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_huge_16x224(**kwdargs): return hiera_base_16x224( - embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs ) diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index 64c69cc89d71..a0504997350b 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -25,14 +25,14 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: if isinstance(head, nn.Identity): return x - B, num_mask_units = x.shape[0:2] - # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx]) + batch_size , num_mask_units = x.shape[0:2] + # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size * #MUs, C, My, Mx]) permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) - x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute)) + x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) - # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C'] + # Restore original layout, e.g. [batch_size * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C'] permute = [0] + list(range(2, len(x.shape))) + [1] - x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1]) + x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1]) return x @@ -132,7 +132,7 @@ def initialize_weights(self): self.apply(self._mae_init_weights) # initialize patch_embed like nn.Linear (instead of nn.Conv2d) - w = self.patch_embed.proj.weight.data + w = self.patch_embed.projection.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) def _mae_init_weights(self, m: nn.Module): @@ -188,7 +188,7 @@ def forward_encoder( ) -> Tuple[torch.Tensor, torch.Tensor]: if mask is None: - mask = self.get_random_mask(x, mask_ratio) # [B, #MUs_all] + mask = self.get_random_mask(x, mask_ratio) # [batch_size , #MUs_all] # Get multi-scale representations from encoder _, intermediates = super().forward(x, mask, return_intermediates=True) @@ -212,8 +212,8 @@ def forward_decoder( # Combine visible and mask tokens - # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] - # mask: [B, #MUs_all] + # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] + # mask: [batch_size , #MUs_all] x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) mask_tokens = self.mask_token.view( (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,) @@ -258,9 +258,9 @@ def forward_loss( """ Note: in mask, 0 is *visible*, 1 is *masked* - x: e.g. [B, 3, H, W] - pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] - label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + x: e.g. [batch_size , 3, H, W] + pred: [batch_size * num_pred_tokens, num_pixels_in_pred_patch * in_chans] + label: [batch_size * num_pred_tokens, num_pixels_in_pred_patch * in_chans] """ if len(self.q_stride) == 2: label = self.get_pixel_label_2d(x, mask) @@ -299,7 +299,7 @@ def forward( }, default="mae_in1k") def mae_hiera_tiny_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, + embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, ) @@ -308,7 +308,7 @@ def mae_hiera_tiny_224(**kwargs): }, default="mae_in1k") def mae_hiera_small_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, + embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, ) @@ -317,7 +317,7 @@ def mae_hiera_small_224(**kwargs): }, default="mae_in1k") def mae_hiera_base_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, ) @@ -326,7 +326,7 @@ def mae_hiera_base_224(**kwargs): }, default="mae_in1k") def mae_hiera_base_plus_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, + embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, ) @@ -335,7 +335,7 @@ def mae_hiera_base_plus_224(**kwargs): }, default="mae_in1k") def mae_hiera_large_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, ) @@ -344,7 +344,7 @@ def mae_hiera_large_224(**kwargs): }, default="mae_in1k") def mae_hiera_huge_224(**kwargs): return MaskedAutoencoderHiera( - embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, + embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, ) @@ -375,7 +375,7 @@ def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs): @pretrained_model(None) def mae_hiera_base_plus_16x224(**kwdargs): return mae_hiera_base_16x224( - embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs + embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs ) @@ -385,7 +385,7 @@ def mae_hiera_base_plus_16x224(**kwdargs): @pretrained_model(None) def mae_hiera_large_16x224(**kwdargs): return mae_hiera_base_16x224( - embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs ) @@ -394,5 +394,5 @@ def mae_hiera_large_16x224(**kwdargs): }, default="mae_k400") def mae_hiera_huge_16x224(**kwdargs): return mae_hiera_base_16x224( - embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs + embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs ) diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py index 992c03e08079..c96c63cbfaf9 100644 --- a/src/transformers/models/hiera/hiera_utils.py +++ b/src/transformers/models/hiera/hiera_utils.py @@ -24,7 +24,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .convert_hiera_to_pytorch import e +from .convert_hiera_to_pytorch import convert_state_dict def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ @@ -40,7 +40,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") - # state_dict["model_state"] = e(state_dict["model_state"],{}) + state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) if "head.projection.weight" in state_dict["model_state"]: # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it if "num_classes" not in kwdargs: @@ -53,7 +53,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool model = model_func(**kwdargs) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model - if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"): + if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): strict = False model.load_state_dict(state_dict["model_state"], strict=strict) From 1433a7c6e835b9f01c092734297841137935fbf1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 08:10:34 +0000 Subject: [PATCH 019/118] Added Config class, basic HF setup, convert_to_hf --- src/transformers/__init__.py | 6 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/hiera/__init__.py | 157 ++++++++----- .../models/hiera/configuration_hiera.py | 193 +++++++--------- .../models/hiera/convert_hiera_to_pytorch.py | 212 ++++++++++++++++++ src/transformers/models/hiera/hiera.py | 129 +++++------ 6 files changed, 470 insertions(+), 230 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 027cf495466c..40c0a56362ac 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -497,6 +497,7 @@ "GroupViTVisionConfig", ], "models.herbert": ["HerbertTokenizer"], + "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"], "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"], "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"], "models.idefics": [ @@ -5280,6 +5281,7 @@ GroupViTVisionConfig, ) from .models.herbert import HerbertTokenizer + from .models.hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig from .models.idefics import ( @@ -6983,6 +6985,10 @@ HubertModel, HubertPreTrainedModel, ) + from .models.hiera import ( + Hiera, + HieraBlock + ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, IBertForMaskedLM, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 7bc637f3e106..ed75e74ebfce 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -117,6 +117,7 @@ ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), + ("hiera","HieraConfig") ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), @@ -352,6 +353,7 @@ ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP") ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -588,6 +590,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), + ("hiera","Hiera") ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index bfd200e9dcb9..3ea6efb0056a 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -1,28 +1,18 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. from typing import TYPE_CHECKING from ...utils import ( OptionalDependencyNotAvailable, _LazyModule, - is_flax_available, - is_tf_available, is_torch_available, ) -_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} +_import_structure = { + "configuration_hiera": [ + "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP", + "HireaConfig", + ], +} try: if not is_torch_available(): @@ -30,28 +20,20 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_vit_mae"] = [ - "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", - "ViTMAEForPreTraining", - "ViTMAELayer", - "ViTMAEModel", - "ViTMAEPreTrainedModel", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_vit_mae"] = [ - "TFViTMAEForPreTraining", - "TFViTMAEModel", - "TFViTMAEPreTrainedModel", + _import_structure["hirea"] = [ + "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST", + "Hirea", + "Head", + "HieraBlock", + "MaskUnitAttention" + "" ] if TYPE_CHECKING: - from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig + from .configuration_hiera import ( + HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, + HieraConfig, + ) try: if not is_torch_available(): @@ -59,24 +41,99 @@ except OptionalDependencyNotAvailable: pass else: - from .modeling_vit_mae import ( - VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, - ViTMAEForPreTraining, - ViTMAELayer, - ViTMAEModel, - ViTMAEPreTrainedModel, + from .hiera import ( + Hiera, + Head, + HieraBlock, + MaskUnitAttention, ) - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel - - else: import sys sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + +####### PREV: + +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# from typing import TYPE_CHECKING + +# from ...utils import ( +# OptionalDependencyNotAvailable, +# _LazyModule, +# is_flax_available, +# is_tf_available, +# is_torch_available, +# ) + + +# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} + +# try: +# if not is_torch_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# _import_structure["modeling_vit_mae"] = [ +# "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", +# "ViTMAEForPreTraining", +# "ViTMAELayer", +# "ViTMAEModel", +# "ViTMAEPreTrainedModel", +# ] + +# try: +# if not is_tf_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# _import_structure["modeling_tf_vit_mae"] = [ +# "TFViTMAEForPreTraining", +# "TFViTMAEModel", +# "TFViTMAEPreTrainedModel", +# ] + +# if TYPE_CHECKING: +# from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig + +# try: +# if not is_torch_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# from .modeling_vit_mae import ( +# VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, +# ViTMAEForPreTraining, +# ViTMAELayer, +# ViTMAEModel, +# ViTMAEPreTrainedModel, +# ) + +# try: +# if not is_tf_available(): +# raise OptionalDependencyNotAvailable() +# except OptionalDependencyNotAvailable: +# pass +# else: +# from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel + + +# else: +# import sys + +# sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index de5de9e7d9e9..c7dfaeaeedfb 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -2,127 +2,108 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging - +from typing import Tuple logger = logging.get_logger(__name__) -VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json", - # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae +HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + } -class ViTMAEConfig(PretrainedConfig): +class HieraConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT - MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with - the defaults will yield a similar configuration to that of the ViT - [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture. + This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with + the defaults will yield a similar configuration to that of the Hiera + [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` are supported. - hidden_dropout_prob (`float`, *optional*, defaults to 0.0): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the layer normalization layers. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - num_channels (`int`, *optional*, defaults to 3): - The number of input channels. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add a bias to the queries, keys and values. - decoder_num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each attention layer in the decoder. - decoder_hidden_size (`int`, *optional*, defaults to 512): - Dimensionality of the decoder. - decoder_num_hidden_layers (`int`, *optional*, defaults to 8): - Number of hidden layers in the decoder. - decoder_intermediate_size (`int`, *optional*, defaults to 2048): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder. - mask_ratio (`float`, *optional*, defaults to 0.75): - The ratio of the number of masked tokens in the input sequence. - norm_pix_loss (`bool`, *optional*, defaults to `False`): - Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved - representation quality in the experiments of the authors. - - Example: - - ```python - >>> from transformers import ViTMAEConfig, ViTMAEModel - - >>> # Initializing a ViT MAE vit-mae-base style configuration - >>> configuration = ViTMAEConfig() - - >>> # Initializing a model (with random weights) from the vit-mae-base style configuration - >>> model = ViTMAEModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "vit_mae" - + input_size (Tuple[int, ...], optional): Dimensions of the input image (height, width). Defaults to (224, 224). + in_chans (int, optional): Number of input channels. Defaults to 3. + embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96. + number_of_heads (int, optional): Initial number of attention heads. Defaults to 1. + num_classes (int, optional): Number of output classes. Defaults to 1000. + stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. + q_pool (int, optional): Number of pooling stages for queries. Defaults to 3. + q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2). + mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride. + mask_unit_attn (Tuple[bool, ...], optional): Specifies which stages use mask unit attention. Defaults to (True, True, False, False). + dim_mul (float, optional): Factor for increasing the dimensionality through the network. Defaults to 2.0. + head_mul (float, optional): Factor for increasing the number of heads through the network. Defaults to 2.0. + patch_kernel (Tuple[int, ...], optional): Kernel size for patch embedding. Defaults to (7, 7). + patch_stride (Tuple[int, ...], optional): Stride for patch embedding. Defaults to (4, 4). + patch_padding (Tuple[int, ...], optional): Padding for patch embedding. Defaults to (3, 3). + mlp_ratio (float, optional): Ratio of hidden size to feed-forward layer size. Defaults to 4.0. + drop_path_rate (float, optional): Dropout rate for stochastic depth. Defaults to 0.0. + head_dropout (float, optional): Dropout rate for attention heads. Defaults to 0.0. + head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001. + sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False. + + + Example: + ```python + >>> from transformers import HieraConfig, Hiera + + >>> # Initializing a ViT MAE vit-mae-base style configuration + >>> configuration = HieraConfig() + + >>> # Initializing a model (with random weights) from the vit-mae-base style configuration + >>> model = Hiera(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "hiera" def __init__( self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - decoder_num_attention_heads=16, - decoder_hidden_size=512, - decoder_num_hidden_layers=8, - decoder_intermediate_size=2048, - mask_ratio=0.75, - norm_pix_loss=False, + input_size: Tuple[int, ...] = (224, 224), + in_chans: int = 3, + embedding_dimension: int = 96, # initial embedding input_dim + number_of_heads: int = 1, # initial number of number_of_heads + num_classes: int = 1000, + stages: Tuple[int, ...] = (2, 3, 16, 3), + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, ...] = (2, 2), + mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) + # mask_unit_attn: which stages use mask unit attention? + mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), + dim_mul: float = 2.0, + head_mul: float = 2.0, + patch_kernel: Tuple[int, ...] = (7, 7), + patch_stride: Tuple[int, ...] = (4, 4), + patch_padding: Tuple[int, ...] = (3, 3), + mlp_ratio: float = 4.0, + drop_path_rate: float = 0.0, + head_dropout: float = 0.0, + head_init_scale: float = 0.001, + sep_position_embeddings: bool = False, **kwargs, + ): super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.decoder_num_attention_heads = decoder_num_attention_heads - self.decoder_hidden_size = decoder_hidden_size - self.decoder_num_hidden_layers = decoder_num_hidden_layers - self.decoder_intermediate_size = decoder_intermediate_size - self.mask_ratio = mask_ratio - self.norm_pix_loss = norm_pix_loss + self.input_size = input_size + self.in_chans = in_chans + self.embedding_dimension = embedding_dimension + self.number_of_heads = number_of_heads + self.num_classes = num_classes + self.stages = stages + self.q_pool = q_pool + self.q_stride = q_stride + self.mask_unit_size = mask_unit_size + self.mask_unit_attn = mask_unit_attn + self.dim_mul = dim_mul + self.head_mul = head_mul + self.patch_kernel = patch_kernel + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.mlp_ratio = mlp_ratio + self.drop_path_rate = drop_path_rate + self.head_dropout = head_dropout + self.head_init_scale = head_init_scale + self.sep_position_embeddings = sep_position_embeddings \ No newline at end of file diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index f1d0c4135796..77556120bcb4 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,6 +3,12 @@ import requests import torch from PIL import Image +# from .configuration_hiera import HieraConfig +# from .hiera import Hiera +# from transformers import HieraConfig, Hiera +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD @@ -29,3 +35,209 @@ def convert_state_dict(orig_state_dict, config): return updated_model_state + +class HieraImageProcessor: + def __init__(self, size): + self.size = size + self.transform_list = [ + transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(self.size) + ] + self.transform_vis = transforms.Compose(self.transform_list) + self.transform_norm = transforms.Compose(self.transform_list + [ + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ]) + + def process_image(self, image_url): + # Load the image + img = Image.open(requests.get(image_url, stream=True).raw) + + # Apply transformations + img_vis = self.transform_vis(img) + img_norm = self.transform_norm(img) + + return img_norm + + + +def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): + pretrained_models_links = { + "hiera_tiny_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", + }, + "hiera_small_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", + }, + "hiera_base_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", + }, + "hiera_base_plus_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", + }, + "hiera_large_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", + }, + "hiera_huge_224": { + "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth", + "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", + }, + "hiera_base_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", + }, + "hiera_base_plus_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", + }, + "hiera_large_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", + }, + "hiera_huge_16x224": { + "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", + "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", + } + } + + + if "hiera_tiny_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2),) + checkpoints = pretrained_models_links["hiera_tiny_224"] + checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"] + + elif "hiera_small_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2),) + checkpoints = pretrained_models_links["hiera_small_224"] + checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] + + elif "hiera_base_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(2, 3, 16, 3),) + checkpoints = pretrained_models_links["hiera_base_224"] + checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] + + elif "hiera_base_plus_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3),) + checkpoints = pretrained_models_links["hiera_base_plus_224"] + checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"] + + elif "hiera_large_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4),) + checkpoints = pretrained_models_links["hiera_large_224"] + checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"] + + elif "hiera_huge_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4)) + checkpoints = pretrained_models_links["hiera_huge_224"] + checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"] + + elif "hiera_base_16x224" in checkpoint_url: + config = HieraConfig(num_classes=num_classes, # Assuming num_classes is defined elsewhere + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True,) + checkpoints = pretrained_models_links["hiera_base_16x224"] + checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"] + + elif "hiera_base_plus_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3)) + checkpoints = pretrained_models_links["hiera_base_plus_16x224"] + checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"] + + elif "hiera_large_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), ) + checkpoints = pretrained_models_links["hiera_large_16x224"] + checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"] + + elif "hiera_huge_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4) ) + checkpoints = pretrained_models_links["hiera_huge_16x224"] + checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"] + + + pretrained = True + if pretrained: + if checkpoints is None: + raise RuntimeError("This model currently doesn't have pretrained weights available.") + elif checkpoint is None: + raise RuntimeError("No checkpoint specified.") + elif checkpoint not in checkpoints: + raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") + + state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") + state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) + if "head.projection.weight" in state_dict["model_state"]: + # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it + if config.num_classes is None: + config.num_classes = state_dict["model_state"]["head.projection.weight"].shape[0] + # If the user specified a different number of classes, remove the projection weights or else we'll error out + elif config.num_classes != state_dict["model_state"]["head.projection.weight"].shape[0]: + del state_dict["model_state"]["head.projection.weight"] + del state_dict["model_state"]["head.projection.bias"] + + model = Hiera(config) + if pretrained: + # Disable being strict when trying to load a encoder-decoder model into an encoder-only model + if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): + strict = False + + model.load_state_dict(state_dict["model_state"], strict=strict) + + + + + url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" + + image = Image.open(requests.get(url, stream=True).raw) + + + image_processor = HieraImageProcessor(size=config.image_size) + inputs = image_processor.process_image(images=image, return_tensors="pt") + + # forward pass + out = model(inputs[None, ...]) + + # 207: golden retriever (imagenet-1k) + out.argmax(dim=-1).item() + + + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + + print(f"Saving image processor to {pytorch_dump_folder_path}") + image_processor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth" + convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/") + diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index fcb04f68934e..7e42d5914d44 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -21,7 +21,7 @@ import math from functools import partial from typing import List, Tuple, Callable, Optional - +from .configuration_hiera import HieraConfig import torch import torch.nn as nn import torch.nn.functional as F @@ -205,106 +205,85 @@ def forward( class Hiera(nn.Module): - def __init__( - self, - input_size: Tuple[int, ...] = (224, 224), - in_chans: int = 3, - embedding_dimention: int = 96, # initial embedding input_dim - number_of_heads: int = 1, # initial number of number_of_heads - num_classes: int = 1000, - stages: Tuple[int, ...] = (2, 3, 16, 3), - q_pool: int = 3, # number of q_pool stages - q_stride: Tuple[int, ...] = (2, 2), - mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) - # mask_unit_attn: which stages use mask unit attention? - mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), - dim_mul: float = 2.0, - head_mul: float = 2.0, - patch_kernel: Tuple[int, ...] = (7, 7), - patch_stride: Tuple[int, ...] = (4, 4), - patch_padding: Tuple[int, ...] = (3, 3), - mlp_ratio: float = 4.0, - drop_path_rate: float = 0.0, - norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), - head_dropout: float = 0.0, - head_init_scale: float = 0.001, - sep_position_embeddings: bool = False, - ): + def __init__(self, config: HieraConfig): super().__init__() - - depth = sum(stages) - self.patch_stride = patch_stride - self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)] + self.config = config + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) # Example, adjust as needed + self.config = config + depth = sum(self.config.stages) + self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)] num_tokens = math.prod(self.tokens_spatial_shape) - flat_mu_size = math.prod(mask_unit_size) - flat_q_stride = math.prod(q_stride) + flat_mu_size = math.prod(self.config.mask_unit_size) + flat_q_stride = math.prod(self.config.q_stride) - assert q_pool < len(stages) - self.q_pool, self.q_stride = q_pool, q_stride - self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size + assert self.config.q_pool < len(self.config.stages) + self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride + self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size self.mask_spatial_shape = [ i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) ] - self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] + self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)] self.patch_embedding = PatchEmbedding( - in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding + self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding ) - self.sep_position_embeddings = sep_position_embeddings - if sep_position_embeddings: + if self.config.sep_position_embeddings: self.position_embeddings_spatial = nn.Parameter( torch.zeros( 1, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], - embedding_dimention, + self.config.embedding_dimension, ) ) self.position_embeddings_temporal = nn.Parameter( - torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention) + torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension) ) else: - self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension)) # Setup roll and reroll modules self.unroll = Unroll( - input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1]) + self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1]) ) self.reroll = Reroll( - input_size, - patch_stride, - [q_stride] * len(self.stage_ends[:-1]), + self.config.input_size, + self.config.patch_stride, + [self.config.q_stride] * len(self.stage_ends[:-1]), self.stage_ends, - q_pool, + self.config.q_pool, ) # q_pool locations - q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]] + q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]] # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)] # Transformer blocks cur_stage = 0 self.blocks = nn.ModuleList() for i in range(depth): - output_dim = embedding_dimention + output_dim = self.config.embedding_dimension # Mask unit or global attention. # Lag by 1 block, so that global attention, # applied post pooling on lower resolution - use_mask_unit_attention = mask_unit_attn[cur_stage] + use_mask_unit_attention = self.config.mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: - output_dim = int(embedding_dimention * dim_mul) - number_of_heads = int(number_of_heads * head_mul) + output_dim = int(self.config.embedding_dimension * self.config.dim_mul) + number_of_heads = int(self.config.number_of_heads * self.config.head_mul) cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride + else: + number_of_heads = self.config.number_of_heads block = HieraBlock( - input_dim=embedding_dimention, + input_dim=self.config.embedding_dimension, output_dim=output_dim, number_of_heads=number_of_heads, - mlp_ratio=mlp_ratio, + mlp_ratio=self.config.mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), @@ -312,21 +291,21 @@ def __init__( use_mask_unit_attention=use_mask_unit_attention, ) - embedding_dimention = output_dim + self.config.embedding_dimension = output_dim self.blocks.append(block) - self.norm = norm_layer(embedding_dimention) - self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout) + self.norm = norm_layer(self.config.embedding_dimension) + self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout) # Initialize everything - if sep_position_embeddings: + if self.config.sep_position_embeddings: nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02) nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02) else: nn.init.trunc_normal_(self.position_embeddings, std=0.02) self.apply(partial(self._init_weights)) - self.head.projection.weight.data.mul_(head_init_scale) - self.head.projection.bias.data.mul_(head_init_scale) + self.head.projection.weight.data.mul_(self.config.head_init_scale) + self.head.projection.bias.data.mul_(self.config.head_init_scale) def _init_weights(self, m, init_bias=0.02): if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): @@ -339,7 +318,7 @@ def _init_weights(self, m, init_bias=0.02): @torch.jit.ignore def no_weight_decay(self): - if self.sep_position_embeddings: + if self.config.sep_position_embeddings: return ["position_embeddings_spatial", "position_embeddings_temporal"] else: return ["position_embeddings"] @@ -371,7 +350,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: return mask.bool() def get_position_embeddings(self) -> torch.Tensor: - if self.sep_position_embeddings: + if self.config.sep_position_embeddings: return self.position_embeddings_spatial.repeat( 1, self.tokens_spatial_shape[0], 1 ) + torch.repeat_interleave( @@ -441,8 +420,9 @@ def forward( "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", }, default="mae_in1k_ft_in1k") -def hiera_tiny_224(**kwdargs): - return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs) +def hiera_tiny_224(**kwargs): + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs) + return Hiera(config) @pretrained_model({ @@ -450,15 +430,16 @@ def hiera_tiny_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", }, default="mae_in1k_ft_in1k") def hiera_small_224(**kwdargs): - return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) + return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) @pretrained_model({ "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", }, default="mae_in1k_ft_in1k") -def hiera_base_224(**kwdargs): - return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs) +def hiera_base_224(**kwargs): + config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + return Hiera(config) @pretrained_model({ @@ -466,7 +447,7 @@ def hiera_base_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", }, default="mae_in1k_ft_in1k") def hiera_base_plus_224(**kwdargs): - return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) + return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) @pretrained_model({ @@ -474,7 +455,7 @@ def hiera_base_plus_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", }, default="mae_in1k_ft_in1k") def hiera_large_224(**kwdargs): - return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) @pretrained_model({ @@ -482,7 +463,7 @@ def hiera_large_224(**kwdargs): "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", }, default="mae_in1k_ft_in1k") def hiera_huge_224(**kwdargs): - return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) + return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) # Video models @@ -511,7 +492,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs): }, default="mae_k400_ft_k400") def hiera_base_plus_16x224(**kwdargs): return hiera_base_16x224( - embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs + embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs ) @@ -521,7 +502,7 @@ def hiera_base_plus_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_large_16x224(**kwdargs): return hiera_base_16x224( - embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs + embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs ) @@ -531,5 +512,5 @@ def hiera_large_16x224(**kwdargs): }, default="mae_k400_ft_k400") def hiera_huge_16x224(**kwdargs): return hiera_base_16x224( - embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs + embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs ) From 46d495c59bf0dd21a48719c2c5097494d0250fc2 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 21:48:20 +0000 Subject: [PATCH 020/118] Fixed Convert function, added hiera to HF files, Initilized test files --- src/transformers/__init__.py | 7 + .../models/auto/configuration_auto.py | 6 +- src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/hiera/__init__.py | 3 + .../models/hiera/convert_hiera_to_pytorch.py | 56 ++-- src/transformers/models/hiera/hiera.py | 242 +++++++----------- .../models/hiera/hiera_image_processor.py | 56 ++++ tests/models/hiera/__init__.py | 0 tests/models/hiera/test_modeling_vit_mae.py | 44 ++++ 9 files changed, 226 insertions(+), 189 deletions(-) create mode 100644 src/transformers/models/hiera/hiera_image_processor.py create mode 100644 tests/models/hiera/__init__.py create mode 100644 tests/models/hiera/test_modeling_vit_mae.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 40c0a56362ac..69eb50a0ca37 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -4148,6 +4148,13 @@ "TFGroupViTVisionModel", ] ) + _import_structure["models.hiera"].extend( + [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "Hiera", + + ] + ) _import_structure["models.hubert"].extend( [ "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index ed75e74ebfce..28b8243dd9ef 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -117,7 +117,7 @@ ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), - ("hiera","HieraConfig") + ("hiera","HieraConfig"), ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), @@ -353,7 +353,7 @@ ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP") + ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -590,7 +590,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera","Hiera") + ("hiera","Hiera"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 05b519d2bcd1..1fa0c71b1537 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -115,6 +115,7 @@ ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), ("groupvit", "GroupViTModel"), + ("hiera", "Hiera"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 3ea6efb0056a..f88e32d03c98 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -47,6 +47,9 @@ HieraBlock, MaskUnitAttention, ) + from .hiera_image_processor import ( + HieraImageProcessor + ) else: import sys diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 77556120bcb4..d1b6e8a4ad30 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,8 +3,9 @@ import requests import torch from PIL import Image -# from .configuration_hiera import HieraConfig -# from .hiera import Hiera +from transformers.models.hiera.configuration_hiera import HieraConfig +from transformers.models.hiera.hiera import Hiera +from transformers.models.hiera.hiera_image_processor import HieraImageProcessor # from transformers import HieraConfig, Hiera from torchvision import transforms from torchvision.transforms.functional import InterpolationMode @@ -35,33 +36,8 @@ def convert_state_dict(orig_state_dict, config): return updated_model_state - -class HieraImageProcessor: - def __init__(self, size): - self.size = size - self.transform_list = [ - transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), - transforms.CenterCrop(self.size) - ] - self.transform_vis = transforms.Compose(self.transform_list) - self.transform_norm = transforms.Compose(self.transform_list + [ - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ]) - - def process_image(self, image_url): - # Load the image - img = Image.open(requests.get(image_url, stream=True).raw) - - # Apply transformations - img_vis = self.transform_vis(img) - img_norm = self.transform_norm(img) - - return img_norm - - - -def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): +def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): + strict = True pretrained_models_links = { "hiera_tiny_224": { "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", @@ -121,9 +97,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] elif "hiera_base_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(2, 3, 16, 3),) + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + checkpoints = pretrained_models_links["hiera_base_224"] checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] @@ -180,7 +155,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): stages=(2, 6, 36, 4) ) checkpoints = pretrained_models_links["hiera_huge_16x224"] checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"] - + elif checkpoint not in checkpoints: + raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") pretrained = True if pretrained: @@ -188,10 +164,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): raise RuntimeError("This model currently doesn't have pretrained weights available.") elif checkpoint is None: raise RuntimeError("No checkpoint specified.") - elif checkpoint not in checkpoints: - raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") - state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") + state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu") state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) if "head.projection.weight" in state_dict["model_state"]: # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it @@ -202,24 +176,24 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path): del state_dict["model_state"]["head.projection.weight"] del state_dict["model_state"]["head.projection.bias"] - model = Hiera(config) + model = Hiera(config=config) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): strict = False - model.load_state_dict(state_dict["model_state"], strict=strict) + model.load_state_dict(state_dict["model_state"]) + # model.load_state_dict(state_dict["model_state"], strict=strict) url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" - image = Image.open(requests.get(url, stream=True).raw) - image_processor = HieraImageProcessor(size=config.image_size) - inputs = image_processor.process_image(images=image, return_tensors="pt") + image_processor = HieraImageProcessor(size=224) + inputs = image_processor.process_image(image_url=url) # forward pass out = model(inputs[None, ...]) diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 7e42d5914d44..7bafed5c3cd0 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -25,11 +25,40 @@ import torch import torch.nn as nn import torch.nn.functional as F +from dataclasses import dataclass from timm.models.layers import DropPath, Mlp - -from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll - +from ...modeling_utils import PreTrainedModel +# from ...modeling_outputs import BaseModelOutput +# from ...utils import ( +# ModelOutput, +# add_start_docstrings, +# add_start_docstrings_to_model_forward, +# logging, +# replace_return_docstrings, +# ) + +from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll + +# @dataclass +# class HieraModelOutput(ModelOutput): +# """ +# Base class for Hiera model's outputs. + +# Args: +# last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): +# Last layer hidden-states. +# attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): +# Attentions weights from the model, one for each layer. +# hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): +# Hidden states of the model at the output of each layer. +# intermediates (list[torch.Tensor], optional): +# Intermediate representations or features from the model, if applicable. +# """ +# last_hidden_state: torch.FloatTensor +# attentions: Optional[Tuple[torch.FloatTensor]] = None +# hidden_states: Optional[Tuple[torch.FloatTensor]] = None +# intermediates: Optional[list[torch.Tensor]] = None class MaskUnitAttention(nn.Module): @@ -204,86 +233,110 @@ def forward( return x -class Hiera(nn.Module): +class Hiera(PreTrainedModel): + config_class = HieraConfig + base_model_prefix = "hiera" + main_input_name = "x" + supports_gradient_checkpointing = True + def __init__(self, config: HieraConfig): - super().__init__() + self.input_size = config.input_size + self.in_chans = config.in_chans + self.embedding_dimension = config.embedding_dimension + self.number_of_heads = config.number_of_heads + self.num_classes = config.num_classes + self.stages = config.stages + self.q_pool = config.q_pool + self.q_stride = config.q_stride + self.mask_unit_size = config.mask_unit_size + self.mask_unit_attn = config.mask_unit_attn + self.dim_mul = config.dim_mul + self.head_mul = config.head_mul + self.patch_kernel = config.patch_kernel + self.patch_stride = config.patch_stride + self.patch_padding = config.patch_padding + self.mlp_ratio = config.mlp_ratio + self.drop_path_rate = config.drop_path_rate + self.head_dropout = config.head_dropout + self.head_init_scale = config.head_init_scale + self.sep_position_embeddings = config.sep_position_embeddings + + super().__init__(config) self.config = config - super().__init__() norm_layer = partial(nn.LayerNorm, eps=1e-6) # Example, adjust as needed - self.config = config - depth = sum(self.config.stages) - self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)] + depth = sum(self.stages) + self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)] num_tokens = math.prod(self.tokens_spatial_shape) - flat_mu_size = math.prod(self.config.mask_unit_size) - flat_q_stride = math.prod(self.config.q_stride) + flat_mu_size = math.prod(self.mask_unit_size) + flat_q_stride = math.prod(self.q_stride) - assert self.config.q_pool < len(self.config.stages) - self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride - self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size + assert self.q_pool < len(self.stages) + self.q_pool, self.q_stride = self.q_pool, self.q_stride + self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size self.mask_spatial_shape = [ i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) ] - self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)] + self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)] self.patch_embedding = PatchEmbedding( - self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding + self.in_chans, self.embedding_dimension, self.patch_kernel, self.patch_stride, self.patch_padding ) - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: self.position_embeddings_spatial = nn.Parameter( torch.zeros( 1, self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], - self.config.embedding_dimension, + self.embedding_dimension, ) ) self.position_embeddings_temporal = nn.Parameter( - torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension) + torch.zeros(1, self.tokens_spatial_shape[0], self.embedding_dimension) ) else: - self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension)) # Setup roll and reroll modules self.unroll = Unroll( - self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1]) + self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]) ) self.reroll = Reroll( - self.config.input_size, - self.config.patch_stride, - [self.config.q_stride] * len(self.stage_ends[:-1]), + self.input_size, + self.patch_stride, + [self.q_stride] * len(self.stage_ends[:-1]), self.stage_ends, - self.config.q_pool, + self.q_pool, ) # q_pool locations - q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]] + q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]] # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)] + dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)] # Transformer blocks cur_stage = 0 self.blocks = nn.ModuleList() for i in range(depth): - output_dim = self.config.embedding_dimension + output_dim = self.embedding_dimension # Mask unit or global attention. # Lag by 1 block, so that global attention, # applied post pooling on lower resolution - use_mask_unit_attention = self.config.mask_unit_attn[cur_stage] + use_mask_unit_attention = self.mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: - output_dim = int(self.config.embedding_dimension * self.config.dim_mul) - number_of_heads = int(self.config.number_of_heads * self.config.head_mul) + output_dim = int(self.embedding_dimension * self.dim_mul) + number_of_heads = int(self.number_of_heads * self.head_mul) cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride else: - number_of_heads = self.config.number_of_heads + number_of_heads = self.number_of_heads block = HieraBlock( - input_dim=self.config.embedding_dimension, + input_dim=self.embedding_dimension, output_dim=output_dim, number_of_heads=number_of_heads, - mlp_ratio=self.config.mlp_ratio, + mlp_ratio=self.mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), @@ -291,21 +344,22 @@ def __init__(self, config: HieraConfig): use_mask_unit_attention=use_mask_unit_attention, ) - self.config.embedding_dimension = output_dim + self.embedding_dimension = output_dim self.blocks.append(block) - self.norm = norm_layer(self.config.embedding_dimension) - self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout) + self.norm = norm_layer(self.embedding_dimension) + self.head = Head(self.embedding_dimension, self.num_classes, dropout_rate=self.head_dropout) # Initialize everything - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02) nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02) else: nn.init.trunc_normal_(self.position_embeddings, std=0.02) self.apply(partial(self._init_weights)) - self.head.projection.weight.data.mul_(self.config.head_init_scale) - self.head.projection.bias.data.mul_(self.config.head_init_scale) + self.head.projection.weight.data.mul_(self.head_init_scale) + self.head.projection.bias.data.mul_(self.head_init_scale) + self.post_init() def _init_weights(self, m, init_bias=0.02): if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): @@ -318,7 +372,7 @@ def _init_weights(self, m, init_bias=0.02): @torch.jit.ignore def no_weight_decay(self): - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: return ["position_embeddings_spatial", "position_embeddings_temporal"] else: return ["position_embeddings"] @@ -350,7 +404,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: return mask.bool() def get_position_embeddings(self) -> torch.Tensor: - if self.config.sep_position_embeddings: + if self.sep_position_embeddings: return self.position_embeddings_spatial.repeat( 1, self.tokens_spatial_shape[0], 1 ) + torch.repeat_interleave( @@ -411,106 +465,4 @@ def forward( if return_intermediates: return x, intermediates - return x - - -# Image models - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_tiny_224(**kwargs): - config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs) - return Hiera(config) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_small_224(**kwdargs): - return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_base_224(**kwargs): - config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) - return Hiera(config) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_base_plus_224(**kwdargs): - return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_large_224(**kwdargs): - return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs) - - -@pretrained_model({ - "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth", - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", -}, default="mae_in1k_ft_in1k") -def hiera_huge_224(**kwdargs): - return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs) - - -# Video models - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_base_16x224(num_classes: int = 400, **kwdargs): - return Hiera( - num_classes=num_classes, # K400 has 400 classes - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_position_embeddings=True, - **kwdargs - ) - - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_base_plus_16x224(**kwdargs): - return hiera_base_16x224( - embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs - ) - - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_large_16x224(**kwdargs): - return hiera_base_16x224( - embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs - ) - - -@pretrained_model({ - "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", -}, default="mae_k400_ft_k400") -def hiera_huge_16x224(**kwdargs): - return hiera_base_16x224( - embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs - ) + return x \ No newline at end of file diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py new file mode 100644 index 000000000000..4900e4a4d3fb --- /dev/null +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -0,0 +1,56 @@ + +"""Image processor class for Hirea.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import rescale, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, +) +from ...utils import TensorType, is_vision_available, logging +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from PIL import Image +import requests + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class HieraImageProcessor(BaseImageProcessor): + def __init__(self, size): + self.size = size + self.transform_list = [ + transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(self.size) + ] + self.transform_vis = transforms.Compose(self.transform_list) + self.transform_norm = transforms.Compose(self.transform_list + [ + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ]) + + def process_image(self, image_url): + # Load the image + img = Image.open(requests.get(image_url, stream=True).raw) + + # Apply transformations + img_vis = self.transform_vis(img) + img_norm = self.transform_norm(img) + + return img_norm \ No newline at end of file diff --git a/tests/models/hiera/__init__.py b/tests/models/hiera/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py new file mode 100644 index 000000000000..014d41766a8e --- /dev/null +++ b/tests/models/hiera/test_modeling_vit_mae.py @@ -0,0 +1,44 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch ViTMAE model. """ + + +import math +import tempfile +import unittest + +import numpy as np + +from transformers import ViTMAEConfig +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ViTMAEForPreTraining, ViTMAEModel + from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import ViTImageProcessor \ No newline at end of file From a25a3a7fc200913b64070a2781ce5e4ff7f87452 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Feb 2024 23:41:40 +0000 Subject: [PATCH 021/118] better naming for x in forward pass --- src/transformers/__init__.py | 4 +- .../models/auto/configuration_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 2 +- src/transformers/models/hiera/__init__.py | 2 +- .../models/hiera/configuration_hiera.py | 8 +- .../models/hiera/convert_hiera_to_pytorch.py | 10 +- src/transformers/models/hiera/hiera.py | 163 ++++++++++-------- src/transformers/models/hiera/hiera_mae.py | 6 +- src/transformers/models/hiera/hiera_utils.py | 6 +- tests/models/hiera/test_modeling_hiera.py | 87 ++++++++++ tests/models/hiera/test_modeling_vit_mae.py | 44 ----- 11 files changed, 199 insertions(+), 135 deletions(-) create mode 100644 tests/models/hiera/test_modeling_hiera.py delete mode 100644 tests/models/hiera/test_modeling_vit_mae.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 69eb50a0ca37..9e3c4c5f7c96 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -4151,7 +4151,7 @@ _import_structure["models.hiera"].extend( [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", - "Hiera", + "HieraModel", ] ) @@ -6993,7 +6993,7 @@ HubertPreTrainedModel, ) from .models.hiera import ( - Hiera, + HieraModel, HieraBlock ) from .models.ibert import ( diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 28b8243dd9ef..796e524fd0cf 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -590,7 +590,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera","Hiera"), + ("hiera","HieraModel"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1fa0c71b1537..0fc417e795e4 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -115,7 +115,7 @@ ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), ("groupvit", "GroupViTModel"), - ("hiera", "Hiera"), + ("hiera", "HieraModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index f88e32d03c98..0434517bf52c 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -42,7 +42,7 @@ pass else: from .hiera import ( - Hiera, + HieraModel, Head, HieraBlock, MaskUnitAttention, diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index c7dfaeaeedfb..e3133354f6ea 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -13,8 +13,8 @@ class HieraConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with - the defaults will yield a similar configuration to that of the Hiera + This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an HieraModel model according to the specified arguments, defining the model architecture. Instantiating a configuration with + the defaults will yield a similar configuration to that of the HieraModel [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the @@ -46,13 +46,13 @@ class HieraConfig(PretrainedConfig): Example: ```python - >>> from transformers import HieraConfig, Hiera + >>> from transformers import HieraConfig, HieraModel >>> # Initializing a ViT MAE vit-mae-base style configuration >>> configuration = HieraConfig() >>> # Initializing a model (with random weights) from the vit-mae-base style configuration - >>> model = Hiera(configuration) + >>> model = HieraModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index d1b6e8a4ad30..d0294f12deab 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,10 +3,10 @@ import requests import torch from PIL import Image -from transformers.models.hiera.configuration_hiera import HieraConfig -from transformers.models.hiera.hiera import Hiera -from transformers.models.hiera.hiera_image_processor import HieraImageProcessor -# from transformers import HieraConfig, Hiera +# from transformers.models.hiera.configuration_hiera import HieraConfig +# from transformers.models.hiera.hiera import HieraModel +# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor +# from transformers import HieraConfig, HieraModel from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD @@ -176,7 +176,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): del state_dict["model_state"]["head.projection.weight"] del state_dict["model_state"]["head.projection.bias"] - model = Hiera(config=config) + model = HieraModel(config=config) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 7bafed5c3cd0..72917eb8e1a4 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -20,7 +20,7 @@ import math from functools import partial -from typing import List, Tuple, Callable, Optional +from typing import List, Tuple, Callable, Optional, Union from .configuration_hiera import HieraConfig import torch import torch.nn as nn @@ -29,36 +29,34 @@ from timm.models.layers import DropPath, Mlp from ...modeling_utils import PreTrainedModel -# from ...modeling_outputs import BaseModelOutput -# from ...utils import ( -# ModelOutput, -# add_start_docstrings, -# add_start_docstrings_to_model_forward, -# logging, -# replace_return_docstrings, -# ) +from ...modeling_outputs import BaseModelOutput +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll -# @dataclass -# class HieraModelOutput(ModelOutput): -# """ -# Base class for Hiera model's outputs. - -# Args: -# last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): -# Last layer hidden-states. -# attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): -# Attentions weights from the model, one for each layer. -# hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): -# Hidden states of the model at the output of each layer. -# intermediates (list[torch.Tensor], optional): -# Intermediate representations or features from the model, if applicable. -# """ -# last_hidden_state: torch.FloatTensor -# attentions: Optional[Tuple[torch.FloatTensor]] = None -# hidden_states: Optional[Tuple[torch.FloatTensor]] = None -# intermediates: Optional[list[torch.Tensor]] = None +@dataclass +class HieraModelOutput(ModelOutput): + """ + Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput. + + Args: + last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): + Last layer hidden-states. + attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): + Attentions weights from the model, one for each layer. + hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): + Hidden states of the model at the output of each layer. + intermediates (List[torch.Tensor], optional): + Intermediate representations or features from the model, if applicable. + """ + last_hidden_state: torch.FloatTensor + intermediates: Optional[List[torch.Tensor]] = None class MaskUnitAttention(nn.Module): @@ -102,15 +100,15 @@ def __init__( self.window_size = window_size self.use_mask_unit_attention = use_mask_unit_attention - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: """ Input should be of shape [batch, tokens, channels]. """ - batch_size , num_channels , _ = x.shape + batch_size , num_channels , _ = embeddings.shape num_windows = ( (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 ) qkv = ( - self.qkv(x) + self.qkv(embeddings) .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim) .permute(3, 0, 4, 2, 1, 5) ) @@ -126,15 +124,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if hasattr(F, "scaled_dot_product_attention"): # Note: the original paper did *not* use SDPA, it's a free boost! - x = F.scaled_dot_product_attention(q, k, v) + embeddings = F.scaled_dot_product_attention(q, k, v) else: attention = (q * self.scale) @ k.transpose(-1, -2) attention = attention.softmax(dim=-1) - x = (attention @ v) + embeddings = (attention @ v) - x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim) - x = self.projection(x) - return x + embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim) + embeddings = self.projection(embeddings) + return embeddings class HieraBlock(nn.Module): @@ -168,16 +166,16 @@ def __init__( if input_dim != output_dim: self.projection = nn.Linear(input_dim, output_dim) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: # Attention + Q Pooling - normalized_input = self.norm1(x) + normalized_embeddings = self.norm1(embeddings) if self.input_dim != self.output_dim: - x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride) - x = x + self.drop_path(self.attention(normalized_input)) + embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride) + embeddings = embeddings + self.drop_path(self.attention(normalized_embeddings)) # MLP - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x + embeddings = embeddings + self.drop_path(self.mlp(self.norm2(embeddings))) + return embeddings class Head(nn.Module): @@ -226,17 +224,36 @@ def __init__( ) def forward( - self, x: torch.Tensor, mask: Optional[torch.Tensor] = None + self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None ) -> torch.Tensor: - x = do_masked_conv(x, self.projection, mask) - x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1) - return x + embeddings = do_masked_conv(pixel_values, self.projection, mask) + embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) + return embeddings + +class HireaModel(PreTrainedModel): + """ + Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + + This model is a PyTorch implementation of the Hiera architecture for image classification. + + The model can be used as follows: + + Args: + config (HieraConfig): Configuration class instance for `Hiera`. + + Example usage: + >>> from your_model_file import Hiera, HieraConfig + >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + + >>> model = Hiera(config) + >>> inputs = torch.rand((1, 3, 224, 224)) + >>> outputs = model(inputs) + """ -class Hiera(PreTrainedModel): config_class = HieraConfig base_model_prefix = "hiera" - main_input_name = "x" + main_input_name = "pixel_values" supports_gradient_checkpointing = True def __init__(self, config: HieraConfig): @@ -417,52 +434,56 @@ def get_position_embeddings(self) -> torch.Tensor: def forward( self, - x: torch.Tensor, + pixel_values: torch.Tensor, mask: torch.Tensor = None, + return_dict: Optional[bool] = True, return_intermediates: bool = False, - ) -> torch.Tensor: + ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: """ mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. """ # Slowfast training passes in a list - if isinstance(x, list): - x = x[0] + if isinstance(pixel_values, list): + pixel_values = pixel_values[0] intermediates = [] - x = self.patch_embedding( - x, + pached_embeddings = self.patch_embedding( + pixel_values, mask=mask.view( - x.shape[0], 1, *self.mask_spatial_shape + pixel_values.shape[0], 1, *self.mask_spatial_shape ) # batch_size , C, *mask_spatial_shape if mask is not None else None, ) - x = x + self.get_position_embeddings() - x = self.unroll(x) + embeddings = pached_embeddings + self.get_position_embeddings() + embeddings = self.unroll(embeddings) # Discard masked tokens if mask is not None: - x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view( - x.shape[0], -1, x.shape[-1] + embeddings = embeddings[mask[..., None].tile(1, self.mu_size, embeddings.shape[2])].view( + embeddings.shape[0], -1, embeddings.shape[-1] ) - for i, blk in enumerate(self.blocks): - x = blk(x) + for i, block in enumerate(self.blocks): + embeddings = block(embeddings) if return_intermediates and i in self.stage_ends: - intermediates.append(self.reroll(x, i, mask=mask)) + intermediates.append(self.reroll(embeddings, i, mask=mask)) if mask is None: - x = x.mean(dim=1) - x = self.norm(x) - x = self.head(x) + embeddings = embeddings.mean(dim=1) + embeddings = self.norm(embeddings) + embeddings = self.head(embeddings) - # x may not always be in spatial order here. + # embeddings may not always be in spatial order here. # e.g. if q_pool = 2, mask_unit_size = (8, 8), and # q_stride = (2, 2), not all unrolls were consumed, - # intermediates[-1] is x in spatial order - if return_intermediates: - return x, intermediates - - return x \ No newline at end of file + # intermediates[-1] is embeddings in spatial order + if not return_dict: + return tuple(v for v in [embeddings, intermediates] if v is not None) + + return HieraModelOutput( + last_hidden_state=embeddings, + intermediates=intermediates if return_intermediates else None, + ) \ No newline at end of file diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index a0504997350b..c45056318a38 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from .hiera import Hiera, HieraBlock +from .hiera import HieraModel, HieraBlock from .hiera_utils import pretrained_model, undo_windowing, conv_nd @@ -36,8 +36,8 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: return x -class MaskedAutoencoderHiera(Hiera): - """Masked Autoencoder with Hiera backbone""" +class MaskedAutoencoderHiera(HieraModel): + """Masked Autoencoder with HieraModel backbone""" def __init__( self, diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py index c96c63cbfaf9..a35b33210941 100644 --- a/src/transformers/models/hiera/hiera_utils.py +++ b/src/transformers/models/hiera/hiera_utils.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # -# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles # # Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, # Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, @@ -27,7 +27,7 @@ from .convert_hiera_to_pytorch import convert_state_dict def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: - """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ + """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ def inner(model_func: Callable) -> Callable: def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module: @@ -69,7 +69,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool def conv_nd(n: int) -> Type[nn.Module]: """ Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. - If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises) + If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises) """ return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py new file mode 100644 index 000000000000..8d593af2a622 --- /dev/null +++ b/tests/models/hiera/test_modeling_hiera.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Hiera model. """ + +import unittest + +from transformers import HieraConfig +from transformers.testing_utils import ( + require_torch, + slow, + torch_device, +) +from transformers.utils import is_torch_available + +if is_torch_available(): + import torch + from transformers import HieraModel + # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model + from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST + + +class HieraModelTester: + # Define this tester to initialize Hiera model and its configurations for testing + def __init__( + self, + parent, + batch_size=8, + num_channels=3, + image_size=224, + # Add other model-specific parameters here + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + # Initialize other necessary attributes here + + def prepare_config_and_inputs(self): + # Prepare configuration and inputs for testing your model + pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device) + + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return HieraConfig( + # Define necessary configuration parameters here + ) + + def create_and_check_model(self, config, pixel_values): + model = HieraModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values=pixel_values) + # Perform checks here, e.g., output shapes, etc. + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size)) + + +@require_torch +class HieraModelTest(unittest.TestCase): + + def setUp(self): + self.model_tester = HieraModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = HieraModel.from_pretrained(model_name) + self.assertIsNotNone(model) \ No newline at end of file diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py deleted file mode 100644 index 014d41766a8e..000000000000 --- a/tests/models/hiera/test_modeling_vit_mae.py +++ /dev/null @@ -1,44 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Testing suite for the PyTorch ViTMAE model. """ - - -import math -import tempfile -import unittest - -import numpy as np - -from transformers import ViTMAEConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device -from transformers.utils import cached_property, is_torch_available, is_vision_available - -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin - - -if is_torch_available(): - import torch - from torch import nn - - from transformers import ViTMAEForPreTraining, ViTMAEModel - from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST - - -if is_vision_available(): - from PIL import Image - - from transformers import ViTImageProcessor \ No newline at end of file From 51d11f554c2cdc86c6a52319dead96c811f7174a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 17 Feb 2024 00:10:52 +0000 Subject: [PATCH 022/118] Moved utils to hiera --- src/transformers/models/hiera/hiera.py | 226 ++++++++++++++++++++++++- 1 file changed, 223 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py index 72917eb8e1a4..cca502aa80c9 100644 --- a/src/transformers/models/hiera/hiera.py +++ b/src/transformers/models/hiera/hiera.py @@ -20,7 +20,7 @@ import math from functools import partial -from typing import List, Tuple, Callable, Optional, Union +from typing import List, Tuple, Callable, Optional, Union, Type from .configuration_hiera import HieraConfig import torch import torch.nn as nn @@ -38,7 +38,227 @@ replace_return_docstrings, ) -from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll + +def conv_nd(n: int) -> Type[nn.Module]: + """ + Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. + If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises) + """ + return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] + + +def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: + # Refer to `Unroll` to see how this performs a maxpool-Nd + return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values + + +def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor: + # target_size: [(T), (H), W] + # (spatial) mask: [B, C, (t), (h), w] + if mask is None: + return mask + + assert len(mask.shape[2:]) == len(target_size) + if mask.shape[2:] != target_size: + return F.interpolate(mask.float(), size=target_size) + return mask + + +def do_masked_conv( + x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None +) -> torch.Tensor: + """Zero-out the masked regions of the input before conv. + Prevents leakage of masked regions when using overlapping kernels. + """ + if conv is None: + return x + if mask is None: + return conv(x) + + mask = get_resized_mask(target_size=x.shape[2:], mask=mask) + return conv(x * mask.bool()) + + +def undo_windowing( + x: torch.Tensor, shape: List[int], mu_shape: List[int] +) -> torch.Tensor: + """ + Restore spatial organization by undoing windowed organization of mask units. + + Args: + x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] + shape: current spatial shape, if it were not organized into mask unit + windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. + mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] + Returns: + x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] + """ + D = len(shape) + B, C = x.shape[0], x.shape[-1] + # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] + num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] + x = x.view(B, *num_MUs, *mu_shape, C) + + # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] + permute = ( + [0] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], + [], + ) + + [len(x.shape) - 1] + ) + x = x.permute(permute).reshape(B, *shape, C) + + return x + + + +class Unroll(nn.Module): + """ + Reorders the tokens such that patches are contiguous in memory. + E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as + [B, (Sy, Sx, H // Sy, W // Sx), C] + + This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). + Not only is this faster, but it also makes it easy to support inputs of arbitrary + dimensions in addition to patch-wise sparsity. + + Performing this operation multiple times in sequence puts entire windows as contiguous + in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of + size 8x8 would be contiguous in memory, allowing operations like mask unit attention + computed easily and efficiently, while also allowing max to be applied sequentially. + + Note: This means that intermediate values of the model are not in HxW order, so they + need to be re-rolled if you want to use the intermediate values as a HxW feature map. + The last block of the network is fine though, since by then the strides are all consumed. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + self.schedule = unroll_schedule + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: Flattened patch embeddings [B, N, C] + Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd + """ + B, _, C = x.shape + + cur_size = self.size + x = x.view(*([B] + cur_size + [C])) + + for strides in self.schedule: + # Move patches with the given strides to the batch dimension + + # Create a view of the tensor with the patch stride as separate dims + # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] + cur_size = [i // s for i, s in zip(cur_size, strides)] + new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] + x = x.view(new_shape) + + # Move the patch stride into the batch dimension + # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] + L = len(new_shape) + permute = ( + [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] + ) + x = x.permute(permute) + + # Now finally flatten the relevant dims into the batch dimension + x = x.flatten(0, len(strides)) + B *= math.prod(strides) + + x = x.reshape(-1, math.prod(self.size), C) + return x + + +class Reroll(nn.Module): + """ + Undos the "unroll" operation so that you can use intermediate features. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + stage_ends: List[int], + q_pool: int, + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + + # The first stage has to reverse everything + # The next stage has to reverse all but the first unroll, etc. + self.schedule = {} + size = self.size + for i in range(stage_ends[-1] + 1): + self.schedule[i] = unroll_schedule, size + # schedule unchanged if no pooling at a stage end + if i in stage_ends[:q_pool]: + if len(unroll_schedule) > 0: + size = [n // s for n, s in zip(size, unroll_schedule[0])] + unroll_schedule = unroll_schedule[1:] + + def forward( + self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Roll the given tensor back up to spatial order assuming it's from the given block. + + If no mask is provided: + - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. + If a mask is provided: + - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + """ + schedule, size = self.schedule[block_idx] + B, N, C = x.shape + + D = len(size) + cur_mu_shape = [1] * D + + for strides in schedule: + # Extract the current patch from N + x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) + + # Move that patch into the current MU + # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] + L = len(x.shape) + permute = ( + [0, 1 + D] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], + [], + ) + + [L - 1] + ) + x = x.permute(permute) + + # Reshape to [B, N//(Sy*Sx), *MU, C] + for i in range(D): + cur_mu_shape[i] *= strides[i] + x = x.reshape(B, -1, *cur_mu_shape, C) + N = x.shape[1] + + # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) + x = x.view(B, N, *cur_mu_shape, C) + + # If masked, return [B, #MUs, MUy, MUx, C] + if mask is not None: + return x + + # If not masked, we can return [B, H, W, C] + x = undo_windowing(x, size, cur_mu_shape) + + return x + @dataclass class HieraModelOutput(ModelOutput): @@ -231,7 +451,7 @@ def forward( return embeddings -class HireaModel(PreTrainedModel): +class HieraModel(PreTrainedModel): """ Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. From ea872fe2f81935e5cf5fdfd086fb3b29fb39f4b9 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 00:17:14 +0000 Subject: [PATCH 023/118] Change hiera -> hiera_model --- src/transformers/models/hiera/__init__.py | 89 +----- src/transformers/models/hiera/benchmarking.py | 77 ----- src/transformers/models/hiera/hiera_mae.py | 2 +- .../models/hiera/{hiera.py => hiera_model.py} | 0 src/transformers/models/hiera/hiera_utils.py | 287 ------------------ 5 files changed, 3 insertions(+), 452 deletions(-) delete mode 100644 src/transformers/models/hiera/benchmarking.py rename src/transformers/models/hiera/{hiera.py => hiera_model.py} (100%) delete mode 100644 src/transformers/models/hiera/hiera_utils.py diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 0434517bf52c..1f388d5361ab 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -41,7 +41,7 @@ except OptionalDependencyNotAvailable: pass else: - from .hiera import ( + from .hiera_model import ( HieraModel, Head, HieraBlock, @@ -54,89 +54,4 @@ else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) - -####### PREV: - -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# from typing import TYPE_CHECKING - -# from ...utils import ( -# OptionalDependencyNotAvailable, -# _LazyModule, -# is_flax_available, -# is_tf_available, -# is_torch_available, -# ) - - -# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]} - -# try: -# if not is_torch_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# _import_structure["modeling_vit_mae"] = [ -# "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST", -# "ViTMAEForPreTraining", -# "ViTMAELayer", -# "ViTMAEModel", -# "ViTMAEPreTrainedModel", -# ] - -# try: -# if not is_tf_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# _import_structure["modeling_tf_vit_mae"] = [ -# "TFViTMAEForPreTraining", -# "TFViTMAEModel", -# "TFViTMAEPreTrainedModel", -# ] - -# if TYPE_CHECKING: -# from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig - -# try: -# if not is_torch_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# from .modeling_vit_mae import ( -# VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST, -# ViTMAEForPreTraining, -# ViTMAELayer, -# ViTMAEModel, -# ViTMAEPreTrainedModel, -# ) - -# try: -# if not is_tf_available(): -# raise OptionalDependencyNotAvailable() -# except OptionalDependencyNotAvailable: -# pass -# else: -# from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel - - -# else: -# import sys - -# sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py deleted file mode 100644 index 33166028977a..000000000000 --- a/src/transformers/models/hiera/benchmarking.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -# -------------------------------------------------------- - -import time -from typing import List, Tuple, Union - -import torch -from tqdm import tqdm - -# From https://github.com/facebookresearch/ToMe/ -def benchmark( - model: torch.nn.Module, - device: torch.device = 0, - input_size: Tuple[int] = (3, 224, 224), - batch_size: int = 64, - runs: int = 40, - throw_out: float = 0.25, - use_fp16: bool = False, - verbose: bool = False, -) -> float: - """ - Benchmark the given model with random inputs at the given batch size. - - Args: - - model: the module to benchmark - - device: the device to use for benchmarking - - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w) - - batch_size: the batch size to use for evaluation - - runs: the number of total runs to do - - throw_out: the percentage of runs to throw out at the start of testing - - use_fp16: whether or not to benchmark with float16 and autocast - - verbose: whether or not to use tqdm to print progress / print throughput at end - - Returns: - - the throughput measured in images / second - """ - if not isinstance(device, torch.device): - device = torch.device(device) - is_cuda = torch.device(device).type == "cuda" - - model = model.eval().to(device) - input = torch.rand(batch_size, *input_size, device=device) - if use_fp16: - input = input.half() - - warm_up = int(runs * throw_out) - total = 0 - start = time.time() - - with torch.autocast(device.type, enabled=use_fp16): - with torch.no_grad(): - for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"): - if i == warm_up: - if is_cuda: - torch.cuda.synchronize() - total = 0 - start = time.time() - - model(input) - total += batch_size - - if is_cuda: - torch.cuda.synchronize() - - end = time.time() - elapsed = end - start - - throughput = total / elapsed - - if verbose: - print(f"Throughput: {throughput:.2f} im/s") - - return throughput diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index c45056318a38..f0e2e7854bff 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from .hiera import HieraModel, HieraBlock +from .hiera_model import HieraModel, HieraBlock from .hiera_utils import pretrained_model, undo_windowing, conv_nd diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera_model.py similarity index 100% rename from src/transformers/models/hiera/hiera.py rename to src/transformers/models/hiera/hiera_model.py diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py deleted file mode 100644 index a35b33210941..000000000000 --- a/src/transformers/models/hiera/hiera_utils.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -# -------------------------------------------------------- -# -# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles -# -# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, -# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, -# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. -# -# Paper: https://arxiv.org/abs/2306.00989/ -# -# References: -# slowfast: https://github.com/facebookresearch/SlowFast -# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm -# -------------------------------------------------------- - -import math -from typing import List, Tuple, Optional, Type, Callable, Dict - -import torch -import torch.nn as nn -import torch.nn.functional as F -from .convert_hiera_to_pytorch import convert_state_dict - -def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable: - """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """ - - def inner(model_func: Callable) -> Callable: - def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module: - if pretrained: - if checkpoints is None: - raise RuntimeError("This model currently doesn't have pretrained weights available.") - elif checkpoint is None: - raise RuntimeError("No checkpoint specified.") - elif checkpoint not in checkpoints: - raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.") - - state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu") - state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) - if "head.projection.weight" in state_dict["model_state"]: - # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it - if "num_classes" not in kwdargs: - kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0] - # If the user specified a different number of classes, remove the projection weights or else we'll error out - elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]: - del state_dict["model_state"]["head.projection.weight"] - del state_dict["model_state"]["head.projection.bias"] - - model = model_func(**kwdargs) - if pretrained: - # Disable being strict when trying to load a encoder-decoder model into an encoder-only model - if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): - strict = False - - model.load_state_dict(state_dict["model_state"], strict=strict) - - return model - - return model_def - - return inner - - - -def conv_nd(n: int) -> Type[nn.Module]: - """ - Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. - If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises) - """ - return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] - - -def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: - # Refer to `Unroll` to see how this performs a maxpool-Nd - return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values - - -def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor: - # target_size: [(T), (H), W] - # (spatial) mask: [B, C, (t), (h), w] - if mask is None: - return mask - - assert len(mask.shape[2:]) == len(target_size) - if mask.shape[2:] != target_size: - return F.interpolate(mask.float(), size=target_size) - return mask - - -def do_masked_conv( - x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None -) -> torch.Tensor: - """Zero-out the masked regions of the input before conv. - Prevents leakage of masked regions when using overlapping kernels. - """ - if conv is None: - return x - if mask is None: - return conv(x) - - mask = get_resized_mask(target_size=x.shape[2:], mask=mask) - return conv(x * mask.bool()) - - -def undo_windowing( - x: torch.Tensor, shape: List[int], mu_shape: List[int] -) -> torch.Tensor: - """ - Restore spatial organization by undoing windowed organization of mask units. - - Args: - x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] - shape: current spatial shape, if it were not organized into mask unit - windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. - mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] - Returns: - x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] - """ - D = len(shape) - B, C = x.shape[0], x.shape[-1] - # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] - num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] - x = x.view(B, *num_MUs, *mu_shape, C) - - # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] - permute = ( - [0] - + sum( - [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], - [], - ) - + [len(x.shape) - 1] - ) - x = x.permute(permute).reshape(B, *shape, C) - - return x - - - -class Unroll(nn.Module): - """ - Reorders the tokens such that patches are contiguous in memory. - E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as - [B, (Sy, Sx, H // Sy, W // Sx), C] - - This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). - Not only is this faster, but it also makes it easy to support inputs of arbitrary - dimensions in addition to patch-wise sparsity. - - Performing this operation multiple times in sequence puts entire windows as contiguous - in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of - size 8x8 would be contiguous in memory, allowing operations like mask unit attention - computed easily and efficiently, while also allowing max to be applied sequentially. - - Note: This means that intermediate values of the model are not in HxW order, so they - need to be re-rolled if you want to use the intermediate values as a HxW feature map. - The last block of the network is fine though, since by then the strides are all consumed. - """ - - def __init__( - self, - input_size: Tuple[int, ...], - patch_stride: Tuple[int, ...], - unroll_schedule: List[Tuple[int, ...]], - ): - super().__init__() - self.size = [i // s for i, s in zip(input_size, patch_stride)] - self.schedule = unroll_schedule - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Input: Flattened patch embeddings [B, N, C] - Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd - """ - B, _, C = x.shape - - cur_size = self.size - x = x.view(*([B] + cur_size + [C])) - - for strides in self.schedule: - # Move patches with the given strides to the batch dimension - - # Create a view of the tensor with the patch stride as separate dims - # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] - cur_size = [i // s for i, s in zip(cur_size, strides)] - new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] - x = x.view(new_shape) - - # Move the patch stride into the batch dimension - # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] - L = len(new_shape) - permute = ( - [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] - ) - x = x.permute(permute) - - # Now finally flatten the relevant dims into the batch dimension - x = x.flatten(0, len(strides)) - B *= math.prod(strides) - - x = x.reshape(-1, math.prod(self.size), C) - return x - - -class Reroll(nn.Module): - """ - Undos the "unroll" operation so that you can use intermediate features. - """ - - def __init__( - self, - input_size: Tuple[int, ...], - patch_stride: Tuple[int, ...], - unroll_schedule: List[Tuple[int, ...]], - stage_ends: List[int], - q_pool: int, - ): - super().__init__() - self.size = [i // s for i, s in zip(input_size, patch_stride)] - - # The first stage has to reverse everything - # The next stage has to reverse all but the first unroll, etc. - self.schedule = {} - size = self.size - for i in range(stage_ends[-1] + 1): - self.schedule[i] = unroll_schedule, size - # schedule unchanged if no pooling at a stage end - if i in stage_ends[:q_pool]: - if len(unroll_schedule) > 0: - size = [n // s for n, s in zip(size, unroll_schedule[0])] - unroll_schedule = unroll_schedule[1:] - - def forward( - self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None - ) -> torch.Tensor: - """ - Roll the given tensor back up to spatial order assuming it's from the given block. - - If no mask is provided: - - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. - If a mask is provided: - - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. - """ - schedule, size = self.schedule[block_idx] - B, N, C = x.shape - - D = len(size) - cur_mu_shape = [1] * D - - for strides in schedule: - # Extract the current patch from N - x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) - - # Move that patch into the current MU - # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] - L = len(x.shape) - permute = ( - [0, 1 + D] - + sum( - [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], - [], - ) - + [L - 1] - ) - x = x.permute(permute) - - # Reshape to [B, N//(Sy*Sx), *MU, C] - for i in range(D): - cur_mu_shape[i] *= strides[i] - x = x.reshape(B, -1, *cur_mu_shape, C) - N = x.shape[1] - - # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) - x = x.view(B, N, *cur_mu_shape, C) - - # If masked, return [B, #MUs, MUy, MUx, C] - if mask is not None: - return x - - # If not masked, we can return [B, H, W, C] - x = undo_windowing(x, size, cur_mu_shape) - - return x \ No newline at end of file From fa570f307c814942366dce660ba0407192c14b22 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 01:10:17 +0000 Subject: [PATCH 024/118] Fixed integration into tranformers --- src/transformers/__init__.py | 2 +- src/transformers/models/hiera/__init__.py | 13 ++++++++----- .../models/hiera/hiera_image_processor.py | 2 +- src/transformers/models/hiera/hiera_model.py | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9e3c4c5f7c96..51771d7f2229 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -6993,8 +6993,8 @@ HubertPreTrainedModel, ) from .models.hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, - HieraBlock ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 1f388d5361ab..2b83a4c8d693 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -9,8 +9,8 @@ _import_structure = { "configuration_hiera": [ - "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP", - "HireaConfig", + "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", + "HieraConfig", ], } @@ -20,15 +20,16 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["hirea"] = [ - "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST", - "Hirea", + _import_structure["hiera_model"] = [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "HieraModel", "Head", "HieraBlock", "MaskUnitAttention" "" ] + if TYPE_CHECKING: from .configuration_hiera import ( HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -42,10 +43,12 @@ pass else: from .hiera_model import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, Head, HieraBlock, MaskUnitAttention, + ) from .hiera_image_processor import ( HieraImageProcessor diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py index 4900e4a4d3fb..d3f2ce96a64b 100644 --- a/src/transformers/models/hiera/hiera_image_processor.py +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -1,5 +1,5 @@ -"""Image processor class for Hirea.""" +"""Image processor class for Hiera.""" from typing import Dict, List, Optional, Union diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py index cca502aa80c9..5e7493e3c6a7 100644 --- a/src/transformers/models/hiera/hiera_model.py +++ b/src/transformers/models/hiera/hiera_model.py @@ -38,6 +38,9 @@ replace_return_docstrings, ) +HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "", +] def conv_nd(n: int) -> Type[nn.Module]: """ From 7e41f4998e22df8d00c8fc6d378272b6569c5a8e Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 01:23:55 +0000 Subject: [PATCH 025/118] Fix: Convert Checkpoint --- .../models/hiera/convert_hiera_to_pytorch.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index d0294f12deab..76c86bcb0cbb 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -3,9 +3,9 @@ import requests import torch from PIL import Image -# from transformers.models.hiera.configuration_hiera import HieraConfig -# from transformers.models.hiera.hiera import HieraModel -# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor +from transformers import HieraConfig +from transformers import HieraModel +from transformers.models.hiera.hiera_image_processor import HieraImageProcessor # from transformers import HieraConfig, HieraModel from torchvision import transforms from torchvision.transforms.functional import InterpolationMode @@ -199,11 +199,13 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): out = model(inputs[None, ...]) # 207: golden retriever (imagenet-1k) - out.argmax(dim=-1).item() + out.last_hidden_state.argmax(dim=-1).item() + # If you also want intermediate feature maps + out = model(inputs[None, ...], return_intermediates=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) + for x in out.intermediates: + print(x.shape) print(f"Saving image processor to {pytorch_dump_folder_path}") image_processor.save_pretrained(pytorch_dump_folder_path) From f47d06a960c74926ad002578edcd48a64af3fba3 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 07:38:00 +0000 Subject: [PATCH 026/118] added documentation for hiera --- README.md | 1 + README_de.md | 1 + README_es.md | 1 + README_fr.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_pt-br.md | 1 + README_ru.md | 1 + README_te.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 ++ docs/source/en/index.md | 1 + 14 files changed, 15 insertions(+) diff --git a/README.md b/README.md index 54e228a11502..b6ec0f083527 100644 --- a/README.md +++ b/README.md @@ -390,6 +390,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_de.md b/README_de.md index 71ff7ce4aa33..b98c4c08113c 100644 --- a/README_de.md +++ b/README_de.md @@ -385,6 +385,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_es.md b/README_es.md index b3c6845000d2..e5c596e70634 100644 --- a/README_es.md +++ b/README_es.md @@ -363,6 +363,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_fr.md b/README_fr.md index 4b87eba5bbe1..53d8612c8b94 100644 --- a/README_fr.md +++ b/README_fr.md @@ -384,6 +384,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Les Transformers sont-ils vraiment inefficaces pour la représentation graphique ?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT : la segmentation sémantique émerge de la supervision textuelle](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ : référentiel complet pour la compréhension du langage polonais](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (de Facebook) publié avec l'article [Hiera : un transformateur de vision hiérarchique sans cloches et sifflets]( https://arxiv.org/abs/2306.00989) par Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT : Apprentissage de la représentation autonome de la parole par prédiction masquée des unités cachées](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT : Quantification entière de BERT avec des entiers uniquement](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS : Un ensemble de données filtré à l'échelle du Web d'intercalation de documents texte-image](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_hd.md b/README_hd.md index e68d9d39ba62..1dd181b01b34 100644 --- a/README_hd.md +++ b/README_hd.md @@ -337,6 +337,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** ((फेसबुक से) पेपर के साथ जारी किया गया [हिरा: बेल्स-एंड-व्हिसल्स के बिना एक पदानुक्रमित विजन ट्रांसफार्मर](https://arxiv.org/abs/2306.00989) by चैतन्य रयाली, युआन-टिंग हू, डैनियल बोल्या, चेन वेई, हाओकी फैन, पो-याओ हुआंग, वैभव अग्रवाल, अर्कबंधु चौधरी, ओमिद पौरसीद, जूडी हॉफमैन, जितेंद्र मलिक, द्वारा यांगहाओ ली, क्रिस्टोफ़ फ़िचटेनहोफ़र 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा। 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ja.md b/README_ja.md index d314b07140f5..c2103ac6a2b3 100644 --- a/README_ja.md +++ b/README_ja.md @@ -397,6 +397,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook から) Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer から公開された研究論文 [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ko.md b/README_ko.md index f8679087ad17..bd781f6adf6d 100644 --- a/README_ko.md +++ b/README_ko.md @@ -312,6 +312,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook 에서) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_pt-br.md b/README_pt-br.md index 684d96366aaf..65ff9fdc0f97 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -390,6 +390,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ru.md b/README_ru.md index e552b5cd4f90..4b01b6cf8060 100644 --- a/README_ru.md +++ b/README_ru.md @@ -381,6 +381,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_te.md b/README_te.md index 8da790e18204..3e69e473862e 100644 --- a/README_te.md +++ b/README_te.md @@ -383,6 +383,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_zh-hans.md b/README_zh-hans.md index 1832870d52ff..fc616751ee0e 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -336,6 +336,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (来自 Facebook) 伴随论文 [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 由 Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_zh-hant.md b/README_zh-hant.md index 2bf31890f359..5adb28a3070b 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -348,6 +348,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ff6e91dbcf25..4b59e76f5490 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -628,6 +628,8 @@ title: CLAP - local: model_doc/encodec title: EnCodec + - local: model_doc/hiera + title: Hiera - local: model_doc/hubert title: Hubert - local: model_doc/mctct diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 34995edec39c..b26c9f91360c 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -155,6 +155,7 @@ Flax), PyTorch, and/or TensorFlow. | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | +| [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ❌ | ❌ | From 9d249e0933d54a255d7baf247762dfbb9b35dd38 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 07:38:31 +0000 Subject: [PATCH 027/118] added documentation for hiera --- docs/source/en/model_doc/hiera.md | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 docs/source/en/model_doc/hiera.md diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md new file mode 100644 index 000000000000..1c46bae9b072 --- /dev/null +++ b/docs/source/en/model_doc/hiera.md @@ -0,0 +1,40 @@ + + +# Hiera + +## Overview + +Hubert was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer + +The abstract from the paper is the following: + +Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera. + +## HireaConfig + +[[autodoc]] HieraConfig + + + + +## HireaModel + +[[autodoc]] HireaModel + - forward + + + \ No newline at end of file From 7cff18690863c605e1ac180005ca2fff0dfb2050 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sat, 17 Feb 2024 07:39:18 +0000 Subject: [PATCH 028/118] added Docstings to models, Transformers based changes --- src/transformers/__init__.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/hiera/__init__.py | 24 ++-- .../models/hiera/configuration_hiera.py | 15 +++ .../models/hiera/convert_hiera_to_pytorch.py | 15 +++ .../models/hiera/hiera_image_processor.py | 14 +++ src/transformers/models/hiera/hiera_mae.py | 113 +----------------- src/transformers/models/hiera/hiera_model.py | 89 +++++++++----- 8 files changed, 124 insertions(+), 149 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 51771d7f2229..2e727a215038 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -4152,6 +4152,7 @@ [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", + "HieraPreTrainedModel" ] ) @@ -6995,6 +6996,7 @@ from .models.hiera import ( HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, + HieraPreTrainedModel ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index aef894a425ba..5261753d202d 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -69,6 +69,7 @@ ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), ("groupvit", "CLIPImageProcessor"), + ("hiera", "HieraImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), ("instructblip", "BlipImageProcessor"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 2b83a4c8d693..0787bffe767e 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -1,3 +1,18 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import TYPE_CHECKING from ...utils import ( @@ -23,9 +38,7 @@ _import_structure["hiera_model"] = [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", - "Head", - "HieraBlock", - "MaskUnitAttention" + "HieraPreTrainedModel" "" ] @@ -45,10 +58,7 @@ from .hiera_model import ( HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, - Head, - HieraBlock, - MaskUnitAttention, - + HieraPreTrainedModel ) from .hiera_image_processor import ( HieraImageProcessor diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index e3133354f6ea..a4ab4fd9d30b 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -1,5 +1,20 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ hiera model configuration""" + from ...configuration_utils import PretrainedConfig from ...utils import logging from typing import Tuple diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 76c86bcb0cbb..5ca2ecd262d9 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -1,3 +1,18 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import requests diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py index d3f2ce96a64b..4e41e14bc6f8 100644 --- a/src/transformers/models/hiera/hiera_image_processor.py +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -1,3 +1,17 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Image processor class for Hiera.""" diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index f0e2e7854bff..d4ec15058b2d 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,8 +17,7 @@ import torch import torch.nn as nn -from .hiera_model import HieraModel, HieraBlock -from .hiera_utils import pretrained_model, undo_windowing, conv_nd +from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: @@ -287,112 +286,4 @@ def forward( ) # pred_mask is mask at resolution of *prediction* # Toggle mask, to generate labels for *masked* tokens - return *self.forward_loss(x, pred, ~pred_mask), mask - - - - -# Image Models - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth", -}, default="mae_in1k") -def mae_hiera_tiny_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth", -}, default="mae_in1k") -def mae_hiera_small_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth", -}, default="mae_in1k") -def mae_hiera_base_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth", -}, default="mae_in1k") -def mae_hiera_base_plus_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth", -}, default="mae_in1k") -def mae_hiera_large_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs, - ) - - -@pretrained_model({ - "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth", -}, default="mae_in1k") -def mae_hiera_huge_224(**kwargs): - return MaskedAutoencoderHiera( - embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs, - ) - - - -# Video Models - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth", -}, default="mae_k400") -def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs): - return MaskedAutoencoderHiera( - num_classes=num_classes, # K400 has 400 classes - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_pos_embed=True, - q_pool=2, - **kwdargs - ) - - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth", -}, default="mae_k400") -@pretrained_model(None) -def mae_hiera_base_plus_16x224(**kwdargs): - return mae_hiera_base_16x224( - embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs - ) - - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth", -}, default="mae_k400") -@pretrained_model(None) -def mae_hiera_large_16x224(**kwdargs): - return mae_hiera_base_16x224( - embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs - ) - - -@pretrained_model({ - "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", -}, default="mae_k400") -def mae_hiera_huge_16x224(**kwdargs): - return mae_hiera_base_16x224( - embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs - ) + return *self.forward_loss(x, pred, ~pred_mask), mask \ No newline at end of file diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py index 5e7493e3c6a7..b1ed0db0e4b9 100644 --- a/src/transformers/models/hiera/hiera_model.py +++ b/src/transformers/models/hiera/hiera_model.py @@ -271,10 +271,6 @@ class HieraModelOutput(ModelOutput): Args: last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): Last layer hidden-states. - attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): - Attentions weights from the model, one for each layer. - hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): - Hidden states of the model at the output of each layer. intermediates (List[torch.Tensor], optional): Intermediate representations or features from the model, if applicable. """ @@ -422,10 +418,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.act_func(x) return x - +@add_start_docstrings(""" +Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). +""") class PatchEmbedding(nn.Module): - """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).""" - def __init__( self, dim_in: int, @@ -453,27 +449,49 @@ def forward( embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) return embeddings - -class HieraModel(PreTrainedModel): +class HieraPreTrainedModel(PreTrainedModel): """ - Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + config_class = HieraConfig + base_model_prefix = "hiera" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True - This model is a PyTorch implementation of the Hiera architecture for image classification. + def _init_weights(self, module, init_bias=0.02): + if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): + nn.init.trunc_normal_(module.weight, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + nn.init.constant_(module.bias, init_bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, init_bias) + nn.init.constant_(module.weight, 1.0) - The model can be used as follows: - Args: - config (HieraConfig): Configuration class instance for `Hiera`. - Example usage: - >>> from your_model_file import Hiera, HieraConfig - >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) - >>> model = Hiera(config) - >>> inputs = torch.rand((1, 3, 224, 224)) - >>> outputs = model(inputs) - """ +@add_start_docstrings(""" +Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + +This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. + +The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance. +Parameters: + config ([`HieraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + +Example usage: + >>> from your_model_file import Hiera, HieraConfig + >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + + >>> model = Hiera(config) + >>> inputs = torch.rand((1, 3, 224, 224)) + >>> outputs = model(inputs) + """) +class HieraModel(HieraPreTrainedModel): config_class = HieraConfig base_model_prefix = "hiera" main_input_name = "pixel_values" @@ -601,14 +619,6 @@ def __init__(self, config: HieraConfig): self.head.projection.bias.data.mul_(self.head_init_scale) self.post_init() - def _init_weights(self, m, init_bias=0.02): - if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): - nn.init.trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, init_bias) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, init_bias) - nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): @@ -655,6 +665,25 @@ def get_position_embeddings(self) -> torch.Tensor: else: return self.position_embeddings + @add_start_docstrings_to_model_forward(""" + The forward pass for the Hiera model. + + Args: + pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`. + + mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). + mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. + Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. + + + return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. + + return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. + + + + """) + @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig") def forward( self, pixel_values: torch.Tensor, @@ -663,8 +692,6 @@ def forward( return_intermediates: bool = False, ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: """ - mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. - Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. """ # Slowfast training passes in a list if isinstance(pixel_values, list): From c4a4168783bbd9f38b08e30935141e78a282d91f Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sun, 18 Feb 2024 06:55:51 +0000 Subject: [PATCH 029/118] make style and quality --- src/transformers/__init__.py | 15 +-- .../models/auto/configuration_auto.py | 6 +- src/transformers/models/hiera/__init__.py | 19 +--- .../models/hiera/configuration_hiera.py | 18 ++-- .../models/hiera/convert_hiera_to_pytorch.py | 102 +++++++++--------- .../models/hiera/hiera_image_processor.py | 51 ++++----- src/transformers/models/hiera/hiera_mae.py | 54 +++------- 7 files changed, 104 insertions(+), 161 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 2e727a215038..4d7ef6ce20d3 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -497,7 +497,7 @@ "GroupViTVisionConfig", ], "models.herbert": ["HerbertTokenizer"], - "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"], + "models.hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig"], "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"], "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"], "models.idefics": [ @@ -4149,12 +4149,7 @@ ] ) _import_structure["models.hiera"].extend( - [ - "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", - "HieraModel", - "HieraPreTrainedModel" - - ] + ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"] ) _import_structure["models.hubert"].extend( [ @@ -6986,6 +6981,7 @@ GroupViTTextModel, GroupViTVisionModel, ) + from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel from .models.hubert import ( HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, HubertForCTC, @@ -6993,11 +6989,6 @@ HubertModel, HubertPreTrainedModel, ) - from .models.hiera import ( - HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, - HieraModel, - HieraPreTrainedModel - ) from .models.ibert import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, IBertForMaskedLM, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 796e524fd0cf..6f824a2e955d 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -116,8 +116,8 @@ ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), + ("hiera", "HieraConfig"), ("hubert", "HubertConfig"), - ("hiera","HieraConfig"), ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), @@ -352,8 +352,8 @@ ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("hiera", "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -590,7 +590,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera","HieraModel"), + ("hiera", "HieraModel"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 0787bffe767e..fcffbbf7593e 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -35,12 +35,7 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["hiera_model"] = [ - "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", - "HieraModel", - "HieraPreTrainedModel" - "" - ] + _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "] if TYPE_CHECKING: @@ -55,16 +50,10 @@ except OptionalDependencyNotAvailable: pass else: - from .hiera_model import ( - HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, - HieraModel, - HieraPreTrainedModel - ) - from .hiera_image_processor import ( - HieraImageProcessor - ) + from .hiera_image_processor import HieraImageProcessor + from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index a4ab4fd9d30b..8d40e7a72777 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -15,15 +15,15 @@ """ hiera model configuration""" +from typing import Tuple + from ...configuration_utils import PretrainedConfig from ...utils import logging -from typing import Tuple -logger = logging.get_logger(__name__) -HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { +logger = logging.get_logger(__name__) -} +HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class HieraConfig(PretrainedConfig): @@ -42,7 +42,7 @@ class HieraConfig(PretrainedConfig): embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96. number_of_heads (int, optional): Initial number of attention heads. Defaults to 1. num_classes (int, optional): Number of output classes. Defaults to 1000. - stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. + stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. q_pool (int, optional): Number of pooling stages for queries. Defaults to 3. q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2). mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride. @@ -58,7 +58,7 @@ class HieraConfig(PretrainedConfig): head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001. sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False. - + Example: ```python >>> from transformers import HieraConfig, HieraModel @@ -72,9 +72,10 @@ class HieraConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ``` - """ + """ model_type = "hiera" + def __init__( self, input_size: Tuple[int, ...] = (224, 224), @@ -99,7 +100,6 @@ def __init__( head_init_scale: float = 0.001, sep_position_embeddings: bool = False, **kwargs, - ): super().__init__(**kwargs) self.input_size = input_size @@ -121,4 +121,4 @@ def __init__( self.drop_path_rate = drop_path_rate self.head_dropout = head_dropout self.head_init_scale = head_init_scale - self.sep_position_embeddings = sep_position_embeddings \ No newline at end of file + self.sep_position_embeddings = sep_position_embeddings diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 5ca2ecd262d9..794a62147d78 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -15,17 +15,11 @@ import argparse -import requests import torch -from PIL import Image -from transformers import HieraConfig -from transformers import HieraModel -from transformers.models.hiera.hiera_image_processor import HieraImageProcessor -# from transformers import HieraConfig, HieraModel -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +# from transformers import HieraConfig, HieraModel +from transformers import HieraConfig, HieraModel +from transformers.models.hiera.hiera_image_processor import HieraImageProcessor def rename_key(name): @@ -51,7 +45,7 @@ def convert_state_dict(orig_state_dict, config): return updated_model_state -def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): +def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs): strict = True pretrained_models_links = { "hiera_tiny_224": { @@ -93,21 +87,24 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): "hiera_huge_16x224": { "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth", "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth", - } + }, } - if "hiera_tiny_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(1, 2, 7, 2),) + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2), + ) checkpoints = pretrained_models_links["hiera_tiny_224"] checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"] elif "hiera_small_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(1, 2, 11, 2),) + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2), + ) checkpoints = pretrained_models_links["hiera_small_224"] checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] @@ -118,56 +115,57 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] elif "hiera_base_plus_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=112, - number_of_heads=2, - stages=(2, 3, 16, 3),) + config = HieraConfig( + embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3), + ) checkpoints = pretrained_models_links["hiera_base_plus_224"] checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"] elif "hiera_large_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=144, - number_of_heads=2, - stages=(2, 6, 36, 4),) + config = HieraConfig( + embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), + ) checkpoints = pretrained_models_links["hiera_large_224"] checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"] elif "hiera_huge_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=256, - number_of_heads=4, - stages=(2, 6, 36, 4)) + config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4)) checkpoints = pretrained_models_links["hiera_huge_224"] checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"] elif "hiera_base_16x224" in checkpoint_url: - config = HieraConfig(num_classes=num_classes, # Assuming num_classes is defined elsewhere - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_position_embeddings=True,) + config = HieraConfig( + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True, + ) checkpoints = pretrained_models_links["hiera_base_16x224"] checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"] elif "hiera_base_plus_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=112, - number_of_heads=2, - stages=(2, 3, 16, 3)) + config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3)) checkpoints = pretrained_models_links["hiera_base_plus_16x224"] checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"] elif "hiera_large_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=144, - number_of_heads=2, - stages=(2, 6, 36, 4), ) + config = HieraConfig( + embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), + ) checkpoints = pretrained_models_links["hiera_large_16x224"] checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"] elif "hiera_huge_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=256, - number_of_heads=4, - stages=(2, 6, 36, 4) ) + config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4)) checkpoints = pretrained_models_links["hiera_huge_16x224"] checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"] elif checkpoint not in checkpoints: @@ -181,7 +179,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): raise RuntimeError("No checkpoint specified.") state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu") - state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{}) + state_dict["model_state"] = convert_state_dict(state_dict["model_state"], {}) if "head.projection.weight" in state_dict["model_state"]: # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it if config.num_classes is None: @@ -194,19 +192,16 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): model = HieraModel(config=config) if pretrained: # Disable being strict when trying to load a encoder-decoder model into an encoder-only model - if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"): + if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr( + model, "decoder_position_embeddings" + ): strict = False - model.load_state_dict(state_dict["model_state"]) + model.load_state_dict(state_dict["model_state"], strict) # model.load_state_dict(state_dict["model_state"], strict=strict) - - - url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" - - image_processor = HieraImageProcessor(size=224) inputs = image_processor.process_image(image_url=url) @@ -220,7 +215,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): out = model(inputs[None, ...], return_intermediates=True) for x in out.intermediates: - print(x.shape) + print(x.shape) print(f"Saving image processor to {pytorch_dump_folder_path}") image_processor.save_pretrained(pytorch_dump_folder_path) @@ -231,4 +226,3 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs): checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth" convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/") - diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py index 4e41e14bc6f8..0200687c4835 100644 --- a/src/transformers/models/hiera/hiera_image_processor.py +++ b/src/transformers/models/hiera/hiera_image_processor.py @@ -15,32 +15,18 @@ """Image processor class for Hiera.""" -from typing import Dict, List, Optional, Union -import numpy as np - -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import rescale, resize, to_channel_dimension_format -from ...image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - infer_channel_dimension_format, - is_scaled_image, - make_list_of_images, - to_numpy_array, - valid_images, -) -from ...utils import TensorType, is_vision_available, logging -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from PIL import Image import requests +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +from ...image_processing_utils import BaseImageProcessor +from ...utils import is_vision_available, logging if is_vision_available(): - import PIL + from PIL import Image + from torchvision import transforms + from torchvision.transforms.functional import InterpolationMode logger = logging.get_logger(__name__) @@ -51,20 +37,23 @@ def __init__(self, size): self.size = size self.transform_list = [ transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC), - transforms.CenterCrop(self.size) + transforms.CenterCrop(self.size), ] self.transform_vis = transforms.Compose(self.transform_list) - self.transform_norm = transforms.Compose(self.transform_list + [ - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ]) - + self.transform_norm = transforms.Compose( + self.transform_list + + [ + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ] + ) + def process_image(self, image_url): # Load the image img = Image.open(requests.get(image_url, stream=True).raw) - + # Apply transformations - img_vis = self.transform_vis(img) + # img_vis = self.transform_vis(img) img_norm = self.transform_norm(img) - - return img_norm \ No newline at end of file + + return img_norm diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index d4ec15058b2d..56b91bc7acb7 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -10,28 +10,28 @@ # -------------------------------------------------------- +import math from functools import partial -from typing import Tuple, Optional +from typing import Optional, Tuple -import math import torch import torch.nn as nn -from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd +from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: if isinstance(head, nn.Identity): return x - batch_size , num_mask_units = x.shape[0:2] + batch_size, num_mask_units = x.shape[0:2] # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size * #MUs, C, My, Mx]) permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) - x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) + x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) # Restore original layout, e.g. [batch_size * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C'] permute = [0] + list(range(2, len(x.shape))) + [1] - x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1]) + x = x.permute(permute).reshape(batch_size, num_mask_units, *x.shape[2:], x.shape[1]) return x @@ -64,8 +64,7 @@ def __init__( i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride) ] self.tokens_spatial_shape_final = [ - i // s ** (self.q_pool) - for i, s in zip(self.tokens_spatial_shape, self.q_stride) + i // s ** (self.q_pool) for i, s in zip(self.tokens_spatial_shape, self.q_stride) ] # -------------------------------------------------------------------------- # Multi-scale fusion heads @@ -73,9 +72,7 @@ def __init__( self.multi_scale_fusion_heads = nn.ModuleList() for i in self.stage_ends[: self.q_pool]: # resolution constant after q_pool - kernel = [ - i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final) - ] + kernel = [i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)] curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)] self.multi_scale_fusion_heads.append( conv_nd(len(self.q_stride))( @@ -94,9 +91,7 @@ def __init__( self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) self.decoder_pos_embed = nn.Parameter( - torch.zeros( - 1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim - ) + torch.zeros(1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim) ) self.decoder_blocks = nn.ModuleList( @@ -113,9 +108,7 @@ def __init__( ) self.decoder_norm = norm_layer(decoder_embed_dim) - self.pred_stride = patch_stride[-1] * ( - self.q_stride[-1] ** self.q_pool - ) # patch stride of prediction + self.pred_stride = patch_stride[-1] * (self.q_stride[-1] ** self.q_pool) # patch stride of prediction self.decoder_pred = nn.Linear( decoder_embed_dim, @@ -143,9 +136,7 @@ def _mae_init_weights(self, m: nn.Module): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) - def get_pixel_label_2d( - self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True - ) -> torch.Tensor: + def get_pixel_label_2d(self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor: # mask (boolean tensor): True must correspond to *masked* input_img = input_img.permute(0, 2, 3, 1) @@ -160,13 +151,11 @@ def get_pixel_label_2d( return label - def get_pixel_label_3d( - self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True - ) -> torch.Tensor: + def get_pixel_label_3d(self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor: # mask (boolean tensor): True must correspond to *masked* # We use time strided loss, only take the first frame from each token - input_vid = input_vid[:, :, ::self.patch_stride[0], :, :] + input_vid = input_vid[:, :, :: self.patch_stride[0], :, :] size = self.pred_stride label = input_vid.unfold(3, size, size).unfold(4, size, size) @@ -181,11 +170,9 @@ def get_pixel_label_3d( return label - def forward_encoder( self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: - if mask is None: mask = self.get_random_mask(x, mask_ratio) # [batch_size , #MUs_all] @@ -203,9 +190,7 @@ def forward_encoder( return x, mask - def forward_decoder( - self, x: torch.Tensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + def forward_decoder(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: # Embed tokens x = self.decoder_embed(x) @@ -214,9 +199,7 @@ def forward_decoder( # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] # mask: [batch_size , #MUs_all] x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) - mask_tokens = self.mask_token.view( - (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,) - ) + mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)) mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:])) mask = mask.expand((-1,) * 2 + x.shape[2:]).bool() x_dec[mask] = x.flatten() @@ -279,11 +262,8 @@ def forward( mask_ratio: float = 0.6, mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - latent, mask = self.forward_encoder(x, mask_ratio, mask=mask) - pred, pred_mask = self.forward_decoder( - latent, mask - ) # pred_mask is mask at resolution of *prediction* + pred, pred_mask = self.forward_decoder(latent, mask) # pred_mask is mask at resolution of *prediction* # Toggle mask, to generate labels for *masked* tokens - return *self.forward_loss(x, pred, ~pred_mask), mask \ No newline at end of file + return *self.forward_loss(x, pred, ~pred_mask), mask From 01e46628895b0e5643fa7a1d60211e5d6e0b32bc Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sun, 18 Feb 2024 06:56:38 +0000 Subject: [PATCH 030/118] make style and quality --- src/transformers/models/hiera/hiera_model.py | 128 ++++++++----------- 1 file changed, 56 insertions(+), 72 deletions(-) diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py index b1ed0db0e4b9..9345084769ec 100644 --- a/src/transformers/models/hiera/hiera_model.py +++ b/src/transformers/models/hiera/hiera_model.py @@ -19,29 +19,29 @@ # -------------------------------------------------------- import math +from dataclasses import dataclass from functools import partial -from typing import List, Tuple, Callable, Optional, Union, Type -from .configuration_hiera import HieraConfig +from typing import Callable, List, Optional, Tuple, Type, Union + import torch import torch.nn as nn import torch.nn.functional as F -from dataclasses import dataclass - from timm.models.layers import DropPath, Mlp + from ...modeling_utils import PreTrainedModel -from ...modeling_outputs import BaseModelOutput from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - logging, - replace_return_docstrings, ) +from .configuration_hiera import HieraConfig + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "", + "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", ] + def conv_nd(n: int) -> Type[nn.Module]: """ Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. @@ -67,9 +67,7 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso return mask -def do_masked_conv( - x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None -) -> torch.Tensor: +def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None) -> torch.Tensor: """Zero-out the masked regions of the input before conv. Prevents leakage of masked regions when using overlapping kernels. """ @@ -82,9 +80,7 @@ def do_masked_conv( return conv(x * mask.bool()) -def undo_windowing( - x: torch.Tensor, shape: List[int], mu_shape: List[int] -) -> torch.Tensor: +def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor: """ Restore spatial organization by undoing windowed organization of mask units. @@ -116,7 +112,6 @@ def undo_windowing( return x - class Unroll(nn.Module): """ Reorders the tokens such that patches are contiguous in memory. @@ -169,9 +164,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Move the patch stride into the batch dimension # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] L = len(new_shape) - permute = ( - [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] - ) + permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] x = x.permute(permute) # Now finally flatten the relevant dims into the batch dimension @@ -210,9 +203,7 @@ def __init__( size = [n // s for n, s in zip(size, unroll_schedule[0])] unroll_schedule = unroll_schedule[1:] - def forward( - self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None - ) -> torch.Tensor: + def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor: """ Roll the given tensor back up to spatial order assuming it's from the given block. @@ -269,11 +260,12 @@ class HieraModelOutput(ModelOutput): Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput. Args: - last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): + last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): Last layer hidden-states. - intermediates (List[torch.Tensor], optional): + intermediates (List[torch.Tensor], optional): Intermediate representations or features from the model, if applicable. """ + last_hidden_state: torch.FloatTensor intermediates: Optional[List[torch.Tensor]] = None @@ -320,15 +312,13 @@ def __init__( self.use_mask_unit_attention = use_mask_unit_attention def forward(self, embeddings: torch.Tensor) -> torch.Tensor: - """ Input should be of shape [batch, tokens, channels]. """ - batch_size , num_channels , _ = embeddings.shape - num_windows = ( - (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 - ) + """Input should be of shape [batch, tokens, channels].""" + batch_size, num_channels, _ = embeddings.shape + num_windows = (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1 qkv = ( self.qkv(embeddings) - .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim) + .reshape(batch_size, -1, num_windows, 3, self.number_of_heads, self.head_dim) .permute(3, 0, 4, 2, 1, 5) ) q, k, v = qkv[0], qkv[1], qkv[2] @@ -336,7 +326,7 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor: if self.q_stride > 1: # Refer to Unroll to see how this performs a maxpool-Nd q = ( - q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) + q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) .max(dim=3) .values ) @@ -347,9 +337,9 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor: else: attention = (q * self.scale) @ k.transpose(-1, -2) attention = attention.softmax(dim=-1) - embeddings = (attention @ v) + embeddings = attention @ v - embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim) + embeddings = embeddings.transpose(1, 3).reshape(batch_size, -1, self.output_dim) embeddings = self.projection(embeddings) return embeddings @@ -418,9 +408,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.act_func(x) return x -@add_start_docstrings(""" + +@add_start_docstrings( + """ Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). -""") +""" +) class PatchEmbedding(nn.Module): def __init__( self, @@ -442,18 +435,18 @@ def __init__( padding=padding, ) - def forward( - self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None - ) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: embeddings = do_masked_conv(pixel_values, self.projection, mask) embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) return embeddings + class HieraPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + config_class = HieraConfig base_model_prefix = "hiera" main_input_name = "pixel_values" @@ -469,9 +462,8 @@ def _init_weights(self, module, init_bias=0.02): nn.init.constant_(module.weight, 1.0) - - -@add_start_docstrings(""" +@add_start_docstrings( + """ Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. @@ -482,7 +474,7 @@ def _init_weights(self, module, init_bias=0.02): config ([`HieraConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. - + Example usage: >>> from your_model_file import Hiera, HieraConfig >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) @@ -490,7 +482,8 @@ def _init_weights(self, module, init_bias=0.02): >>> model = Hiera(config) >>> inputs = torch.rand((1, 3, 224, 224)) >>> outputs = model(inputs) - """) + """ +) class HieraModel(HieraPreTrainedModel): config_class = HieraConfig base_model_prefix = "hiera" @@ -531,9 +524,7 @@ def __init__(self, config: HieraConfig): assert self.q_pool < len(self.stages) self.q_pool, self.q_stride = self.q_pool, self.q_stride self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size - self.mask_spatial_shape = [ - i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size) - ] + self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)] self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)] self.patch_embedding = PatchEmbedding( @@ -555,9 +546,7 @@ def __init__(self, config: HieraConfig): self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension)) # Setup roll and reroll modules - self.unroll = Unroll( - self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]) - ) + self.unroll = Unroll(self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])) self.reroll = Reroll( self.input_size, self.patch_stride, @@ -566,7 +555,7 @@ def __init__(self, config: HieraConfig): self.q_pool, ) # q_pool locations - q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]] + q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]] # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)] @@ -619,7 +608,6 @@ def __init__(self, config: HieraConfig): self.head.projection.bias.data.mul_(self.head_init_scale) self.post_init() - @torch.jit.ignore def no_weight_decay(self): if self.sep_position_embeddings: @@ -632,21 +620,19 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor: Generates a random mask, mask_ratio fraction are dropped. 1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc. """ - batch_size = x.shape[0] + batch_size = x.shape[0] # Tokens selected for masking at mask unit level num_windows = math.prod(self.mask_spatial_shape) # num_mask_units len_keep = int(num_windows * (1 - mask_ratio)) - noise = torch.rand(batch_size , num_windows, device=x.device) + noise = torch.rand(batch_size, num_windows, device=x.device) # Sort noise for each sample - ids_shuffle = torch.argsort( - noise, dim=1 - ) # ascend: small is keep, large is remove + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove ids_restore = torch.argsort(ids_shuffle, dim=1) # Generate the binary mask: 1 is *keep*, 0 is *remove* # Note this is opposite to original MAE - mask = torch.zeros([batch_size , num_windows], device=x.device) + mask = torch.zeros([batch_size, num_windows], device=x.device) mask[:, :len_keep] = 1 # Unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) @@ -665,34 +651,34 @@ def get_position_embeddings(self) -> torch.Tensor: else: return self.position_embeddings - @add_start_docstrings_to_model_forward(""" + @add_start_docstrings_to_model_forward( + """ The forward pass for the Hiera model. Args: pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`. - + mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. - + return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. - - - - """) - @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig") + + + + """ + ) def forward( self, pixel_values: torch.Tensor, mask: torch.Tensor = None, return_dict: Optional[bool] = True, - return_intermediates: bool = False, + return_intermediates: bool = True, ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: - """ - """ + """ """ # Slowfast training passes in a list if isinstance(pixel_values, list): pixel_values = pixel_values[0] @@ -700,9 +686,7 @@ def forward( pached_embeddings = self.patch_embedding( pixel_values, - mask=mask.view( - pixel_values.shape[0], 1, *self.mask_spatial_shape - ) # batch_size , C, *mask_spatial_shape + mask=mask.view(pixel_values.shape[0], 1, *self.mask_spatial_shape) # batch_size , C, *mask_spatial_shape if mask is not None else None, ) @@ -732,8 +716,8 @@ def forward( # intermediates[-1] is embeddings in spatial order if not return_dict: return tuple(v for v in [embeddings, intermediates] if v is not None) - + return HieraModelOutput( last_hidden_state=embeddings, intermediates=intermediates if return_intermediates else None, - ) \ No newline at end of file + ) From cc2c623197701c7371a00b7119d10928c5eae8a4 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Mon, 26 Feb 2024 23:11:01 +0000 Subject: [PATCH 031/118] Integration & Block tests running --- tests/models/hiera/test_modeling_hiera.py | 265 +++++++++++++++++++--- 1 file changed, 235 insertions(+), 30 deletions(-) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 8d593af2a622..72badde557df 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -15,7 +15,8 @@ """ Testing suite for the PyTorch Hiera model. """ import unittest - +from typing import Tuple +from transformers.models.hiera.hiera_model import HieraBlock from transformers import HieraConfig from transformers.testing_utils import ( require_torch, @@ -23,65 +24,269 @@ torch_device, ) from transformers.utils import is_torch_available - +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD if is_torch_available(): import torch from transformers import HieraModel - # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model - from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST - - + from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST + from torchvision.transforms.functional import InterpolationMode + from torchvision import transforms + from PIL import Image +import math class HieraModelTester: - # Define this tester to initialize Hiera model and its configurations for testing def __init__( self, parent, - batch_size=8, - num_channels=3, - image_size=224, - # Add other model-specific parameters here + input_size: Tuple[int, ...] = (224, 224), + in_chans: int = 3, + embedding_dimension: int = 96, # initial embedding input_dim + number_of_heads: int = 1, # initial number of number_of_heads + num_classes: int = 1000, + stages: Tuple[int, ...] = (2, 3, 16, 3), + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, ...] = (2, 2), + mask_unit_size: Tuple[int, ...] = (8, 8), # must divide q_stride ** (#stages-1) + # mask_unit_attn: which stages use mask unit attention? + mask_unit_attn: Tuple[bool, ...] = (True, True, False, False), + dim_mul: float = 2.0, + head_mul: float = 2.0, + patch_kernel: Tuple[int, ...] = (7, 7), + patch_stride: Tuple[int, ...] = (4, 4), + patch_padding: Tuple[int, ...] = (3, 3), + mlp_ratio: float = 4.0, + drop_path_rate: float = 0.0, + head_dropout: float = 0.0, + head_init_scale: float = 0.001, + sep_position_embeddings: bool = False, ): self.parent = parent - self.batch_size = batch_size - self.num_channels = num_channels - self.image_size = image_size - # Initialize other necessary attributes here + self.input_size = input_size + self.in_chans = in_chans + self.embedding_dimension = embedding_dimension + self.number_of_heads = number_of_heads + self.num_classes = num_classes + self.stages = stages + self.q_pool = q_pool + self.q_stride = q_stride + self.mask_unit_size = mask_unit_size + self.mask_unit_attn = mask_unit_attn + self.dim_mul = dim_mul + self.head_mul = head_mul + self.patch_kernel = patch_kernel + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.mlp_ratio = mlp_ratio + self.drop_path_rate = drop_path_rate + self.head_dropout = head_dropout + self.head_init_scale = head_init_scale + self.sep_position_embeddings = sep_position_embeddings - def prepare_config_and_inputs(self): + def prepare_config_and_inputs(self,checkpoint_url): # Prepare configuration and inputs for testing your model - pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device) + pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1])) - config = self.get_config() + config = self.get_config(checkpoint_url=checkpoint_url) return config, pixel_values - def get_config(self): - return HieraConfig( - # Define necessary configuration parameters here - ) + def get_config(self,checkpoint_url): + if "hiera_tiny_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2),) + + elif "hiera_small_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2),) + + elif "hiera_base_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), ) + + + elif "hiera_base_plus_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3),) + + elif "hiera_large_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4),) + + elif "hiera_huge_224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4)) + + elif "hiera_base_16x224" in checkpoint_url: + config = HieraConfig(num_classes=self.num_classes, + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True,) + + elif "hiera_base_plus_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3)) + + elif "hiera_large_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), ) + + elif "hiera_huge_16x224" in checkpoint_url: + config = HieraConfig(embedding_dimension=256, + number_of_heads=4, + stages=(2, 6, 36, 4) ) + else: + raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})") + + return config def create_and_check_model(self, config, pixel_values): + batch_size = 1 model = HieraModel(config=config) - model.to(torch_device) + num_patches = int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)**2 + flat_q_stride = math.prod(self.q_stride) + embedding_dimension = self.embedding_dimension + indermediate_shapes = [] + for _ in self.stages: + indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension)) + num_patches = num_patches/flat_q_stride + embedding_dimension = embedding_dimension * 2 model.eval() with torch.no_grad(): result = model(pixel_values=pixel_values) - # Perform checks here, e.g., output shapes, etc. - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size)) + + for idx, x in enumerate(result.intermediates): + self.parent.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape") @require_torch -class HieraModelTest(unittest.TestCase): +class HieraModelTest(): def setUp(self): self.model_tester = HieraModelTester(self) def test_model(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_model(*config_and_inputs) + for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name) + self.model_tester.create_and_check_model(*config_and_inputs) - @slow + # @slow def test_model_from_pretrained(self): for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = HieraModel.from_pretrained(model_name) - self.assertIsNotNone(model) \ No newline at end of file + self.assertIsNotNone(model) + +@require_torch +@slow +class HieraModelIntegrationTest(unittest.TestCase): + def test_forward(self): + torch_device = "cpu" + input_size = 224 + batch_size =1 + patch_kernel = (7,7) + patch_padding = (3,3) + patch_stride = (4,4) + q_stride = (2,2) + flat_q_stride = math.prod(q_stride) + stages=(2, 3, 16, 3) + embedding_dimension = 96 + model = HieraModel.from_pretrained("/home/ubuntu/home/hiera/model/") + model.to(torch_device) + + random_tensor = torch.rand(batch_size, 3, input_size, input_size) + num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1)**2 + + indermediate_shapes = [] + for _ in stages: + indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension)) + num_patches = num_patches/flat_q_stride + embedding_dimension = embedding_dimension * 2 + out = model(random_tensor) + + out.last_hidden_state.argmax(dim=-1).item() + + out = model(random_tensor, return_intermediates=True) + for idx, x in enumerate(out.intermediates): + self.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape") + +class TestHieraBlock(unittest.TestCase): + def test_output_shape(self): + batch_size, input_dim, output_dim = 1, 96, 192 + number_of_heads = 2 + mlp_ratio = 4.0 + drop_path = 0.0 + q_stride = 4 + window_size = 16 + use_mask_unit_attention = True + num_patches = 3136 + + block = HieraBlock( + input_dim=input_dim, + output_dim=output_dim, + number_of_heads=number_of_heads, + mlp_ratio=mlp_ratio, + drop_path=drop_path, + q_stride=q_stride, + window_size=window_size, + use_mask_unit_attention=use_mask_unit_attention + ) + + # Create a dummy input + x = torch.randn(batch_size, num_patches,input_dim) + + # Forward pass + out = block(x) + + # Check the shape of the output + expected_shape = (batch_size, num_patches/q_stride, output_dim) + self.assertEqual(out.shape, expected_shape, "Output shape is incorrect") + + def test_input_output_dim_equality(self): + batch_size, input_dim, output_dim = 1, 96, 96 + number_of_heads = 1 + mlp_ratio = 4.0 + drop_path = 0.0 + q_stride = 1 + window_size = 64 + use_mask_unit_attention = True + num_patches = 3136 + block = HieraBlock( + input_dim=input_dim, + output_dim=output_dim, + number_of_heads=number_of_heads, + mlp_ratio=mlp_ratio, + drop_path=drop_path, + q_stride=q_stride, + window_size=window_size, + use_mask_unit_attention=use_mask_unit_attention + ) + + # Create a dummy input + x = torch.randn(batch_size, num_patches,input_dim) + + # Forward pass + out = block(x) + + # Check the shape of the output + expected_shape = (batch_size, num_patches, output_dim) + self.assertEqual(out.shape, expected_shape, "Output shape is incorrect. Input shape should be equal to output shape") + + +if __name__ == '__main__': + test = HieraModelIntegrationTest() + test.test_forward() + block_test = TestHieraBlock() + block_test.test_output_shape() + block_test.test_input_output_dim_equality() + model_test = HieraModelTest() + model_test.setUp() + model_test.test_model() + model_test.test_model_from_pretrained() From f172b7490835cdebad3ebe3e1fd8fe6940aef728 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 06:04:01 +0000 Subject: [PATCH 032/118] Fixed bugs --- src/transformers/__init__.py | 6 +++++- src/transformers/models/auto/configuration_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/hiera/__init__.py | 8 ++++++-- src/transformers/models/hiera/hiera_mae.py | 2 +- .../models/hiera/{hiera_model.py => modeling_hiera.py} | 0 6 files changed, 14 insertions(+), 5 deletions(-) rename src/transformers/models/hiera/{hiera_model.py => modeling_hiera.py} (100%) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4d7ef6ce20d3..9d668babbec2 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -6981,7 +6981,11 @@ GroupViTTextModel, GroupViTVisionModel, ) - from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel + from .models.hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, + HieraModel, + HieraPreTrainedModel, + ) from .models.hubert import ( HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, HubertForCTC, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6f824a2e955d..10511e2ff47e 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -590,7 +590,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), - ("hiera", "HieraModel"), + ("hiera", "Hiera"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 0fc417e795e4..fb4d571632a4 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -501,6 +501,7 @@ ("efficientnet", "EfficientNetModel"), ("focalnet", "FocalNetModel"), ("glpn", "GLPNModel"), + ("hiera", "HieraModel"), ("imagegpt", "ImageGPTModel"), ("levit", "LevitModel"), ("mobilenet_v1", "MobileNetV1Model"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index fcffbbf7593e..d32f0a934fea 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -35,7 +35,11 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "] + _import_structure["modeling_hiera"] = [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "HieraModel", + "HieraPreTrainedModel " + ] if TYPE_CHECKING: @@ -51,7 +55,7 @@ pass else: from .hiera_image_processor import HieraImageProcessor - from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel + from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel else: import sys diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py index 56b91bc7acb7..7c42c22734a1 100644 --- a/src/transformers/models/hiera/hiera_mae.py +++ b/src/transformers/models/hiera/hiera_mae.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing +from .modeling_hiera import HieraBlock, HieraModel, conv_nd, undo_windowing def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/modeling_hiera.py similarity index 100% rename from src/transformers/models/hiera/hiera_model.py rename to src/transformers/models/hiera/modeling_hiera.py From 35b3720aa89ad9711df9fc38abc75b2a93c0d012 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 17:41:40 +0000 Subject: [PATCH 033/118] Removed tim dependency --- .../models/hiera/modeling_hiera.py | 92 ++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 9345084769ec..f463834a437b 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -18,15 +18,16 @@ # timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm # -------------------------------------------------------- +import collections.abc import math from dataclasses import dataclass from functools import partial +from itertools import repeat from typing import Callable, List, Optional, Tuple, Type, Union import torch import torch.nn as nn import torch.nn.functional as F -from timm.models.layers import DropPath, Mlp from ...modeling_utils import PreTrainedModel from ...utils import ( @@ -38,7 +39,7 @@ HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth", + "namangarg110/hiera_base_224", ] @@ -112,6 +113,93 @@ def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> to return x +# Copied from transformers.models.swin.modeling_swin.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" + + +# Copied from timm.layers.helpers +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +# Copied from timm.layers.mlp +class Mlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=None, + bias=True, + drop=0.0, + use_conv=False, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear + + self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity() + self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.norm(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + class Unroll(nn.Module): """ Reorders the tokens such that patches are contiguous in memory. From 5f90a2559d1807d710c2fcea15678da7ff9054df Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 17:42:06 +0000 Subject: [PATCH 034/118] added HieraBlock --- src/transformers/models/hiera/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index d32f0a934fea..3346e03f9a88 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -38,8 +38,9 @@ _import_structure["modeling_hiera"] = [ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", - "HieraPreTrainedModel " - ] + "HieraPreTrainedModel", + "HieraBlock", + ] if TYPE_CHECKING: @@ -55,7 +56,7 @@ pass else: from .hiera_image_processor import HieraImageProcessor - from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel + from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock, HieraModel, HieraPreTrainedModel else: import sys From ebde8c89a821c21063051b184605e5ffb4dbf6c7 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 17:42:27 +0000 Subject: [PATCH 035/118] fixed: Model name --- src/transformers/models/hiera/convert_hiera_to_pytorch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index 794a62147d78..f4f82d59a3c9 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -217,12 +217,12 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs) for x in out.intermediates: print(x.shape) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path, push_to_hub=True, safe_serialization=False) if __name__ == "__main__": parser = argparse.ArgumentParser() checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth" - convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/") + convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="/home/ubuntu/home/hiera/hiera_base_224") From 772e421b0d56759dacc710e6ab303ea1f0f79900 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 17:43:02 +0000 Subject: [PATCH 036/118] added tests for HieraModel, HieraBlock --- tests/models/hiera/test_modeling_hiera.py | 272 ++++++++++++---------- 1 file changed, 143 insertions(+), 129 deletions(-) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 72badde557df..21e0f14fe58f 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -15,24 +15,84 @@ """ Testing suite for the PyTorch Hiera model. """ import unittest -from typing import Tuple -from transformers.models.hiera.hiera_model import HieraBlock -from transformers import HieraConfig +from typing import Tuple + +from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraConfig, HieraModel +from transformers.models.hiera import HieraBlock from transformers.testing_utils import ( require_torch, slow, - torch_device, ) from transformers.utils import is_torch_available -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + + if is_torch_available(): import torch - from transformers import HieraModel - from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST - from torchvision.transforms.functional import InterpolationMode - from torchvision import transforms - from PIL import Image import math + + +class HieraBlockTester: + def __init__( + self, + parent, + batch_size: int = 1, + input_dim: int = 96, + output_dim: int = 192, + number_of_heads: int = 2, + mlp_ratio: float = 4.0, + drop_path: float = 0.0, + q_stride: int = 4, + window_size: int = 16, + use_mask_unit_attention: bool = True, + num_patches: int = 3136, + ): + self.parent = parent + self.batch_size = batch_size + self.input_dim = input_dim + self.output_dim = output_dim + self.number_of_heads = number_of_heads + self.mlp_ratio = mlp_ratio + self.drop_path = drop_path + self.q_stride = q_stride + self.window_size = window_size + self.use_mask_unit_attention = use_mask_unit_attention + self.num_patches = num_patches + + def create_and_check_block(self): + block = HieraBlock( + input_dim=self.input_dim, + output_dim=self.output_dim, + number_of_heads=self.number_of_heads, + mlp_ratio=self.mlp_ratio, + drop_path=self.drop_path, + q_stride=self.q_stride, + window_size=self.window_size, + use_mask_unit_attention=self.use_mask_unit_attention, + ) + + x = torch.randn(self.batch_size, self.num_patches, self.input_dim) + out = block(x) + + expected_shape = (self.batch_size, self.num_patches // self.q_stride, self.output_dim) + self.parent.assertEqual(out.shape, expected_shape, "Output shape is incorrect") + + +@require_torch +class TestHieraBlock(unittest.TestCase): + def setUp(self): + self.block_tester = HieraBlockTester(self) + + def test_output_shape(self): + self.block_tester.create_and_check_block() + + def test_input_output_dim_equality(self): + self.block_tester.output_dim = self.block_tester.input_dim + self.block_tester.q_stride = 1 + self.block_tester.number_of_heads = 1 + self.block_tester.window_size = 64 + self.block_tester.create_and_check_block() + + class HieraModelTester: def __init__( self, @@ -81,7 +141,7 @@ def __init__( self.head_init_scale = head_init_scale self.sep_position_embeddings = sep_position_embeddings - def prepare_config_and_inputs(self,checkpoint_url): + def prepare_config_and_inputs(self, checkpoint_url): # Prepare configuration and inputs for testing your model pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1])) @@ -89,60 +149,69 @@ def prepare_config_and_inputs(self,checkpoint_url): return config, pixel_values - def get_config(self,checkpoint_url): + def get_config(self, checkpoint_url): if "hiera_tiny_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(1, 2, 7, 2),) + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 7, 2), + ) elif "hiera_small_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, - number_of_heads=1, - stages=(1, 2, 11, 2),) + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(1, 2, 11, 2), + ) elif "hiera_base_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), ) - + config = HieraConfig( + embedding_dimension=96, + number_of_heads=1, + stages=(2, 3, 16, 3), + ) elif "hiera_base_plus_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=112, - number_of_heads=2, - stages=(2, 3, 16, 3),) + config = HieraConfig( + embedding_dimension=112, + number_of_heads=2, + stages=(2, 3, 16, 3), + ) elif "hiera_large_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=144, - number_of_heads=2, - stages=(2, 6, 36, 4),) + config = HieraConfig( + embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), + ) elif "hiera_huge_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=256, - number_of_heads=4, - stages=(2, 6, 36, 4)) + config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4)) elif "hiera_base_16x224" in checkpoint_url: - config = HieraConfig(num_classes=self.num_classes, - input_size=(16, 224, 224), - q_stride=(1, 2, 2), - mask_unit_size=(1, 8, 8), - patch_kernel=(3, 7, 7), - patch_stride=(2, 4, 4), - patch_padding=(1, 3, 3), - sep_position_embeddings=True,) + config = HieraConfig( + num_classes=self.num_classes, + input_size=(16, 224, 224), + q_stride=(1, 2, 2), + mask_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_position_embeddings=True, + ) elif "hiera_base_plus_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=112, - number_of_heads=2, - stages=(2, 3, 16, 3)) + config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3)) elif "hiera_large_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=144, - number_of_heads=2, - stages=(2, 6, 36, 4), ) + config = HieraConfig( + embedding_dimension=144, + number_of_heads=2, + stages=(2, 6, 36, 4), + ) elif "hiera_huge_16x224" in checkpoint_url: - config = HieraConfig(embedding_dimension=256, - number_of_heads=4, - stages=(2, 6, 36, 4) ) + config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4)) else: raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})") @@ -151,25 +220,29 @@ def get_config(self,checkpoint_url): def create_and_check_model(self, config, pixel_values): batch_size = 1 model = HieraModel(config=config) - num_patches = int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)**2 + num_patches = ( + int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1) + ** 2 + ) flat_q_stride = math.prod(self.q_stride) embedding_dimension = self.embedding_dimension indermediate_shapes = [] for _ in self.stages: - indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension)) - num_patches = num_patches/flat_q_stride + indermediate_shapes.append( + (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension) + ) + num_patches = num_patches / flat_q_stride embedding_dimension = embedding_dimension * 2 model.eval() with torch.no_grad(): result = model(pixel_values=pixel_values) for idx, x in enumerate(result.intermediates): - self.parent.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape") + self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") @require_torch -class HieraModelTest(): - +class HieraModelTest(unittest.TestCase): def setUp(self): self.model_tester = HieraModelTester(self) @@ -178,36 +251,39 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name) self.model_tester.create_and_check_model(*config_and_inputs) - # @slow + @slow def test_model_from_pretrained(self): for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = HieraModel.from_pretrained(model_name) self.assertIsNotNone(model) + @require_torch @slow class HieraModelIntegrationTest(unittest.TestCase): def test_forward(self): torch_device = "cpu" input_size = 224 - batch_size =1 - patch_kernel = (7,7) - patch_padding = (3,3) - patch_stride = (4,4) - q_stride = (2,2) - flat_q_stride = math.prod(q_stride) - stages=(2, 3, 16, 3) + batch_size = 1 + patch_kernel = (7, 7) + patch_padding = (3, 3) + patch_stride = (4, 4) + q_stride = (2, 2) + flat_q_stride = math.prod(q_stride) + stages = (2, 3, 16, 3) embedding_dimension = 96 - model = HieraModel.from_pretrained("/home/ubuntu/home/hiera/model/") + model = HieraModel.from_pretrained("namangarg110/hiera_base_224") model.to(torch_device) - + random_tensor = torch.rand(batch_size, 3, input_size, input_size) - num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1)**2 + num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1) ** 2 indermediate_shapes = [] for _ in stages: - indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension)) - num_patches = num_patches/flat_q_stride + indermediate_shapes.append( + (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension) + ) + num_patches = num_patches / flat_q_stride embedding_dimension = embedding_dimension * 2 out = model(random_tensor) @@ -215,72 +291,10 @@ def test_forward(self): out = model(random_tensor, return_intermediates=True) for idx, x in enumerate(out.intermediates): - self.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape") - -class TestHieraBlock(unittest.TestCase): - def test_output_shape(self): - batch_size, input_dim, output_dim = 1, 96, 192 - number_of_heads = 2 - mlp_ratio = 4.0 - drop_path = 0.0 - q_stride = 4 - window_size = 16 - use_mask_unit_attention = True - num_patches = 3136 - - block = HieraBlock( - input_dim=input_dim, - output_dim=output_dim, - number_of_heads=number_of_heads, - mlp_ratio=mlp_ratio, - drop_path=drop_path, - q_stride=q_stride, - window_size=window_size, - use_mask_unit_attention=use_mask_unit_attention - ) - - # Create a dummy input - x = torch.randn(batch_size, num_patches,input_dim) - - # Forward pass - out = block(x) - - # Check the shape of the output - expected_shape = (batch_size, num_patches/q_stride, output_dim) - self.assertEqual(out.shape, expected_shape, "Output shape is incorrect") - - def test_input_output_dim_equality(self): - batch_size, input_dim, output_dim = 1, 96, 96 - number_of_heads = 1 - mlp_ratio = 4.0 - drop_path = 0.0 - q_stride = 1 - window_size = 64 - use_mask_unit_attention = True - num_patches = 3136 - block = HieraBlock( - input_dim=input_dim, - output_dim=output_dim, - number_of_heads=number_of_heads, - mlp_ratio=mlp_ratio, - drop_path=drop_path, - q_stride=q_stride, - window_size=window_size, - use_mask_unit_attention=use_mask_unit_attention - ) - - # Create a dummy input - x = torch.randn(batch_size, num_patches,input_dim) - - # Forward pass - out = block(x) - - # Check the shape of the output - expected_shape = (batch_size, num_patches, output_dim) - self.assertEqual(out.shape, expected_shape, "Output shape is incorrect. Input shape should be equal to output shape") + self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") -if __name__ == '__main__': +if __name__ == "__main__": test = HieraModelIntegrationTest() test.test_forward() block_test = TestHieraBlock() From 850350eef1fac3eb23fd68bf4137135114975e56 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 19:07:37 +0000 Subject: [PATCH 037/118] fixed imports --- tests/models/hiera/test_modeling_hiera.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 21e0f14fe58f..326159d9f23e 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -17,8 +17,9 @@ import unittest from typing import Tuple -from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraConfig, HieraModel -from transformers.models.hiera import HieraBlock +from transformers import HieraConfig, HieraModel + +from transformers.models.hiera.modeling_hiera import HieraBlock, HIERA_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.testing_utils import ( require_torch, slow, From 20f3bc04a5a036d2c31b4c861f3c4d1fccd1824b Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 19:26:05 +0000 Subject: [PATCH 038/118] fixed quality & copies --- docs/source/en/index.md | 2 +- .../models/hiera/configuration_hiera.py | 60 ++++++++----------- src/transformers/utils/dummy_pt_objects.py | 17 ++++++ tests/models/hiera/test_modeling_hiera.py | 3 +- 4 files changed, 44 insertions(+), 38 deletions(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index b26c9f91360c..1acd49678534 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -155,7 +155,7 @@ Flax), PyTorch, and/or TensorFlow. | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | -| [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | +| [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ❌ | ❌ | diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index 8d40e7a72777..5b5e92688521 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -37,41 +37,31 @@ class HieraConfig(PretrainedConfig): Args: - input_size (Tuple[int, ...], optional): Dimensions of the input image (height, width). Defaults to (224, 224). - in_chans (int, optional): Number of input channels. Defaults to 3. - embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96. - number_of_heads (int, optional): Initial number of attention heads. Defaults to 1. - num_classes (int, optional): Number of output classes. Defaults to 1000. - stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. - q_pool (int, optional): Number of pooling stages for queries. Defaults to 3. - q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2). - mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride. - mask_unit_attn (Tuple[bool, ...], optional): Specifies which stages use mask unit attention. Defaults to (True, True, False, False). - dim_mul (float, optional): Factor for increasing the dimensionality through the network. Defaults to 2.0. - head_mul (float, optional): Factor for increasing the number of heads through the network. Defaults to 2.0. - patch_kernel (Tuple[int, ...], optional): Kernel size for patch embedding. Defaults to (7, 7). - patch_stride (Tuple[int, ...], optional): Stride for patch embedding. Defaults to (4, 4). - patch_padding (Tuple[int, ...], optional): Padding for patch embedding. Defaults to (3, 3). - mlp_ratio (float, optional): Ratio of hidden size to feed-forward layer size. Defaults to 4.0. - drop_path_rate (float, optional): Dropout rate for stochastic depth. Defaults to 0.0. - head_dropout (float, optional): Dropout rate for attention heads. Defaults to 0.0. - head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001. - sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False. - - - Example: - ```python - >>> from transformers import HieraConfig, HieraModel - - >>> # Initializing a ViT MAE vit-mae-base style configuration - >>> configuration = HieraConfig() - - >>> # Initializing a model (with random weights) from the vit-mae-base style configuration - >>> model = HieraModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ``` + input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width). Defaults to (224, 224). + in_chans (int, optional, *optional*, defaults to 3): Number of input channels. Defaults to 3. + embedding_dimension (int, optional, *optional*, defaults to 96): Dimension of the initial embedding. Defaults to 96. + number_of_heads (int, optional, *optional*, defaults to 1): Initial number of attention heads. Defaults to 1. + num_classes (int, optional, *optional*, defaults to 1000): Number of output classes. Defaults to 1000. + stages (Tuple[int, ...], optional, *optional*, defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model. + q_pool (int, optional, *optional*, defaults to 3): Number of pooling stages for queries. Defaults to 3. + q_stride (Tuple[int, ...], optional, *optional*, defaults to `(2, 2)`): Stride size for pooling. Defaults to (2, 2). + mask_unit_size (Tuple[int, ...], optional, *optional*, defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride. + mask_unit_attn (Tuple[bool, ...], optional, *optional*, defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention. Defaults to (True, True, False, False). + dim_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the dimensionality through the network. Defaults to 2.0. + head_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the number of heads through the network. Defaults to 2.0. + patch_kernel (Tuple[int, ...], optional, *optional*, defaults to `(7, 7)`): Kernel size for patch embedding. Defaults to (7, 7). + patch_stride (Tuple[int, ...], optional, *optional*, defaults to `(4, 4)`): Stride for patch embedding. Defaults to (4, 4). + patch_padding (Tuple[int, ...], optional, *optional*, defaults to `(3, 3)`): Padding for patch embedding. Defaults to (3, 3). + mlp_ratio (float, optional, *optional*, defaults to 4.0): Ratio of hidden size to feed-forward layer size. Defaults to 4.0. + drop_path_rate (float, optional, *optional*, defaults to 0.0): Dropout rate for stochastic depth. Defaults to 0.0. + head_dropout (float, optional, *optional*, defaults to 0.0): Dropout rate for attention heads. Defaults to 0.0. + head_init_scale (float, optional, *optional*, defaults to 0.001): Initial scaling factor for attention head weights. Defaults to 0.001. + sep_position_embeddings (bool, optional, *optional*, defaults to `False`): Whether to use separate position embeddings. Defaults to False. + + + + + """ model_type = "hiera" diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 5c635cf7af2c..4e1b0211216a 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4240,6 +4240,23 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class HieraModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class HieraPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 326159d9f23e..5d90ca9f9f55 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -18,8 +18,7 @@ from typing import Tuple from transformers import HieraConfig, HieraModel - -from transformers.models.hiera.modeling_hiera import HieraBlock, HIERA_PRETRAINED_MODEL_ARCHIVE_LIST +from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock from transformers.testing_utils import ( require_torch, slow, From 12ef68aee902199df6546f1ffe5a58332a896f4d Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Wed, 28 Feb 2024 23:30:41 +0000 Subject: [PATCH 039/118] Fixes --- docs/source/en/model_doc/hiera.md | 4 +- .../models/auto/image_processing_auto.py | 1 - src/transformers/models/hiera/__init__.py | 9 ++- tests/models/hiera/test_modeling_hiera.py | 60 ++++++++----------- 4 files changed, 35 insertions(+), 39 deletions(-) diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index 1c46bae9b072..d38e2e70c770 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -31,9 +31,9 @@ Modern hierarchical vision transformers have added several vision-specific compo -## HireaModel +## HieraModel -[[autodoc]] HireaModel +[[autodoc]] HieraModel - forward diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 5261753d202d..aef894a425ba 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -69,7 +69,6 @@ ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), ("groupvit", "CLIPImageProcessor"), - ("hiera", "HieraImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), ("instructblip", "BlipImageProcessor"), diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index 3346e03f9a88..b04392f55fa5 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -43,6 +43,7 @@ ] + if TYPE_CHECKING: from .configuration_hiera import ( HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -55,8 +56,12 @@ except OptionalDependencyNotAvailable: pass else: - from .hiera_image_processor import HieraImageProcessor - from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock, HieraModel, HieraPreTrainedModel + from .modeling_hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, + HieraModel, + HieraPreTrainedModel, + HieraBlock, + ) else: import sys diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 5d90ca9f9f55..8f24484a71c8 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -17,7 +17,7 @@ import unittest from typing import Tuple -from transformers import HieraConfig, HieraModel +from transformers import HieraConfig, HieraModel, HieraPreTrainedModel from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock from transformers.testing_utils import ( require_torch, @@ -78,7 +78,7 @@ def create_and_check_block(self): @require_torch -class TestHieraBlock(unittest.TestCase): +class HieraBlockTest(unittest.TestCase): def setUp(self): self.block_tester = HieraBlockTester(self) @@ -94,6 +94,9 @@ def test_input_output_dim_equality(self): class HieraModelTester: + + all_model_classes = (HieraModel, HieraPreTrainedModel) if is_torch_available() else () + def __init__( self, parent, @@ -219,26 +222,27 @@ def get_config(self, checkpoint_url): def create_and_check_model(self, config, pixel_values): batch_size = 1 - model = HieraModel(config=config) - num_patches = ( - int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1) - ** 2 - ) - flat_q_stride = math.prod(self.q_stride) - embedding_dimension = self.embedding_dimension - indermediate_shapes = [] - for _ in self.stages: - indermediate_shapes.append( - (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension) + for model_class in self.all_model_classes: + model = model_class(config=config) + num_patches = ( + int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1) + ** 2 ) - num_patches = num_patches / flat_q_stride - embedding_dimension = embedding_dimension * 2 - model.eval() - with torch.no_grad(): - result = model(pixel_values=pixel_values) - - for idx, x in enumerate(result.intermediates): - self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") + flat_q_stride = math.prod(self.q_stride) + embedding_dimension = self.embedding_dimension + indermediate_shapes = [] + for _ in self.stages: + indermediate_shapes.append( + (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension) + ) + num_patches = num_patches / flat_q_stride + embedding_dimension = embedding_dimension * 2 + model.eval() + with torch.no_grad(): + result = model(pixel_values=pixel_values) + + for idx, x in enumerate(result.intermediates): + self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") @require_torch @@ -291,16 +295,4 @@ def test_forward(self): out = model(random_tensor, return_intermediates=True) for idx, x in enumerate(out.intermediates): - self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") - - -if __name__ == "__main__": - test = HieraModelIntegrationTest() - test.test_forward() - block_test = TestHieraBlock() - block_test.test_output_shape() - block_test.test_input_output_dim_equality() - model_test = HieraModelTest() - model_test.setUp() - model_test.test_model() - model_test.test_model_from_pretrained() + self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") \ No newline at end of file From 3faf1e708c14a7af87ca7b6cad45cf24edc60714 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:22:15 -0600 Subject: [PATCH 040/118] Update docs/source/en/model_doc/hiera.md Fix name Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/hiera.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index d38e2e70c770..bda5e0b9ad2f 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. ## Overview -Hubert was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer +Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer The abstract from the paper is the following: From 7debb8dd8d224b0c4681f62126648dd1fcac2426 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:23:03 -0600 Subject: [PATCH 041/118] Update docs/source/en/model_doc/hiera.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/hiera.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index bda5e0b9ad2f..10664868c9bd 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -24,7 +24,7 @@ The abstract from the paper is the following: Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera. -## HireaConfig +## HieraConfig [[autodoc]] HieraConfig From 942e8e9147f46968cb0e1a0362e44a5a16b81ddc Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:23:35 -0600 Subject: [PATCH 042/118] Update docs/source/en/model_doc/hiera.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/hiera.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index 10664868c9bd..8cd6dc1a977a 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -28,7 +28,6 @@ Modern hierarchical vision transformers have added several vision-specific compo [[autodoc]] HieraConfig - ## HieraModel From 4e9ddd47f45690713e9460a11f52acd2c4e0112c Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:24:44 -0600 Subject: [PATCH 043/118] Update src/transformers/models/hiera/configuration_hiera.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/transformers/models/hiera/configuration_hiera.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index 5b5e92688521..81910fbdf8f4 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -28,7 +28,7 @@ class HieraConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an HieraModel model according to the specified arguments, defining the model architecture. Instantiating a configuration with + This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the HieraModel [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture. From c9e77046133a2e77e2c3d4d93c919eea71abd500 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:25:08 -0600 Subject: [PATCH 044/118] Update src/transformers/models/hiera/configuration_hiera.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/transformers/models/hiera/configuration_hiera.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index 81910fbdf8f4..1d02957c2b73 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -37,7 +37,7 @@ class HieraConfig(PretrainedConfig): Args: - input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width). Defaults to (224, 224). + input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width). in_chans (int, optional, *optional*, defaults to 3): Number of input channels. Defaults to 3. embedding_dimension (int, optional, *optional*, defaults to 96): Dimension of the initial embedding. Defaults to 96. number_of_heads (int, optional, *optional*, defaults to 1): Initial number of attention heads. Defaults to 1. From d8e8b735e697796ad74a0f4cd7775afdf986de9f Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:27:24 -0600 Subject: [PATCH 045/118] Update src/transformers/models/hiera/modeling_hiera.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/transformers/models/hiera/modeling_hiera.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index f463834a437b..cc678e1c71ea 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -377,7 +377,8 @@ def __init__( ): """ Args: - - input_dim, output_dim: The input and output feature dimensions. + input_dim (`int`): The input feature dimensions. + output_dim (`int`): The output feature dimensions. - number_of_heads: The number of attention number_of_heads. - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). - window_size: The current (flattened) size of a mask unit *after* pooling (if any). From 6027674616eb077b7bcb10a61be1b4247444c0c2 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 11:27:41 -0600 Subject: [PATCH 046/118] Update src/transformers/models/hiera/modeling_hiera.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/transformers/models/hiera/modeling_hiera.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index cc678e1c71ea..9165b9a529f0 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -379,7 +379,7 @@ def __init__( Args: input_dim (`int`): The input feature dimensions. output_dim (`int`): The output feature dimensions. - - number_of_heads: The number of attention number_of_heads. + number_of_heads (`int`): The number of attention heads. - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). - window_size: The current (flattened) size of a mask unit *after* pooling (if any). - use_mask_unit_attention: Use Mask Unit or Global Attention. From 0e3a0e5b11133df720ea4f398d75a8686ad305b4 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 18:56:37 +0000 Subject: [PATCH 047/118] Fixed formatting --- .../models/hiera/modeling_hiera.py | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 9165b9a529f0..fe4d67f2e6a4 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -69,7 +69,8 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None) -> torch.Tensor: - """Zero-out the masked regions of the input before conv. + """ + Zero-out the masked regions of the input before conv. Prevents leakage of masked regions when using overlapping kernels. """ if conv is None: @@ -296,9 +297,9 @@ def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> Roll the given tensor back up to spatial order assuming it's from the given block. If no mask is provided: - - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. + Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. If a mask is provided: - - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + Returns [B, #MUs, MUy, MUx, C] for 2d, etc. """ schedule, size = self.schedule[block_idx] B, N, C = x.shape @@ -377,12 +378,12 @@ def __init__( ): """ Args: - input_dim (`int`): The input feature dimensions. - output_dim (`int`): The output feature dimensions. - number_of_heads (`int`): The number of attention heads. - - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). - - window_size: The current (flattened) size of a mask unit *after* pooling (if any). - - use_mask_unit_attention: Use Mask Unit or Global Attention. + input_dim (`int`): The input feature dimensions. + output_dim (`int`): The output feature dimensions. + number_of_heads (`int`): The number of attention heads. + q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4). + window_size: The current (flattened) size of a mask unit *after* pooling (if any). + use_mask_unit_attention: Use Mask Unit or Global Attention. """ super().__init__() @@ -499,7 +500,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: @add_start_docstrings( - """ +""" Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). """ ) @@ -552,7 +553,7 @@ def _init_weights(self, module, init_bias=0.02): @add_start_docstrings( - """ +""" Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. @@ -571,7 +572,7 @@ def _init_weights(self, module, init_bias=0.02): >>> model = Hiera(config) >>> inputs = torch.rand((1, 3, 224, 224)) >>> outputs = model(inputs) - """ +""" ) class HieraModel(HieraPreTrainedModel): config_class = HieraConfig @@ -741,7 +742,7 @@ def get_position_embeddings(self) -> torch.Tensor: return self.position_embeddings @add_start_docstrings_to_model_forward( - """ + """ The forward pass for the Hiera model. Args: @@ -750,14 +751,8 @@ def get_position_embeddings(self) -> torch.Tensor: mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. - - return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. - return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. - - - """ ) def forward( @@ -767,7 +762,6 @@ def forward( return_dict: Optional[bool] = True, return_intermediates: bool = True, ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: - """ """ # Slowfast training passes in a list if isinstance(pixel_values, list): pixel_values = pixel_values[0] @@ -809,4 +803,4 @@ def forward( return HieraModelOutput( last_hidden_state=embeddings, intermediates=intermediates if return_intermediates else None, - ) + ) \ No newline at end of file From e9a41269485f3f541caf686d1a138268c7fbcfcc Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 19:20:35 +0000 Subject: [PATCH 048/118] Code quality & Import differences --- src/transformers/models/hiera/__init__.py | 9 +-- .../models/hiera/modeling_hiera.py | 58 +++++++++---------- tests/models/hiera/test_modeling_hiera.py | 8 ++- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py index b04392f55fa5..d8c62fc0800a 100644 --- a/src/transformers/models/hiera/__init__.py +++ b/src/transformers/models/hiera/__init__.py @@ -39,11 +39,9 @@ "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel", - "HieraBlock", ] - if TYPE_CHECKING: from .configuration_hiera import ( HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -57,11 +55,10 @@ pass else: from .modeling_hiera import ( - HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, - HieraModel, + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, + HieraModel, HieraPreTrainedModel, - HieraBlock, - ) + ) else: import sys diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index fe4d67f2e6a4..b7267ae7b7f5 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -500,9 +500,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: @add_start_docstrings( -""" -Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). -""" + """ + Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). + """ ) class PatchEmbedding(nn.Module): def __init__( @@ -553,26 +553,26 @@ def _init_weights(self, module, init_bias=0.02): @add_start_docstrings( -""" -Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. + """ + Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles. -This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. + This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images. -The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance. + The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance. -Parameters: - config ([`HieraConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + Parameters: + config ([`HieraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -Example usage: - >>> from your_model_file import Hiera, HieraConfig - >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + Example usage: + >>> from your_model_file import Hiera, HieraConfig + >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) - >>> model = Hiera(config) - >>> inputs = torch.rand((1, 3, 224, 224)) - >>> outputs = model(inputs) -""" + >>> model = Hiera(config) + >>> inputs = torch.rand((1, 3, 224, 224)) + >>> outputs = model(inputs) + """ ) class HieraModel(HieraPreTrainedModel): config_class = HieraConfig @@ -742,18 +742,18 @@ def get_position_embeddings(self) -> torch.Tensor: return self.position_embeddings @add_start_docstrings_to_model_forward( - """ - The forward pass for the Hiera model. + """ + The forward pass for the Hiera model. - Args: - pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`. + Args: + pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`. - mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). - mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. - Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. - return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. - return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. - """ + mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False). + mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. + Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. + return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. + return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. + """ ) def forward( self, @@ -803,4 +803,4 @@ def forward( return HieraModelOutput( last_hidden_state=embeddings, intermediates=intermediates if return_intermediates else None, - ) \ No newline at end of file + ) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 8f24484a71c8..de9afd8a0a59 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -94,7 +94,6 @@ def test_input_output_dim_equality(self): class HieraModelTester: - all_model_classes = (HieraModel, HieraPreTrainedModel) if is_torch_available() else () def __init__( @@ -225,7 +224,10 @@ def create_and_check_model(self, config, pixel_values): for model_class in self.all_model_classes: model = model_class(config=config) num_patches = ( - int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1) + int( + ((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + + 1 + ) ** 2 ) flat_q_stride = math.prod(self.q_stride) @@ -295,4 +297,4 @@ def test_forward(self): out = model(random_tensor, return_intermediates=True) for idx, x in enumerate(out.intermediates): - self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") \ No newline at end of file + self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") From b057d912d532483d35e44c9d1ccb5f4924490fa1 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 20:02:11 +0000 Subject: [PATCH 049/118] quality and repo-consistency fix --- src/transformers/__init__.py | 6 +++--- src/transformers/models/hiera/configuration_hiera.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9d668babbec2..de8bbdb00371 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2365,6 +2365,9 @@ "GroupViTVisionModel", ] ) + _import_structure["models.hiera"].extend( + ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"] + ) _import_structure["models.hubert"].extend( [ "HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4148,9 +4151,6 @@ "TFGroupViTVisionModel", ] ) - _import_structure["models.hiera"].extend( - ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"] - ) _import_structure["models.hubert"].extend( [ "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index 1d02957c2b73..dc4e7d554bee 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -23,14 +23,17 @@ logger = logging.get_logger(__name__) -HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} +HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "namangarg110/hiera_base_224": "https://huggingface.co/namangarg110/hiera_base_224/blob/main/config.json", + # See all Hiera models at https://huggingface.co/models?filter=hiera +} class HieraConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the HieraModel - [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture. + [namangarg110/hiera_base_224](https://huggingface.co/namangarg110/hiera_base_224/) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. From d7210cc169ee21d3eaff040362883cce548c18f5 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 20:27:09 +0000 Subject: [PATCH 050/118] fixed no torch error --- tests/models/hiera/test_modeling_hiera.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index de9afd8a0a59..729d1de4247c 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -17,8 +17,7 @@ import unittest from typing import Tuple -from transformers import HieraConfig, HieraModel, HieraPreTrainedModel -from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock +from transformers import HieraConfig from transformers.testing_utils import ( require_torch, slow, @@ -28,6 +27,10 @@ if is_torch_available(): import torch + + from transformers import HieraModel + from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock + import math @@ -94,7 +97,7 @@ def test_input_output_dim_equality(self): class HieraModelTester: - all_model_classes = (HieraModel, HieraPreTrainedModel) if is_torch_available() else () + all_model_classes = (HieraModel,) if is_torch_available() else () def __init__( self, From 10dfa68d4ba8a6f536f6181cd1025a4f56f6485d Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 20:42:47 +0000 Subject: [PATCH 051/118] Docstring fix --- src/transformers/models/hiera/modeling_hiera.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index b7267ae7b7f5..eb7da758fd6d 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -566,7 +566,7 @@ def _init_weights(self, module, init_bias=0.02): configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. Example usage: - >>> from your_model_file import Hiera, HieraConfig + >>> from transformers import Hiera, HieraConfig >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) >>> model = Hiera(config) From f81fa7625d2753f71eabe05f36443cf0825927fb Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 20:52:22 +0000 Subject: [PATCH 052/118] Docstring fix --- src/transformers/models/hiera/modeling_hiera.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index eb7da758fd6d..0b5a1fa35213 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -566,7 +566,7 @@ def _init_weights(self, module, init_bias=0.02): configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. Example usage: - >>> from transformers import Hiera, HieraConfig + >>> from transformers import HieraModel, HieraConfig >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) >>> model = Hiera(config) From 5951e581507d331cbbf56717e1c702b2410466a1 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 20:53:07 +0000 Subject: [PATCH 053/118] doc string fix --- src/transformers/models/hiera/modeling_hiera.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 0b5a1fa35213..159aaa0b9fa3 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -567,7 +567,7 @@ def _init_weights(self, module, init_bias=0.02): Example usage: >>> from transformers import HieraModel, HieraConfig - >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3)) >>> model = Hiera(config) >>> inputs = torch.rand((1, 3, 224, 224)) From 9b194a4b496e2dbbeb872b6475431ef5a1cd6489 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 29 Feb 2024 20:58:02 +0000 Subject: [PATCH 054/118] fixed example usage --- src/transformers/models/hiera/modeling_hiera.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 159aaa0b9fa3..536de5592202 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -567,9 +567,9 @@ def _init_weights(self, module, init_bias=0.02): Example usage: >>> from transformers import HieraModel, HieraConfig + >>> import torch >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3)) - - >>> model = Hiera(config) + >>> model = HieraModel(config) >>> inputs = torch.rand((1, 3, 224, 224)) >>> outputs = model(inputs) """ From dd3da8f4b469677f1de1e89e611084aa06f6780a Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 7 Mar 2024 05:29:43 +0000 Subject: [PATCH 055/118] Resolved issues in modeling_hiera --- .../models/hiera/modeling_hiera.py | 332 +++++++++--------- tests/models/hiera/test_modeling_hiera.py | 66 +--- 2 files changed, 160 insertions(+), 238 deletions(-) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 536de5592202..3cd0d21c56b8 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -1,8 +1,8 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Meta and The HuggingFace Team. All rights reserved. # All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. +# This code is part of a project that uses the model by Meta, licensed under +# the Creative Commons Attribution-NonCommercial 4.0 International License. +# To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or # -------------------------------------------------------- # # Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles @@ -12,10 +12,6 @@ # Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer. # # Paper: https://arxiv.org/abs/2306.00989/ -# -# References: -# slowfast: https://github.com/facebookresearch/SlowFast -# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm # -------------------------------------------------------- import collections.abc @@ -52,7 +48,7 @@ def conv_nd(n: int) -> Type[nn.Module]: def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: - # Refer to `Unroll` to see how this performs a maxpool-Nd + # Refer to `HieraUnroll` to see how this performs a maxpool-Nd return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values @@ -61,8 +57,6 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso # (spatial) mask: [B, C, (t), (h), w] if mask is None: return mask - - assert len(mask.shape[2:]) == len(target_size) if mask.shape[2:] != target_size: return F.interpolate(mask.float(), size=target_size) return mask @@ -82,36 +76,35 @@ def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor return conv(x * mask.bool()) -def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor: +def undo_windowing(tensor: torch.Tensor, spatial_shape: List[int], mask_unit_shape: List[int]) -> torch.Tensor: """ Restore spatial organization by undoing windowed organization of mask units. Args: - x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] - shape: current spatial shape, if it were not organized into mask unit - windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. - mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] + tensor: Tensor organized by mask units windows, e.g., in 2D [batch_size, num_mask_units_y*num_mask_units_x, mask_unit_height, mask_unit_width, channels]. + spatial_shape: Desired spatial shape if it were not organized into mask unit windows, e.g., in 2D [batch_size, num_mask_units_y*mask_unit_height, num_mask_units_x*mask_unit_width, channels]. + mask_unit_shape: Current mask unit shape, e.g., in 2D [mask_unit_height, mask_unit_width]. Returns: - x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] + Restored tensor with spatial organization, e.g., in 2D [batch_size, num_mask_units_y*mask_unit_height, num_mask_units_x*mask_unit_width, channels]. """ - D = len(shape) - B, C = x.shape[0], x.shape[-1] - # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] - num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] - x = x.view(B, *num_MUs, *mu_shape, C) - - # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] - permute = ( + num_dimensions = len(spatial_shape) + batch_size, channels = tensor.shape[0], tensor.shape[-1] + # [batch_size, num_mask_units_y*num_mask_units_x, mask_unit_height, mask_unit_width, channels] -> [batch_size, num_mask_units_y, num_mask_units_x, mask_unit_height, mask_unit_width, channels] + num_mask_units = [spatial_dim // mask_unit_dim for spatial_dim, mask_unit_dim in zip(spatial_shape, mask_unit_shape)] + tensor = tensor.view(batch_size, *num_mask_units, *mask_unit_shape, channels) + + # Calculate the permutation order for restoring spatial organization + permute_order = ( [0] + sum( - [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], + [list(p) for p in zip(range(1, 1 + num_dimensions), range(1 + num_dimensions, 1 + 2 * num_dimensions))], [], ) - + [len(x.shape) - 1] + + [len(tensor.shape) - 1] ) - x = x.permute(permute).reshape(B, *shape, C) + tensor = tensor.permute(permute_order).reshape(batch_size, *spatial_shape, channels) - return x + return tensor # Copied from transformers.models.swin.modeling_swin.drop_path @@ -134,42 +127,29 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals return output -class DropPath(nn.Module): +class HieraDropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): - super(DropPath, self).__init__() + super(HieraDropPath, self).__init__() self.drop_prob = drop_prob self.scale_by_keep = scale_by_keep def forward(self, x): return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) - def extra_repr(self): - return f"drop_prob={round(self.drop_prob,3):0.3f}" - - -# Copied from timm.layers.helpers -def _ntuple(n): - def parse(x): - if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): - return tuple(x) - return tuple(repeat(x, n)) - - return parse -to_2tuple = _ntuple(2) -# Copied from timm.layers.mlp -class Mlp(nn.Module): +class HieraMlp(nn.Module): """MLP as used in Vision Transformer, MLP-Mixer and related networks""" def __init__( self, + config: HieraConfig, in_features, - hidden_features=None, + # hidden_features=None, out_features=None, act_layer=nn.GELU, norm_layer=None, @@ -178,18 +158,25 @@ def __init__( use_conv=False, ): super().__init__() + self.config = config + hidden_features = int(in_features * self.config.mlp_ratio) out_features = out_features or in_features hidden_features = hidden_features or in_features - bias = to_2tuple(bias) - drop_probs = to_2tuple(drop) - linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear + bias = (bias, bias) if not isinstance(bias, tuple) else bias + + drop_probs = (drop, drop) if not isinstance(drop, tuple) else drop - self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) self.act = act_layer() self.drop1 = nn.Dropout(drop_probs[0]) - self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity() - self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1]) self.drop2 = nn.Dropout(drop_probs[1]) + self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity() + + if use_conv: + self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0]) + self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1]) + else: + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) def forward(self, x): x = self.fc1(x) @@ -201,7 +188,7 @@ def forward(self, x): return x -class Unroll(nn.Module): +class HieraUnroll(nn.Module): """ Reorders the tokens such that patches are contiguous in memory. E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as @@ -223,124 +210,123 @@ class Unroll(nn.Module): def __init__( self, - input_size: Tuple[int, ...], - patch_stride: Tuple[int, ...], - unroll_schedule: List[Tuple[int, ...]], + config: HieraConfig, ): super().__init__() - self.size = [i // s for i, s in zip(input_size, patch_stride)] - self.schedule = unroll_schedule + self.config = config + self.size = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)] + self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)] + self.schedule = [self.config.q_stride] * len(self.stage_ends[:-1]) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: """ Input: Flattened patch embeddings [B, N, C] Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd """ - B, _, C = x.shape + batch_size, _, channels = embeddings.shape - cur_size = self.size - x = x.view(*([B] + cur_size + [C])) + current_size = self.size + embeddings = embeddings.view(*([batch_size] + current_size + [channels])) - for strides in self.schedule: + for stride_steps in self.schedule: # Move patches with the given strides to the batch dimension # Create a view of the tensor with the patch stride as separate dims # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] - cur_size = [i // s for i, s in zip(cur_size, strides)] - new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] - x = x.view(new_shape) + current_size = [dimension // stride for dimension, stride in zip(current_size, stride_steps)] + new_shape = [batch_size] + sum([[dimension, stride] for dimension, stride in zip(current_size, stride_steps)], []) + [channels] + embeddings = embeddings.view(new_shape) # Move the patch stride into the batch dimension # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] - L = len(new_shape) - permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] - x = x.permute(permute) + shape_length = len(new_shape) + permute_order = [0] + list(range(2, shape_length - 1, 2)) + list(range(1, shape_length - 1, 2)) + [shape_length - 1] + embeddings = embeddings.permute(*permute_order) # Now finally flatten the relevant dims into the batch dimension - x = x.flatten(0, len(strides)) - B *= math.prod(strides) + embeddings = embeddings.flatten(0, len(stride_steps)) + batch_size *= math.prod(stride_steps) - x = x.reshape(-1, math.prod(self.size), C) - return x + embeddings = embeddings.reshape(-1, math.prod(self.size), channels) + return embeddings -class Reroll(nn.Module): +class HieraReroll(nn.Module): """ Undos the "unroll" operation so that you can use intermediate features. """ def __init__( self, - input_size: Tuple[int, ...], - patch_stride: Tuple[int, ...], - unroll_schedule: List[Tuple[int, ...]], - stage_ends: List[int], - q_pool: int, + config: HieraConfig, + ): super().__init__() - self.size = [i // s for i, s in zip(input_size, patch_stride)] - + self.config = config + self.size = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)] + self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)] + unroll_schedule = [self.config.q_stride] * len(self.stage_ends[:-1]) # The first stage has to reverse everything # The next stage has to reverse all but the first unroll, etc. self.schedule = {} size = self.size - for i in range(stage_ends[-1] + 1): + for i in range(self.stage_ends[-1] + 1): self.schedule[i] = unroll_schedule, size # schedule unchanged if no pooling at a stage end - if i in stage_ends[:q_pool]: + if i in self.stage_ends[:self.config.q_pool]: if len(unroll_schedule) > 0: - size = [n // s for n, s in zip(size, unroll_schedule[0])] + size = [new_size // stride for new_size, stride in zip(size, unroll_schedule[0])] unroll_schedule = unroll_schedule[1:] - def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor: + def forward(self, embeddings: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor: """ Roll the given tensor back up to spatial order assuming it's from the given block. If no mask is provided: Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. If a mask is provided: - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + Returns [B, #MaskUnits, MaskUnitHeight, MaskUnitWidth, C] for 2d, etc. """ schedule, size = self.schedule[block_idx] - B, N, C = x.shape + batch_size, num_tokens, num_channels = embeddings.shape - D = len(size) - cur_mu_shape = [1] * D + num_dimensions = len(size) + current_mask_unit_shape = [1] * num_dimensions for strides in schedule: # Extract the current patch from N - x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) + embeddings = embeddings.view(batch_size, *strides, num_tokens // math.prod(strides), *current_mask_unit_shape, num_channels) # Move that patch into the current MU # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] - L = len(x.shape) + shape_length = len(embeddings.shape) permute = ( - [0, 1 + D] + [0, 1 + num_dimensions] + sum( - [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], + [list(p) for p in zip(range(1, 1 + num_dimensions), range(1 + num_dimensions + 1, shape_length - 1))], [], ) - + [L - 1] + + [shape_length - 1] ) - x = x.permute(permute) + embeddings = embeddings.permute(permute) # Reshape to [B, N//(Sy*Sx), *MU, C] - for i in range(D): - cur_mu_shape[i] *= strides[i] - x = x.reshape(B, -1, *cur_mu_shape, C) - N = x.shape[1] + for i in range(num_dimensions): + current_mask_unit_shape[i] *= strides[i] + embeddings = embeddings.reshape(batch_size, -1, *current_mask_unit_shape, num_channels) + num_tokens = embeddings.shape[1] # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) - x = x.view(B, N, *cur_mu_shape, C) + embeddings = embeddings.view(batch_size, num_tokens, *current_mask_unit_shape, num_channels) # If masked, return [B, #MUs, MUy, MUx, C] if mask is not None: - return x + return embeddings # If not masked, we can return [B, H, W, C] - x = undo_windowing(x, size, cur_mu_shape) + embeddings = undo_windowing(embeddings, size, current_mask_unit_shape) - return x + return embeddings @dataclass @@ -357,18 +343,22 @@ class HieraModelOutput(ModelOutput): last_hidden_state: torch.FloatTensor intermediates: Optional[List[torch.Tensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + -class MaskUnitAttention(nn.Module): +class HieraMaskUnitAttention(nn.Module): """ Computes either Mask Unit or Global Attention. Also is able to perform q pooling. Note: this assumes the tokens have already been flattened and unrolled into mask units. - See `Unroll` for more details. + See `HieraUnroll` for more details. """ def __init__( self, + config: HieraConfig, input_dim: int, output_dim: int, number_of_heads: int, @@ -386,7 +376,7 @@ def __init__( use_mask_unit_attention: Use Mask Unit or Global Attention. """ super().__init__() - + self.config = config self.input_dim = input_dim self.output_dim = output_dim self.number_of_heads = number_of_heads @@ -414,33 +404,30 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor: q, k, v = qkv[0], qkv[1], qkv[2] if self.q_stride > 1: - # Refer to Unroll to see how this performs a maxpool-Nd + # Refer to HieraUnroll to see how this performs a maxpool-Nd q = ( q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim) .max(dim=3) .values ) - if hasattr(F, "scaled_dot_product_attention"): - # Note: the original paper did *not* use SDPA, it's a free boost! - embeddings = F.scaled_dot_product_attention(q, k, v) - else: - attention = (q * self.scale) @ k.transpose(-1, -2) - attention = attention.softmax(dim=-1) - embeddings = attention @ v + + attention = (q * self.scale) @ k.transpose(-1, -2) + attention = attention.softmax(dim=-1) + embeddings = attention @ v embeddings = embeddings.transpose(1, 3).reshape(batch_size, -1, self.output_dim) embeddings = self.projection(embeddings) - return embeddings + return embeddings, attention class HieraBlock(nn.Module): def __init__( self, + config: HieraConfig, input_dim: int, output_dim: int, number_of_heads: int, - mlp_ratio: float = 4.0, drop_path: float = 0.0, norm_layer: nn.Module = nn.LayerNorm, act_layer: nn.Module = nn.GELU, @@ -449,19 +436,18 @@ def __init__( use_mask_unit_attention: bool = False, ): super().__init__() - + self.config = config self.input_dim = input_dim self.output_dim = output_dim - self.norm1 = norm_layer(input_dim) - self.attention = MaskUnitAttention( - input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention + self.attention = HieraMaskUnitAttention( + config, input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention ) self.norm2 = norm_layer(output_dim) - self.mlp = Mlp(output_dim, int(output_dim * mlp_ratio), act_layer=act_layer) + self.mlp = HieraMlp(config, output_dim, act_layer=act_layer) - self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity() + self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity() if input_dim != output_dim: self.projection = nn.Linear(input_dim, output_dim) @@ -470,24 +456,25 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor: normalized_embeddings = self.norm1(embeddings) if self.input_dim != self.output_dim: embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride) - embeddings = embeddings + self.drop_path(self.attention(normalized_embeddings)) + attention_output , attention_weights = self.attention(normalized_embeddings) + embeddings = embeddings + self.drop_path(attention_output) # MLP embeddings = embeddings + self.drop_path(self.mlp(self.norm2(embeddings))) - return embeddings + return embeddings, attention_weights -class Head(nn.Module): +class HieraHead(nn.Module): def __init__( self, + config: HieraConfig, input_dim: int, - num_classes: int, - dropout_rate: float = 0.0, act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1), ): super().__init__() - self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity() - self.projection = nn.Linear(input_dim, num_classes) + self.config = config + self.dropout = nn.Dropout(self.config.head_dropout) if self.config.head_dropout > 0 else nn.Identity() + self.projection = nn.Linear(input_dim, self.config.num_classes) # act_fun for eval and testing only self.act_func = act_func @@ -498,31 +485,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.act_func(x) return x - -@add_start_docstrings( - """ - Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d). - """ -) -class PatchEmbedding(nn.Module): +class HieraPatchEmbedding(nn.Module): def __init__( self, - dim_in: int, - output_dim: int, - kernel: Tuple[int, ...], - stride: Tuple[int, ...], - padding: Tuple[int, ...], + config: HieraConfig, ): super().__init__() - + self.config = config # Support any number of spatial dimensions - self.spatial_dims = len(kernel) + self.spatial_dims = len(self.config.patch_kernel) self.projection = conv_nd(self.spatial_dims)( - dim_in, - output_dim, - kernel_size=kernel, - stride=stride, - padding=padding, + self.config.in_chans, + self.config.embedding_dimension, + kernel_size=self.config.patch_kernel, + stride=self.config.patch_stride, + padding=self.config.patch_padding, ) def forward(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: @@ -604,22 +581,19 @@ def __init__(self, config: HieraConfig): super().__init__(config) self.config = config - norm_layer = partial(nn.LayerNorm, eps=1e-6) # Example, adjust as needed + norm_layer = partial(nn.LayerNorm, eps=1e-6) depth = sum(self.stages) self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)] num_tokens = math.prod(self.tokens_spatial_shape) flat_mu_size = math.prod(self.mask_unit_size) flat_q_stride = math.prod(self.q_stride) - assert self.q_pool < len(self.stages) self.q_pool, self.q_stride = self.q_pool, self.q_stride self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)] self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)] - self.patch_embedding = PatchEmbedding( - self.in_chans, self.embedding_dimension, self.patch_kernel, self.patch_stride, self.patch_padding - ) + self.patch_embedding = HieraPatchEmbedding(config) if self.sep_position_embeddings: self.position_embeddings_spatial = nn.Parameter( @@ -636,14 +610,9 @@ def __init__(self, config: HieraConfig): self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension)) # Setup roll and reroll modules - self.unroll = Unroll(self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])) - self.reroll = Reroll( - self.input_size, - self.patch_stride, - [self.q_stride] * len(self.stage_ends[:-1]), - self.stage_ends, - self.q_pool, - ) + self.unroll = HieraUnroll(config) + self.reroll = HieraReroll(config) + # q_pool locations q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]] # stochastic depth decay rule @@ -670,10 +639,10 @@ def __init__(self, config: HieraConfig): number_of_heads = self.number_of_heads block = HieraBlock( + config, input_dim=self.embedding_dimension, output_dim=output_dim, number_of_heads=number_of_heads, - mlp_ratio=self.mlp_ratio, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), @@ -685,7 +654,7 @@ def __init__(self, config: HieraConfig): self.blocks.append(block) self.norm = norm_layer(self.embedding_dimension) - self.head = Head(self.embedding_dimension, self.num_classes, dropout_rate=self.head_dropout) + self.head = HieraHead(config, self.embedding_dimension) # Initialize everything if self.sep_position_embeddings: @@ -752,7 +721,10 @@ def get_position_embeddings(self) -> torch.Tensor: mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim. Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch. return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple. - return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. + output_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model. + output_attentions (`bool`, optional): Whether to return attention weights + output_hidden_states(`bool`, optional): Whether to return Hidden States + """ ) def forward( @@ -760,12 +732,14 @@ def forward( pixel_values: torch.Tensor, mask: torch.Tensor = None, return_dict: Optional[bool] = True, - return_intermediates: bool = True, + output_intermediates: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + ) -> Union[Tuple[torch.Tensor], HieraModelOutput]: - # Slowfast training passes in a list - if isinstance(pixel_values, list): - pixel_values = pixel_values[0] - intermediates = [] + intermediates = [] if output_intermediates else None + attentions = [] if output_attentions else None + hidden_states = [] if output_hidden_states else None pached_embeddings = self.patch_embedding( pixel_values, @@ -776,6 +750,9 @@ def forward( embeddings = pached_embeddings + self.get_position_embeddings() embeddings = self.unroll(embeddings) + if output_hidden_states: + hidden_states.append(embeddings) + # Discard masked tokens if mask is not None: embeddings = embeddings[mask[..., None].tile(1, self.mu_size, embeddings.shape[2])].view( @@ -783,9 +760,14 @@ def forward( ) for i, block in enumerate(self.blocks): - embeddings = block(embeddings) + embeddings, attention = block(embeddings) + if output_attentions: + attentions.append(attention) + + if output_hidden_states: + hidden_states.append(embeddings) - if return_intermediates and i in self.stage_ends: + if output_intermediates and i in self.stage_ends: intermediates.append(self.reroll(embeddings, i, mask=mask)) if mask is None: @@ -798,9 +780,11 @@ def forward( # q_stride = (2, 2), not all unrolls were consumed, # intermediates[-1] is embeddings in spatial order if not return_dict: - return tuple(v for v in [embeddings, intermediates] if v is not None) + return tuple(v for v in [embeddings, intermediates, attention, hidden_states] if v is not None) return HieraModelOutput( last_hidden_state=embeddings, - intermediates=intermediates if return_intermediates else None, + intermediates=intermediates if output_intermediates else None, + attentions=attentions if output_attentions else None, + hidden_states=hidden_states if output_hidden_states else None, ) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 729d1de4247c..38d84d015220 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,68 +34,6 @@ import math -class HieraBlockTester: - def __init__( - self, - parent, - batch_size: int = 1, - input_dim: int = 96, - output_dim: int = 192, - number_of_heads: int = 2, - mlp_ratio: float = 4.0, - drop_path: float = 0.0, - q_stride: int = 4, - window_size: int = 16, - use_mask_unit_attention: bool = True, - num_patches: int = 3136, - ): - self.parent = parent - self.batch_size = batch_size - self.input_dim = input_dim - self.output_dim = output_dim - self.number_of_heads = number_of_heads - self.mlp_ratio = mlp_ratio - self.drop_path = drop_path - self.q_stride = q_stride - self.window_size = window_size - self.use_mask_unit_attention = use_mask_unit_attention - self.num_patches = num_patches - - def create_and_check_block(self): - block = HieraBlock( - input_dim=self.input_dim, - output_dim=self.output_dim, - number_of_heads=self.number_of_heads, - mlp_ratio=self.mlp_ratio, - drop_path=self.drop_path, - q_stride=self.q_stride, - window_size=self.window_size, - use_mask_unit_attention=self.use_mask_unit_attention, - ) - - x = torch.randn(self.batch_size, self.num_patches, self.input_dim) - out = block(x) - - expected_shape = (self.batch_size, self.num_patches // self.q_stride, self.output_dim) - self.parent.assertEqual(out.shape, expected_shape, "Output shape is incorrect") - - -@require_torch -class HieraBlockTest(unittest.TestCase): - def setUp(self): - self.block_tester = HieraBlockTester(self) - - def test_output_shape(self): - self.block_tester.create_and_check_block() - - def test_input_output_dim_equality(self): - self.block_tester.output_dim = self.block_tester.input_dim - self.block_tester.q_stride = 1 - self.block_tester.number_of_heads = 1 - self.block_tester.window_size = 64 - self.block_tester.create_and_check_block() - - class HieraModelTester: all_model_classes = (HieraModel,) if is_torch_available() else () @@ -298,6 +236,6 @@ def test_forward(self): out.last_hidden_state.argmax(dim=-1).item() - out = model(random_tensor, return_intermediates=True) + out = model(random_tensor, output_intermediates=True) for idx, x in enumerate(out.intermediates): self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape") From 3475b2d9361069e8caf775eff4a0be283667d980 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Thu, 7 Mar 2024 05:29:56 +0000 Subject: [PATCH 056/118] Removed Hiera MAE --- src/transformers/models/hiera/hiera_mae.py | 269 --------------------- 1 file changed, 269 deletions(-) delete mode 100644 src/transformers/models/hiera/hiera_mae.py diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py deleted file mode 100644 index 7c42c22734a1..000000000000 --- a/src/transformers/models/hiera/hiera_mae.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -# -------------------------------------------------------- -# References: -# mae: https://github.com/facebookresearch/mae -# slowfast: https://github.com/facebookresearch/SlowFast -# -------------------------------------------------------- - - -import math -from functools import partial -from typing import Optional, Tuple - -import torch -import torch.nn as nn - -from .modeling_hiera import HieraBlock, HieraModel, conv_nd, undo_windowing - - -def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor: - if isinstance(head, nn.Identity): - return x - - batch_size, num_mask_units = x.shape[0:2] - # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size * #MUs, C, My, Mx]) - permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2)) - x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute)) - - # Restore original layout, e.g. [batch_size * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C'] - permute = [0] + list(range(2, len(x.shape))) + [1] - x = x.permute(permute).reshape(batch_size, num_mask_units, *x.shape[2:], x.shape[1]) - return x - - -class MaskedAutoencoderHiera(HieraModel): - """Masked Autoencoder with HieraModel backbone""" - - def __init__( - self, - in_chans: int = 3, - patch_stride: Tuple[int, ...] = (4, 4), - mlp_ratio: float = 4.0, - decoder_embed_dim: int = 512, - decoder_depth: int = 8, - decoder_num_heads: int = 16, - norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6), - **kwdargs, - ): - super().__init__( - in_chans=in_chans, - patch_stride=patch_stride, - mlp_ratio=mlp_ratio, - norm_layer=norm_layer, - **kwdargs, - ) - - del self.norm, self.head - encoder_dim_out = self.blocks[-1].dim_out - self.encoder_norm = norm_layer(encoder_dim_out) - self.mask_unit_spatial_shape_final = [ - i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride) - ] - self.tokens_spatial_shape_final = [ - i // s ** (self.q_pool) for i, s in zip(self.tokens_spatial_shape, self.q_stride) - ] - # -------------------------------------------------------------------------- - # Multi-scale fusion heads - curr_mu_size = self.mask_unit_size - self.multi_scale_fusion_heads = nn.ModuleList() - - for i in self.stage_ends[: self.q_pool]: # resolution constant after q_pool - kernel = [i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)] - curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)] - self.multi_scale_fusion_heads.append( - conv_nd(len(self.q_stride))( - self.blocks[i].dim_out, - encoder_dim_out, - kernel_size=kernel, - stride=kernel, - ) - ) - self.multi_scale_fusion_heads.append(nn.Identity()) # final stage, no transform - - # -------------------------------------------------------------------------- - # MAE decoder specifics - self.decoder_embed = nn.Linear(encoder_dim_out, decoder_embed_dim) - - self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) - - self.decoder_pos_embed = nn.Parameter( - torch.zeros(1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim) - ) - - self.decoder_blocks = nn.ModuleList( - [ - HieraBlock( - dim=decoder_embed_dim, - dim_out=decoder_embed_dim, - heads=decoder_num_heads, - norm_layer=norm_layer, - mlp_ratio=mlp_ratio, - ) - for i in range(decoder_depth) - ] - ) - self.decoder_norm = norm_layer(decoder_embed_dim) - - self.pred_stride = patch_stride[-1] * (self.q_stride[-1] ** self.q_pool) # patch stride of prediction - - self.decoder_pred = nn.Linear( - decoder_embed_dim, - (self.pred_stride ** min(2, len(self.q_stride))) * in_chans, - ) # predictor - # -------------------------------------------------------------------------- - - self.initialize_weights() - - def initialize_weights(self): - nn.init.trunc_normal_(self.mask_token, std=0.02) - nn.init.trunc_normal_(self.decoder_pos_embed, std=0.02) - self.apply(self._mae_init_weights) - - # initialize patch_embed like nn.Linear (instead of nn.Conv2d) - w = self.patch_embed.projection.weight.data - nn.init.xavier_uniform_(w.view([w.shape[0], -1])) - - def _mae_init_weights(self, m: nn.Module): - if isinstance(m, nn.Linear): - nn.init.xavier_uniform_(m.weight) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def get_pixel_label_2d(self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor: - # mask (boolean tensor): True must correspond to *masked* - input_img = input_img.permute(0, 2, 3, 1) - - size = self.pred_stride - label = input_img.unfold(1, size, size).unfold(2, size, size) - label = label.flatten(1, 2).flatten(2) - label = label[mask] - if norm: - mean = label.mean(dim=-1, keepdim=True) - var = label.var(dim=-1, keepdim=True) - label = (label - mean) / (var + 1.0e-6) ** 0.5 - - return label - - def get_pixel_label_3d(self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor: - # mask (boolean tensor): True must correspond to *masked* - - # We use time strided loss, only take the first frame from each token - input_vid = input_vid[:, :, :: self.patch_stride[0], :, :] - - size = self.pred_stride - label = input_vid.unfold(3, size, size).unfold(4, size, size) - label = label.permute(0, 2, 3, 4, 5, 6, 1) # Different from 2d, mistake during training lol - label = label.flatten(1, 3).flatten(2) - label = label[mask] - - if norm: - mean = label.mean(dim=-1, keepdim=True) - var = label.var(dim=-1, keepdim=True) - label = (label - mean) / (var + 1.0e-6) ** 0.5 - - return label - - def forward_encoder( - self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor]: - if mask is None: - mask = self.get_random_mask(x, mask_ratio) # [batch_size , #MUs_all] - - # Get multi-scale representations from encoder - _, intermediates = super().forward(x, mask, return_intermediates=True) - # Resolution unchanged after q_pool stages, so skip those features - intermediates = intermediates[: self.q_pool] + intermediates[-1:] - - # Multi-scale fusion - x = 0.0 - for head, interm_x in zip(self.multi_scale_fusion_heads, intermediates): - x += apply_fusion_head(head, interm_x) - - x = self.encoder_norm(x) - - return x, mask - - def forward_decoder(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - # Embed tokens - x = self.decoder_embed(x) - - # Combine visible and mask tokens - - # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out] - # mask: [batch_size , #MUs_all] - x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype) - mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)) - mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:])) - mask = mask.expand((-1,) * 2 + x.shape[2:]).bool() - x_dec[mask] = x.flatten() - x_dec = ~mask * mask_tokens + mask * x_dec - - # Get back spatial order - x = undo_windowing( - x_dec, - self.tokens_spatial_shape_final, - self.mask_unit_spatial_shape_final, - ) - mask = undo_windowing( - mask[..., 0:1], - self.tokens_spatial_shape_final, - self.mask_unit_spatial_shape_final, - ) - - # Flatten - x = x.reshape(x.shape[0], -1, x.shape[-1]) - mask = mask.view(x.shape[0], -1) - - # Add pos embed - x = x + self.decoder_pos_embed - - # Apply decoder blocks - for blk in self.decoder_blocks: - x = blk(x) - x = self.decoder_norm(x) - - # Predictor projection - x = self.decoder_pred(x) - - return x, mask - - def forward_loss( - self, x: torch.Tensor, pred: torch.Tensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Note: in mask, 0 is *visible*, 1 is *masked* - - x: e.g. [batch_size , 3, H, W] - pred: [batch_size * num_pred_tokens, num_pixels_in_pred_patch * in_chans] - label: [batch_size * num_pred_tokens, num_pixels_in_pred_patch * in_chans] - """ - if len(self.q_stride) == 2: - label = self.get_pixel_label_2d(x, mask) - elif len(self.q_stride) == 3: - label = self.get_pixel_label_3d(x, mask) - else: - raise NotImplementedError - - pred = pred[mask] - loss = (pred - label) ** 2 - - return loss.mean(), pred, label - - def forward( - self, - x: torch.Tensor, - mask_ratio: float = 0.6, - mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - latent, mask = self.forward_encoder(x, mask_ratio, mask=mask) - pred, pred_mask = self.forward_decoder(latent, mask) # pred_mask is mask at resolution of *prediction* - - # Toggle mask, to generate labels for *masked* tokens - return *self.forward_loss(x, pred, ~pred_mask), mask From 5ba0aafba75958cebebfd463f6318bb0bf2cece7 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Fri, 15 Mar 2024 07:00:43 +0000 Subject: [PATCH 057/118] Added test and resolved bug --- .../models/hiera/convert_hiera_to_pytorch.py | 60 ++++++++++--------- .../models/hiera/modeling_hiera.py | 14 ++--- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py index f4f82d59a3c9..f85f37dd04bf 100644 --- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py +++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. +# Copyright 2024 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,21 +14,17 @@ # limitations under the License. import argparse +from PIL import Image import torch # from transformers import HieraConfig, HieraModel from transformers import HieraConfig, HieraModel -from transformers.models.hiera.hiera_image_processor import HieraImageProcessor - +from transformers import BeitImageProcessor +from transformers.image_utils import PILImageResampling, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +import requests def rename_key(name): - # if "patch_embed.proj" in name: - # name = name.replace("patch_embed.proj", "patch_embed.projection") - # # elif "block.proj" in name: - # # name = name.replace("block.proj", "block.projection") - # elif "attn.proj" in name: - # name = name.replace("attn.proj", "attn.projection") if ".proj." in name: name = name.replace(".proj.", ".projection.") if "attn" in name: @@ -109,7 +105,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs) checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"] elif "hiera_base_224" in checkpoint_url: - config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs) + config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3)) checkpoints = pretrained_models_links["hiera_base_224"] checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"] @@ -197,29 +193,39 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs) ): strict = False - model.load_state_dict(state_dict["model_state"], strict) - # model.load_state_dict(state_dict["model_state"], strict=strict) - + model.load_state_dict(state_dict["model_state"], strict=strict) + + + image_processor = BeitImageProcessor( + size = {"height":256,"width":256}, + do_rescale=True, + do_center_crop=True, + crop_size = {"height":224,"width":224}, + do_normalize=True, + do_reduce_labels=False, + do_resize=True, + image_std=IMAGENET_DEFAULT_STD, + image_mean=IMAGENET_DEFAULT_MEAN, + resample = PILImageResampling.BICUBIC) + + url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" + image = Image.open(requests.get(url, stream=True).raw) - image_processor = HieraImageProcessor(size=224) - inputs = image_processor.process_image(image_url=url) - - # forward pass - out = model(inputs[None, ...]) - - # 207: golden retriever (imagenet-1k) - out.last_hidden_state.argmax(dim=-1).item() - + processed_image = image_processor(images=image, return_tensors="pt") + model.load_state_dict(state_dict["model_state"], strict=strict) + expected_slice = torch.tensor( + [ 0.1825, 0.8655, 0.5779, 1.1550, 1.1025, 0.6381, 1.0288, -0.0624, 0.1455] + ) # If you also want intermediate feature maps - out = model(inputs[None, ...], return_intermediates=True) + out = model(processed_image.pixel_values) + out.last_hidden_state.argmax(dim=-1).item() + assert torch.allclose(out.last_hidden_state[0, :9], expected_slice, atol=1e-4) - for x in out.intermediates: - print(x.shape) print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path, push_to_hub=True, safe_serialization=False) - + model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False) + if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 3cd0d21c56b8..6c8d6c93cf26 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -624,25 +624,21 @@ def __init__(self, config: HieraConfig): for i in range(depth): output_dim = self.embedding_dimension - # Mask unit or global attention. - # Lag by 1 block, so that global attention, - # applied post pooling on lower resolution use_mask_unit_attention = self.mask_unit_attn[cur_stage] if i - 1 in self.stage_ends: output_dim = int(self.embedding_dimension * self.dim_mul) - number_of_heads = int(self.number_of_heads * self.head_mul) + self.number_of_heads = int(self.number_of_heads * self.head_mul) # Update the class variable cur_stage += 1 if i in q_pool_blocks: flat_mu_size //= flat_q_stride - else: - number_of_heads = self.number_of_heads + block = HieraBlock( config, input_dim=self.embedding_dimension, output_dim=output_dim, - number_of_heads=number_of_heads, + number_of_heads=self.number_of_heads, drop_path=dpr[i], norm_layer=norm_layer, q_stride=(flat_q_stride if i in q_pool_blocks else 1), @@ -650,7 +646,7 @@ def __init__(self, config: HieraConfig): use_mask_unit_attention=use_mask_unit_attention, ) - self.embedding_dimension = output_dim + self.embedding_dimension = output_dim self.blocks.append(block) self.norm = norm_layer(self.embedding_dimension) @@ -787,4 +783,4 @@ def forward( intermediates=intermediates if output_intermediates else None, attentions=attentions if output_attentions else None, hidden_states=hidden_states if output_hidden_states else None, - ) + ) \ No newline at end of file From 3adb788e60956511582536bc8b411a9c622f175c Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Fri, 15 Mar 2024 07:01:48 +0000 Subject: [PATCH 058/118] fixed doc string --- .../models/hiera/configuration_hiera.py | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index dc4e7d554bee..885e647ef260 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. +# Copyright 2024 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -40,31 +40,26 @@ class HieraConfig(PretrainedConfig): Args: - input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width). - in_chans (int, optional, *optional*, defaults to 3): Number of input channels. Defaults to 3. - embedding_dimension (int, optional, *optional*, defaults to 96): Dimension of the initial embedding. Defaults to 96. - number_of_heads (int, optional, *optional*, defaults to 1): Initial number of attention heads. Defaults to 1. - num_classes (int, optional, *optional*, defaults to 1000): Number of output classes. Defaults to 1000. - stages (Tuple[int, ...], optional, *optional*, defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model. - q_pool (int, optional, *optional*, defaults to 3): Number of pooling stages for queries. Defaults to 3. - q_stride (Tuple[int, ...], optional, *optional*, defaults to `(2, 2)`): Stride size for pooling. Defaults to (2, 2). - mask_unit_size (Tuple[int, ...], optional, *optional*, defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride. - mask_unit_attn (Tuple[bool, ...], optional, *optional*, defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention. Defaults to (True, True, False, False). - dim_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the dimensionality through the network. Defaults to 2.0. - head_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the number of heads through the network. Defaults to 2.0. - patch_kernel (Tuple[int, ...], optional, *optional*, defaults to `(7, 7)`): Kernel size for patch embedding. Defaults to (7, 7). - patch_stride (Tuple[int, ...], optional, *optional*, defaults to `(4, 4)`): Stride for patch embedding. Defaults to (4, 4). - patch_padding (Tuple[int, ...], optional, *optional*, defaults to `(3, 3)`): Padding for patch embedding. Defaults to (3, 3). - mlp_ratio (float, optional, *optional*, defaults to 4.0): Ratio of hidden size to feed-forward layer size. Defaults to 4.0. - drop_path_rate (float, optional, *optional*, defaults to 0.0): Dropout rate for stochastic depth. Defaults to 0.0. - head_dropout (float, optional, *optional*, defaults to 0.0): Dropout rate for attention heads. Defaults to 0.0. - head_init_scale (float, optional, *optional*, defaults to 0.001): Initial scaling factor for attention head weights. Defaults to 0.001. - sep_position_embeddings (bool, optional, *optional*, defaults to `False`): Whether to use separate position embeddings. Defaults to False. - - - - - + input_size (Tuple[int, int] or int, , defaults to `(224, 224)`): Dimensions of the input image (height, width). + in_chans (int, optional, , defaults to 3): Number of input channels. + embedding_dimension (int, optional, defaults to 96): Dimension of the initial embedding. + number_of_heads (int, optional, defaults to 1): Initial number of attention heads. + num_classes (int, optional, , defaults to 1000): Number of output classes. + stages (Tuple[int, ...], optional, , defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model. + q_pool (int, optional, , defaults to 3): Number of pooling stages for queries. . + q_stride (Tuple[int, ...], optional, , defaults to `(2, 2)`): Stride size for pooling. + mask_unit_size (Tuple[int, ...], optional, , defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride. + mask_unit_attn (Tuple[bool, ...], optional, , defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention. + dim_mul (float, optional, , defaults to 2.0): Factor for increasing the dimensionality through the network. + head_mul (float, optional, , defaults to 2.0): Factor for increasing the number of heads through the network. + patch_kernel (Tuple[int, ...], optional, , defaults to `(7, 7)`): Kernel size for patch embedding. + patch_stride (Tuple[int, ...], optional, , defaults to `(4, 4)`): Stride for patch embedding. + patch_padding (Tuple[int, ...], optional, , defaults to `(3, 3)`): Padding for patch embedding. + mlp_ratio (float, optional, , defaults to 4.0): Ratio of hidden size to feed-forward layer size. + drop_path_rate (float, optional, , defaults to 0.0): Dropout rate for stochastic depth. + head_dropout (float, optional, , defaults to 0.0): Dropout rate for attention heads. + head_init_scale (float, optional, , defaults to 0.001): Initial scaling factor for attention head weights. + sep_position_embeddings (bool, optional, , defaults to `False`): Whether to use separate position embeddings. """ model_type = "hiera" From c69df922789adfa5a872d0088dc4aca41b69aa1f Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Thu, 28 Mar 2024 20:41:25 +0100 Subject: [PATCH 059/118] First commit --- README.md | 1 + README_de.md | 1 + README_es.md | 1 + README_fr.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_pt-br.md | 1 + README_ru.md | 1 + README_te.md | 1 + README_vi.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/hiera.md | 56 + docs/source/en/tasks/image_classification.md | 2 +- src/transformers/__init__.py | 18 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + .../models/auto/feature_extraction_auto.py | 1 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + src/transformers/models/hiera/__init__.py | 59 + .../models/hiera/configuration_hiera.py | 175 +++ .../models/hiera/convert_hiera_to_hf.py | 332 ++++++ .../models/hiera/modeling_hiera.py | 1043 +++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 31 + tests/models/hiera/__init__.py | 0 tests/models/hiera/test_modeling_hiera.py | 317 +++++ 30 files changed, 2056 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/model_doc/hiera.md create mode 100644 src/transformers/models/hiera/__init__.py create mode 100644 src/transformers/models/hiera/configuration_hiera.py create mode 100644 src/transformers/models/hiera/convert_hiera_to_hf.py create mode 100644 src/transformers/models/hiera/modeling_hiera.py create mode 100644 tests/models/hiera/__init__.py create mode 100644 tests/models/hiera/test_modeling_hiera.py diff --git a/README.md b/README.md index 4a3b78756716..783a503237a8 100644 --- a/README.md +++ b/README.md @@ -391,6 +391,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_de.md b/README_de.md index 5c3fa28ccba8..59d347f3438b 100644 --- a/README_de.md +++ b/README_de.md @@ -387,6 +387,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_es.md b/README_es.md index 9a6ea777a790..3c6ef0abe280 100644 --- a/README_es.md +++ b/README_es.md @@ -364,6 +364,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_fr.md b/README_fr.md index 7f7fe2343e27..f8d9cd5f6b3a 100644 --- a/README_fr.md +++ b/README_fr.md @@ -385,6 +385,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_hd.md b/README_hd.md index 12df2d0740c9..90f7145d3811 100644 --- a/README_hd.md +++ b/README_hd.md @@ -338,6 +338,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा। 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ja.md b/README_ja.md index 78cd7b0474be..e053b3409e03 100644 --- a/README_ja.md +++ b/README_ja.md @@ -398,6 +398,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ko.md b/README_ko.md index 1798760d86e9..d9f6577d154e 100644 --- a/README_ko.md +++ b/README_ko.md @@ -313,6 +313,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_pt-br.md b/README_pt-br.md index 899acaf7f1c4..68bd03da9e13 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -396,6 +396,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_ru.md b/README_ru.md index fdb647996556..ef61a742e51e 100644 --- a/README_ru.md +++ b/README_ru.md @@ -386,6 +386,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_te.md b/README_te.md index 8906438d1fb0..711f016548da 100644 --- a/README_te.md +++ b/README_te.md @@ -388,6 +388,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_vi.md b/README_vi.md index 5aabe6ccc353..60f4201fc385 100644 --- a/README_vi.md +++ b/README_vi.md @@ -387,6 +387,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (từ Microsoft) được phát hành với bài báo [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (từ UCSD, NVIDIA) được phát hành với bài báo [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (từ Allegro.pl, AGH University of Science and Technology) được phát hành với bài báo [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (từ Facebook) được phát hành với bài báo [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (từ Berkeley) được phát hành với bài báo [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (từ HuggingFace) được phát hành với bài báo [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_zh-hans.md b/README_zh-hans.md index ca3d42eb00b9..0a341a1ffc5f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -337,6 +337,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/README_zh-hant.md b/README_zh-hant.md index 78278a76a289..c6a1e4075d7c 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -349,6 +349,7 @@ conda install conda-forge::transformers 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from ) released with the paper []() by . 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 92ee8eeda447..8508e693916c 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -565,6 +565,8 @@ title: FocalNet - local: model_doc/glpn title: GLPN + - local: model_doc/hiera + title: Hiera - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ffa9ae3f4b0b..d52b5a288cc6 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -156,6 +156,7 @@ Flax), PyTorch, and/or TensorFlow. | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | +| [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md new file mode 100644 index 000000000000..233f63b29759 --- /dev/null +++ b/docs/source/en/model_doc/hiera.md @@ -0,0 +1,56 @@ + + +# Hiera + +## Overview + +The Hiera model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + + +## HieraConfig + +[[autodoc]] HieraConfig + +## HieraModel + +[[autodoc]] HieraModel + - forward + +## HieraForMaskedImageModeling + +[[autodoc]] HieraForMaskedImageModeling + - forward + +## HieraForImageClassification + +[[autodoc]] HieraForImageClassification + - forward + + + diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 22a568f5e446..3f0eee3d5ff8 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -34,7 +34,7 @@ The task illustrated in this tutorial is supported by the following model archit -[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [CLIP](../model_doc/clip), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [PVTv2](../model_doc/pvt_v2), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SigLIP](../model_doc/siglip), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) +[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [CLIP](../model_doc/clip), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [Hiera](../model_doc/hiera), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [PVTv2](../model_doc/pvt_v2), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SigLIP](../model_doc/siglip), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index da29d77972f4..3d3bb6fcd35e 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -499,6 +499,7 @@ "GroupViTVisionConfig", ], "models.herbert": ["HerbertTokenizer"], + "models.hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig"], "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"], "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"], "models.idefics": [ @@ -2399,6 +2400,15 @@ "GroupViTVisionModel", ] ) + _import_structure["models.hiera"].extend( + [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "HieraForImageClassification", + "HieraForMaskedImageModeling", + "HieraModel", + "HieraPreTrainedModel", + ] + ) _import_structure["models.hubert"].extend( [ "HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5383,6 +5393,7 @@ GroupViTVisionConfig, ) from .models.herbert import HerbertTokenizer + from .models.hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig from .models.idefics import ( @@ -7110,6 +7121,13 @@ GroupViTTextModel, GroupViTVisionModel, ) + from .models.hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, + HieraForImageClassification, + HieraForMaskedImageModeling, + HieraModel, + HieraPreTrainedModel, + ) from .models.hubert import ( HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, HubertForCTC, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 0599d3b876e6..5d866b2d51b4 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -107,6 +107,7 @@ graphormer, groupvit, herbert, + hiera, hubert, ibert, idefics, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index bf46066002fe..95bfa104ecb6 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -121,6 +121,7 @@ ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), ("groupvit", "GroupViTConfig"), + ("hiera", "HieraConfig"), ("hubert", "HubertConfig"), ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), @@ -384,6 +385,7 @@ ("graphormer", "Graphormer"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), + ("hiera", "Hiera"), ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index f8cb55091b02..86992edf49c0 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -60,6 +60,7 @@ ("flava", "FlavaFeatureExtractor"), ("glpn", "GLPNFeatureExtractor"), ("groupvit", "CLIPFeatureExtractor"), + ("hiera", "HieraFeatureExtractor"), ("hubert", "Wav2Vec2FeatureExtractor"), ("imagegpt", "ImageGPTFeatureExtractor"), ("layoutlmv2", "LayoutLMv2FeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 3debf97fea20..971f368c900f 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -69,6 +69,7 @@ ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), ("groupvit", "CLIPImageProcessor"), + ("hiera", "HieraImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), ("instructblip", "BlipImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 150dea04f374..2a08f4ea2c81 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -116,6 +116,7 @@ ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), ("groupvit", "GroupViTModel"), + ("hiera", "HieraModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), @@ -548,6 +549,7 @@ [ ("deit", "DeiTForMaskedImageModeling"), ("focalnet", "FocalNetForMaskedImageModeling"), + ("hiera", "HieraForMaskedImageModeling"), ("swin", "SwinForMaskedImageModeling"), ("swinv2", "Swinv2ForMaskedImageModeling"), ("vit", "ViTForMaskedImageModeling"), @@ -587,6 +589,7 @@ ), ("efficientnet", "EfficientNetForImageClassification"), ("focalnet", "FocalNetForImageClassification"), + ("hiera", "HieraForImageClassification"), ("imagegpt", "ImageGPTForImageClassification"), ( "levit", diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py new file mode 100644 index 000000000000..fb05b30adcb1 --- /dev/null +++ b/src/transformers/models/hiera/__init__.py @@ -0,0 +1,59 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = {"configuration_hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig", "HieraOnnxConfig"]} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_hiera"] = [ + "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", + "HieraForImageClassification", + "HieraForMaskedImageModeling", + "HieraModel", + "HieraPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig, HieraOnnxConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_hiera import ( + HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, + HieraForImageClassification, + HieraForMaskedImageModeling, + HieraModel, + HieraPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py new file mode 100644 index 000000000000..25e309ef1fe8 --- /dev/null +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -0,0 +1,175 @@ +# coding=utf-8 +# Copyright 2024 Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Hiera model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "EduardoPacheco/hiera-tiny-224": "https://huggingface.co/EduardoPacheco/hiera-tiny-224/resolve/main/config.json", +} + + +class HieraConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate an Hiera + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Hiera + [google/hiera-base-patch16-224](https://huggingface.co/google/hiera-base-patch16-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + embed_dim (`int`, *optional*, defaults to 96): + Dimensionality of patch embedding. + input_size (`tuple(int)`, *optional*, defaults to `(224, 224)`): + The size (resolution) of input in the format (height, width) for images + and (frames, height, width) for videos. + patch_kernel (`tuple(int)`, *optional*, defaults to `(7, 7)`): + The size (resolution) of each patch. + patch_stride (`tuple(int)`, *optional*, defaults to `(4, 4)`): + The stride of the patch. + patch_padding (`tuple(int)`, *optional*, defaults to `(3, 3)`): + The padding of the patch. + mlp_ratio (`float`, *optional*, defaults to 4.0): + The ratio of mlp hidden dim to embedding dim. + depths (`tuple(int)`, *optional*, defaults to `[2, 3, 16, 3]`): + Depth of each layer in the Transformer encoder. + initial_num_heads (`int`, *optional*, defaults to 1): + Initial number of attention heads in the first layer of the Transformer encoder. + num_head_multiplier (`float`, *optional*, defaults to 2.0): + The multiplier to the number of attention heads in each layer of the Transformer encoder. + embed_dim_multiplier (`float`, *optional*, defaults to 2.0): + The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder. + num_query_pool (`int`, *optional*, defaults to 3): + The number of query pool stages. + query_stride (`tuple(int)`, *optional*, defaults to `(2, 2)`): + The stride of the query pool. + masked_unit_size (`tuple(int)`, *optional*, defaults to `(8, 8)`): + The size of the masked unit. + masked_unit_attention (`list(bool)`, *optional*, defaults to `[True, True, False, False]`): + Whether to use masked unit attention in each layer of the Transformer encoder. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The drop path rate. + sep_pos_embed (`bool`, *optional*, defaults to `False`): + Whether to use separate position embedding for temporal and spatial dimensions. Must be `True` for videos. + and `False` for images. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices and + the zero_initializer for initializing all bias vectors. + layer_norm_init (`float`, *optional*, defaults to 1.0): + The initial weight value for layer normalization layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import HieraConfig, HieraModel + + >>> # Initializing a Hiera hiera-base-patch16-224 style configuration + >>> configuration = HieraConfig() + + >>> # Initializing a model (with random weights) from the hiera-base-patch16-224 style configuration + >>> model = HieraModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "hiera" + + def __init__( + self, + embed_dim=96, + input_size=(224, 224), + patch_kernel=(7, 7), + patch_stride=(4, 4), + patch_padding=(3, 3), + mlp_ratio=4.0, + depths=[2, 3, 16, 3], + initial_num_heads=1, + num_head_multiplier=2.0, + embed_dim_multiplier=2.0, + num_query_pool=3, + query_stride=(2, 2), + masked_unit_size=(8, 8), + masked_unit_attention=[True, True, False, False], + drop_path_rate=0.0, + sep_pos_embed=False, + num_channels=3, + hidden_act="gelu", + initializer_range=0.02, + layer_norm_init=1.0, + layer_norm_eps=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + + self.embed_dim = embed_dim + self.input_size = input_size + self.patch_kernel = patch_kernel + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.mlp_ratio = mlp_ratio + self.depths = depths + self.initial_num_heads = initial_num_heads + self.num_head_multiplier = num_head_multiplier + self.embed_dim_multiplier = embed_dim_multiplier + self.num_query_pool = num_query_pool + self.query_stride = query_stride + self.masked_unit_size = masked_unit_size + self.masked_unit_attention = masked_unit_attention + self.drop_path_rate = drop_path_rate + self.sep_pos_embed = sep_pos_embed + self.num_channels = num_channels + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_init = layer_norm_init + self.layer_norm_eps = layer_norm_eps + + self.hidden_size = embed_dim + + +class HieraOnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py new file mode 100644 index 000000000000..e36725baf84f --- /dev/null +++ b/src/transformers/models/hiera/convert_hiera_to_hf.py @@ -0,0 +1,332 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Hiera checkpoints trained with the DINO method.""" + + +import argparse +from dataclasses import dataclass + +import requests +import torch +from PIL import Image +from torchvision import transforms + +from transformers import BeitImageProcessor, HieraConfig, HieraForImageClassification, HieraModel +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def create_rename_keys(config, base_model=False): + rename_keys = [] + # fmt: off + num_stages = len(config.depths) + # embedding dimensions for input and stages + dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)] + + global_layer_idx = 0 + for stage_idx in range(num_stages): + dim_in = dims[stage_idx] + dim_out = dims[stage_idx + 1] + for layer_idx in range(config.depths[stage_idx]): + rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias")) + rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias")) + rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias")) + rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias")) + rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias")) + rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias")) + + # projection layer only for the first layer of each stage boundary (except the first stage) + if dim_out != dim_in and layer_idx == 0: + rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight")) + rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias")) + + global_layer_idx += 1 + + # projection layer + position embeddings + rename_keys.extend( + [ + ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"), + ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias") + ] + ) + + if config.sep_pos_embed: + rename_keys.extend( + [ + ("pos_embed_spatial", "hiera.embeddings.position_embeddings_spatial"), + ("pos_embed_temporal", "hiera.embeddings.position_embeddings_temporal") + ] + ) + else: + rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings")) + + if base_model: + # layernorm + pooler + rename_keys.extend([("norm.weight", "layernorm.weight"), ("norm.bias", "layernorm.bias")]) + + # if just the base model, we should remove "hiera" from all keys that start with "hiera" + rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("hiera") else pair for pair in rename_keys] + else: + # layernorm + classification head + rename_keys.extend( + [ + ("norm.weight", "hiera.layernorm.weight"), + ("norm.bias", "hiera.layernorm.bias"), + ("head.projection.weight", "classifier.weight"), + ("head.projection.bias", "classifier.bias"), + ] + ) + # fmt: on + return rename_keys + + +def remove_classification_head_(state_dict): + ignore_keys = ["head.projection.weight", "head.projection.bias"] + for k in ignore_keys: + state_dict.pop(k, None) + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@dataclass +class HieraInfo: + base_checkpoint_url: str + checkpoint_url: str + config: HieraConfig + + +def get_hiera_config(model_name: str, base_model: bool) -> HieraInfo: + kwargs = {} if base_model else {"num_labels": 400 if model_name.endswith("16x224") else 1000} + + if model_name == "hiera-tiny-224": + config = HieraConfig(depths=[1, 2, 7, 2], **kwargs) + elif model_name == "hiera-small-224": + HieraConfig(depths=[1, 2, 11, 2], **kwargs) + elif model_name == "hiera-base-224": + config = HieraConfig(**kwargs) + elif model_name == "hiera-base-plus-224": + config = HieraConfig(embed_dim=112, initial_num_heads=2, **kwargs) + elif model_name == "hiera-large-224": + config = HieraConfig(embed_dim=144, initial_num_heads=2, depths=[2, 6, 36, 4], **kwargs) + elif model_name == "hiera-huge-224": + config = HieraConfig(embed_dim=256, initial_num_heads=4, depths=[2, 6, 36, 4], **kwargs) + elif model_name == "hiera-base-16x224": + config = HieraConfig( + input_size=(16, 224, 224), + query_stride=(1, 2, 2), + masked_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + **kwargs, + ) + elif model_name == "hiera-base-plus-16x224": + config = HieraConfig( + input_size=(16, 224, 224), + query_stride=(1, 2, 2), + masked_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + embed_dim=112, + initial_num_heads=2, + **kwargs, + ) + elif model_name == "hiera-large-16x224": + config = HieraConfig( + input_size=(16, 224, 224), + query_stride=(1, 2, 2), + masked_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + embed_dim=144, + initial_num_heads=2, + depths=[2, 6, 36, 4], + **kwargs, + ) + elif model_name == "hiera-huge-16x224": + config = HieraConfig( + input_size=(16, 224, 224), + query_stride=(1, 2, 2), + masked_unit_size=(1, 8, 8), + patch_kernel=(3, 7, 7), + patch_stride=(2, 4, 4), + patch_padding=(1, 3, 3), + sep_pos_embed=True, + embed_dim=256, + initial_num_heads=4, + depths=[2, 6, 36, 4], + **kwargs, + ) + else: + raise ValueError(f"Unrecognized model name: {model_name}") + + return config + + +@torch.no_grad() +def convert_hiera_checkpoint(args): + model_name = args.model_name + base_model = args.base_model + pytorch_dump_folder_path = args.pytorch_dump_folder_path + verify_logits = args.verify_logits + push_to_hub = args.push_to_hub + IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] + IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225] + + config = get_hiera_config(model_name, base_model) + + # Load original hiera model + original_model = torch.hub.load( + "facebookresearch/hiera", + model=model_name.replace("-", "_"), + pretrained=True, + checkpoint="mae_in1k_ft_in1k" if not base_model else "mae_in1k", + ) + + original_model.eval() + original_state_dict = original_model.state_dict() + if base_model: + remove_classification_head_(original_state_dict) + + # # Rename keys + new_state_dict = original_state_dict.copy() + rename_keys = create_rename_keys(config, base_model) + + for src, dest in rename_keys: + rename_key(new_state_dict, src, dest) + + # Load HF hiera model + model = HieraModel(config) if base_model else HieraForImageClassification(config) + model.eval() + + missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) + print("Missing keys:", missing_keys) + print("Unexpected keys:", unexpected_keys) + + input_image = prepare_img() + + if model_name.endswith("16x224"): + original_image_preprocessor = None + else: + original_image_preprocessor = transforms.Compose( + [ + transforms.Resize( + int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC + ), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ] + ) + + image_processor = BeitImageProcessor( + image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"height": 224, "width": 224} + ) + inputs = image_processor(images=input_image, return_tensors="pt") + + expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) + + assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4) + + outputs = model(**inputs) + # original implementation returns logits.softmax(dim=-1) + expected_prob = original_model(input_image) + + if verify_logits and not base_model: + output_prob = outputs.logits.softmax(dim=-1) + assert torch.allclose(output_prob, expected_prob, atol=1e-4) + print("Looks good!") + else: + print("Converted without verifying logits") + + if pytorch_dump_folder_path is not None: + print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + image_processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print(f"Pushing model and processor for {model_name} to hub") + hub_name = model_name + if not base_model: + hub_name = f"{model_name}-k400" if model_name.endswith("16x224") else f"{model_name}-in1k" + model.push_to_hub(f"EduardoPacheco/{hub_name}") + image_processor.push_to_hub(f"EduardoPacheco/{hub_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="hiera-tiny-224", + type=str, + choices=[ + "hiera-tiny-224", + "hiera-small-224", + "hiera-base-224", + "hiera-base-plus-224", + "hiera-large-224", + "hiera-huge-224", + "hiera-base-16x224", + "hiera-base-plus-16x224", + "hiera-large-16x224", + "hiera-huge-16x224", + ], + help="Name of the Hiera model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--verify_logits", + action="store_true", + help="Whether or not to verify the logits against the original implementation.", + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + parser.add_argument( + "--base_model", + action="store_true", + help="Whether to only convert the base model (no projection head weights).", + ) + + args = parser.parse_args() + convert_hiera_checkpoint(args) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py new file mode 100644 index 000000000000..94740cd64a0b --- /dev/null +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -0,0 +1,1043 @@ +# coding=utf-8 +# Copyright 2024 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Hiera model.""" + + +import math +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + ImageClassifierOutput, + MaskedImageModelingOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_hiera import HieraConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "HieraConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "EduardoPacheco/hiera-tiny-224" +_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "google/hiera-base-patch16-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" + + +HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "EduardoPacheco/hiera-tiny-224", + # See all Hiera models at https://huggingface.co/models?filter=hiera +] + + +# Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L73 +def conv_nd(n: int) -> nn.Module: + """ + Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3. + If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises) + """ + return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n] + + +# Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L81 +def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: + # Refer to `Unroll` to see how this performs a maxpool-Nd + return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values + + +class HieraPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + + # Support any number of spatial dimensions + self.spatial_dims = len(config.patch_kernel) + if self.spatial_dims not in (2, 3): + raise ValueError( + f"The number of dimensions of the input image should be 2 or 3, but got {self.spatial_dims}." + ) + self.num_channels = config.num_channels + self.image_size = config.input_size + + self.projection = conv_nd(self.spatial_dims)( + self.num_channels, + config.hidden_size, + kernel_size=config.patch_kernel, + stride=config.patch_stride, + padding=config.patch_padding, + ) + + def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: + """Zero-out the masked regions of the input before conv. + Prevents leakage of masked regions when using overlapping kernels. + """ + if mask is None: + return self.projection(pixel_values) + + target_size = pixel_values.shape[2:] + + if len(mask.shape[2:]) != len(target_size): + raise ValueError( + f"The length of the spatial dimensions of the mask should match the one from input image, but got {len(mask.shape[2:])} and {len(target_size)}." + ) + + if mask.shape[2:] != target_size: + mask = nn.functional.interpolate(mask.float(), size=target_size) + + return self.projection(pixel_values * mask.bool()) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + _, num_channels, _, _ = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + + embeddings = self.masked_conv(pixel_values, bool_masked_pos) + embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1) + + return embeddings + + +class HieraEmbeddings(nn.Module): + """ + Construct position and patch embeddings. + """ + + def __init__(self, config: HieraConfig, use_mask_token: bool = False) -> None: + super().__init__() + + self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)] + self.num_tokens = math.prod(self.tokens_spatial_shape) + self.sep_pos_embed = config.sep_pos_embed + self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)] + + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None + + self.patch_embeddings = HieraPatchEmbeddings(config) + + if self.sep_pos_embed: + self.position_embeddings_spatial = nn.Parameter( + torch.zeros( + 1, + self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], + config.hidden_size, + ) + ) + self.position_embeddings_temporal = nn.Parameter( + torch.zeros(1, self.tokens_spatial_shape[0], config.hidden_size) + ) + else: + self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.hidden_size)) + + def get_position_embedding(self) -> torch.Tensor: + if self.sep_pos_embed: + return self.position_embeddings_spatial.repeat( + 1, self.tokens_spatial_shape[0], 1 + ) + torch.repeat_interleave( + self.position_embeddings_temporal, + self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2], + dim=1, + ) + else: + return self.position_embeddings + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor: + if len(self.mask_spatial_shape) == 2: + batch_size, num_channels, height, width = pixel_values.shape + else: + batch_size, num_channels, depth, height, width = pixel_values.shape + + if bool_masked_pos is not None: + bool_masked_pos = bool_masked_pos.view(batch_size, 1, *self.mask_spatial_shape) + + embeddings = self.patch_embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + embeddings = embeddings + self.get_position_embedding() + + return embeddings + + +class HieraMaskUnitAttention(nn.Module): + """ + Computes either Mask Unit or Global Attention. Also is able to perform q pooling. + + Note: this assumes the tokens have already been flattened and unrolled into mask units. + """ + + def __init__( + self, + dim: int, + dim_out: int, + num_heads: int, + query_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ): + super().__init__() + + self.dim = dim + self.dim_out = dim_out + self.num_heads = num_heads + self.query_stride = query_stride + + self.head_dim = dim_out // num_heads + self.scale = (self.head_dim) ** -0.5 + + self.qkv = nn.Linear(dim, 3 * dim_out) + self.proj = nn.Linear(dim_out, dim_out) + + self.window_size = window_size + self.use_mask_unit_attn = use_mask_unit_attn + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor: + """Input should be of shape [batch, tokens, channels].""" + batch_size, seq_len, _ = hidden_states.shape + + num_windows = 1 + if self.use_mask_unit_attn: + num_windows = seq_len // (self.q_stride * self.window_size) + + qkv = self.qkv(hidden_states) + qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim) + qkv = qkv.permute(3, 0, 4, 2, 1, 5) + + query, key, value = qkv.unbind(0) + + if self.query_stride > 1: + # Refer to Unroll to see how this performs a maxpool-Nd + query = query.view(batch_size, self.num_heads, num_windows, self.query_stride, -1, self.head_dim) + query = query.max(dim=3).values + + attn_weights = (query * self.scale) @ key.transpose(-1, -2) + attn_weights = attn_weights.softmax(dim=-1) + + attn_output = attn_weights @ value + attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.dim_out) + attn_output = self.proj(attn_output) + + return (attn_output, attn_weights) if output_attentions else (attn_output, None) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Hiera +class HieraDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class HieraMlp(nn.Module): + def __init__(self, config, dim: int): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(dim, int(dim * config.mlp_ratio)) + self.fc2 = nn.Linear(int(dim * config.mlp_ratio), dim) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class HieraLayer(nn.Module): + def __init__( + self, + config, + dim: int, + dim_out: int, + num_heads: int, + drop_path: float = 0.0, + query_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ): + super().__init__() + + self.dim = dim + self.dim_out = dim_out + self.query_stride = query_stride + + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attn = HieraMaskUnitAttention(dim, dim_out, num_heads, query_stride, window_size, use_mask_unit_attn) + + self.layernorm_after = nn.LayerNorm(dim_out, eps=config.layer_norm_eps) + self.mlp = HieraMlp(config, dim_out) + + self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity() + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor: + batch_size, seq_len, hidden_dim = hidden_states.shape + # Attention + Q Pooling + hidden_states_norm = self.layernorm_before(hidden_states) + + if self.dim != self.dim_out: + hidden_states = self.proj(hidden_states_norm) + # Refer to `HieraUnroll` to see how this performs a maxpool-Nd + hidden_states = hidden_states.view(batch_size, self.query_stride, -1, hidden_dim).max(dim=1).values + + (hidden_states_norm, attn_weights) = self.attn(hidden_states_norm, output_attentions=output_attentions) + hidden_states = hidden_states + self.drop_path(hidden_states_norm) + + residual = hidden_states + hidden_states = self.layernorm_after(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.drop_path(hidden_states) + + return (hidden_states, attn_weights) + + +class HieraStage(nn.Module): + def __init__( + self, + config, + depth: int, + dim: int, + dim_out: int, + num_heads: int, + drop_path: List[float], + query_stride: List[int], + window_size: int, + use_mask_unit_attn: bool, + ) -> None: + super().__init__() + self.layers = nn.ModuleList( + [ + HieraLayer( + config=config, + dim=dim if i == 0 else dim_out, + dim_out=dim_out, + num_heads=num_heads, + drop_path=drop_path[i], + query_stride=query_stride[i], + window_size=window_size, + use_mask_unit_attn=use_mask_unit_attn, + ) + for i in range(depth) + ] + ) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor: + for layer_module in self.layers: + (hidden_states, attn_weights) = layer_module(hidden_states, output_attentions=output_attentions) + + return hidden_states, attn_weights + + +class HieraEncoder(nn.Module): + def __init__(self, config: HieraConfig) -> None: + super().__init__() + self.config = config + + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + # query strides rule + stage_ends = [sum(config.depths[:i]) - 1 for i in range(1, len(config.depths) + 1)] + query_pool_layer = [stage_end + 1 for stage_end in stage_ends[: config.num_query_pool]] + query_strides = [ + math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(sum(config.depths)) + ] + + # Transformer blocks + self.stages = nn.ModuleList() + embed_dim = config.embed_dim + + for idx_stage, depth in enumerate(config.depths): + dim_out = int(config.embed_dim * config.embed_dim_multiplier**idx_stage) + + stage = HieraStage( + config=config, + depth=depth, + dim=embed_dim, + dim_out=dim_out, + num_heads=int(config.initial_num_heads * config.num_head_multiplier**idx_stage), + drop_path=dpr[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])], + query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])], + window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage), + use_mask_unit_attn=config.masked_unit_attention[idx_stage], + ) + + embed_dim = dim_out + self.stages.append(stage) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, stage_module in enumerate(self.stages): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + stage_module.__call__, hidden_states, output_attentions + ) + else: + layer_outputs = stage_module(hidden_states, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class HieraUnroll(nn.Module): + """ + Reorders the tokens such that patches are contiguous in memory. + E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as + [B, (Sy, Sx, H // Sy, W // Sx), C] + + This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1). + Not only is this faster, but it also makes it easy to support inputs of arbitrary + dimensions in addition to patch-wise sparsity. + + Performing this operation multiple times in sequence puts entire windows as contiguous + in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of + size 8x8 would be contiguous in memory, allowing operations like mask unit attention + computed easily and efficiently, while also allowing max to be applied sequentially. + + Note: This means that intermediate values of the model are not in HxW order, so they + need to be re-rolled if you want to use the intermediate values as a HxW feature map. + The last block of the network is fine though, since by then the strides are all consumed. + """ + + def __init__(self, config) -> None: + super().__init__() + self.size = [i // s for i, s in zip(config.input_size, config.patch_stride)] + self.schedule = [config.query_stride] * len(config.depths[:-1]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: Flattened patch embeddings [B, N, C] + Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd + """ + B, _, C = x.shape + + cur_size = self.size + x = x.view(*([B] + cur_size + [C])) + + for strides in self.schedule: + # Move patches with the given strides to the batch dimension + + # Create a view of the tensor with the patch stride as separate dims + # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C] + cur_size = [i // s for i, s in zip(cur_size, strides)] + new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C] + x = x.view(new_shape) + + # Move the patch stride into the batch dimension + # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C] + L = len(new_shape) + permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1] + x = x.permute(permute) + + # Now finally flatten the relevant dims into the batch dimension + x = x.flatten(0, len(strides)) + B *= math.prod(strides) + + x = x.reshape(-1, math.prod(self.size), C) + return x + + +def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor: + """ + Restore spatial organization by undoing windowed organization of mask units. + + Args: + x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C] + shape: current spatial shape, if it were not organized into mask unit + windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C]. + mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx] + Returns: + x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C] + """ + D = len(shape) + B, C = x.shape[0], x.shape[-1] + # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C] + num_MUs = [s // mu for s, mu in zip(shape, mu_shape)] + x = x.view(B, *num_MUs, *mu_shape, C) + + # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C] + permute = ( + [0] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))], + [], + ) + + [len(x.shape) - 1] + ) + x = x.permute(permute).reshape(B, *shape, C) + + return x + + +class HieraReroll(nn.Module): + """ + Undos the "unroll" operation so that you can use intermediate features. + """ + + def __init__( + self, + input_size: Tuple[int, ...], + patch_stride: Tuple[int, ...], + unroll_schedule: List[Tuple[int, ...]], + stage_ends: List[int], + q_pool: int, + ): + super().__init__() + self.size = [i // s for i, s in zip(input_size, patch_stride)] + + # The first stage has to reverse everything + # The next stage has to reverse all but the first unroll, etc. + self.schedule = {} + size = self.size + for i in range(stage_ends[-1] + 1): + self.schedule[i] = unroll_schedule, size + # schedule unchanged if no pooling at a stage end + if i in stage_ends[:q_pool]: + if len(unroll_schedule) > 0: + size = [n // s for n, s in zip(size, unroll_schedule[0])] + unroll_schedule = unroll_schedule[1:] + + def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor: + """ + Roll the given tensor back up to spatial order assuming it's from the given block. + + If no mask is provided: + - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc. + If a mask is provided: + - Returns [B, #MUs, MUy, MUx, C] for 2d, etc. + """ + schedule, size = self.schedule[block_idx] + B, N, C = x.shape + + D = len(size) + cur_mu_shape = [1] * D + + for strides in schedule: + # Extract the current patch from N + x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C) + + # Move that patch into the current MU + # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C] + L = len(x.shape) + permute = ( + [0, 1 + D] + + sum( + [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))], + [], + ) + + [L - 1] + ) + x = x.permute(permute) + + # Reshape to [B, N//(Sy*Sx), *MU, C] + for i in range(D): + cur_mu_shape[i] *= strides[i] + x = x.reshape(B, -1, *cur_mu_shape, C) + N = x.shape[1] + + # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C]) + x = x.view(B, N, *cur_mu_shape, C) + + # If masked, return [B, #MUs, MUy, MUx, C] + if mask is not None: + return x + + # If not masked, we can return [B, H, W, C] + x = undo_windowing(x, size, cur_mu_shape) + + return x + + +class HieraPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = HieraConfig + base_model_prefix = "hiera" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["HieraEmbeddings", "HieraLayer"] + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + std = self.config.initializer_range + + if isinstance(module, HieraEmbeddings): + if self.config.sep_pos_embed: + nn.init.trunc_normal_(module.position_embeddings_spatial, std=std) + nn.init.trunc_normal_(module.position_embeddings_temporal, std=std) + else: + nn.init.trunc_normal_(module.position_embeddings, std=std) + + elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): + nn.init.trunc_normal_(module.weight, std=std) + if isinstance(module, nn.Linear) and module.bias is not None: + nn.init.constant_(module.bias, std) + + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, std) + nn.init.constant_(module.weight, self.config.layer_norm_init) + + +HIERA_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`HieraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +HIERA_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Hiera Model transformer outputting raw hidden-states without any specific head on top.", + HIERA_START_DOCSTRING, +) +class HieraModel(HieraPreTrainedModel): + def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, use_mask_token: bool = False): + super().__init__(config) + self.config = config + self.num_layers = len(config.depths) + self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (self.num_layers - 1)) + + self.embeddings = HieraEmbeddings(config, use_mask_token=use_mask_token) + self.unroll = HieraUnroll(config) + self.encoder = HieraEncoder(config) + + self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> HieraPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?) + expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype + if pixel_values.dtype != expected_dtype: + pixel_values = pixel_values.to(expected_dtype) + + embedding_output = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output) + pooled_output = self.layernorm(pooled_output) + + if not return_dict: + head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """Hiera Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886). + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + """, + HIERA_START_DOCSTRING, +) +# Copied from transformers.models.vit.modeling_vit.ViTForMaskedImageModeling with VIT->HIERA,ViT->Hiera,vit->hiera,google/vit-base-patch16-224-in21k->EduardoPacheco/hiera-tiny-224 +class HieraForMaskedImageModeling(HieraPreTrainedModel): + def __init__(self, config: HieraConfig) -> None: + super().__init__(config) + + self.hiera = HieraModel(config, add_pooling_layer=False, use_mask_token=True) + + self.decoder = nn.Sequential( + nn.Conv2d( + in_channels=config.hidden_size, + out_channels=config.encoder_stride**2 * config.num_channels, + kernel_size=1, + ), + nn.PixelShuffle(config.encoder_stride), + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedImageModelingOutput]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + + Returns: + + Examples: + ```python + >>> from transformers import AutoImageProcessor, HieraForMaskedImageModeling + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("google/hiera-base-patch16-224-in21k") + >>> model = HieraForMaskedImageModeling.from_pretrained("google/hiera-base-patch16-224-in21k") + + >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 + >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values + >>> # create random boolean mask of shape (batch_size, num_patches) + >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool() + + >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos) + >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction + >>> list(reconstructed_pixel_values.shape) + [1, 3, 224, 224] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if bool_masked_pos is not None and (self.config.patch_size != self.config.encoder_stride): + raise ValueError( + "When `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that " + "the reconstructed image has the same dimensions as the input. " + f"Got `patch_size` = {self.config.patch_size} and `encoder_stride` = {self.config.encoder_stride}." + ) + + outputs = self.hiera( + pixel_values, + bool_masked_pos=bool_masked_pos, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # Reshape to (batch_size, num_channels, height, width) + sequence_output = sequence_output[:, 1:] + batch_size, sequence_length, num_channels = sequence_output.shape + height = width = math.floor(sequence_length**0.5) + sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width) + + # Reconstruct pixel values + reconstructed_pixel_values = self.decoder(sequence_output) + + masked_im_loss = None + if bool_masked_pos is not None: + size = self.config.image_size // self.config.patch_size + bool_masked_pos = bool_masked_pos.reshape(-1, size, size) + mask = ( + bool_masked_pos.repeat_interleave(self.config.patch_size, 1) + .repeat_interleave(self.config.patch_size, 2) + .unsqueeze(1) + .contiguous() + ) + reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none") + masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels + + if not return_dict: + output = (reconstructed_pixel_values,) + outputs[1:] + return ((masked_im_loss,) + output) if masked_im_loss is not None else output + + return MaskedImageModelingOutput( + loss=masked_im_loss, + reconstruction=reconstructed_pixel_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + + + + Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """, + HIERA_START_DOCSTRING, +) +class HieraForImageClassification(HieraPreTrainedModel): + def __init__(self, config: HieraConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.hiera = HieraModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(self.hiera.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.hiera( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 1bdab80a13f6..4b20b73414d0 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4267,6 +4267,37 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class HieraForImageClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class HieraForMaskedImageModeling(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class HieraModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class HieraPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/hiera/__init__.py b/tests/models/hiera/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py new file mode 100644 index 000000000000..55fbe76a4d56 --- /dev/null +++ b/tests/models/hiera/test_modeling_hiera.py @@ -0,0 +1,317 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Hiera model. """ + + +import unittest + +from transformers import HieraConfig +from transformers.testing_utils import ( + require_accelerate, + require_torch, + require_torch_accelerator, + require_torch_fp16, + require_vision, + slow, + torch_device, +) +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import HieraForImageClassification, HieraForMaskedImageModeling, HieraModel + from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import ViTImageProcessor + + +class HieraModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + scope=None, + encoder_stride=2, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.encoder_stride = encoder_stride + + # in Hiera, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return HieraConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = HieraModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels): + model = HieraForMaskedImageModeling(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size) + ) + + # test greyscale images + config.num_channels = 1 + model = HieraForMaskedImageModeling(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = HieraForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = HieraForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Hiera does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + HieraModel, + HieraForImageClassification, + HieraForMaskedImageModeling, + ) + if is_torch_available() + else () + ) + fx_compatible = False + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = HieraModelTester(self) + self.config_tester = ConfigTester(self, config_class=HieraConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Hiera does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_image_modeling(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = HieraModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class HieraModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ViTImageProcessor.from_pretrained("google/hiera-base-patch16-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = HieraForImageClassification.from_pretrained("google/hiera-base-patch16-224").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # Hiera models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = HieraModel.from_pretrained("facebook/dino-hieras8").to(torch_device) + + image_processor = ViTImageProcessor.from_pretrained("facebook/dino-hieras8", size=480) + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(pixel_values, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 3601, 384)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) + + @slow + @require_accelerate + @require_torch_accelerator + @require_torch_fp16 + def test_inference_fp16(self): + r""" + A small test to make sure that inference work in half precision without any problem. + """ + model = HieraModel.from_pretrained("facebook/dino-hieras8", torch_dtype=torch.float16, device_map="auto") + image_processor = self.default_image_processor + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # forward pass to make sure inference works in fp16 + with torch.no_grad(): + _ = model(pixel_values) From 2e1f8d4005850260887753b5e430bf402f90ac59 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Fri, 29 Mar 2024 22:04:33 +0100 Subject: [PATCH 060/118] Finished conversion script and model forward working --- .../models/hiera/convert_hiera_to_hf.py | 36 +++++++--- .../models/hiera/modeling_hiera.py | 67 ++++++++++++++----- 2 files changed, 74 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py index e36725baf84f..5c48bb55bb79 100644 --- a/src/transformers/models/hiera/convert_hiera_to_hf.py +++ b/src/transformers/models/hiera/convert_hiera_to_hf.py @@ -206,6 +206,7 @@ def convert_hiera_checkpoint(args): base_model = args.base_model pytorch_dump_folder_path = args.pytorch_dump_folder_path verify_logits = args.verify_logits + verify_pixel_values = args.verify_pixel_values push_to_hub = args.push_to_hub IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225] @@ -256,23 +257,31 @@ def convert_hiera_checkpoint(args): ] ) - image_processor = BeitImageProcessor( - image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"height": 224, "width": 224} - ) + image_processor = BeitImageProcessor(image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD) inputs = image_processor(images=input_image, return_tensors="pt") expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4) + if verify_pixel_values: + input_image = prepare_img() + + inputs = image_processor(images=input_image, return_tensors="pt") + expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) + assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4) + print("Pixel values look good!") + else: + print("Converted without verifying pixel values") + inputs = {"pixel_values": torch.rand((1, 3, 224, 224))} + expected_pixel_values = inputs["pixel_values"] outputs = model(**inputs) # original implementation returns logits.softmax(dim=-1) - expected_prob = original_model(input_image) + expected_prob = original_model(expected_pixel_values) if verify_logits and not base_model: output_prob = outputs.logits.softmax(dim=-1) assert torch.allclose(output_prob, expected_prob, atol=1e-4) - print("Looks good!") + print("Logits look good!") else: print("Converted without verifying logits") @@ -294,7 +303,7 @@ def convert_hiera_checkpoint(args): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( - "--model_name", + "--model-name", default="hiera-tiny-224", type=str, choices=[ @@ -312,21 +321,26 @@ def convert_hiera_checkpoint(args): help="Name of the Hiera model you'd like to convert.", ) parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory." ) parser.add_argument( - "--verify_logits", + "--verify-logits", action="store_true", help="Whether or not to verify the logits against the original implementation.", ) parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." ) parser.add_argument( - "--base_model", + "--base-model", action="store_true", help="Whether to only convert the base model (no projection head weights).", ) + parser.add_argument( + "--verify-pixel-values", + action="store_true", + help="Whether to verify the pixel values of the input image.", + ) args = parser.parse_args() convert_hiera_checkpoint(args) diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 94740cd64a0b..bb86a866abb7 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -227,13 +227,18 @@ def __init__( self.window_size = window_size self.use_mask_unit_attn = use_mask_unit_attn - def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor: + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + ) -> torch.Tensor: """Input should be of shape [batch, tokens, channels].""" batch_size, seq_len, _ = hidden_states.shape num_windows = 1 if self.use_mask_unit_attn: - num_windows = seq_len // (self.q_stride * self.window_size) + num_windows = seq_len // (self.query_stride * self.window_size) qkv = self.qkv(hidden_states) qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim) @@ -249,6 +254,10 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) attn_weights = (query * self.scale) @ key.transpose(-1, -2) attn_weights = attn_weights.softmax(dim=-1) + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + attn_output = attn_weights @ value attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.dim_out) attn_output = self.proj(attn_output) @@ -335,17 +344,24 @@ def __init__( if dim != dim_out: self.proj = nn.Linear(dim, dim_out) - def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor: - batch_size, seq_len, hidden_dim = hidden_states.shape + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + ) -> torch.Tensor: + batch_size, seq_len, _ = hidden_states.shape # Attention + Q Pooling hidden_states_norm = self.layernorm_before(hidden_states) if self.dim != self.dim_out: hidden_states = self.proj(hidden_states_norm) # Refer to `HieraUnroll` to see how this performs a maxpool-Nd - hidden_states = hidden_states.view(batch_size, self.query_stride, -1, hidden_dim).max(dim=1).values + hidden_states = hidden_states.view(batch_size, self.query_stride, -1, self.dim_out).max(dim=1).values - (hidden_states_norm, attn_weights) = self.attn(hidden_states_norm, output_attentions=output_attentions) + (hidden_states_norm, attn_weights) = self.attn( + hidden_states_norm, head_mask, output_attentions=output_attentions + ) hidden_states = hidden_states + self.drop_path(hidden_states_norm) residual = hidden_states @@ -368,8 +384,14 @@ def __init__( query_stride: List[int], window_size: int, use_mask_unit_attn: bool, + stage_num: int, ) -> None: super().__init__() + # we need to know if the previous stage used masked attention + # mask unit or global attention. + # lag by 1 layer, so that global attention, + # applied post pooling on lower resolution + previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0] self.layers = nn.ModuleList( [ HieraLayer( @@ -380,15 +402,20 @@ def __init__( drop_path=drop_path[i], query_stride=query_stride[i], window_size=window_size, - use_mask_unit_attn=use_mask_unit_attn, + use_mask_unit_attn=use_mask_unit_attn or (previous_stage_used_masked_attention and i == 0), ) for i in range(depth) ] ) - def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor: - for layer_module in self.layers: - (hidden_states, attn_weights) = layer_module(hidden_states, output_attentions=output_attentions) + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.FloatTensor], output_attentions: bool = False + ) -> torch.Tensor: + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + (hidden_states, attn_weights) = layer_module( + hidden_states, layer_head_mask, output_attentions=output_attentions + ) return hidden_states, attn_weights @@ -424,6 +451,7 @@ def __init__(self, config: HieraConfig) -> None: query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])], window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage), use_mask_unit_attn=config.masked_unit_attention[idx_stage], + stage_num=idx_stage, ) embed_dim = dim_out @@ -434,6 +462,7 @@ def __init__(self, config: HieraConfig) -> None: def forward( self, hidden_states: torch.Tensor, + head_mask: Optional[torch.FloatTensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = True, @@ -442,15 +471,16 @@ def forward( all_self_attentions = () if output_attentions else None for i, stage_module in enumerate(self.stages): + layer_head_mask = head_mask[i] if head_mask is not None else None if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: layer_outputs = self._gradient_checkpointing_func( - stage_module.__call__, hidden_states, output_attentions + stage_module.__call__, hidden_states, layer_head_mask, output_attentions ) else: - layer_outputs = stage_module(hidden_states, output_attentions) + layer_outputs = stage_module(hidden_states, layer_head_mask, output_attentions) hidden_states = layer_outputs[0] @@ -775,19 +805,19 @@ def forward( # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + head_mask = self.get_head_mask(head_mask, len(self.config.depths)) # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?) expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype if pixel_values.dtype != expected_dtype: pixel_values = pixel_values.to(expected_dtype) - embedding_output = self.embeddings( - pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding - ) + embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + hidden_states = self.unroll(embedding_output) encoder_outputs = self.encoder( - embedding_output, + hidden_states, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -796,7 +826,8 @@ def forward( sequence_output = encoder_outputs[0] pooled_output = None if self.pooler is not None: - pooled_output = self.pooler(sequence_output) + pooled_output = self.pooler(sequence_output.transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) pooled_output = self.layernorm(pooled_output) if not return_dict: From 5924b6cfc2f4be02c1f60f035b1080228665a6c5 Mon Sep 17 00:00:00 2001 From: Naman Garg Date: Sun, 31 Mar 2024 10:53:20 +0000 Subject: [PATCH 061/118] Resolved all issues --- docs/source/en/model_doc/hiera.md | 18 +- src/transformers/models/hiera/__init__.py | 2 +- .../models/hiera/configuration_hiera.py | 64 ++++-- .../models/hiera/convert_hiera_to_pytorch.py | 45 ++--- .../models/hiera/hiera_image_processor.py | 59 ------ .../models/hiera/modeling_hiera.py | 94 ++++----- tests/models/hiera/test_modeling_hiera.py | 190 ++++++++---------- 7 files changed, 209 insertions(+), 263 deletions(-) delete mode 100644 src/transformers/models/hiera/hiera_image_processor.py diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index 8cd6dc1a977a..f519f00893cc 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -1,4 +1,4 @@ -