From 0cd30ee02ffa7cc99107e294c5cb66db7d2928af Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 00:06:17 +0000
Subject: [PATCH 001/118] initialized Structure

---
 src/transformers/models/__init__.py           |   1 +
 src/transformers/models/hiera/__init__.py     |  82 +++
 src/transformers/models/hiera/benchmarking.py |  77 +++
 .../models/hiera/configuration_hiera.py       | 128 +++++
 .../models/hiera/convert_hiera_to_pytorch.py  |  27 +
 src/transformers/models/hiera/hiera.py        | 535 ++++++++++++++++++
 src/transformers/models/hiera/hiera_mae.py    | 398 +++++++++++++
 src/transformers/models/hiera/hiera_utils.py  | 287 ++++++++++
 8 files changed, 1535 insertions(+)
 create mode 100644 src/transformers/models/hiera/__init__.py
 create mode 100644 src/transformers/models/hiera/benchmarking.py
 create mode 100644 src/transformers/models/hiera/configuration_hiera.py
 create mode 100644 src/transformers/models/hiera/convert_hiera_to_pytorch.py
 create mode 100644 src/transformers/models/hiera/hiera.py
 create mode 100644 src/transformers/models/hiera/hiera_mae.py
 create mode 100644 src/transformers/models/hiera/hiera_utils.py

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 5686cf516c49..0ef69742dc18 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -105,6 +105,7 @@
     graphormer,
     groupvit,
     herbert,
+    hiera,
     hubert,
     ibert,
     idefics,
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
new file mode 100644
index 000000000000..bfd200e9dcb9
--- /dev/null
+++ b/src/transformers/models/hiera/__init__.py
@@ -0,0 +1,82 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
+
+
+_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vit_mae"] = [
+        "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ViTMAEForPreTraining",
+        "ViTMAELayer",
+        "ViTMAEModel",
+        "ViTMAEPreTrainedModel",
+    ]
+
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_vit_mae"] = [
+        "TFViTMAEForPreTraining",
+        "TFViTMAEModel",
+        "TFViTMAEPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vit_mae import (
+            VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTMAEForPreTraining,
+            ViTMAELayer,
+            ViTMAEModel,
+            ViTMAEPreTrainedModel,
+        )
+
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py
new file mode 100644
index 000000000000..33166028977a
--- /dev/null
+++ b/src/transformers/models/hiera/benchmarking.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+
+import time
+from typing import List, Tuple, Union
+
+import torch
+from tqdm import tqdm
+
+# From https://github.com/facebookresearch/ToMe/
+def benchmark(
+    model: torch.nn.Module,
+    device: torch.device = 0,
+    input_size: Tuple[int] = (3, 224, 224),
+    batch_size: int = 64,
+    runs: int = 40,
+    throw_out: float = 0.25,
+    use_fp16: bool = False,
+    verbose: bool = False,
+) -> float:
+    """
+    Benchmark the given model with random inputs at the given batch size.
+
+    Args:
+     - model: the module to benchmark
+     - device: the device to use for benchmarking
+     - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w)
+     - batch_size: the batch size to use for evaluation
+     - runs: the number of total runs to do
+     - throw_out: the percentage of runs to throw out at the start of testing
+     - use_fp16: whether or not to benchmark with float16 and autocast
+     - verbose: whether or not to use tqdm to print progress / print throughput at end
+
+    Returns:
+     - the throughput measured in images / second
+    """
+    if not isinstance(device, torch.device):
+        device = torch.device(device)
+    is_cuda = torch.device(device).type == "cuda"
+
+    model = model.eval().to(device)
+    input = torch.rand(batch_size, *input_size, device=device)
+    if use_fp16:
+        input = input.half()
+
+    warm_up = int(runs * throw_out)
+    total = 0
+    start = time.time()
+
+    with torch.autocast(device.type, enabled=use_fp16):
+        with torch.no_grad():
+            for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
+                if i == warm_up:
+                    if is_cuda:
+                        torch.cuda.synchronize()
+                    total = 0
+                    start = time.time()
+
+                model(input)
+                total += batch_size
+
+    if is_cuda:
+        torch.cuda.synchronize()
+
+    end = time.time()
+    elapsed = end - start
+
+    throughput = total / elapsed
+
+    if verbose:
+        print(f"Throughput: {throughput:.2f} im/s")
+
+    return throughput
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
new file mode 100644
index 000000000000..de5de9e7d9e9
--- /dev/null
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -0,0 +1,128 @@
+""" hiera  model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json",
+    # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae
+}
+
+
+class ViTMAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT
+    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the ViT
+    [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        mask_ratio (`float`, *optional*, defaults to 0.75):
+            The ratio of the number of masked tokens in the input sequence.
+        norm_pix_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
+            representation quality in the experiments of the authors.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTMAEConfig, ViTMAEModel
+
+    >>> # Initializing a ViT MAE vit-mae-base style configuration
+    >>> configuration = ViTMAEConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
+    >>> model = ViTMAEModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit_mae"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        mask_ratio=0.75,
+        norm_pix_loss=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.mask_ratio = mask_ratio
+        self.norm_pix_loss = norm_pix_loss
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
new file mode 100644
index 000000000000..506507e4e66e
--- /dev/null
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -0,0 +1,27 @@
+import argparse
+
+import requests
+import torch
+from PIL import Image
+
+
+
+def rename_key(name):
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "patch_embed.projection")
+    return name
+
+
+def e(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+           pass
+        else:
+            new_name = rename_key(key)
+            orig_state_dict[new_name] = val
+
+    return orig_state_dict
+
+
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
new file mode 100644
index 000000000000..35e8c93e160b
--- /dev/null
+++ b/src/transformers/models/hiera/hiera.py
@@ -0,0 +1,535 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+#
+# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+#
+# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
+# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
+# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
+#
+# Paper: https://arxiv.org/abs/2306.00989/
+#
+# References:
+# slowfast: https://github.com/facebookresearch/SlowFast
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# --------------------------------------------------------
+
+import math
+from functools import partial
+from typing import List, Tuple, Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.models.layers import DropPath, Mlp
+
+from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll
+
+
+
+class MaskUnitAttention(nn.Module):
+    """
+    Computes either Mask Unit or Global Attention. Also is able to perform q pooling.
+
+    Note: this assumes the tokens have already been flattened and unrolled into mask units.
+    See `Unroll` for more details.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        heads: int,
+        q_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ):
+        """
+        Args:
+        - dim, dim_out: The input and output feature dimensions.
+        - heads: The number of attention heads.
+        - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
+        - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
+        - use_mask_unit_attn: Use Mask Unit or Global Attention.
+        """
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+        self.heads = heads
+        self.q_stride = q_stride
+
+        self.head_dim = dim_out // heads
+        self.scale = (self.head_dim) ** -0.5
+
+        self.qkv = nn.Linear(dim, 3 * dim_out)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+        self.window_size = window_size
+        self.use_mask_unit_attn = use_mask_unit_attn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ Input should be of shape [batch, tokens, channels]. """
+        B, N, _ = x.shape
+        num_windows = (
+            (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1
+        )
+
+        qkv = (
+            self.qkv(x)
+            .reshape(B, -1, num_windows, 3, self.heads, self.head_dim)
+            .permute(3, 0, 4, 2, 1, 5)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        if self.q_stride > 1:
+            # Refer to Unroll to see how this performs a maxpool-Nd
+            q = (
+                q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim)
+                .max(dim=3)
+                .values
+            )
+
+        if hasattr(F, "scaled_dot_product_attention"):
+            # Note: the original paper did *not* use SDPA, it's a free boost!
+            x = F.scaled_dot_product_attention(q, k, v)
+        else:
+            attn = (q * self.scale) @ k.transpose(-1, -2)
+            attn = attn.softmax(dim=-1)
+            x = (attn @ v)
+
+        x = x.transpose(1, 3).reshape(B, -1, self.dim_out)
+        x = self.proj(x)
+        return x
+
+
+class HieraBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        drop_path: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        act_layer: nn.Module = nn.GELU,
+        q_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+
+        self.norm1 = norm_layer(dim)
+        self.attn = MaskUnitAttention(
+            dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn
+        )
+
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Attention + Q Pooling
+        x_norm = self.norm1(x)
+        if self.dim != self.dim_out:
+            x = do_pool(self.proj(x_norm), stride=self.attn.q_stride)
+        x = x + self.drop_path(self.attn(x_norm))
+
+        # MLP
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Head(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_classes: int,
+        dropout_rate: float = 0.0,
+        act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1),
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
+        self.projection = nn.Linear(dim, num_classes)
+        # act_fun for eval and testing only
+        self.act_func = act_func
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.dropout(x)
+        x = self.projection(x)
+        if not self.training:
+            x = self.act_func(x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d)."""
+
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        kernel: Tuple[int, ...],
+        stride: Tuple[int, ...],
+        padding: Tuple[int, ...],
+    ):
+        super().__init__()
+
+        # Support any number of spatial dimensions
+        self.spatial_dims = len(kernel)
+        self.proj = conv_nd(self.spatial_dims)(
+            dim_in,
+            dim_out,
+            kernel_size=kernel,
+            stride=stride,
+            padding=padding,
+        )
+
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x = do_masked_conv(x, self.proj, mask)
+        x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1)
+        return x
+
+
+class Hiera(nn.Module):
+    def __init__(
+        self,
+        input_size: Tuple[int, ...] = (224, 224),
+        in_chans: int = 3,
+        embed_dim: int = 96,  # initial embed dim
+        num_heads: int = 1,  # initial number of heads
+        num_classes: int = 1000,
+        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, ...] = (2, 2),
+        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
+        # mask_unit_attn: which stages use mask unit attention?
+        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
+        dim_mul: float = 2.0,
+        head_mul: float = 2.0,
+        patch_kernel: Tuple[int, ...] = (7, 7),
+        patch_stride: Tuple[int, ...] = (4, 4),
+        patch_padding: Tuple[int, ...] = (3, 3),
+        mlp_ratio: float = 4.0,
+        drop_path_rate: float = 0.0,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        head_dropout: float = 0.0,
+        head_init_scale: float = 0.001,
+        sep_pos_embed: bool = False,
+    ):
+        super().__init__()
+
+        depth = sum(stages)
+        self.patch_stride = patch_stride
+        self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)]
+        num_tokens = math.prod(self.tokens_spatial_shape)
+        flat_mu_size = math.prod(mask_unit_size)
+        flat_q_stride = math.prod(q_stride)
+
+        assert q_pool < len(stages)
+        self.q_pool, self.q_stride = q_pool, q_stride
+        self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size
+        self.mask_spatial_shape = [
+            i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
+        ]
+        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+
+        self.patch_embed = PatchEmbed(
+            in_chans, embed_dim, patch_kernel, patch_stride, patch_padding
+        )
+
+        self.sep_pos_embed = sep_pos_embed
+        if sep_pos_embed:
+            self.pos_embed_spatial = nn.Parameter(
+                torch.zeros(
+                    1,
+                    self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
+                    embed_dim,
+                )
+            )
+            self.pos_embed_temporal = nn.Parameter(
+                torch.zeros(1, self.tokens_spatial_shape[0], embed_dim)
+            )
+        else:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim))
+
+        # Setup roll and reroll modules
+        self.unroll = Unroll(
+            input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1])
+        )
+        self.reroll = Reroll(
+            input_size,
+            patch_stride,
+            [q_stride] * len(self.stage_ends[:-1]),
+            self.stage_ends,
+            q_pool,
+        )
+        # q_pool locations
+        q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]]
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+
+        # Transformer blocks
+        cur_stage = 0
+        self.blocks = nn.ModuleList()
+
+        for i in range(depth):
+            dim_out = embed_dim
+            # Mask unit or global attention.
+            # Lag by 1 block, so that global attention,
+            # applied post pooling on lower resolution
+            use_mask_unit_attn = mask_unit_attn[cur_stage]
+
+            if i - 1 in self.stage_ends:
+                dim_out = int(embed_dim * dim_mul)
+                num_heads = int(num_heads * head_mul)
+                cur_stage += 1
+                if i in q_pool_blocks:
+                    flat_mu_size //= flat_q_stride
+
+            block = HieraBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                q_stride=(flat_q_stride if i in q_pool_blocks else 1),
+                window_size=flat_mu_size,
+                use_mask_unit_attn=use_mask_unit_attn,
+            )
+
+            embed_dim = dim_out
+            self.blocks.append(block)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout)
+
+        # Initialize everything
+        if sep_pos_embed:
+            nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02)
+            nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02)
+        else:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(partial(self._init_weights))
+        self.head.projection.weight.data.mul_(head_init_scale)
+        self.head.projection.bias.data.mul_(head_init_scale)
+
+    def _init_weights(self, m, init_bias=0.02):
+        if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, init_bias)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, init_bias)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        if self.sep_pos_embed:
+            return ["pos_embed_spatial", "pos_embed_temporal"]
+        else:
+            return ["pos_embed"]
+
+    def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
+        """
+        Generates a random mask, mask_ratio fraction are dropped.
+        1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
+        """
+        B = x.shape[0]
+        # Tokens selected for masking at mask unit level
+        num_windows = math.prod(self.mask_spatial_shape)  # num_mask_units
+        len_keep = int(num_windows * (1 - mask_ratio))
+        noise = torch.rand(B, num_windows, device=x.device)
+
+        # Sort noise for each sample
+        ids_shuffle = torch.argsort(
+            noise, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # Generate the binary mask: 1 is *keep*, 0 is *remove*
+        # Note this is opposite to original MAE
+        mask = torch.zeros([B, num_windows], device=x.device)
+        mask[:, :len_keep] = 1
+        # Unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return mask.bool()
+
+    def get_pos_embed(self) -> torch.Tensor:
+        if self.sep_pos_embed:
+            return self.pos_embed_spatial.repeat(
+                1, self.tokens_spatial_shape[0], 1
+            ) + torch.repeat_interleave(
+                self.pos_embed_temporal,
+                self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
+                dim=1,
+            )
+        else:
+            return self.pos_embed
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor = None,
+        return_intermediates: bool = False,
+    ) -> torch.Tensor:
+        """
+        mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim.
+        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
+        """
+        # Slowfast training passes in a list
+        if isinstance(x, list):
+            x = x[0]
+        intermediates = []
+
+        x = self.patch_embed(
+            x,
+            mask=mask.view(
+                x.shape[0], 1, *self.mask_spatial_shape
+            )  # B, C, *mask_spatial_shape
+            if mask is not None
+            else None,
+        )
+        x = x + self.get_pos_embed()
+        x = self.unroll(x)
+
+        # Discard masked tokens
+        if mask is not None:
+            x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view(
+                x.shape[0], -1, x.shape[-1]
+            )
+
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+
+            if return_intermediates and i in self.stage_ends:
+                intermediates.append(self.reroll(x, i, mask=mask))
+
+        if mask is None:
+            x = x.mean(dim=1)
+            x = self.norm(x)
+            x = self.head(x)
+
+        # x may not always be in spatial order here.
+        # e.g. if q_pool = 2, mask_unit_size = (8, 8), and
+        # q_stride = (2, 2), not all unrolls were consumed,
+        # intermediates[-1] is x in spatial order
+        if return_intermediates:
+            return x, intermediates
+
+        return x
+
+
+# Image models
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_tiny_224(**kwdargs):
+    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_small_224(**kwdargs):
+    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_base_224(**kwdargs):
+    return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_base_plus_224(**kwdargs):
+    return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_large_224(**kwdargs):
+    return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_huge_224(**kwdargs):
+    return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs)
+
+
+# Video models
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_base_16x224(num_classes: int = 400, **kwdargs):
+    return Hiera(
+        num_classes=num_classes,  # K400 has 400 classes
+        input_size=(16, 224, 224),
+        q_stride=(1, 2, 2),
+        mask_unit_size=(1, 8, 8),
+        patch_kernel=(3, 7, 7),
+        patch_stride=(2, 4, 4),
+        patch_padding=(1, 3, 3),
+        sep_pos_embed=True,
+        **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_base_plus_16x224(**kwdargs):
+    return hiera_base_16x224(
+        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_large_16x224(**kwdargs):
+    return hiera_base_16x224(
+        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_huge_16x224(**kwdargs):
+    return hiera_base_16x224(
+        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+    )
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
new file mode 100644
index 000000000000..64c69cc89d71
--- /dev/null
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -0,0 +1,398 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# mae: https://github.com/facebookresearch/mae
+# slowfast: https://github.com/facebookresearch/SlowFast
+# --------------------------------------------------------
+
+
+from functools import partial
+from typing import Tuple, Optional
+
+import math
+import torch
+import torch.nn as nn
+
+from .hiera import Hiera, HieraBlock
+from .hiera_utils import pretrained_model, undo_windowing, conv_nd
+
+
+def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
+    if isinstance(head, nn.Identity):
+        return x
+
+    B, num_mask_units = x.shape[0:2]
+    # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx])
+    permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
+    x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute))
+
+    # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C']
+    permute = [0] + list(range(2, len(x.shape))) + [1]
+    x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1])
+    return x
+
+
+class MaskedAutoencoderHiera(Hiera):
+    """Masked Autoencoder with Hiera backbone"""
+
+    def __init__(
+        self,
+        in_chans: int = 3,
+        patch_stride: Tuple[int, ...] = (4, 4),
+        mlp_ratio: float = 4.0,
+        decoder_embed_dim: int = 512,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 16,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        **kwdargs,
+    ):
+        super().__init__(
+            in_chans=in_chans,
+            patch_stride=patch_stride,
+            mlp_ratio=mlp_ratio,
+            norm_layer=norm_layer,
+            **kwdargs,
+        )
+
+        del self.norm, self.head
+        encoder_dim_out = self.blocks[-1].dim_out
+        self.encoder_norm = norm_layer(encoder_dim_out)
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride)
+        ]
+        self.tokens_spatial_shape_final = [
+            i // s ** (self.q_pool)
+            for i, s in zip(self.tokens_spatial_shape, self.q_stride)
+        ]
+        # --------------------------------------------------------------------------
+        # Multi-scale fusion heads
+        curr_mu_size = self.mask_unit_size
+        self.multi_scale_fusion_heads = nn.ModuleList()
+
+        for i in self.stage_ends[: self.q_pool]:  # resolution constant after q_pool
+            kernel = [
+                i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)
+            ]
+            curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)]
+            self.multi_scale_fusion_heads.append(
+                conv_nd(len(self.q_stride))(
+                    self.blocks[i].dim_out,
+                    encoder_dim_out,
+                    kernel_size=kernel,
+                    stride=kernel,
+                )
+            )
+        self.multi_scale_fusion_heads.append(nn.Identity())  # final stage, no transform
+
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(encoder_dim_out, decoder_embed_dim)
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+
+        self.decoder_pos_embed = nn.Parameter(
+            torch.zeros(
+                1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim
+            )
+        )
+
+        self.decoder_blocks = nn.ModuleList(
+            [
+                HieraBlock(
+                    dim=decoder_embed_dim,
+                    dim_out=decoder_embed_dim,
+                    heads=decoder_num_heads,
+                    norm_layer=norm_layer,
+                    mlp_ratio=mlp_ratio,
+                )
+                for i in range(decoder_depth)
+            ]
+        )
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+
+        self.pred_stride = patch_stride[-1] * (
+            self.q_stride[-1] ** self.q_pool
+        )  # patch stride of prediction
+
+        self.decoder_pred = nn.Linear(
+            decoder_embed_dim,
+            (self.pred_stride ** min(2, len(self.q_stride))) * in_chans,
+        )  # predictor
+        # --------------------------------------------------------------------------
+
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        nn.init.trunc_normal_(self.mask_token, std=0.02)
+        nn.init.trunc_normal_(self.decoder_pos_embed, std=0.02)
+        self.apply(self._mae_init_weights)
+
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+    def _mae_init_weights(self, m: nn.Module):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_pixel_label_2d(
+        self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True
+    ) -> torch.Tensor:
+        # mask (boolean tensor): True must correspond to *masked*
+        input_img = input_img.permute(0, 2, 3, 1)
+
+        size = self.pred_stride
+        label = input_img.unfold(1, size, size).unfold(2, size, size)
+        label = label.flatten(1, 2).flatten(2)
+        label = label[mask]
+        if norm:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+    def get_pixel_label_3d(
+        self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True
+    ) -> torch.Tensor:
+        # mask (boolean tensor): True must correspond to *masked*
+
+        # We use time strided loss, only take the first frame from each token
+        input_vid = input_vid[:, :, ::self.patch_stride[0], :, :]
+
+        size = self.pred_stride
+        label = input_vid.unfold(3, size, size).unfold(4, size, size)
+        label = label.permute(0, 2, 3, 4, 5, 6, 1)  # Different from 2d, mistake during training lol
+        label = label.flatten(1, 3).flatten(2)
+        label = label[mask]
+
+        if norm:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+
+    def forward_encoder(
+        self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        if mask is None:
+            mask = self.get_random_mask(x, mask_ratio)  # [B, #MUs_all]
+
+        # Get multi-scale representations from encoder
+        _, intermediates = super().forward(x, mask, return_intermediates=True)
+        # Resolution unchanged after q_pool stages, so skip those features
+        intermediates = intermediates[: self.q_pool] + intermediates[-1:]
+
+        # Multi-scale fusion
+        x = 0.0
+        for head, interm_x in zip(self.multi_scale_fusion_heads, intermediates):
+            x += apply_fusion_head(head, interm_x)
+
+        x = self.encoder_norm(x)
+
+        return x, mask
+
+    def forward_decoder(
+        self, x: torch.Tensor, mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Embed tokens
+        x = self.decoder_embed(x)
+
+        # Combine visible and mask tokens
+
+        # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
+        # mask: [B, #MUs_all]
+        x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
+        mask_tokens = self.mask_token.view(
+            (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)
+        )
+        mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:]))
+        mask = mask.expand((-1,) * 2 + x.shape[2:]).bool()
+        x_dec[mask] = x.flatten()
+        x_dec = ~mask * mask_tokens + mask * x_dec
+
+        # Get back spatial order
+        x = undo_windowing(
+            x_dec,
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+        mask = undo_windowing(
+            mask[..., 0:1],
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+
+        # Flatten
+        x = x.reshape(x.shape[0], -1, x.shape[-1])
+        mask = mask.view(x.shape[0], -1)
+
+        # Add pos embed
+        x = x + self.decoder_pos_embed
+
+        # Apply decoder blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+
+        # Predictor projection
+        x = self.decoder_pred(x)
+
+        return x, mask
+
+    def forward_loss(
+        self, x: torch.Tensor, pred: torch.Tensor, mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Note: in mask, 0 is *visible*, 1 is *masked*
+
+        x: e.g. [B, 3, H, W]
+        pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        """
+        if len(self.q_stride) == 2:
+            label = self.get_pixel_label_2d(x, mask)
+        elif len(self.q_stride) == 3:
+            label = self.get_pixel_label_3d(x, mask)
+        else:
+            raise NotImplementedError
+
+        pred = pred[mask]
+        loss = (pred - label) ** 2
+
+        return loss.mean(), pred, label
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_ratio: float = 0.6,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        latent, mask = self.forward_encoder(x, mask_ratio, mask=mask)
+        pred, pred_mask = self.forward_decoder(
+            latent, mask
+        )  # pred_mask is mask at resolution of *prediction*
+
+        # Toggle mask, to generate labels for *masked* tokens
+        return *self.forward_loss(x, pred, ~pred_mask), mask
+
+
+
+
+# Image Models
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
+}, default="mae_in1k")
+def mae_hiera_tiny_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
+}, default="mae_in1k")
+def mae_hiera_small_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
+}, default="mae_in1k")
+def mae_hiera_base_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
+}, default="mae_in1k")
+def mae_hiera_base_plus_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
+}, default="mae_in1k")
+def mae_hiera_large_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
+}, default="mae_in1k")
+def mae_hiera_huge_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+    )
+
+
+
+# Video Models
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
+}, default="mae_k400")
+def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs):
+    return MaskedAutoencoderHiera(
+        num_classes=num_classes,  # K400 has 400 classes
+        input_size=(16, 224, 224),
+        q_stride=(1, 2, 2),
+        mask_unit_size=(1, 8, 8),
+        patch_kernel=(3, 7, 7),
+        patch_stride=(2, 4, 4),
+        patch_padding=(1, 3, 3),
+        sep_pos_embed=True,
+        q_pool=2,
+        **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
+}, default="mae_k400")
+@pretrained_model(None)
+def mae_hiera_base_plus_16x224(**kwdargs):
+    return mae_hiera_base_16x224(
+        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
+}, default="mae_k400")
+@pretrained_model(None)
+def mae_hiera_large_16x224(**kwdargs):
+    return mae_hiera_base_16x224(
+        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
+}, default="mae_k400")
+def mae_hiera_huge_16x224(**kwdargs):
+    return mae_hiera_base_16x224(
+        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+    )
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
new file mode 100644
index 000000000000..992c03e08079
--- /dev/null
+++ b/src/transformers/models/hiera/hiera_utils.py
@@ -0,0 +1,287 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+#
+# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+#
+# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
+# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
+# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
+#
+# Paper: https://arxiv.org/abs/2306.00989/
+#
+# References:
+# slowfast: https://github.com/facebookresearch/SlowFast
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# --------------------------------------------------------
+
+import math
+from typing import List, Tuple, Optional, Type, Callable, Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .convert_hiera_to_pytorch import e
+
+def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
+    """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
+
+    def inner(model_func: Callable) -> Callable:
+        def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module:
+            if pretrained:
+                if checkpoints is None:
+                    raise RuntimeError("This model currently doesn't have pretrained weights available.")
+                elif checkpoint is None:
+                    raise RuntimeError("No checkpoint specified.")
+                elif checkpoint not in checkpoints:
+                    raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
+
+                state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
+                # state_dict["model_state"] = e(state_dict["model_state"],{})
+                if "head.projection.weight" in state_dict["model_state"]:
+                    # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
+                    if "num_classes" not in kwdargs:
+                        kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0]
+                    # If the user specified a different number of classes, remove the projection weights or else we'll error out
+                    elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]:
+                        del state_dict["model_state"]["head.projection.weight"]
+                        del state_dict["model_state"]["head.projection.bias"]
+
+            model = model_func(**kwdargs)
+            if pretrained:
+                # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
+                if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"):
+                    strict = False
+
+                model.load_state_dict(state_dict["model_state"], strict=strict)
+            
+            return model
+
+        return model_def
+    
+    return inner
+
+
+
+def conv_nd(n: int) -> Type[nn.Module]:
+    """
+    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
+    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
+    """
+    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
+
+
+def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
+    # Refer to `Unroll` to see how this performs a maxpool-Nd
+    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
+
+
+def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor:
+    # target_size: [(T), (H), W]
+    # (spatial) mask: [B, C, (t), (h), w]
+    if mask is None:
+        return mask
+
+    assert len(mask.shape[2:]) == len(target_size)
+    if mask.shape[2:] != target_size:
+        return F.interpolate(mask.float(), size=target_size)
+    return mask
+
+
+def do_masked_conv(
+    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """Zero-out the masked regions of the input before conv.
+    Prevents leakage of masked regions when using overlapping kernels.
+    """
+    if conv is None:
+        return x
+    if mask is None:
+        return conv(x)
+
+    mask = get_resized_mask(target_size=x.shape[2:], mask=mask)
+    return conv(x * mask.bool())
+
+
+def undo_windowing(
+    x: torch.Tensor, shape: List[int], mu_shape: List[int]
+) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
+        shape: current spatial shape, if it were not organized into mask unit
+            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
+        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
+    Returns:
+        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
+    """
+    D = len(shape)
+    B, C = x.shape[0], x.shape[-1]
+    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
+    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
+    x = x.view(B, *num_MUs, *mu_shape, C)
+
+    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
+    permute = (
+        [0]
+        + sum(
+            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
+            [],
+        )
+        + [len(x.shape) - 1]
+    )
+    x = x.permute(permute).reshape(B, *shape, C)
+
+    return x
+
+
+
+class Unroll(nn.Module):
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
+                           [B, (Sy, Sx, H // Sy, W // Sx), C]
+
+    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in HxW order, so they
+    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+        self.schedule = unroll_schedule
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input: Flattened patch embeddings [B, N, C]
+        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
+        """
+        B, _, C = x.shape
+
+        cur_size = self.size
+        x = x.view(*([B] + cur_size + [C]))
+
+        for strides in self.schedule:
+            # Move patches with the given strides to the batch dimension
+
+            # Create a view of the tensor with the patch stride as separate dims
+            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
+            cur_size = [i // s for i, s in zip(cur_size, strides)]
+            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
+            x = x.view(new_shape)
+
+            # Move the patch stride into the batch dimension
+            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
+            L = len(new_shape)
+            permute = (
+                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Now finally flatten the relevant dims into the batch dimension
+            x = x.flatten(0, len(strides))
+            B *= math.prod(strides)
+
+        x = x.reshape(-1, math.prod(self.size), C)
+        return x
+
+
+class Reroll(nn.Module):
+    """
+    Undos the "unroll" operation so that you can use intermediate features.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+        stage_ends: List[int],
+        q_pool: int,
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        self.schedule = {}
+        size = self.size
+        for i in range(stage_ends[-1] + 1):
+            self.schedule[i] = unroll_schedule, size
+            # schedule unchanged if no pooling at a stage end
+            if i in stage_ends[:q_pool]:
+                if len(unroll_schedule) > 0:
+                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
+                unroll_schedule = unroll_schedule[1:]
+
+    def forward(
+        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no mask is provided:
+            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
+        If a mask is provided:
+            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+        """
+        schedule, size = self.schedule[block_idx]
+        B, N, C = x.shape
+
+        D = len(size)
+        cur_mu_shape = [1] * D
+
+        for strides in schedule:
+            # Extract the current patch from N
+            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
+
+            # Move that patch into the current MU
+            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
+            L = len(x.shape)
+            permute = (
+                [0, 1 + D]
+                + sum(
+                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
+                    [],
+                )
+                + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Reshape to [B, N//(Sy*Sx), *MU, C]
+            for i in range(D):
+                cur_mu_shape[i] *= strides[i]
+            x = x.reshape(B, -1, *cur_mu_shape, C)
+            N = x.shape[1]
+
+        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        x = x.view(B, N, *cur_mu_shape, C)
+
+        # If masked, return [B, #MUs, MUy, MUx, C]
+        if mask is not None:
+            return x
+
+        # If not masked, we can return [B, H, W, C]
+        x = undo_windowing(x, size, cur_mu_shape)
+
+        return x
\ No newline at end of file

From ec4111f144ae048c842d96f729dbcaacc1faf053 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 02:17:36 +0000
Subject: [PATCH 002/118] Updated variable names

---
 .../models/hiera/convert_hiera_to_pytorch.py  |  30 +--
 src/transformers/models/hiera/hiera.py        | 200 +++++++++---------
 src/transformers/models/hiera/hiera_mae.py    |  42 ++--
 src/transformers/models/hiera/hiera_utils.py  |   6 +-
 4 files changed, 141 insertions(+), 137 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 506507e4e66e..f1d0c4135796 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -7,21 +7,25 @@
 
 
 def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "patch_embed.projection")
+    # if "patch_embed.proj" in name:
+    #     name = name.replace("patch_embed.proj", "patch_embed.projection")
+    # # elif "block.proj" in name:
+    # #     name = name.replace("block.proj", "block.projection")
+    # elif "attn.proj" in name:
+    #     name = name.replace("attn.proj", "attn.projection")
+    if ".proj." in name:
+        name = name.replace(".proj.", ".projection.")
+    if "attn" in name:
+        name = name.replace("attn", "attention")
+    if "pos_embed" in name:
+        name = name.replace("pos_embed", "position_embeddings")
+    if "patch_embed" in name:
+        name = name.replace("patch_embed", "patch_embedding")
     return name
 
 
-def e(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-           pass
-        else:
-            new_name = rename_key(key)
-            orig_state_dict[new_name] = val
-
-    return orig_state_dict
+def convert_state_dict(orig_state_dict, config):
+    updated_model_state = {rename_key(k): v for k, v in orig_state_dict.items()}
+    return updated_model_state
 
 
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 35e8c93e160b..fcb04f68934e 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -42,47 +42,47 @@ class MaskUnitAttention(nn.Module):
 
     def __init__(
         self,
-        dim: int,
-        dim_out: int,
-        heads: int,
+        input_dim: int,
+        output_dim: int,
+        number_of_heads: int,
         q_stride: int = 1,
         window_size: int = 0,
-        use_mask_unit_attn: bool = False,
+        use_mask_unit_attention: bool = False,
     ):
         """
         Args:
-        - dim, dim_out: The input and output feature dimensions.
-        - heads: The number of attention heads.
+        - input_dim, output_dim: The input and output feature dimensions.
+        - number_of_heads: The number of attention number_of_heads.
         - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
         - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
-        - use_mask_unit_attn: Use Mask Unit or Global Attention.
+        - use_mask_unit_attention: Use Mask Unit or Global Attention.
         """
         super().__init__()
 
-        self.dim = dim
-        self.dim_out = dim_out
-        self.heads = heads
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.number_of_heads = number_of_heads
         self.q_stride = q_stride
 
-        self.head_dim = dim_out // heads
+        self.head_dim = output_dim // number_of_heads
         self.scale = (self.head_dim) ** -0.5
 
-        self.qkv = nn.Linear(dim, 3 * dim_out)
-        self.proj = nn.Linear(dim_out, dim_out)
+        self.qkv = nn.Linear(input_dim, 3 * output_dim)
+        self.projection = nn.Linear(output_dim, output_dim)
 
         self.window_size = window_size
-        self.use_mask_unit_attn = use_mask_unit_attn
+        self.use_mask_unit_attention = use_mask_unit_attention
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """ Input should be of shape [batch, tokens, channels]. """
-        B, N, _ = x.shape
+        batch_size , num_channels , _ = x.shape
         num_windows = (
-            (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1
+            (num_channels  // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
         )
 
         qkv = (
             self.qkv(x)
-            .reshape(B, -1, num_windows, 3, self.heads, self.head_dim)
+            .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
         q, k, v = qkv[0], qkv[1], qkv[2]
@@ -90,7 +90,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.q_stride > 1:
             # Refer to Unroll to see how this performs a maxpool-Nd
             q = (
-                q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim)
+                q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
                 .max(dim=3)
                 .values
             )
@@ -99,52 +99,52 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             # Note: the original paper did *not* use SDPA, it's a free boost!
             x = F.scaled_dot_product_attention(q, k, v)
         else:
-            attn = (q * self.scale) @ k.transpose(-1, -2)
-            attn = attn.softmax(dim=-1)
-            x = (attn @ v)
+            attention = (q * self.scale) @ k.transpose(-1, -2)
+            attention = attention.softmax(dim=-1)
+            x = (attention @ v)
 
-        x = x.transpose(1, 3).reshape(B, -1, self.dim_out)
-        x = self.proj(x)
+        x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
+        x = self.projection(x)
         return x
 
 
 class HieraBlock(nn.Module):
     def __init__(
         self,
-        dim: int,
-        dim_out: int,
-        heads: int,
+        input_dim: int,
+        output_dim: int,
+        number_of_heads: int,
         mlp_ratio: float = 4.0,
         drop_path: float = 0.0,
         norm_layer: nn.Module = nn.LayerNorm,
         act_layer: nn.Module = nn.GELU,
         q_stride: int = 1,
         window_size: int = 0,
-        use_mask_unit_attn: bool = False,
+        use_mask_unit_attention: bool = False,
     ):
         super().__init__()
 
-        self.dim = dim
-        self.dim_out = dim_out
+        self.input_dim = input_dim
+        self.output_dim = output_dim
 
-        self.norm1 = norm_layer(dim)
-        self.attn = MaskUnitAttention(
-            dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn
+        self.norm1 = norm_layer(input_dim)
+        self.attention = MaskUnitAttention(
+            input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention
         )
 
-        self.norm2 = norm_layer(dim_out)
-        self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer)
+        self.norm2 = norm_layer(output_dim)
+        self.mlp = Mlp(output_dim, int(output_dim * mlp_ratio), act_layer=act_layer)
 
         self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
-        if dim != dim_out:
-            self.proj = nn.Linear(dim, dim_out)
+        if input_dim != output_dim:
+            self.projection = nn.Linear(input_dim, output_dim)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Attention + Q Pooling
-        x_norm = self.norm1(x)
-        if self.dim != self.dim_out:
-            x = do_pool(self.proj(x_norm), stride=self.attn.q_stride)
-        x = x + self.drop_path(self.attn(x_norm))
+        normalized_input = self.norm1(x)
+        if self.input_dim != self.output_dim:
+            x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride)
+        x = x + self.drop_path(self.attention(normalized_input))
 
         # MLP
         x = x + self.drop_path(self.mlp(self.norm2(x)))
@@ -154,14 +154,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class Head(nn.Module):
     def __init__(
         self,
-        dim: int,
+        input_dim: int,
         num_classes: int,
         dropout_rate: float = 0.0,
         act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1),
     ):
         super().__init__()
         self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-        self.projection = nn.Linear(dim, num_classes)
+        self.projection = nn.Linear(input_dim, num_classes)
         # act_fun for eval and testing only
         self.act_func = act_func
 
@@ -173,13 +173,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class PatchEmbed(nn.Module):
-    """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d)."""
+class PatchEmbedding(nn.Module):
+    """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d)."""
 
     def __init__(
         self,
         dim_in: int,
-        dim_out: int,
+        output_dim: int,
         kernel: Tuple[int, ...],
         stride: Tuple[int, ...],
         padding: Tuple[int, ...],
@@ -188,9 +188,9 @@ def __init__(
 
         # Support any number of spatial dimensions
         self.spatial_dims = len(kernel)
-        self.proj = conv_nd(self.spatial_dims)(
+        self.projection = conv_nd(self.spatial_dims)(
             dim_in,
-            dim_out,
+            output_dim,
             kernel_size=kernel,
             stride=stride,
             padding=padding,
@@ -199,7 +199,7 @@ def __init__(
     def forward(
         self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        x = do_masked_conv(x, self.proj, mask)
+        x = do_masked_conv(x, self.projection, mask)
         x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1)
         return x
 
@@ -209,8 +209,8 @@ def __init__(
         self,
         input_size: Tuple[int, ...] = (224, 224),
         in_chans: int = 3,
-        embed_dim: int = 96,  # initial embed dim
-        num_heads: int = 1,  # initial number of heads
+        embedding_dimention: int = 96,  # initial embedding input_dim
+        number_of_heads: int = 1,  # initial number of number_of_heads
         num_classes: int = 1000,
         stages: Tuple[int, ...] = (2, 3, 16, 3),
         q_pool: int = 3,  # number of q_pool stages
@@ -228,7 +228,7 @@ def __init__(
         norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
         head_dropout: float = 0.0,
         head_init_scale: float = 0.001,
-        sep_pos_embed: bool = False,
+        sep_position_embeddings: bool = False,
     ):
         super().__init__()
 
@@ -247,24 +247,24 @@ def __init__(
         ]
         self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
 
-        self.patch_embed = PatchEmbed(
-            in_chans, embed_dim, patch_kernel, patch_stride, patch_padding
+        self.patch_embedding = PatchEmbedding(
+            in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding
         )
 
-        self.sep_pos_embed = sep_pos_embed
-        if sep_pos_embed:
-            self.pos_embed_spatial = nn.Parameter(
+        self.sep_position_embeddings = sep_position_embeddings
+        if sep_position_embeddings:
+            self.position_embeddings_spatial = nn.Parameter(
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    embed_dim,
+                    embedding_dimention,
                 )
             )
-            self.pos_embed_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], embed_dim)
+            self.position_embeddings_temporal = nn.Parameter(
+                torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention)
             )
         else:
-            self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention))
 
         # Setup roll and reroll modules
         self.unroll = Unroll(
@@ -287,43 +287,43 @@ def __init__(
         self.blocks = nn.ModuleList()
 
         for i in range(depth):
-            dim_out = embed_dim
+            output_dim = embedding_dimention
             # Mask unit or global attention.
             # Lag by 1 block, so that global attention,
             # applied post pooling on lower resolution
-            use_mask_unit_attn = mask_unit_attn[cur_stage]
+            use_mask_unit_attention = mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
-                dim_out = int(embed_dim * dim_mul)
-                num_heads = int(num_heads * head_mul)
+                output_dim = int(embedding_dimention * dim_mul)
+                number_of_heads = int(number_of_heads * head_mul)
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
 
             block = HieraBlock(
-                dim=embed_dim,
-                dim_out=dim_out,
-                heads=num_heads,
+                input_dim=embedding_dimention,
+                output_dim=output_dim,
+                number_of_heads=number_of_heads,
                 mlp_ratio=mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
                 window_size=flat_mu_size,
-                use_mask_unit_attn=use_mask_unit_attn,
+                use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            embed_dim = dim_out
+            embedding_dimention = output_dim
             self.blocks.append(block)
 
-        self.norm = norm_layer(embed_dim)
-        self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout)
+        self.norm = norm_layer(embedding_dimention)
+        self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout)
 
         # Initialize everything
-        if sep_pos_embed:
-            nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02)
-            nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02)
+        if sep_position_embeddings:
+            nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02)
+            nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02)
         else:
-            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+            nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         self.apply(partial(self._init_weights))
         self.head.projection.weight.data.mul_(head_init_scale)
         self.head.projection.bias.data.mul_(head_init_scale)
@@ -339,21 +339,21 @@ def _init_weights(self, m, init_bias=0.02):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        if self.sep_pos_embed:
-            return ["pos_embed_spatial", "pos_embed_temporal"]
+        if self.sep_position_embeddings:
+            return ["position_embeddings_spatial", "position_embeddings_temporal"]
         else:
-            return ["pos_embed"]
+            return ["position_embeddings"]
 
     def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         """
         Generates a random mask, mask_ratio fraction are dropped.
         1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
         """
-        B = x.shape[0]
+        batch_size  = x.shape[0]
         # Tokens selected for masking at mask unit level
         num_windows = math.prod(self.mask_spatial_shape)  # num_mask_units
         len_keep = int(num_windows * (1 - mask_ratio))
-        noise = torch.rand(B, num_windows, device=x.device)
+        noise = torch.rand(batch_size , num_windows, device=x.device)
 
         # Sort noise for each sample
         ids_shuffle = torch.argsort(
@@ -363,24 +363,24 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
 
         # Generate the binary mask: 1 is *keep*, 0 is *remove*
         # Note this is opposite to original MAE
-        mask = torch.zeros([B, num_windows], device=x.device)
+        mask = torch.zeros([batch_size , num_windows], device=x.device)
         mask[:, :len_keep] = 1
         # Unshuffle to get the binary mask
         mask = torch.gather(mask, dim=1, index=ids_restore)
 
         return mask.bool()
 
-    def get_pos_embed(self) -> torch.Tensor:
-        if self.sep_pos_embed:
-            return self.pos_embed_spatial.repeat(
+    def get_position_embeddings(self) -> torch.Tensor:
+        if self.sep_position_embeddings:
+            return self.position_embeddings_spatial.repeat(
                 1, self.tokens_spatial_shape[0], 1
             ) + torch.repeat_interleave(
-                self.pos_embed_temporal,
+                self.position_embeddings_temporal,
                 self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
                 dim=1,
             )
         else:
-            return self.pos_embed
+            return self.position_embeddings
 
     def forward(
         self,
@@ -389,7 +389,7 @@ def forward(
         return_intermediates: bool = False,
     ) -> torch.Tensor:
         """
-        mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim.
+        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
         """
         # Slowfast training passes in a list
@@ -397,15 +397,15 @@ def forward(
             x = x[0]
         intermediates = []
 
-        x = self.patch_embed(
+        x = self.patch_embedding(
             x,
             mask=mask.view(
                 x.shape[0], 1, *self.mask_spatial_shape
-            )  # B, C, *mask_spatial_shape
+            )  # batch_size , C, *mask_spatial_shape
             if mask is not None
             else None,
         )
-        x = x + self.get_pos_embed()
+        x = x + self.get_position_embeddings()
         x = self.unroll(x)
 
         # Discard masked tokens
@@ -442,7 +442,7 @@ def forward(
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_tiny_224(**kwdargs):
-    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs)
+    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs)
 
 
 @pretrained_model({
@@ -450,7 +450,7 @@ def hiera_tiny_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_small_224(**kwdargs):
-    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs)
+    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
 
 
 @pretrained_model({
@@ -458,7 +458,7 @@ def hiera_small_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_base_224(**kwdargs):
-    return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs)
+    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs)
 
 
 @pretrained_model({
@@ -466,7 +466,7 @@ def hiera_base_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_base_plus_224(**kwdargs):
-    return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs)
+    return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
 
 
 @pretrained_model({
@@ -474,7 +474,7 @@ def hiera_base_plus_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_large_224(**kwdargs):
-    return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
 
 
 @pretrained_model({
@@ -482,7 +482,7 @@ def hiera_large_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_huge_224(**kwdargs):
-    return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
 
 
 # Video models
@@ -500,7 +500,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs):
         patch_kernel=(3, 7, 7),
         patch_stride=(2, 4, 4),
         patch_padding=(1, 3, 3),
-        sep_pos_embed=True,
+        sep_position_embeddings=True,
         **kwdargs
     )
 
@@ -511,7 +511,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_base_plus_16x224(**kwdargs):
     return hiera_base_16x224(
-        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+        embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
     )
 
 
@@ -521,7 +521,7 @@ def hiera_base_plus_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_large_16x224(**kwdargs):
     return hiera_base_16x224(
-        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
     )
 
 
@@ -531,5 +531,5 @@ def hiera_large_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_huge_16x224(**kwdargs):
     return hiera_base_16x224(
-        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
     )
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index 64c69cc89d71..a0504997350b 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -25,14 +25,14 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
     if isinstance(head, nn.Identity):
         return x
 
-    B, num_mask_units = x.shape[0:2]
-    # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx])
+    batch_size , num_mask_units = x.shape[0:2]
+    # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size  * #MUs, C, My, Mx])
     permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
-    x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute))
+    x = head(x.reshape(batch_size  * num_mask_units, *x.shape[2:]).permute(permute))
 
-    # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C']
+    # Restore original layout, e.g. [batch_size  * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C']
     permute = [0] + list(range(2, len(x.shape))) + [1]
-    x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1])
+    x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1])
     return x
 
 
@@ -132,7 +132,7 @@ def initialize_weights(self):
         self.apply(self._mae_init_weights)
 
         # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
-        w = self.patch_embed.proj.weight.data
+        w = self.patch_embed.projection.weight.data
         nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
 
     def _mae_init_weights(self, m: nn.Module):
@@ -188,7 +188,7 @@ def forward_encoder(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
 
         if mask is None:
-            mask = self.get_random_mask(x, mask_ratio)  # [B, #MUs_all]
+            mask = self.get_random_mask(x, mask_ratio)  # [batch_size , #MUs_all]
 
         # Get multi-scale representations from encoder
         _, intermediates = super().forward(x, mask, return_intermediates=True)
@@ -212,8 +212,8 @@ def forward_decoder(
 
         # Combine visible and mask tokens
 
-        # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
-        # mask: [B, #MUs_all]
+        # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
+        # mask: [batch_size , #MUs_all]
         x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
         mask_tokens = self.mask_token.view(
             (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)
@@ -258,9 +258,9 @@ def forward_loss(
         """
         Note: in mask, 0 is *visible*, 1 is *masked*
 
-        x: e.g. [B, 3, H, W]
-        pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
-        label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        x: e.g. [batch_size , 3, H, W]
+        pred: [batch_size  * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        label: [batch_size  * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
         """
         if len(self.q_stride) == 2:
             label = self.get_pixel_label_2d(x, mask)
@@ -299,7 +299,7 @@ def forward(
 }, default="mae_in1k")
 def mae_hiera_tiny_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
+        embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
     )
 
 
@@ -308,7 +308,7 @@ def mae_hiera_tiny_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_small_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
+        embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
     )
 
 
@@ -317,7 +317,7 @@ def mae_hiera_small_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_base_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+        embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
     )
 
 
@@ -326,7 +326,7 @@ def mae_hiera_base_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_base_plus_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
     )
 
 
@@ -335,7 +335,7 @@ def mae_hiera_base_plus_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_large_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
     )
 
 
@@ -344,7 +344,7 @@ def mae_hiera_large_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_huge_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
     )
 
 
@@ -375,7 +375,7 @@ def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs):
 @pretrained_model(None)
 def mae_hiera_base_plus_16x224(**kwdargs):
     return mae_hiera_base_16x224(
-        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
     )
 
 
@@ -385,7 +385,7 @@ def mae_hiera_base_plus_16x224(**kwdargs):
 @pretrained_model(None)
 def mae_hiera_large_16x224(**kwdargs):
     return mae_hiera_base_16x224(
-        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
     )
 
 
@@ -394,5 +394,5 @@ def mae_hiera_large_16x224(**kwdargs):
 }, default="mae_k400")
 def mae_hiera_huge_16x224(**kwdargs):
     return mae_hiera_base_16x224(
-        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
     )
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
index 992c03e08079..c96c63cbfaf9 100644
--- a/src/transformers/models/hiera/hiera_utils.py
+++ b/src/transformers/models/hiera/hiera_utils.py
@@ -24,7 +24,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .convert_hiera_to_pytorch import e
+from .convert_hiera_to_pytorch import convert_state_dict
 
 def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
     """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
@@ -40,7 +40,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool
                     raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
 
                 state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
-                # state_dict["model_state"] = e(state_dict["model_state"],{})
+                state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
                 if "head.projection.weight" in state_dict["model_state"]:
                     # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
                     if "num_classes" not in kwdargs:
@@ -53,7 +53,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool
             model = model_func(**kwdargs)
             if pretrained:
                 # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-                if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"):
+                if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
                     strict = False
 
                 model.load_state_dict(state_dict["model_state"], strict=strict)

From 126de187bdaf7628d04f70f7d788581fdb45be2c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 08:10:34 +0000
Subject: [PATCH 003/118] Added Config class, basic HF setup, convert_to_hf

---
 src/transformers/__init__.py                  |   6 +
 .../models/auto/configuration_auto.py         |   3 +
 src/transformers/models/hiera/__init__.py     | 157 ++++++++-----
 .../models/hiera/configuration_hiera.py       | 193 +++++++---------
 .../models/hiera/convert_hiera_to_pytorch.py  | 212 ++++++++++++++++++
 src/transformers/models/hiera/hiera.py        | 129 +++++------
 6 files changed, 470 insertions(+), 230 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 84a664580227..aa1d07603390 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -496,6 +496,7 @@
         "GroupViTVisionConfig",
     ],
     "models.herbert": ["HerbertTokenizer"],
+    "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.idefics": [
@@ -5247,6 +5248,7 @@
         GroupViTVisionConfig,
     )
     from .models.herbert import HerbertTokenizer
+    from .models.hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.idefics import (
@@ -6941,6 +6943,10 @@
             HubertModel,
             HubertPreTrainedModel,
         )
+        from .models.hiera import (
+            Hiera,
+            HieraBlock
+        )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             IBertForMaskedLM,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 682241ea4a84..97ca773d1113 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -115,6 +115,7 @@
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
+        ("hiera","HieraConfig")
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
@@ -347,6 +348,7 @@
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP")
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -579,6 +581,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
+        ("hiera","Hiera")
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index bfd200e9dcb9..3ea6efb0056a 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -1,28 +1,18 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from typing import TYPE_CHECKING
 
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
-    is_flax_available,
-    is_tf_available,
     is_torch_available,
 )
 
 
-_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
+_import_structure = {
+    "configuration_hiera": [
+        "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "HireaConfig",
+    ],
+}
 
 try:
     if not is_torch_available():
@@ -30,28 +20,20 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_vit_mae"] = [
-        "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ViTMAEForPreTraining",
-        "ViTMAELayer",
-        "ViTMAEModel",
-        "ViTMAEPreTrainedModel",
-    ]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_vit_mae"] = [
-        "TFViTMAEForPreTraining",
-        "TFViTMAEModel",
-        "TFViTMAEPreTrainedModel",
+    _import_structure["hirea"] = [
+        "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Hirea",
+        "Head",
+        "HieraBlock",
+        "MaskUnitAttention"
+        ""
     ]
 
 if TYPE_CHECKING:
-    from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+    from .configuration_hiera import (
+        HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        HieraConfig,
+    )
 
     try:
         if not is_torch_available():
@@ -59,24 +41,99 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_vit_mae import (
-            VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ViTMAEForPreTraining,
-            ViTMAELayer,
-            ViTMAEModel,
-            ViTMAEPreTrainedModel,
+        from .hiera import (
+            Hiera,
+            Head,
+            HieraBlock,
+            MaskUnitAttention,
         )
 
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
-
-
 else:
     import sys
 
     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+
+####### PREV:
+    
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from typing import TYPE_CHECKING
+
+# from ...utils import (
+#     OptionalDependencyNotAvailable,
+#     _LazyModule,
+#     is_flax_available,
+#     is_tf_available,
+#     is_torch_available,
+# )
+
+
+# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
+
+# try:
+#     if not is_torch_available():
+#         raise OptionalDependencyNotAvailable()
+# except OptionalDependencyNotAvailable:
+#     pass
+# else:
+#     _import_structure["modeling_vit_mae"] = [
+#         "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
+#         "ViTMAEForPreTraining",
+#         "ViTMAELayer",
+#         "ViTMAEModel",
+#         "ViTMAEPreTrainedModel",
+#     ]
+
+# try:
+#     if not is_tf_available():
+#         raise OptionalDependencyNotAvailable()
+# except OptionalDependencyNotAvailable:
+#     pass
+# else:
+#     _import_structure["modeling_tf_vit_mae"] = [
+#         "TFViTMAEForPreTraining",
+#         "TFViTMAEModel",
+#         "TFViTMAEPreTrainedModel",
+#     ]
+
+# if TYPE_CHECKING:
+#     from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+
+#     try:
+#         if not is_torch_available():
+#             raise OptionalDependencyNotAvailable()
+#     except OptionalDependencyNotAvailable:
+#         pass
+#     else:
+#         from .modeling_vit_mae import (
+#             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
+#             ViTMAEForPreTraining,
+#             ViTMAELayer,
+#             ViTMAEModel,
+#             ViTMAEPreTrainedModel,
+#         )
+
+#     try:
+#         if not is_tf_available():
+#             raise OptionalDependencyNotAvailable()
+#     except OptionalDependencyNotAvailable:
+#         pass
+#     else:
+#         from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
+
+
+# else:
+#     import sys
+
+#     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index de5de9e7d9e9..c7dfaeaeedfb 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -2,127 +2,108 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-
+from typing import  Tuple
 
 logger = logging.get_logger(__name__)
 
-VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json",
-    # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae
+HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+
 }
 
 
-class ViTMAEConfig(PretrainedConfig):
+class HieraConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT
-    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
-    the defaults will yield a similar configuration to that of the ViT
-    [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture.
+    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Hiera
+    [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
-        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the decoder.
-        decoder_hidden_size (`int`, *optional*, defaults to 512):
-            Dimensionality of the decoder.
-        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
-            Number of hidden layers in the decoder.
-        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
-        mask_ratio (`float`, *optional*, defaults to 0.75):
-            The ratio of the number of masked tokens in the input sequence.
-        norm_pix_loss (`bool`, *optional*, defaults to `False`):
-            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
-            representation quality in the experiments of the authors.
-
-    Example:
-
-    ```python
-    >>> from transformers import ViTMAEConfig, ViTMAEModel
-
-    >>> # Initializing a ViT MAE vit-mae-base style configuration
-    >>> configuration = ViTMAEConfig()
-
-    >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
-    >>> model = ViTMAEModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "vit_mae"
-
+        input_size (Tuple[int, ...], optional): Dimensions of the input image (height, width). Defaults to (224, 224).
+        in_chans (int, optional): Number of input channels. Defaults to 3.
+        embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96.
+        number_of_heads (int, optional): Initial number of attention heads. Defaults to 1.
+        num_classes (int, optional): Number of output classes. Defaults to 1000.
+        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. 
+        q_pool (int, optional): Number of pooling stages for queries. Defaults to 3.
+        q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2).
+        mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride.
+        mask_unit_attn (Tuple[bool, ...], optional): Specifies which stages use mask unit attention. Defaults to (True, True, False, False).
+        dim_mul (float, optional): Factor for increasing the dimensionality through the network. Defaults to 2.0.
+        head_mul (float, optional): Factor for increasing the number of heads through the network. Defaults to 2.0.
+        patch_kernel (Tuple[int, ...], optional): Kernel size for patch embedding. Defaults to (7, 7).
+        patch_stride (Tuple[int, ...], optional): Stride for patch embedding. Defaults to (4, 4).
+        patch_padding (Tuple[int, ...], optional): Padding for patch embedding. Defaults to (3, 3).
+        mlp_ratio (float, optional): Ratio of hidden size to feed-forward layer size. Defaults to 4.0.
+        drop_path_rate (float, optional): Dropout rate for stochastic depth. Defaults to 0.0.
+        head_dropout (float, optional): Dropout rate for attention heads. Defaults to 0.0.
+        head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001.
+        sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False.
+
+    
+        Example:
+        ```python
+        >>> from transformers import HieraConfig, Hiera
+
+        >>> # Initializing a ViT MAE vit-mae-base style configuration
+        >>> configuration = HieraConfig()
+
+        >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
+        >>> model = Hiera(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```
+        """
+
+    model_type = "hiera"
     def __init__(
         self,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        image_size=224,
-        patch_size=16,
-        num_channels=3,
-        qkv_bias=True,
-        decoder_num_attention_heads=16,
-        decoder_hidden_size=512,
-        decoder_num_hidden_layers=8,
-        decoder_intermediate_size=2048,
-        mask_ratio=0.75,
-        norm_pix_loss=False,
+        input_size: Tuple[int, ...] = (224, 224),
+        in_chans: int = 3,
+        embedding_dimension: int = 96,  # initial embedding input_dim
+        number_of_heads: int = 1,  # initial number of number_of_heads
+        num_classes: int = 1000,
+        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, ...] = (2, 2),
+        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
+        # mask_unit_attn: which stages use mask unit attention?
+        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
+        dim_mul: float = 2.0,
+        head_mul: float = 2.0,
+        patch_kernel: Tuple[int, ...] = (7, 7),
+        patch_stride: Tuple[int, ...] = (4, 4),
+        patch_padding: Tuple[int, ...] = (3, 3),
+        mlp_ratio: float = 4.0,
+        drop_path_rate: float = 0.0,
+        head_dropout: float = 0.0,
+        head_init_scale: float = 0.001,
+        sep_position_embeddings: bool = False,
         **kwargs,
+
     ):
         super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.qkv_bias = qkv_bias
-        self.decoder_num_attention_heads = decoder_num_attention_heads
-        self.decoder_hidden_size = decoder_hidden_size
-        self.decoder_num_hidden_layers = decoder_num_hidden_layers
-        self.decoder_intermediate_size = decoder_intermediate_size
-        self.mask_ratio = mask_ratio
-        self.norm_pix_loss = norm_pix_loss
+        self.input_size = input_size
+        self.in_chans = in_chans
+        self.embedding_dimension = embedding_dimension
+        self.number_of_heads = number_of_heads
+        self.num_classes = num_classes
+        self.stages = stages
+        self.q_pool = q_pool
+        self.q_stride = q_stride
+        self.mask_unit_size = mask_unit_size
+        self.mask_unit_attn = mask_unit_attn
+        self.dim_mul = dim_mul
+        self.head_mul = head_mul
+        self.patch_kernel = patch_kernel
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.drop_path_rate = drop_path_rate
+        self.head_dropout = head_dropout
+        self.head_init_scale = head_init_scale
+        self.sep_position_embeddings = sep_position_embeddings
\ No newline at end of file
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index f1d0c4135796..77556120bcb4 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,6 +3,12 @@
 import requests
 import torch
 from PIL import Image
+# from .configuration_hiera import HieraConfig
+# from .hiera import Hiera
+# from transformers import HieraConfig, Hiera
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
@@ -29,3 +35,209 @@ def convert_state_dict(orig_state_dict, config):
     return updated_model_state
 
 
+
+class HieraImageProcessor:
+    def __init__(self, size):
+        self.size = size
+        self.transform_list = [
+            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
+            transforms.CenterCrop(self.size)
+        ]
+        self.transform_vis = transforms.Compose(self.transform_list)
+        self.transform_norm = transforms.Compose(self.transform_list + [
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+        ])
+    
+    def process_image(self, image_url):
+        # Load the image
+        img = Image.open(requests.get(image_url, stream=True).raw)
+        
+        # Apply transformations
+        img_vis = self.transform_vis(img)
+        img_norm = self.transform_norm(img)
+        
+        return img_norm
+
+
+
+def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
+    pretrained_models_links = {
+        "hiera_tiny_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
+        },
+        "hiera_small_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
+        },
+        "hiera_base_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
+        },
+        "hiera_base_plus_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
+        },
+        "hiera_large_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
+        },
+        "hiera_huge_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
+        },
+        "hiera_base_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
+        },
+        "hiera_base_plus_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
+        },
+        "hiera_large_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
+        },
+        "hiera_huge_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
+        }
+    }
+
+
+    if "hiera_tiny_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=96, 
+                            number_of_heads=1, 
+                            stages=(1, 2, 7, 2),)
+        checkpoints = pretrained_models_links["hiera_tiny_224"]
+        checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_small_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=96, 
+                            number_of_heads=1, 
+                            stages=(1, 2, 11, 2),)
+        checkpoints = pretrained_models_links["hiera_small_224"]
+        checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_base_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=96, 
+                            number_of_heads=1, 
+                            stages=(2, 3, 16, 3),)
+        checkpoints = pretrained_models_links["hiera_base_224"]
+        checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_base_plus_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=112, 
+                            number_of_heads=2, 
+                            stages=(2, 3, 16, 3),)
+        checkpoints = pretrained_models_links["hiera_base_plus_224"]
+        checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_large_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=144, 
+                            number_of_heads=2, 
+                            stages=(2, 6, 36, 4),)
+        checkpoints = pretrained_models_links["hiera_large_224"]
+        checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_huge_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=256, 
+                            number_of_heads=4, 
+                            stages=(2, 6, 36, 4))
+        checkpoints = pretrained_models_links["hiera_huge_224"]
+        checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_base_16x224" in checkpoint_url:
+        config = HieraConfig(num_classes=num_classes,  # Assuming num_classes is defined elsewhere
+                            input_size=(16, 224, 224),
+                            q_stride=(1, 2, 2),
+                            mask_unit_size=(1, 8, 8),
+                            patch_kernel=(3, 7, 7),
+                            patch_stride=(2, 4, 4),
+                            patch_padding=(1, 3, 3),
+                            sep_position_embeddings=True,)
+        checkpoints = pretrained_models_links["hiera_base_16x224"]
+        checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"]
+
+    elif "hiera_base_plus_16x224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=112, 
+                            number_of_heads=2, 
+                            stages=(2, 3, 16, 3))
+        checkpoints = pretrained_models_links["hiera_base_plus_16x224"]
+        checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"]
+
+    elif "hiera_large_16x224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=144, 
+                            number_of_heads=2, 
+                            stages=(2, 6, 36, 4), )
+        checkpoints = pretrained_models_links["hiera_large_16x224"]
+        checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"]
+
+    elif "hiera_huge_16x224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=256, 
+                            number_of_heads=4, 
+                            stages=(2, 6, 36, 4) )
+        checkpoints = pretrained_models_links["hiera_huge_16x224"]
+        checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
+
+
+    pretrained = True
+    if pretrained:
+        if checkpoints is None:
+            raise RuntimeError("This model currently doesn't have pretrained weights available.")
+        elif checkpoint is None:
+            raise RuntimeError("No checkpoint specified.")
+        elif checkpoint not in checkpoints:
+            raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
+
+        state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
+        state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
+        if "head.projection.weight" in state_dict["model_state"]:
+            # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
+            if config.num_classes is None:
+                config.num_classes = state_dict["model_state"]["head.projection.weight"].shape[0]
+            # If the user specified a different number of classes, remove the projection weights or else we'll error out
+            elif config.num_classes != state_dict["model_state"]["head.projection.weight"].shape[0]:
+                del state_dict["model_state"]["head.projection.weight"]
+                del state_dict["model_state"]["head.projection.bias"]
+
+    model = Hiera(config)
+    if pretrained:
+        # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
+        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
+            strict = False
+
+        model.load_state_dict(state_dict["model_state"], strict=strict)
+    
+
+
+
+    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
+
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    
+    image_processor = HieraImageProcessor(size=config.image_size)
+    inputs = image_processor.process_image(images=image, return_tensors="pt")
+
+    # forward pass
+    out = model(inputs[None, ...])
+
+    # 207: golden retriever  (imagenet-1k)
+    out.argmax(dim=-1).item()
+
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth"
+    convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/")
+
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index fcb04f68934e..7e42d5914d44 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -21,7 +21,7 @@
 import math
 from functools import partial
 from typing import List, Tuple, Callable, Optional
-
+from .configuration_hiera import HieraConfig
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -205,106 +205,85 @@ def forward(
 
 
 class Hiera(nn.Module):
-    def __init__(
-        self,
-        input_size: Tuple[int, ...] = (224, 224),
-        in_chans: int = 3,
-        embedding_dimention: int = 96,  # initial embedding input_dim
-        number_of_heads: int = 1,  # initial number of number_of_heads
-        num_classes: int = 1000,
-        stages: Tuple[int, ...] = (2, 3, 16, 3),
-        q_pool: int = 3,  # number of q_pool stages
-        q_stride: Tuple[int, ...] = (2, 2),
-        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
-        # mask_unit_attn: which stages use mask unit attention?
-        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
-        dim_mul: float = 2.0,
-        head_mul: float = 2.0,
-        patch_kernel: Tuple[int, ...] = (7, 7),
-        patch_stride: Tuple[int, ...] = (4, 4),
-        patch_padding: Tuple[int, ...] = (3, 3),
-        mlp_ratio: float = 4.0,
-        drop_path_rate: float = 0.0,
-        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
-        head_dropout: float = 0.0,
-        head_init_scale: float = 0.001,
-        sep_position_embeddings: bool = False,
-    ):
+    def __init__(self, config: HieraConfig):
         super().__init__()
-
-        depth = sum(stages)
-        self.patch_stride = patch_stride
-        self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)]
+        self.config = config
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)  # Example, adjust as needed
+        self.config = config
+        depth = sum(self.config.stages)
+        self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)]
         num_tokens = math.prod(self.tokens_spatial_shape)
-        flat_mu_size = math.prod(mask_unit_size)
-        flat_q_stride = math.prod(q_stride)
+        flat_mu_size = math.prod(self.config.mask_unit_size)
+        flat_q_stride = math.prod(self.config.q_stride)
 
-        assert q_pool < len(stages)
-        self.q_pool, self.q_stride = q_pool, q_stride
-        self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size
+        assert self.config.q_pool < len(self.config.stages)
+        self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride
+        self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size
         self.mask_spatial_shape = [
             i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
         ]
-        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+        self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)]
 
         self.patch_embedding = PatchEmbedding(
-            in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding
+            self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding
         )
 
-        self.sep_position_embeddings = sep_position_embeddings
-        if sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             self.position_embeddings_spatial = nn.Parameter(
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    embedding_dimention,
+                    self.config.embedding_dimension,
                 )
             )
             self.position_embeddings_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention)
+                torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension)
             )
         else:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension))
 
         # Setup roll and reroll modules
         self.unroll = Unroll(
-            input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1])
+            self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1])
         )
         self.reroll = Reroll(
-            input_size,
-            patch_stride,
-            [q_stride] * len(self.stage_ends[:-1]),
+            self.config.input_size,
+            self.config.patch_stride,
+            [self.config.q_stride] * len(self.stage_ends[:-1]),
             self.stage_ends,
-            q_pool,
+            self.config.q_pool,
         )
         # q_pool locations
-        q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]]
+        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]]
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)]
 
         # Transformer blocks
         cur_stage = 0
         self.blocks = nn.ModuleList()
 
         for i in range(depth):
-            output_dim = embedding_dimention
+            output_dim = self.config.embedding_dimension
             # Mask unit or global attention.
             # Lag by 1 block, so that global attention,
             # applied post pooling on lower resolution
-            use_mask_unit_attention = mask_unit_attn[cur_stage]
+            use_mask_unit_attention = self.config.mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
-                output_dim = int(embedding_dimention * dim_mul)
-                number_of_heads = int(number_of_heads * head_mul)
+                output_dim = int(self.config.embedding_dimension * self.config.dim_mul)
+                number_of_heads = int(self.config.number_of_heads * self.config.head_mul)
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
+            else:
+                number_of_heads = self.config.number_of_heads
 
             block = HieraBlock(
-                input_dim=embedding_dimention,
+                input_dim=self.config.embedding_dimension,
                 output_dim=output_dim,
                 number_of_heads=number_of_heads,
-                mlp_ratio=mlp_ratio,
+                mlp_ratio=self.config.mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -312,21 +291,21 @@ def __init__(
                 use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            embedding_dimention = output_dim
+            self.config.embedding_dimension = output_dim
             self.blocks.append(block)
 
-        self.norm = norm_layer(embedding_dimention)
-        self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout)
+        self.norm = norm_layer(self.config.embedding_dimension)
+        self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout)
 
         # Initialize everything
-        if sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02)
             nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02)
         else:
             nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         self.apply(partial(self._init_weights))
-        self.head.projection.weight.data.mul_(head_init_scale)
-        self.head.projection.bias.data.mul_(head_init_scale)
+        self.head.projection.weight.data.mul_(self.config.head_init_scale)
+        self.head.projection.bias.data.mul_(self.config.head_init_scale)
 
     def _init_weights(self, m, init_bias=0.02):
         if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
@@ -339,7 +318,7 @@ def _init_weights(self, m, init_bias=0.02):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        if self.sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             return ["position_embeddings_spatial", "position_embeddings_temporal"]
         else:
             return ["position_embeddings"]
@@ -371,7 +350,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         return mask.bool()
 
     def get_position_embeddings(self) -> torch.Tensor:
-        if self.sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             return self.position_embeddings_spatial.repeat(
                 1, self.tokens_spatial_shape[0], 1
             ) + torch.repeat_interleave(
@@ -441,8 +420,9 @@ def forward(
     "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
 }, default="mae_in1k_ft_in1k")
-def hiera_tiny_224(**kwdargs):
-    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs)
+def hiera_tiny_224(**kwargs):
+    config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs)
+    return Hiera(config)
 
 
 @pretrained_model({
@@ -450,15 +430,16 @@ def hiera_tiny_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_small_224(**kwdargs):
-    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
+    return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
 
 
 @pretrained_model({
     "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
 }, default="mae_in1k_ft_in1k")
-def hiera_base_224(**kwdargs):
-    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs)
+def hiera_base_224(**kwargs):
+    config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+    return Hiera(config)
 
 
 @pretrained_model({
@@ -466,7 +447,7 @@ def hiera_base_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_base_plus_224(**kwdargs):
-    return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
+    return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
 
 
 @pretrained_model({
@@ -474,7 +455,7 @@ def hiera_base_plus_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_large_224(**kwdargs):
-    return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
 
 
 @pretrained_model({
@@ -482,7 +463,7 @@ def hiera_large_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_huge_224(**kwdargs):
-    return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
 
 
 # Video models
@@ -511,7 +492,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_base_plus_16x224(**kwdargs):
     return hiera_base_16x224(
-        embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
+        embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
     )
 
 
@@ -521,7 +502,7 @@ def hiera_base_plus_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_large_16x224(**kwdargs):
     return hiera_base_16x224(
-        embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
     )
 
 
@@ -531,5 +512,5 @@ def hiera_large_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_huge_16x224(**kwdargs):
     return hiera_base_16x224(
-        embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
     )

From 75a34406ccf2afb2f0c80b634007f320da23e5f6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 21:48:20 +0000
Subject: [PATCH 004/118] Fixed Convert function, added hiera to HF files,
 Initilized test files

---
 src/transformers/__init__.py                  |   7 +
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/hiera/__init__.py     |   3 +
 .../models/hiera/convert_hiera_to_pytorch.py  |  56 ++--
 src/transformers/models/hiera/hiera.py        | 242 +++++++-----------
 .../models/hiera/hiera_image_processor.py     |  56 ++++
 tests/models/hiera/__init__.py                |   0
 tests/models/hiera/test_modeling_vit_mae.py   |  44 ++++
 9 files changed, 226 insertions(+), 189 deletions(-)
 create mode 100644 src/transformers/models/hiera/hiera_image_processor.py
 create mode 100644 tests/models/hiera/__init__.py
 create mode 100644 tests/models/hiera/test_modeling_vit_mae.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aa1d07603390..d8018bfba4c3 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4117,6 +4117,13 @@
             "TFGroupViTVisionModel",
         ]
     )
+    _import_structure["models.hiera"].extend(
+        [
+            "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Hiera",
+            
+        ]
+    )
     _import_structure["models.hubert"].extend(
         [
             "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0875b5b4faa4..520399067ec7 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -115,7 +115,7 @@
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
-        ("hiera","HieraConfig")
+        ("hiera","HieraConfig"),
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
@@ -348,7 +348,7 @@
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP")
+        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -581,7 +581,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera","Hiera")
+        ("hiera","Hiera"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1de0249831db..fde580b54580 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -114,6 +114,7 @@
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
         ("groupvit", "GroupViTModel"),
+        ("hiera", "Hiera"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 3ea6efb0056a..f88e32d03c98 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -47,6 +47,9 @@
             HieraBlock,
             MaskUnitAttention,
         )
+        from .hiera_image_processor import (
+            HieraImageProcessor
+        )
 
 else:
     import sys
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 77556120bcb4..d1b6e8a4ad30 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,8 +3,9 @@
 import requests
 import torch
 from PIL import Image
-# from .configuration_hiera import HieraConfig
-# from .hiera import Hiera
+from transformers.models.hiera.configuration_hiera import HieraConfig
+from transformers.models.hiera.hiera import Hiera
+from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
 # from transformers import HieraConfig, Hiera
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
@@ -35,33 +36,8 @@ def convert_state_dict(orig_state_dict, config):
     return updated_model_state
 
 
-
-class HieraImageProcessor:
-    def __init__(self, size):
-        self.size = size
-        self.transform_list = [
-            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
-            transforms.CenterCrop(self.size)
-        ]
-        self.transform_vis = transforms.Compose(self.transform_list)
-        self.transform_norm = transforms.Compose(self.transform_list + [
-            transforms.ToTensor(),
-            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ])
-    
-    def process_image(self, image_url):
-        # Load the image
-        img = Image.open(requests.get(image_url, stream=True).raw)
-        
-        # Apply transformations
-        img_vis = self.transform_vis(img)
-        img_norm = self.transform_norm(img)
-        
-        return img_norm
-
-
-
-def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
+def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
+    strict = True
     pretrained_models_links = {
         "hiera_tiny_224": {
             "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
@@ -121,9 +97,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
         checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, 
-                            number_of_heads=1, 
-                            stages=(2, 3, 16, 3),)
+        config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+
         checkpoints = pretrained_models_links["hiera_base_224"]
         checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
 
@@ -180,7 +155,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
                             stages=(2, 6, 36, 4) )
         checkpoints = pretrained_models_links["hiera_huge_16x224"]
         checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
-
+    elif checkpoint not in checkpoints:
+        raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
 
     pretrained = True
     if pretrained:
@@ -188,10 +164,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
             raise RuntimeError("This model currently doesn't have pretrained weights available.")
         elif checkpoint is None:
             raise RuntimeError("No checkpoint specified.")
-        elif checkpoint not in checkpoints:
-            raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
 
-        state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
+        state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu")
         state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
         if "head.projection.weight" in state_dict["model_state"]:
             # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
@@ -202,24 +176,24 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
                 del state_dict["model_state"]["head.projection.weight"]
                 del state_dict["model_state"]["head.projection.bias"]
 
-    model = Hiera(config)
+    model = Hiera(config=config)
     if pretrained:
         # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
         if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
             strict = False
 
-        model.load_state_dict(state_dict["model_state"], strict=strict)
+        model.load_state_dict(state_dict["model_state"])
+        # model.load_state_dict(state_dict["model_state"], strict=strict)
     
 
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
 
-    image = Image.open(requests.get(url, stream=True).raw)
 
     
-    image_processor = HieraImageProcessor(size=config.image_size)
-    inputs = image_processor.process_image(images=image, return_tensors="pt")
+    image_processor = HieraImageProcessor(size=224)
+    inputs = image_processor.process_image(image_url=url)
 
     # forward pass
     out = model(inputs[None, ...])
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 7e42d5914d44..7bafed5c3cd0 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -25,11 +25,40 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from dataclasses import dataclass
 
 from timm.models.layers import DropPath, Mlp
-
-from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll
-
+from ...modeling_utils import PreTrainedModel
+# from ...modeling_outputs import BaseModelOutput
+# from ...utils import (
+#     ModelOutput,
+#     add_start_docstrings,
+#     add_start_docstrings_to_model_forward,
+#     logging,
+#     replace_return_docstrings,
+# )
+
+from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll
+
+# @dataclass
+# class HieraModelOutput(ModelOutput):
+#     """
+#     Base class for Hiera model's outputs.
+
+#     Args:
+#         last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
+#             Last layer hidden-states.
+#         attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): 
+#             Attentions weights from the model, one for each layer.
+#         hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): 
+#             Hidden states of the model at the output of each layer.
+#         intermediates (list[torch.Tensor], optional): 
+#             Intermediate representations or features from the model, if applicable.
+#     """
+#     last_hidden_state: torch.FloatTensor
+#     attentions: Optional[Tuple[torch.FloatTensor]] = None
+#     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+#     intermediates: Optional[list[torch.Tensor]] = None
 
 
 class MaskUnitAttention(nn.Module):
@@ -204,86 +233,110 @@ def forward(
         return x
 
 
-class Hiera(nn.Module):
+class Hiera(PreTrainedModel):
+    config_class = HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "x"
+    supports_gradient_checkpointing = True
+
     def __init__(self, config: HieraConfig):
-        super().__init__()
+        self.input_size = config.input_size
+        self.in_chans = config.in_chans
+        self.embedding_dimension = config.embedding_dimension
+        self.number_of_heads = config.number_of_heads
+        self.num_classes = config.num_classes
+        self.stages = config.stages
+        self.q_pool = config.q_pool
+        self.q_stride = config.q_stride
+        self.mask_unit_size = config.mask_unit_size
+        self.mask_unit_attn = config.mask_unit_attn
+        self.dim_mul = config.dim_mul
+        self.head_mul = config.head_mul
+        self.patch_kernel = config.patch_kernel
+        self.patch_stride = config.patch_stride
+        self.patch_padding = config.patch_padding
+        self.mlp_ratio = config.mlp_ratio
+        self.drop_path_rate = config.drop_path_rate
+        self.head_dropout = config.head_dropout
+        self.head_init_scale = config.head_init_scale
+        self.sep_position_embeddings = config.sep_position_embeddings
+
+        super().__init__(config)
         self.config = config
-        super().__init__()
         norm_layer = partial(nn.LayerNorm, eps=1e-6)  # Example, adjust as needed
-        self.config = config
-        depth = sum(self.config.stages)
-        self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)]
+        depth = sum(self.stages)
+        self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)]
         num_tokens = math.prod(self.tokens_spatial_shape)
-        flat_mu_size = math.prod(self.config.mask_unit_size)
-        flat_q_stride = math.prod(self.config.q_stride)
+        flat_mu_size = math.prod(self.mask_unit_size)
+        flat_q_stride = math.prod(self.q_stride)
 
-        assert self.config.q_pool < len(self.config.stages)
-        self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride
-        self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size
+        assert self.q_pool < len(self.stages)
+        self.q_pool, self.q_stride = self.q_pool, self.q_stride
+        self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size
         self.mask_spatial_shape = [
             i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
         ]
-        self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)]
+        self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)]
 
         self.patch_embedding = PatchEmbedding(
-            self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding
+            self.in_chans, self.embedding_dimension, self.patch_kernel, self.patch_stride, self.patch_padding
         )
 
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             self.position_embeddings_spatial = nn.Parameter(
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    self.config.embedding_dimension,
+                    self.embedding_dimension,
                 )
             )
             self.position_embeddings_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension)
+                torch.zeros(1, self.tokens_spatial_shape[0], self.embedding_dimension)
             )
         else:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension))
 
         # Setup roll and reroll modules
         self.unroll = Unroll(
-            self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1])
+            self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])
         )
         self.reroll = Reroll(
-            self.config.input_size,
-            self.config.patch_stride,
-            [self.config.q_stride] * len(self.stage_ends[:-1]),
+            self.input_size,
+            self.patch_stride,
+            [self.q_stride] * len(self.stage_ends[:-1]),
             self.stage_ends,
-            self.config.q_pool,
+            self.q_pool,
         )
         # q_pool locations
-        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]]
+        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]]
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)]
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)]
 
         # Transformer blocks
         cur_stage = 0
         self.blocks = nn.ModuleList()
 
         for i in range(depth):
-            output_dim = self.config.embedding_dimension
+            output_dim = self.embedding_dimension
             # Mask unit or global attention.
             # Lag by 1 block, so that global attention,
             # applied post pooling on lower resolution
-            use_mask_unit_attention = self.config.mask_unit_attn[cur_stage]
+            use_mask_unit_attention = self.mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
-                output_dim = int(self.config.embedding_dimension * self.config.dim_mul)
-                number_of_heads = int(self.config.number_of_heads * self.config.head_mul)
+                output_dim = int(self.embedding_dimension * self.dim_mul)
+                number_of_heads = int(self.number_of_heads * self.head_mul)
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
             else:
-                number_of_heads = self.config.number_of_heads
+                number_of_heads = self.number_of_heads
 
             block = HieraBlock(
-                input_dim=self.config.embedding_dimension,
+                input_dim=self.embedding_dimension,
                 output_dim=output_dim,
                 number_of_heads=number_of_heads,
-                mlp_ratio=self.config.mlp_ratio,
+                mlp_ratio=self.mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -291,21 +344,22 @@ def __init__(self, config: HieraConfig):
                 use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            self.config.embedding_dimension = output_dim
+            self.embedding_dimension = output_dim
             self.blocks.append(block)
 
-        self.norm = norm_layer(self.config.embedding_dimension)
-        self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout)
+        self.norm = norm_layer(self.embedding_dimension)
+        self.head = Head(self.embedding_dimension, self.num_classes, dropout_rate=self.head_dropout)
 
         # Initialize everything
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02)
             nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02)
         else:
             nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         self.apply(partial(self._init_weights))
-        self.head.projection.weight.data.mul_(self.config.head_init_scale)
-        self.head.projection.bias.data.mul_(self.config.head_init_scale)
+        self.head.projection.weight.data.mul_(self.head_init_scale)
+        self.head.projection.bias.data.mul_(self.head_init_scale)
+        self.post_init()
 
     def _init_weights(self, m, init_bias=0.02):
         if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
@@ -318,7 +372,7 @@ def _init_weights(self, m, init_bias=0.02):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             return ["position_embeddings_spatial", "position_embeddings_temporal"]
         else:
             return ["position_embeddings"]
@@ -350,7 +404,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         return mask.bool()
 
     def get_position_embeddings(self) -> torch.Tensor:
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             return self.position_embeddings_spatial.repeat(
                 1, self.tokens_spatial_shape[0], 1
             ) + torch.repeat_interleave(
@@ -411,106 +465,4 @@ def forward(
         if return_intermediates:
             return x, intermediates
 
-        return x
-
-
-# Image models
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_tiny_224(**kwargs):
-    config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs)
-    return Hiera(config)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_small_224(**kwdargs):
-    return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_base_224(**kwargs):
-    config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
-    return Hiera(config)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_base_plus_224(**kwdargs):
-    return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_large_224(**kwdargs):
-    return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_huge_224(**kwdargs):
-    return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
-
-
-# Video models
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_base_16x224(num_classes: int = 400, **kwdargs):
-    return Hiera(
-        num_classes=num_classes,  # K400 has 400 classes
-        input_size=(16, 224, 224),
-        q_stride=(1, 2, 2),
-        mask_unit_size=(1, 8, 8),
-        patch_kernel=(3, 7, 7),
-        patch_stride=(2, 4, 4),
-        patch_padding=(1, 3, 3),
-        sep_position_embeddings=True,
-        **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_base_plus_16x224(**kwdargs):
-    return hiera_base_16x224(
-        embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_large_16x224(**kwdargs):
-    return hiera_base_16x224(
-        embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_huge_16x224(**kwdargs):
-    return hiera_base_16x224(
-        embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
-    )
+        return x
\ No newline at end of file
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
new file mode 100644
index 000000000000..4900e4a4d3fb
--- /dev/null
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -0,0 +1,56 @@
+
+"""Image processor class for Hirea."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from PIL import Image
+import requests
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class HieraImageProcessor(BaseImageProcessor):
+    def __init__(self, size):
+        self.size = size
+        self.transform_list = [
+            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
+            transforms.CenterCrop(self.size)
+        ]
+        self.transform_vis = transforms.Compose(self.transform_list)
+        self.transform_norm = transforms.Compose(self.transform_list + [
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+        ])
+    
+    def process_image(self, image_url):
+        # Load the image
+        img = Image.open(requests.get(image_url, stream=True).raw)
+        
+        # Apply transformations
+        img_vis = self.transform_vis(img)
+        img_norm = self.transform_norm(img)
+        
+        return img_norm
\ No newline at end of file
diff --git a/tests/models/hiera/__init__.py b/tests/models/hiera/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py
new file mode 100644
index 000000000000..014d41766a8e
--- /dev/null
+++ b/tests/models/hiera/test_modeling_vit_mae.py
@@ -0,0 +1,44 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViTMAE model. """
+
+
+import math
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import ViTMAEConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import ViTMAEForPreTraining, ViTMAEModel
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTImageProcessor
\ No newline at end of file

From 5569dad499855d951b2e0d6096583eee3b5e6916 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 23:41:40 +0000
Subject: [PATCH 005/118] better naming for x in forward pass

---
 src/transformers/__init__.py                  |   4 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 src/transformers/models/hiera/__init__.py     |   2 +-
 .../models/hiera/configuration_hiera.py       |   8 +-
 .../models/hiera/convert_hiera_to_pytorch.py  |  10 +-
 src/transformers/models/hiera/hiera.py        | 163 ++++++++++--------
 src/transformers/models/hiera/hiera_mae.py    |   6 +-
 src/transformers/models/hiera/hiera_utils.py  |   6 +-
 tests/models/hiera/test_modeling_hiera.py     |  87 ++++++++++
 tests/models/hiera/test_modeling_vit_mae.py   |  44 -----
 11 files changed, 199 insertions(+), 135 deletions(-)
 create mode 100644 tests/models/hiera/test_modeling_hiera.py
 delete mode 100644 tests/models/hiera/test_modeling_vit_mae.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d8018bfba4c3..d3646a75f940 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4120,7 +4120,7 @@
     _import_structure["models.hiera"].extend(
         [
             "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "Hiera",
+            "HieraModel",
             
         ]
     )
@@ -6951,7 +6951,7 @@
             HubertPreTrainedModel,
         )
         from .models.hiera import (
-            Hiera,
+            HieraModel,
             HieraBlock
         )
         from .models.ibert import (
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 520399067ec7..58ce7f77f5a8 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -581,7 +581,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera","Hiera"),
+        ("hiera","HieraModel"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index fde580b54580..ddb20abdcc12 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -114,7 +114,7 @@
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
         ("groupvit", "GroupViTModel"),
-        ("hiera", "Hiera"),
+        ("hiera", "HieraModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index f88e32d03c98..0434517bf52c 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -42,7 +42,7 @@
         pass
     else:
         from .hiera import (
-            Hiera,
+            HieraModel,
             Head,
             HieraBlock,
             MaskUnitAttention,
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index c7dfaeaeedfb..e3133354f6ea 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -13,8 +13,8 @@
 
 class HieraConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with
-    the defaults will yield a similar configuration to that of the Hiera
+    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an HieraModel model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the HieraModel
     [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -46,13 +46,13 @@ class HieraConfig(PretrainedConfig):
     
         Example:
         ```python
-        >>> from transformers import HieraConfig, Hiera
+        >>> from transformers import HieraConfig, HieraModel
 
         >>> # Initializing a ViT MAE vit-mae-base style configuration
         >>> configuration = HieraConfig()
 
         >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
-        >>> model = Hiera(configuration)
+        >>> model = HieraModel(configuration)
 
         >>> # Accessing the model configuration
         >>> configuration = model.config
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index d1b6e8a4ad30..d0294f12deab 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,10 +3,10 @@
 import requests
 import torch
 from PIL import Image
-from transformers.models.hiera.configuration_hiera import HieraConfig
-from transformers.models.hiera.hiera import Hiera
-from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
-# from transformers import HieraConfig, Hiera
+# from transformers.models.hiera.configuration_hiera import HieraConfig
+# from transformers.models.hiera.hiera import HieraModel
+# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
+# from transformers import HieraConfig, HieraModel
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
@@ -176,7 +176,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
                 del state_dict["model_state"]["head.projection.weight"]
                 del state_dict["model_state"]["head.projection.bias"]
 
-    model = Hiera(config=config)
+    model = HieraModel(config=config)
     if pretrained:
         # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
         if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 7bafed5c3cd0..72917eb8e1a4 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -20,7 +20,7 @@
 
 import math
 from functools import partial
-from typing import List, Tuple, Callable, Optional
+from typing import List, Tuple, Callable, Optional, Union
 from .configuration_hiera import HieraConfig
 import torch
 import torch.nn as nn
@@ -29,36 +29,34 @@
 
 from timm.models.layers import DropPath, Mlp
 from ...modeling_utils import PreTrainedModel
-# from ...modeling_outputs import BaseModelOutput
-# from ...utils import (
-#     ModelOutput,
-#     add_start_docstrings,
-#     add_start_docstrings_to_model_forward,
-#     logging,
-#     replace_return_docstrings,
-# )
+from ...modeling_outputs import BaseModelOutput
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 
 from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll
 
-# @dataclass
-# class HieraModelOutput(ModelOutput):
-#     """
-#     Base class for Hiera model's outputs.
-
-#     Args:
-#         last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
-#             Last layer hidden-states.
-#         attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): 
-#             Attentions weights from the model, one for each layer.
-#         hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): 
-#             Hidden states of the model at the output of each layer.
-#         intermediates (list[torch.Tensor], optional): 
-#             Intermediate representations or features from the model, if applicable.
-#     """
-#     last_hidden_state: torch.FloatTensor
-#     attentions: Optional[Tuple[torch.FloatTensor]] = None
-#     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-#     intermediates: Optional[list[torch.Tensor]] = None
+@dataclass
+class HieraModelOutput(ModelOutput):
+    """
+    Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput.
+
+    Args:
+        last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
+            Last layer hidden-states.
+        attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): 
+            Attentions weights from the model, one for each layer.
+        hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): 
+            Hidden states of the model at the output of each layer.
+        intermediates (List[torch.Tensor], optional): 
+            Intermediate representations or features from the model, if applicable.
+    """
+    last_hidden_state: torch.FloatTensor
+    intermediates: Optional[List[torch.Tensor]] = None
 
 
 class MaskUnitAttention(nn.Module):
@@ -102,15 +100,15 @@ def __init__(
         self.window_size = window_size
         self.use_mask_unit_attention = use_mask_unit_attention
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         """ Input should be of shape [batch, tokens, channels]. """
-        batch_size , num_channels , _ = x.shape
+        batch_size , num_channels , _ = embeddings.shape
         num_windows = (
             (num_channels  // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
         )
 
         qkv = (
-            self.qkv(x)
+            self.qkv(embeddings)
             .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
@@ -126,15 +124,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         if hasattr(F, "scaled_dot_product_attention"):
             # Note: the original paper did *not* use SDPA, it's a free boost!
-            x = F.scaled_dot_product_attention(q, k, v)
+            embeddings = F.scaled_dot_product_attention(q, k, v)
         else:
             attention = (q * self.scale) @ k.transpose(-1, -2)
             attention = attention.softmax(dim=-1)
-            x = (attention @ v)
+            embeddings = (attention @ v)
 
-        x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
-        x = self.projection(x)
-        return x
+        embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
+        embeddings = self.projection(embeddings)
+        return embeddings
 
 
 class HieraBlock(nn.Module):
@@ -168,16 +166,16 @@ def __init__(
         if input_dim != output_dim:
             self.projection = nn.Linear(input_dim, output_dim)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         # Attention + Q Pooling
-        normalized_input = self.norm1(x)
+        normalized_embeddings = self.norm1(embeddings)
         if self.input_dim != self.output_dim:
-            x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride)
-        x = x + self.drop_path(self.attention(normalized_input))
+            embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride)
+        embeddings = embeddings + self.drop_path(self.attention(normalized_embeddings))
 
         # MLP
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
+        embeddings = embeddings + self.drop_path(self.mlp(self.norm2(embeddings)))
+        return embeddings
 
 
 class Head(nn.Module):
@@ -226,17 +224,36 @@ def __init__(
         )
 
     def forward(
-        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+        self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        x = do_masked_conv(x, self.projection, mask)
-        x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1)
-        return x
+        embeddings = do_masked_conv(pixel_values, self.projection, mask)
+        embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
+        return embeddings
+
 
+class HireaModel(PreTrainedModel):
+    """
+    Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+
+    This model is a PyTorch implementation of the Hiera architecture for image classification.
+
+    The model can be used as follows:
+
+    Args:
+        config (HieraConfig): Configuration class instance for `Hiera`.
+
+    Example usage:
+        >>> from your_model_file import Hiera, HieraConfig
+        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+
+        >>> model = Hiera(config)
+        >>> inputs = torch.rand((1, 3, 224, 224))
+        >>> outputs = model(inputs)
+    """
 
-class Hiera(PreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
-    main_input_name = "x"
+    main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 
     def __init__(self, config: HieraConfig):
@@ -417,52 +434,56 @@ def get_position_embeddings(self) -> torch.Tensor:
 
     def forward(
         self,
-        x: torch.Tensor,
+        pixel_values: torch.Tensor,
         mask: torch.Tensor = None,
+        return_dict: Optional[bool] = True,
         return_intermediates: bool = False,
-    ) -> torch.Tensor:
+    ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
         """
         mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
         """
         # Slowfast training passes in a list
-        if isinstance(x, list):
-            x = x[0]
+        if isinstance(pixel_values, list):
+            pixel_values = pixel_values[0]
         intermediates = []
 
-        x = self.patch_embedding(
-            x,
+        pached_embeddings = self.patch_embedding(
+            pixel_values,
             mask=mask.view(
-                x.shape[0], 1, *self.mask_spatial_shape
+                pixel_values.shape[0], 1, *self.mask_spatial_shape
             )  # batch_size , C, *mask_spatial_shape
             if mask is not None
             else None,
         )
-        x = x + self.get_position_embeddings()
-        x = self.unroll(x)
+        embeddings = pached_embeddings + self.get_position_embeddings()
+        embeddings = self.unroll(embeddings)
 
         # Discard masked tokens
         if mask is not None:
-            x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view(
-                x.shape[0], -1, x.shape[-1]
+            embeddings = embeddings[mask[..., None].tile(1, self.mu_size, embeddings.shape[2])].view(
+                embeddings.shape[0], -1, embeddings.shape[-1]
             )
 
-        for i, blk in enumerate(self.blocks):
-            x = blk(x)
+        for i, block in enumerate(self.blocks):
+            embeddings = block(embeddings)
 
             if return_intermediates and i in self.stage_ends:
-                intermediates.append(self.reroll(x, i, mask=mask))
+                intermediates.append(self.reroll(embeddings, i, mask=mask))
 
         if mask is None:
-            x = x.mean(dim=1)
-            x = self.norm(x)
-            x = self.head(x)
+            embeddings = embeddings.mean(dim=1)
+            embeddings = self.norm(embeddings)
+            embeddings = self.head(embeddings)
 
-        # x may not always be in spatial order here.
+        # embeddings may not always be in spatial order here.
         # e.g. if q_pool = 2, mask_unit_size = (8, 8), and
         # q_stride = (2, 2), not all unrolls were consumed,
-        # intermediates[-1] is x in spatial order
-        if return_intermediates:
-            return x, intermediates
-
-        return x
\ No newline at end of file
+        # intermediates[-1] is embeddings in spatial order
+        if not return_dict:
+            return tuple(v for v in [embeddings, intermediates] if v is not None)
+        
+        return HieraModelOutput(
+            last_hidden_state=embeddings,
+            intermediates=intermediates if return_intermediates else None,
+        )
\ No newline at end of file
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index a0504997350b..c45056318a38 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera import Hiera, HieraBlock
+from .hiera import HieraModel, HieraBlock
 from .hiera_utils import pretrained_model, undo_windowing, conv_nd
 
 
@@ -36,8 +36,8 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
     return x
 
 
-class MaskedAutoencoderHiera(Hiera):
-    """Masked Autoencoder with Hiera backbone"""
+class MaskedAutoencoderHiera(HieraModel):
+    """Masked Autoencoder with HieraModel backbone"""
 
     def __init__(
         self,
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
index c96c63cbfaf9..a35b33210941 100644
--- a/src/transformers/models/hiera/hiera_utils.py
+++ b/src/transformers/models/hiera/hiera_utils.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 # --------------------------------------------------------
 #
-# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles
 #
 # Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
 # Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
@@ -27,7 +27,7 @@
 from .convert_hiera_to_pytorch import convert_state_dict
 
 def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
-    """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
+    """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
 
     def inner(model_func: Callable) -> Callable:
         def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module:
@@ -69,7 +69,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool
 def conv_nd(n: int) -> Type[nn.Module]:
     """
     Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
-    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
+    If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises)
     """
     return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
new file mode 100644
index 000000000000..8d593af2a622
--- /dev/null
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Hiera model. """
+
+import unittest
+
+from transformers import HieraConfig
+from transformers.testing_utils import (
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available
+
+if is_torch_available():
+    import torch
+    from transformers import HieraModel
+    # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model
+    from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class HieraModelTester:
+    # Define this tester to initialize Hiera model and its configurations for testing
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        num_channels=3,
+        image_size=224,
+        # Add other model-specific parameters here
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        # Initialize other necessary attributes here
+
+    def prepare_config_and_inputs(self):
+        # Prepare configuration and inputs for testing your model
+        pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device)
+
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return HieraConfig(
+            # Define necessary configuration parameters here
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = HieraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values=pixel_values)
+        # Perform checks here, e.g., output shapes, etc.
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size))
+
+
+@require_torch
+class HieraModelTest(unittest.TestCase):
+
+    def setUp(self):
+        self.model_tester = HieraModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = HieraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
\ No newline at end of file
diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py
deleted file mode 100644
index 014d41766a8e..000000000000
--- a/tests/models/hiera/test_modeling_vit_mae.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ViTMAE model. """
-
-
-import math
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import ViTMAEConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import ViTMAEForPreTraining, ViTMAEModel
-    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
\ No newline at end of file

From 11017c67c64c691911eb660361c219038d08e43a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Sat, 17 Feb 2024 00:10:52 +0000
Subject: [PATCH 006/118] Moved utils to hiera

---
 src/transformers/models/hiera/hiera.py | 226 ++++++++++++++++++++++++-
 1 file changed, 223 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 72917eb8e1a4..cca502aa80c9 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -20,7 +20,7 @@
 
 import math
 from functools import partial
-from typing import List, Tuple, Callable, Optional, Union
+from typing import List, Tuple, Callable, Optional, Union, Type
 from .configuration_hiera import HieraConfig
 import torch
 import torch.nn as nn
@@ -38,7 +38,227 @@
     replace_return_docstrings,
 )
 
-from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll
+
+def conv_nd(n: int) -> Type[nn.Module]:
+    """
+    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
+    If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises)
+    """
+    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
+
+
+def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
+    # Refer to `Unroll` to see how this performs a maxpool-Nd
+    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
+
+
+def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor:
+    # target_size: [(T), (H), W]
+    # (spatial) mask: [B, C, (t), (h), w]
+    if mask is None:
+        return mask
+
+    assert len(mask.shape[2:]) == len(target_size)
+    if mask.shape[2:] != target_size:
+        return F.interpolate(mask.float(), size=target_size)
+    return mask
+
+
+def do_masked_conv(
+    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """Zero-out the masked regions of the input before conv.
+    Prevents leakage of masked regions when using overlapping kernels.
+    """
+    if conv is None:
+        return x
+    if mask is None:
+        return conv(x)
+
+    mask = get_resized_mask(target_size=x.shape[2:], mask=mask)
+    return conv(x * mask.bool())
+
+
+def undo_windowing(
+    x: torch.Tensor, shape: List[int], mu_shape: List[int]
+) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
+        shape: current spatial shape, if it were not organized into mask unit
+            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
+        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
+    Returns:
+        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
+    """
+    D = len(shape)
+    B, C = x.shape[0], x.shape[-1]
+    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
+    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
+    x = x.view(B, *num_MUs, *mu_shape, C)
+
+    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
+    permute = (
+        [0]
+        + sum(
+            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
+            [],
+        )
+        + [len(x.shape) - 1]
+    )
+    x = x.permute(permute).reshape(B, *shape, C)
+
+    return x
+
+
+
+class Unroll(nn.Module):
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
+                           [B, (Sy, Sx, H // Sy, W // Sx), C]
+
+    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in HxW order, so they
+    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+        self.schedule = unroll_schedule
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input: Flattened patch embeddings [B, N, C]
+        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
+        """
+        B, _, C = x.shape
+
+        cur_size = self.size
+        x = x.view(*([B] + cur_size + [C]))
+
+        for strides in self.schedule:
+            # Move patches with the given strides to the batch dimension
+
+            # Create a view of the tensor with the patch stride as separate dims
+            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
+            cur_size = [i // s for i, s in zip(cur_size, strides)]
+            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
+            x = x.view(new_shape)
+
+            # Move the patch stride into the batch dimension
+            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
+            L = len(new_shape)
+            permute = (
+                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Now finally flatten the relevant dims into the batch dimension
+            x = x.flatten(0, len(strides))
+            B *= math.prod(strides)
+
+        x = x.reshape(-1, math.prod(self.size), C)
+        return x
+
+
+class Reroll(nn.Module):
+    """
+    Undos the "unroll" operation so that you can use intermediate features.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+        stage_ends: List[int],
+        q_pool: int,
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        self.schedule = {}
+        size = self.size
+        for i in range(stage_ends[-1] + 1):
+            self.schedule[i] = unroll_schedule, size
+            # schedule unchanged if no pooling at a stage end
+            if i in stage_ends[:q_pool]:
+                if len(unroll_schedule) > 0:
+                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
+                unroll_schedule = unroll_schedule[1:]
+
+    def forward(
+        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no mask is provided:
+            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
+        If a mask is provided:
+            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+        """
+        schedule, size = self.schedule[block_idx]
+        B, N, C = x.shape
+
+        D = len(size)
+        cur_mu_shape = [1] * D
+
+        for strides in schedule:
+            # Extract the current patch from N
+            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
+
+            # Move that patch into the current MU
+            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
+            L = len(x.shape)
+            permute = (
+                [0, 1 + D]
+                + sum(
+                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
+                    [],
+                )
+                + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Reshape to [B, N//(Sy*Sx), *MU, C]
+            for i in range(D):
+                cur_mu_shape[i] *= strides[i]
+            x = x.reshape(B, -1, *cur_mu_shape, C)
+            N = x.shape[1]
+
+        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        x = x.view(B, N, *cur_mu_shape, C)
+
+        # If masked, return [B, #MUs, MUy, MUx, C]
+        if mask is not None:
+            return x
+
+        # If not masked, we can return [B, H, W, C]
+        x = undo_windowing(x, size, cur_mu_shape)
+
+        return x
+
 
 @dataclass
 class HieraModelOutput(ModelOutput):
@@ -231,7 +451,7 @@ def forward(
         return embeddings
 
 
-class HireaModel(PreTrainedModel):
+class HieraModel(PreTrainedModel):
     """
     Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
 

From ad959d49a2c073e1ebce29ee8736d26ad18a8710 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 00:17:14 +0000
Subject: [PATCH 007/118] Change hiera -> hiera_model

---
 src/transformers/models/hiera/__init__.py     |  89 +-----
 src/transformers/models/hiera/benchmarking.py |  77 -----
 src/transformers/models/hiera/hiera_mae.py    |   2 +-
 .../models/hiera/{hiera.py => hiera_model.py} |   0
 src/transformers/models/hiera/hiera_utils.py  | 287 ------------------
 5 files changed, 3 insertions(+), 452 deletions(-)
 delete mode 100644 src/transformers/models/hiera/benchmarking.py
 rename src/transformers/models/hiera/{hiera.py => hiera_model.py} (100%)
 delete mode 100644 src/transformers/models/hiera/hiera_utils.py

diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 0434517bf52c..1f388d5361ab 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -41,7 +41,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .hiera import (
+        from .hiera_model import (
             HieraModel,
             Head,
             HieraBlock,
@@ -54,89 +54,4 @@
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
-
-####### PREV:
-    
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# from typing import TYPE_CHECKING
-
-# from ...utils import (
-#     OptionalDependencyNotAvailable,
-#     _LazyModule,
-#     is_flax_available,
-#     is_tf_available,
-#     is_torch_available,
-# )
-
-
-# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
-
-# try:
-#     if not is_torch_available():
-#         raise OptionalDependencyNotAvailable()
-# except OptionalDependencyNotAvailable:
-#     pass
-# else:
-#     _import_structure["modeling_vit_mae"] = [
-#         "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
-#         "ViTMAEForPreTraining",
-#         "ViTMAELayer",
-#         "ViTMAEModel",
-#         "ViTMAEPreTrainedModel",
-#     ]
-
-# try:
-#     if not is_tf_available():
-#         raise OptionalDependencyNotAvailable()
-# except OptionalDependencyNotAvailable:
-#     pass
-# else:
-#     _import_structure["modeling_tf_vit_mae"] = [
-#         "TFViTMAEForPreTraining",
-#         "TFViTMAEModel",
-#         "TFViTMAEPreTrainedModel",
-#     ]
-
-# if TYPE_CHECKING:
-#     from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
-
-#     try:
-#         if not is_torch_available():
-#             raise OptionalDependencyNotAvailable()
-#     except OptionalDependencyNotAvailable:
-#         pass
-#     else:
-#         from .modeling_vit_mae import (
-#             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
-#             ViTMAEForPreTraining,
-#             ViTMAELayer,
-#             ViTMAEModel,
-#             ViTMAEPreTrainedModel,
-#         )
-
-#     try:
-#         if not is_tf_available():
-#             raise OptionalDependencyNotAvailable()
-#     except OptionalDependencyNotAvailable:
-#         pass
-#     else:
-#         from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
-
-
-# else:
-#     import sys
-
-#     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py
deleted file mode 100644
index 33166028977a..000000000000
--- a/src/transformers/models/hiera/benchmarking.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-
-import time
-from typing import List, Tuple, Union
-
-import torch
-from tqdm import tqdm
-
-# From https://github.com/facebookresearch/ToMe/
-def benchmark(
-    model: torch.nn.Module,
-    device: torch.device = 0,
-    input_size: Tuple[int] = (3, 224, 224),
-    batch_size: int = 64,
-    runs: int = 40,
-    throw_out: float = 0.25,
-    use_fp16: bool = False,
-    verbose: bool = False,
-) -> float:
-    """
-    Benchmark the given model with random inputs at the given batch size.
-
-    Args:
-     - model: the module to benchmark
-     - device: the device to use for benchmarking
-     - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w)
-     - batch_size: the batch size to use for evaluation
-     - runs: the number of total runs to do
-     - throw_out: the percentage of runs to throw out at the start of testing
-     - use_fp16: whether or not to benchmark with float16 and autocast
-     - verbose: whether or not to use tqdm to print progress / print throughput at end
-
-    Returns:
-     - the throughput measured in images / second
-    """
-    if not isinstance(device, torch.device):
-        device = torch.device(device)
-    is_cuda = torch.device(device).type == "cuda"
-
-    model = model.eval().to(device)
-    input = torch.rand(batch_size, *input_size, device=device)
-    if use_fp16:
-        input = input.half()
-
-    warm_up = int(runs * throw_out)
-    total = 0
-    start = time.time()
-
-    with torch.autocast(device.type, enabled=use_fp16):
-        with torch.no_grad():
-            for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
-                if i == warm_up:
-                    if is_cuda:
-                        torch.cuda.synchronize()
-                    total = 0
-                    start = time.time()
-
-                model(input)
-                total += batch_size
-
-    if is_cuda:
-        torch.cuda.synchronize()
-
-    end = time.time()
-    elapsed = end - start
-
-    throughput = total / elapsed
-
-    if verbose:
-        print(f"Throughput: {throughput:.2f} im/s")
-
-    return throughput
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index c45056318a38..f0e2e7854bff 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera import HieraModel, HieraBlock
+from .hiera_model import HieraModel, HieraBlock
 from .hiera_utils import pretrained_model, undo_windowing, conv_nd
 
 
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera_model.py
similarity index 100%
rename from src/transformers/models/hiera/hiera.py
rename to src/transformers/models/hiera/hiera_model.py
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
deleted file mode 100644
index a35b33210941..000000000000
--- a/src/transformers/models/hiera/hiera_utils.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-#
-# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles
-#
-# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
-# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
-# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
-#
-# Paper: https://arxiv.org/abs/2306.00989/
-#
-# References:
-# slowfast: https://github.com/facebookresearch/SlowFast
-# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
-# --------------------------------------------------------
-
-import math
-from typing import List, Tuple, Optional, Type, Callable, Dict
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .convert_hiera_to_pytorch import convert_state_dict
-
-def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
-    """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
-
-    def inner(model_func: Callable) -> Callable:
-        def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module:
-            if pretrained:
-                if checkpoints is None:
-                    raise RuntimeError("This model currently doesn't have pretrained weights available.")
-                elif checkpoint is None:
-                    raise RuntimeError("No checkpoint specified.")
-                elif checkpoint not in checkpoints:
-                    raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
-
-                state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
-                state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
-                if "head.projection.weight" in state_dict["model_state"]:
-                    # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
-                    if "num_classes" not in kwdargs:
-                        kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0]
-                    # If the user specified a different number of classes, remove the projection weights or else we'll error out
-                    elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]:
-                        del state_dict["model_state"]["head.projection.weight"]
-                        del state_dict["model_state"]["head.projection.bias"]
-
-            model = model_func(**kwdargs)
-            if pretrained:
-                # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-                if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
-                    strict = False
-
-                model.load_state_dict(state_dict["model_state"], strict=strict)
-            
-            return model
-
-        return model_def
-    
-    return inner
-
-
-
-def conv_nd(n: int) -> Type[nn.Module]:
-    """
-    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
-    If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises)
-    """
-    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
-
-
-def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
-    # Refer to `Unroll` to see how this performs a maxpool-Nd
-    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
-
-
-def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor:
-    # target_size: [(T), (H), W]
-    # (spatial) mask: [B, C, (t), (h), w]
-    if mask is None:
-        return mask
-
-    assert len(mask.shape[2:]) == len(target_size)
-    if mask.shape[2:] != target_size:
-        return F.interpolate(mask.float(), size=target_size)
-    return mask
-
-
-def do_masked_conv(
-    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
-) -> torch.Tensor:
-    """Zero-out the masked regions of the input before conv.
-    Prevents leakage of masked regions when using overlapping kernels.
-    """
-    if conv is None:
-        return x
-    if mask is None:
-        return conv(x)
-
-    mask = get_resized_mask(target_size=x.shape[2:], mask=mask)
-    return conv(x * mask.bool())
-
-
-def undo_windowing(
-    x: torch.Tensor, shape: List[int], mu_shape: List[int]
-) -> torch.Tensor:
-    """
-    Restore spatial organization by undoing windowed organization of mask units.
-
-    Args:
-        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
-        shape: current spatial shape, if it were not organized into mask unit
-            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
-        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
-    Returns:
-        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
-    """
-    D = len(shape)
-    B, C = x.shape[0], x.shape[-1]
-    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
-    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
-    x = x.view(B, *num_MUs, *mu_shape, C)
-
-    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
-    permute = (
-        [0]
-        + sum(
-            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
-            [],
-        )
-        + [len(x.shape) - 1]
-    )
-    x = x.permute(permute).reshape(B, *shape, C)
-
-    return x
-
-
-
-class Unroll(nn.Module):
-    """
-    Reorders the tokens such that patches are contiguous in memory.
-    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
-                           [B, (Sy, Sx, H // Sy, W // Sx), C]
-
-    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
-    Not only is this faster, but it also makes it easy to support inputs of arbitrary
-    dimensions in addition to patch-wise sparsity.
-
-    Performing this operation multiple times in sequence puts entire windows as contiguous
-    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
-    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
-    computed easily and efficiently, while also allowing max to be applied sequentially.
-
-    Note: This means that intermediate values of the model are not in HxW order, so they
-    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
-    The last block of the network is fine though, since by then the strides are all consumed.
-    """
-
-    def __init__(
-        self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
-    ):
-        super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-        self.schedule = unroll_schedule
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Input: Flattened patch embeddings [B, N, C]
-        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
-        """
-        B, _, C = x.shape
-
-        cur_size = self.size
-        x = x.view(*([B] + cur_size + [C]))
-
-        for strides in self.schedule:
-            # Move patches with the given strides to the batch dimension
-
-            # Create a view of the tensor with the patch stride as separate dims
-            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
-            cur_size = [i // s for i, s in zip(cur_size, strides)]
-            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
-            x = x.view(new_shape)
-
-            # Move the patch stride into the batch dimension
-            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
-            L = len(new_shape)
-            permute = (
-                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
-            )
-            x = x.permute(permute)
-
-            # Now finally flatten the relevant dims into the batch dimension
-            x = x.flatten(0, len(strides))
-            B *= math.prod(strides)
-
-        x = x.reshape(-1, math.prod(self.size), C)
-        return x
-
-
-class Reroll(nn.Module):
-    """
-    Undos the "unroll" operation so that you can use intermediate features.
-    """
-
-    def __init__(
-        self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
-        stage_ends: List[int],
-        q_pool: int,
-    ):
-        super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-
-        # The first stage has to reverse everything
-        # The next stage has to reverse all but the first unroll, etc.
-        self.schedule = {}
-        size = self.size
-        for i in range(stage_ends[-1] + 1):
-            self.schedule[i] = unroll_schedule, size
-            # schedule unchanged if no pooling at a stage end
-            if i in stage_ends[:q_pool]:
-                if len(unroll_schedule) > 0:
-                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
-                unroll_schedule = unroll_schedule[1:]
-
-    def forward(
-        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
-    ) -> torch.Tensor:
-        """
-        Roll the given tensor back up to spatial order assuming it's from the given block.
-
-        If no mask is provided:
-            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
-        If a mask is provided:
-            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
-        """
-        schedule, size = self.schedule[block_idx]
-        B, N, C = x.shape
-
-        D = len(size)
-        cur_mu_shape = [1] * D
-
-        for strides in schedule:
-            # Extract the current patch from N
-            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
-
-            # Move that patch into the current MU
-            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
-            L = len(x.shape)
-            permute = (
-                [0, 1 + D]
-                + sum(
-                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
-                    [],
-                )
-                + [L - 1]
-            )
-            x = x.permute(permute)
-
-            # Reshape to [B, N//(Sy*Sx), *MU, C]
-            for i in range(D):
-                cur_mu_shape[i] *= strides[i]
-            x = x.reshape(B, -1, *cur_mu_shape, C)
-            N = x.shape[1]
-
-        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
-        x = x.view(B, N, *cur_mu_shape, C)
-
-        # If masked, return [B, #MUs, MUy, MUx, C]
-        if mask is not None:
-            return x
-
-        # If not masked, we can return [B, H, W, C]
-        x = undo_windowing(x, size, cur_mu_shape)
-
-        return x
\ No newline at end of file

From fac7b231f7c7277bc202edfc9b9e765e802620fa Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 01:10:17 +0000
Subject: [PATCH 008/118] Fixed integration into tranformers

---
 src/transformers/__init__.py                        |  2 +-
 src/transformers/models/hiera/__init__.py           | 13 ++++++++-----
 .../models/hiera/hiera_image_processor.py           |  2 +-
 src/transformers/models/hiera/hiera_model.py        |  3 +++
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d3646a75f940..359e0f1a3f50 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6951,8 +6951,8 @@
             HubertPreTrainedModel,
         )
         from .models.hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
-            HieraBlock
         )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 1f388d5361ab..2b83a4c8d693 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -9,8 +9,8 @@
 
 _import_structure = {
     "configuration_hiera": [
-        "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "HireaConfig",
+        "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "HieraConfig",
     ],
 }
 
@@ -20,15 +20,16 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["hirea"] = [
-        "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Hirea",
+    _import_structure["hiera_model"] = [
+        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "HieraModel",
         "Head",
         "HieraBlock",
         "MaskUnitAttention"
         ""
     ]
 
+
 if TYPE_CHECKING:
     from .configuration_hiera import (
         HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -42,10 +43,12 @@
         pass
     else:
         from .hiera_model import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
             Head,
             HieraBlock,
             MaskUnitAttention,
+            
         )
         from .hiera_image_processor import (
             HieraImageProcessor
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
index 4900e4a4d3fb..d3f2ce96a64b 100644
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -1,5 +1,5 @@
 
-"""Image processor class for Hirea."""
+"""Image processor class for Hiera."""
 
 from typing import Dict, List, Optional, Union
 
diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py
index cca502aa80c9..5e7493e3c6a7 100644
--- a/src/transformers/models/hiera/hiera_model.py
+++ b/src/transformers/models/hiera/hiera_model.py
@@ -38,6 +38,9 @@
     replace_return_docstrings,
 )
 
+HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "",
+]
 
 def conv_nd(n: int) -> Type[nn.Module]:
     """

From 866ffc7573e58a2e6341cae78c36ceb75ceeeba4 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 01:23:55 +0000
Subject: [PATCH 009/118] Fix: Convert Checkpoint

---
 .../models/hiera/convert_hiera_to_pytorch.py       | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index d0294f12deab..76c86bcb0cbb 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,9 +3,9 @@
 import requests
 import torch
 from PIL import Image
-# from transformers.models.hiera.configuration_hiera import HieraConfig
-# from transformers.models.hiera.hiera import HieraModel
-# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
+from transformers import HieraConfig
+from transformers import HieraModel
+from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
 # from transformers import HieraConfig, HieraModel
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
@@ -199,11 +199,13 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
     out = model(inputs[None, ...])
 
     # 207: golden retriever  (imagenet-1k)
-    out.argmax(dim=-1).item()
+    out.last_hidden_state.argmax(dim=-1).item()
 
+    # If you also want intermediate feature maps
+    out = model(inputs[None, ...], return_intermediates=True)
 
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
+    for x in out.intermediates:
+        print(x.shape)    
 
     print(f"Saving image processor to {pytorch_dump_folder_path}")
     image_processor.save_pretrained(pytorch_dump_folder_path)

From b3828e19951bdcdd9482fc0b2d3c050d13510c1e Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 07:38:00 +0000
Subject: [PATCH 010/118] added documentation for hiera

---
 README.md                   | 1 +
 README_de.md                | 1 +
 README_es.md                | 1 +
 README_fr.md                | 1 +
 README_hd.md                | 1 +
 README_ja.md                | 1 +
 README_ko.md                | 1 +
 README_pt-br.md             | 1 +
 README_ru.md                | 1 +
 README_te.md                | 1 +
 README_zh-hans.md           | 1 +
 README_zh-hant.md           | 1 +
 docs/source/en/_toctree.yml | 2 ++
 docs/source/en/index.md     | 1 +
 14 files changed, 15 insertions(+)

diff --git a/README.md b/README.md
index b7077ce61032..8e33f4f20ac4 100644
--- a/README.md
+++ b/README.md
@@ -388,6 +388,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_de.md b/README_de.md
index f21bebdc7811..82f998c3140c 100644
--- a/README_de.md
+++ b/README_de.md
@@ -384,6 +384,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_es.md b/README_es.md
index 9dfbf8931aba..980de1212979 100644
--- a/README_es.md
+++ b/README_es.md
@@ -361,6 +361,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_fr.md b/README_fr.md
index 75ebdd315f65..211ccfcc9e1f 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -382,6 +382,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Les Transformers sont-ils vraiment inefficaces pour la représentation graphique ?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT : la segmentation sémantique émerge de la supervision textuelle](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ : référentiel complet pour la compréhension du langage polonais](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (de Facebook) publié avec l'article [Hiera : un transformateur de vision hiérarchique sans cloches et sifflets]( https://arxiv.org/abs/2306.00989) par Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT : Apprentissage de la représentation autonome de la parole par prédiction masquée des unités cachées](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT : Quantification entière de BERT avec des entiers uniquement](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS : Un ensemble de données filtré à l'échelle du Web d'intercalation de documents texte-image](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_hd.md b/README_hd.md
index 6402c3ee5eb7..272999ff1cb6 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -335,6 +335,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** ((फेसबुक से) पेपर के साथ जारी किया गया [हिरा: बेल्स-एंड-व्हिसल्स के बिना एक पदानुक्रमित विजन ट्रांसफार्मर](https://arxiv.org/abs/2306.00989) by चैतन्य रयाली, युआन-टिंग हू, डैनियल बोल्या, चेन वेई, हाओकी फैन, पो-याओ हुआंग, वैभव अग्रवाल, अर्कबंधु चौधरी, ओमिद पौरसीद, जूडी हॉफमैन, जितेंद्र मलिक, द्वारा यांगहाओ ली, क्रिस्टोफ़ फ़िचटेनहोफ़र
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ja.md b/README_ja.md
index bd8a058b7b1b..51fdc9d64710 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -395,6 +395,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook から) Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer から公開された研究論文 [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989)  
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ko.md b/README_ko.md
index 533ab4685bce..b844bc23474c 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -310,6 +310,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook 에서) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_pt-br.md b/README_pt-br.md
index 40841bd82b9f..279b128a05d8 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -389,6 +389,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ru.md b/README_ru.md
index 3e6f3d54f27e..ef7c970f1ae3 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -380,6 +380,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_te.md b/README_te.md
index 2c0b97dada67..e8073232a6a8 100644
--- a/README_te.md
+++ b/README_te.md
@@ -382,6 +382,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index f2b9b38273bf..154425954ace 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -334,6 +334,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (来自 Facebook) 伴随论文  [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 由 Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 1d5155529aa0..2e3ab3ec4bd4 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -346,6 +346,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 678b679cb143..c169a2c625fb 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -620,6 +620,8 @@
         title: CLAP
       - local: model_doc/encodec
         title: EnCodec
+      - local: model_doc/hiera
+        title: Hiera
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 81dc97e97134..4c809e6c100b 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -154,6 +154,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
+|                        [Hiera](model_doc/hiera)                          |       ✅        |         ❌         |      ❌      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
 |                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
 |                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |

From 82672b2ab5f12a39478c406102f671b217ef7bde Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 07:38:31 +0000
Subject: [PATCH 011/118] added documentation for hiera

---
 docs/source/en/model_doc/hiera.md | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 docs/source/en/model_doc/hiera.md

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
new file mode 100644
index 000000000000..1c46bae9b072
--- /dev/null
+++ b/docs/source/en/model_doc/hiera.md
@@ -0,0 +1,40 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hiera
+
+## Overview
+
+Hubert was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
+
+The abstract from the paper is the following:
+
+Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.
+
+## HireaConfig
+
+[[autodoc]] HieraConfig
+
+<frameworkcontent>
+<pt>
+
+## HireaModel
+
+[[autodoc]] HireaModel
+    - forward
+
+</tf>
+</frameworkcontent>
\ No newline at end of file

From 00478b60cd5cc1e448e64bb4ce25961baf8f8368 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 07:39:18 +0000
Subject: [PATCH 012/118] added Docstings to models, Transformers based changes

---
 src/transformers/__init__.py                  |   2 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/hiera/__init__.py     |  24 ++--
 .../models/hiera/configuration_hiera.py       |  15 +++
 .../models/hiera/convert_hiera_to_pytorch.py  |  15 +++
 .../models/hiera/hiera_image_processor.py     |  14 +++
 src/transformers/models/hiera/hiera_mae.py    | 113 +-----------------
 src/transformers/models/hiera/hiera_model.py  |  89 +++++++++-----
 8 files changed, 124 insertions(+), 149 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 359e0f1a3f50..346eb625808b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4121,6 +4121,7 @@
         [
             "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
             "HieraModel",
+            "HieraPreTrainedModel"
             
         ]
     )
@@ -6953,6 +6954,7 @@
         from .models.hiera import (
             HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
+            HieraPreTrainedModel
         )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c9cd6fca69d6..788b671a232d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -69,6 +69,7 @@
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
+        ("hiera", "HieraImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 2b83a4c8d693..0787bffe767e 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING
 
 from ...utils import (
@@ -23,9 +38,7 @@
     _import_structure["hiera_model"] = [
         "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "HieraModel",
-        "Head",
-        "HieraBlock",
-        "MaskUnitAttention"
+        "HieraPreTrainedModel"
         ""
     ]
 
@@ -45,10 +58,7 @@
         from .hiera_model import (
             HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
-            Head,
-            HieraBlock,
-            MaskUnitAttention,
-            
+            HieraPreTrainedModel
         )
         from .hiera_image_processor import (
             HieraImageProcessor
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index e3133354f6ea..a4ab4fd9d30b 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -1,5 +1,20 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ hiera  model configuration"""
 
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from typing import  Tuple
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 76c86bcb0cbb..5ca2ecd262d9 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 
 import requests
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
index d3f2ce96a64b..4e41e14bc6f8 100644
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -1,3 +1,17 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """Image processor class for Hiera."""
 
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index f0e2e7854bff..d4ec15058b2d 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,8 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera_model import HieraModel, HieraBlock
-from .hiera_utils import pretrained_model, undo_windowing, conv_nd
+from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd
 
 
 def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
@@ -287,112 +286,4 @@ def forward(
         )  # pred_mask is mask at resolution of *prediction*
 
         # Toggle mask, to generate labels for *masked* tokens
-        return *self.forward_loss(x, pred, ~pred_mask), mask
-
-
-
-
-# Image Models
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
-}, default="mae_in1k")
-def mae_hiera_tiny_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
-}, default="mae_in1k")
-def mae_hiera_small_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
-}, default="mae_in1k")
-def mae_hiera_base_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
-}, default="mae_in1k")
-def mae_hiera_base_plus_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
-}, default="mae_in1k")
-def mae_hiera_large_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
-}, default="mae_in1k")
-def mae_hiera_huge_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
-    )
-
-
-
-# Video Models
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
-}, default="mae_k400")
-def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs):
-    return MaskedAutoencoderHiera(
-        num_classes=num_classes,  # K400 has 400 classes
-        input_size=(16, 224, 224),
-        q_stride=(1, 2, 2),
-        mask_unit_size=(1, 8, 8),
-        patch_kernel=(3, 7, 7),
-        patch_stride=(2, 4, 4),
-        patch_padding=(1, 3, 3),
-        sep_pos_embed=True,
-        q_pool=2,
-        **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
-}, default="mae_k400")
-@pretrained_model(None)
-def mae_hiera_base_plus_16x224(**kwdargs):
-    return mae_hiera_base_16x224(
-        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
-}, default="mae_k400")
-@pretrained_model(None)
-def mae_hiera_large_16x224(**kwdargs):
-    return mae_hiera_base_16x224(
-        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-}, default="mae_k400")
-def mae_hiera_huge_16x224(**kwdargs):
-    return mae_hiera_base_16x224(
-        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
-    )
+        return *self.forward_loss(x, pred, ~pred_mask), mask
\ No newline at end of file
diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py
index 5e7493e3c6a7..b1ed0db0e4b9 100644
--- a/src/transformers/models/hiera/hiera_model.py
+++ b/src/transformers/models/hiera/hiera_model.py
@@ -271,10 +271,6 @@ class HieraModelOutput(ModelOutput):
     Args:
         last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
             Last layer hidden-states.
-        attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): 
-            Attentions weights from the model, one for each layer.
-        hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): 
-            Hidden states of the model at the output of each layer.
         intermediates (List[torch.Tensor], optional): 
             Intermediate representations or features from the model, if applicable.
     """
@@ -422,10 +418,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.act_func(x)
         return x
 
-
+@add_start_docstrings("""
+Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
+""")
 class PatchEmbedding(nn.Module):
-    """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d)."""
-
     def __init__(
         self,
         dim_in: int,
@@ -453,27 +449,49 @@ def forward(
         embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
         return embeddings
 
-
-class HieraModel(PreTrainedModel):
+class HieraPreTrainedModel(PreTrainedModel):
     """
-    Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
 
-    This model is a PyTorch implementation of the Hiera architecture for image classification.
+    def _init_weights(self, module, init_bias=0.02):
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                nn.init.constant_(module.bias, init_bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, init_bias)
+            nn.init.constant_(module.weight, 1.0)
 
-    The model can be used as follows:
 
-    Args:
-        config (HieraConfig): Configuration class instance for `Hiera`.
 
-    Example usage:
-        >>> from your_model_file import Hiera, HieraConfig
-        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
 
-        >>> model = Hiera(config)
-        >>> inputs = torch.rand((1, 3, 224, 224))
-        >>> outputs = model(inputs)
-    """
+@add_start_docstrings("""
+Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+
+This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
+
+The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance.
 
+Parameters:
+    config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
+        Initializing with a config file does not load the weights associated with the model, only the
+        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    
+Example usage:
+    >>> from your_model_file import Hiera, HieraConfig
+    >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+
+    >>> model = Hiera(config)
+    >>> inputs = torch.rand((1, 3, 224, 224))
+    >>> outputs = model(inputs)
+                      """)
+class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
@@ -601,14 +619,6 @@ def __init__(self, config: HieraConfig):
         self.head.projection.bias.data.mul_(self.head_init_scale)
         self.post_init()
 
-    def _init_weights(self, m, init_bias=0.02):
-        if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, init_bias)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, init_bias)
-            nn.init.constant_(m.weight, 1.0)
 
     @torch.jit.ignore
     def no_weight_decay(self):
@@ -655,6 +665,25 @@ def get_position_embeddings(self) -> torch.Tensor:
         else:
             return self.position_embeddings
 
+    @add_start_docstrings_to_model_forward("""
+    The forward pass for the Hiera model.
+
+    Args:
+        pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`.
+        
+        mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
+        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
+        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
+
+        
+        return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
+
+        return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
+    
+    
+        
+    """)
+    @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig")
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -663,8 +692,6 @@ def forward(
         return_intermediates: bool = False,
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
         """
-        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
-        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
         """
         # Slowfast training passes in a list
         if isinstance(pixel_values, list):

From 4144fe8a41da4b088e9db149b756e5b32aa7c934 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sun, 18 Feb 2024 06:55:51 +0000
Subject: [PATCH 013/118] make style and quality

---
 src/transformers/__init__.py                  |  15 +--
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/hiera/__init__.py     |  19 +---
 .../models/hiera/configuration_hiera.py       |  18 ++--
 .../models/hiera/convert_hiera_to_pytorch.py  | 102 +++++++++---------
 .../models/hiera/hiera_image_processor.py     |  51 ++++-----
 src/transformers/models/hiera/hiera_mae.py    |  54 +++-------
 7 files changed, 104 insertions(+), 161 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 346eb625808b..27141f8fc304 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -496,7 +496,7 @@
         "GroupViTVisionConfig",
     ],
     "models.herbert": ["HerbertTokenizer"],
-    "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"],
+    "models.hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.idefics": [
@@ -4118,12 +4118,7 @@
         ]
     )
     _import_structure["models.hiera"].extend(
-        [
-            "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "HieraModel",
-            "HieraPreTrainedModel"
-            
-        ]
+        ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"]
     )
     _import_structure["models.hubert"].extend(
         [
@@ -6944,6 +6939,7 @@
             GroupViTTextModel,
             GroupViTVisionModel,
         )
+        from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
         from .models.hubert import (
             HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             HubertForCTC,
@@ -6951,11 +6947,6 @@
             HubertModel,
             HubertPreTrainedModel,
         )
-        from .models.hiera import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
-            HieraModel,
-            HieraPreTrainedModel
-        )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             IBertForMaskedLM,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 58ce7f77f5a8..8c47296b1140 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -114,8 +114,8 @@
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
+        ("hiera", "HieraConfig"),
         ("hubert", "HubertConfig"),
-        ("hiera","HieraConfig"),
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
@@ -347,8 +347,8 @@
         ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("hiera", "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -581,7 +581,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera","HieraModel"),
+        ("hiera", "HieraModel"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 0787bffe767e..fcffbbf7593e 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -35,12 +35,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["hiera_model"] = [
-        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "HieraModel",
-        "HieraPreTrainedModel"
-        ""
-    ]
+    _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "]
 
 
 if TYPE_CHECKING:
@@ -55,16 +50,10 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .hiera_model import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
-            HieraModel,
-            HieraPreTrainedModel
-        )
-        from .hiera_image_processor import (
-            HieraImageProcessor
-        )
+        from .hiera_image_processor import HieraImageProcessor
+        from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
 
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index a4ab4fd9d30b..8d40e7a72777 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -15,15 +15,15 @@
 """ hiera  model configuration"""
 
 
+from typing import Tuple
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from typing import  Tuple
 
-logger = logging.get_logger(__name__)
 
-HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+logger = logging.get_logger(__name__)
 
-}
+HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 
 
 class HieraConfig(PretrainedConfig):
@@ -42,7 +42,7 @@ class HieraConfig(PretrainedConfig):
         embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96.
         number_of_heads (int, optional): Initial number of attention heads. Defaults to 1.
         num_classes (int, optional): Number of output classes. Defaults to 1000.
-        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. 
+        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model.
         q_pool (int, optional): Number of pooling stages for queries. Defaults to 3.
         q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2).
         mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride.
@@ -58,7 +58,7 @@ class HieraConfig(PretrainedConfig):
         head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001.
         sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False.
 
-    
+
         Example:
         ```python
         >>> from transformers import HieraConfig, HieraModel
@@ -72,9 +72,10 @@ class HieraConfig(PretrainedConfig):
         >>> # Accessing the model configuration
         >>> configuration = model.config
         ```
-        """
+    """
 
     model_type = "hiera"
+
     def __init__(
         self,
         input_size: Tuple[int, ...] = (224, 224),
@@ -99,7 +100,6 @@ def __init__(
         head_init_scale: float = 0.001,
         sep_position_embeddings: bool = False,
         **kwargs,
-
     ):
         super().__init__(**kwargs)
         self.input_size = input_size
@@ -121,4 +121,4 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.head_dropout = head_dropout
         self.head_init_scale = head_init_scale
-        self.sep_position_embeddings = sep_position_embeddings
\ No newline at end of file
+        self.sep_position_embeddings = sep_position_embeddings
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 5ca2ecd262d9..794a62147d78 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -15,17 +15,11 @@
 
 import argparse
 
-import requests
 import torch
-from PIL import Image
-from transformers import HieraConfig
-from transformers import HieraModel
-from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
-# from transformers import HieraConfig, HieraModel
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
+# from transformers import HieraConfig, HieraModel
+from transformers import HieraConfig, HieraModel
+from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
 
 
 def rename_key(name):
@@ -51,7 +45,7 @@ def convert_state_dict(orig_state_dict, config):
     return updated_model_state
 
 
-def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
+def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs):
     strict = True
     pretrained_models_links = {
         "hiera_tiny_224": {
@@ -93,21 +87,24 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
         "hiera_huge_16x224": {
             "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
             "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-        }
+        },
     }
 
-
     if "hiera_tiny_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, 
-                            number_of_heads=1, 
-                            stages=(1, 2, 7, 2),)
+        config = HieraConfig(
+            embedding_dimension=96,
+            number_of_heads=1,
+            stages=(1, 2, 7, 2),
+        )
         checkpoints = pretrained_models_links["hiera_tiny_224"]
         checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_small_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, 
-                            number_of_heads=1, 
-                            stages=(1, 2, 11, 2),)
+        config = HieraConfig(
+            embedding_dimension=96,
+            number_of_heads=1,
+            stages=(1, 2, 11, 2),
+        )
         checkpoints = pretrained_models_links["hiera_small_224"]
         checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
 
@@ -118,56 +115,57 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
         checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_plus_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=112, 
-                            number_of_heads=2, 
-                            stages=(2, 3, 16, 3),)
+        config = HieraConfig(
+            embedding_dimension=112,
+            number_of_heads=2,
+            stages=(2, 3, 16, 3),
+        )
         checkpoints = pretrained_models_links["hiera_base_plus_224"]
         checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_large_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=144, 
-                            number_of_heads=2, 
-                            stages=(2, 6, 36, 4),)
+        config = HieraConfig(
+            embedding_dimension=144,
+            number_of_heads=2,
+            stages=(2, 6, 36, 4),
+        )
         checkpoints = pretrained_models_links["hiera_large_224"]
         checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_huge_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=256, 
-                            number_of_heads=4, 
-                            stages=(2, 6, 36, 4))
+        config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
         checkpoints = pretrained_models_links["hiera_huge_224"]
         checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_16x224" in checkpoint_url:
-        config = HieraConfig(num_classes=num_classes,  # Assuming num_classes is defined elsewhere
-                            input_size=(16, 224, 224),
-                            q_stride=(1, 2, 2),
-                            mask_unit_size=(1, 8, 8),
-                            patch_kernel=(3, 7, 7),
-                            patch_stride=(2, 4, 4),
-                            patch_padding=(1, 3, 3),
-                            sep_position_embeddings=True,)
+        config = HieraConfig(
+            input_size=(16, 224, 224),
+            q_stride=(1, 2, 2),
+            mask_unit_size=(1, 8, 8),
+            patch_kernel=(3, 7, 7),
+            patch_stride=(2, 4, 4),
+            patch_padding=(1, 3, 3),
+            sep_position_embeddings=True,
+        )
         checkpoints = pretrained_models_links["hiera_base_16x224"]
         checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"]
 
     elif "hiera_base_plus_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=112, 
-                            number_of_heads=2, 
-                            stages=(2, 3, 16, 3))
+        config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3))
         checkpoints = pretrained_models_links["hiera_base_plus_16x224"]
         checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"]
 
     elif "hiera_large_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=144, 
-                            number_of_heads=2, 
-                            stages=(2, 6, 36, 4), )
+        config = HieraConfig(
+            embedding_dimension=144,
+            number_of_heads=2,
+            stages=(2, 6, 36, 4),
+        )
         checkpoints = pretrained_models_links["hiera_large_16x224"]
         checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"]
 
     elif "hiera_huge_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=256, 
-                            number_of_heads=4, 
-                            stages=(2, 6, 36, 4) )
+        config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
         checkpoints = pretrained_models_links["hiera_huge_16x224"]
         checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
     elif checkpoint not in checkpoints:
@@ -181,7 +179,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
             raise RuntimeError("No checkpoint specified.")
 
         state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu")
-        state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
+        state_dict["model_state"] = convert_state_dict(state_dict["model_state"], {})
         if "head.projection.weight" in state_dict["model_state"]:
             # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
             if config.num_classes is None:
@@ -194,19 +192,16 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
     model = HieraModel(config=config)
     if pretrained:
         # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
+        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(
+            model, "decoder_position_embeddings"
+        ):
             strict = False
 
-        model.load_state_dict(state_dict["model_state"])
+        model.load_state_dict(state_dict["model_state"], strict)
         # model.load_state_dict(state_dict["model_state"], strict=strict)
-    
-
-
 
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
 
-
-    
     image_processor = HieraImageProcessor(size=224)
     inputs = image_processor.process_image(image_url=url)
 
@@ -220,7 +215,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
     out = model(inputs[None, ...], return_intermediates=True)
 
     for x in out.intermediates:
-        print(x.shape)    
+        print(x.shape)
 
     print(f"Saving image processor to {pytorch_dump_folder_path}")
     image_processor.save_pretrained(pytorch_dump_folder_path)
@@ -231,4 +226,3 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
 
     checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth"
     convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/")
-
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
index 4e41e14bc6f8..0200687c4835 100644
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -15,32 +15,18 @@
 
 """Image processor class for Hiera."""
 
-from typing import Dict, List, Optional, Union
 
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import rescale, resize, to_channel_dimension_format
-from ...image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    infer_channel_dimension_format,
-    is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from ...utils import TensorType, is_vision_available, logging
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from PIL import Image
 import requests
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+from ...image_processing_utils import BaseImageProcessor
+from ...utils import is_vision_available, logging
 
 
 if is_vision_available():
-    import PIL
+    from PIL import Image
+    from torchvision import transforms
+    from torchvision.transforms.functional import InterpolationMode
 
 
 logger = logging.get_logger(__name__)
@@ -51,20 +37,23 @@ def __init__(self, size):
         self.size = size
         self.transform_list = [
             transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
-            transforms.CenterCrop(self.size)
+            transforms.CenterCrop(self.size),
         ]
         self.transform_vis = transforms.Compose(self.transform_list)
-        self.transform_norm = transforms.Compose(self.transform_list + [
-            transforms.ToTensor(),
-            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ])
-    
+        self.transform_norm = transforms.Compose(
+            self.transform_list
+            + [
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ]
+        )
+
     def process_image(self, image_url):
         # Load the image
         img = Image.open(requests.get(image_url, stream=True).raw)
-        
+
         # Apply transformations
-        img_vis = self.transform_vis(img)
+        # img_vis = self.transform_vis(img)
         img_norm = self.transform_norm(img)
-        
-        return img_norm
\ No newline at end of file
+
+        return img_norm
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index d4ec15058b2d..56b91bc7acb7 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -10,28 +10,28 @@
 # --------------------------------------------------------
 
 
+import math
 from functools import partial
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
-import math
 import torch
 import torch.nn as nn
 
-from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd
+from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing
 
 
 def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
     if isinstance(head, nn.Identity):
         return x
 
-    batch_size , num_mask_units = x.shape[0:2]
+    batch_size, num_mask_units = x.shape[0:2]
     # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size  * #MUs, C, My, Mx])
     permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
-    x = head(x.reshape(batch_size  * num_mask_units, *x.shape[2:]).permute(permute))
+    x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute))
 
     # Restore original layout, e.g. [batch_size  * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C']
     permute = [0] + list(range(2, len(x.shape))) + [1]
-    x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1])
+    x = x.permute(permute).reshape(batch_size, num_mask_units, *x.shape[2:], x.shape[1])
     return x
 
 
@@ -64,8 +64,7 @@ def __init__(
             i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride)
         ]
         self.tokens_spatial_shape_final = [
-            i // s ** (self.q_pool)
-            for i, s in zip(self.tokens_spatial_shape, self.q_stride)
+            i // s ** (self.q_pool) for i, s in zip(self.tokens_spatial_shape, self.q_stride)
         ]
         # --------------------------------------------------------------------------
         # Multi-scale fusion heads
@@ -73,9 +72,7 @@ def __init__(
         self.multi_scale_fusion_heads = nn.ModuleList()
 
         for i in self.stage_ends[: self.q_pool]:  # resolution constant after q_pool
-            kernel = [
-                i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)
-            ]
+            kernel = [i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)]
             curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)]
             self.multi_scale_fusion_heads.append(
                 conv_nd(len(self.q_stride))(
@@ -94,9 +91,7 @@ def __init__(
         self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
 
         self.decoder_pos_embed = nn.Parameter(
-            torch.zeros(
-                1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim
-            )
+            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim)
         )
 
         self.decoder_blocks = nn.ModuleList(
@@ -113,9 +108,7 @@ def __init__(
         )
         self.decoder_norm = norm_layer(decoder_embed_dim)
 
-        self.pred_stride = patch_stride[-1] * (
-            self.q_stride[-1] ** self.q_pool
-        )  # patch stride of prediction
+        self.pred_stride = patch_stride[-1] * (self.q_stride[-1] ** self.q_pool)  # patch stride of prediction
 
         self.decoder_pred = nn.Linear(
             decoder_embed_dim,
@@ -143,9 +136,7 @@ def _mae_init_weights(self, m: nn.Module):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
 
-    def get_pixel_label_2d(
-        self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True
-    ) -> torch.Tensor:
+    def get_pixel_label_2d(self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor:
         # mask (boolean tensor): True must correspond to *masked*
         input_img = input_img.permute(0, 2, 3, 1)
 
@@ -160,13 +151,11 @@ def get_pixel_label_2d(
 
         return label
 
-    def get_pixel_label_3d(
-        self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True
-    ) -> torch.Tensor:
+    def get_pixel_label_3d(self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor:
         # mask (boolean tensor): True must correspond to *masked*
 
         # We use time strided loss, only take the first frame from each token
-        input_vid = input_vid[:, :, ::self.patch_stride[0], :, :]
+        input_vid = input_vid[:, :, :: self.patch_stride[0], :, :]
 
         size = self.pred_stride
         label = input_vid.unfold(3, size, size).unfold(4, size, size)
@@ -181,11 +170,9 @@ def get_pixel_label_3d(
 
         return label
 
-
     def forward_encoder(
         self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-
         if mask is None:
             mask = self.get_random_mask(x, mask_ratio)  # [batch_size , #MUs_all]
 
@@ -203,9 +190,7 @@ def forward_encoder(
 
         return x, mask
 
-    def forward_decoder(
-        self, x: torch.Tensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward_decoder(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         # Embed tokens
         x = self.decoder_embed(x)
 
@@ -214,9 +199,7 @@ def forward_decoder(
         # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
         # mask: [batch_size , #MUs_all]
         x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
-        mask_tokens = self.mask_token.view(
-            (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)
-        )
+        mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,))
         mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:]))
         mask = mask.expand((-1,) * 2 + x.shape[2:]).bool()
         x_dec[mask] = x.flatten()
@@ -279,11 +262,8 @@ def forward(
         mask_ratio: float = 0.6,
         mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-
         latent, mask = self.forward_encoder(x, mask_ratio, mask=mask)
-        pred, pred_mask = self.forward_decoder(
-            latent, mask
-        )  # pred_mask is mask at resolution of *prediction*
+        pred, pred_mask = self.forward_decoder(latent, mask)  # pred_mask is mask at resolution of *prediction*
 
         # Toggle mask, to generate labels for *masked* tokens
-        return *self.forward_loss(x, pred, ~pred_mask), mask
\ No newline at end of file
+        return *self.forward_loss(x, pred, ~pred_mask), mask

From d23a70d0d14d5ca50a7d20fde8e2ace59231a714 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sun, 18 Feb 2024 06:56:38 +0000
Subject: [PATCH 014/118] make style and quality

---
 src/transformers/models/hiera/hiera_model.py | 128 ++++++++-----------
 1 file changed, 56 insertions(+), 72 deletions(-)

diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py
index b1ed0db0e4b9..9345084769ec 100644
--- a/src/transformers/models/hiera/hiera_model.py
+++ b/src/transformers/models/hiera/hiera_model.py
@@ -19,29 +19,29 @@
 # --------------------------------------------------------
 
 import math
+from dataclasses import dataclass
 from functools import partial
-from typing import List, Tuple, Callable, Optional, Union, Type
-from .configuration_hiera import HieraConfig
+from typing import Callable, List, Optional, Tuple, Type, Union
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from dataclasses import dataclass
-
 from timm.models.layers import DropPath, Mlp
+
 from ...modeling_utils import PreTrainedModel
-from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
 )
+from .configuration_hiera import HieraConfig
+
 
 HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "",
+    "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
 ]
 
+
 def conv_nd(n: int) -> Type[nn.Module]:
     """
     Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
@@ -67,9 +67,7 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso
     return mask
 
 
-def do_masked_conv(
-    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
-) -> torch.Tensor:
+def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
     """Zero-out the masked regions of the input before conv.
     Prevents leakage of masked regions when using overlapping kernels.
     """
@@ -82,9 +80,7 @@ def do_masked_conv(
     return conv(x * mask.bool())
 
 
-def undo_windowing(
-    x: torch.Tensor, shape: List[int], mu_shape: List[int]
-) -> torch.Tensor:
+def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor:
     """
     Restore spatial organization by undoing windowed organization of mask units.
 
@@ -116,7 +112,6 @@ def undo_windowing(
     return x
 
 
-
 class Unroll(nn.Module):
     """
     Reorders the tokens such that patches are contiguous in memory.
@@ -169,9 +164,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             # Move the patch stride into the batch dimension
             # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
             L = len(new_shape)
-            permute = (
-                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
-            )
+            permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
             x = x.permute(permute)
 
             # Now finally flatten the relevant dims into the batch dimension
@@ -210,9 +203,7 @@ def __init__(
                     size = [n // s for n, s in zip(size, unroll_schedule[0])]
                 unroll_schedule = unroll_schedule[1:]
 
-    def forward(
-        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor:
         """
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
@@ -269,11 +260,12 @@ class HieraModelOutput(ModelOutput):
     Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput.
 
     Args:
-        last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
+        last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)):
             Last layer hidden-states.
-        intermediates (List[torch.Tensor], optional): 
+        intermediates (List[torch.Tensor], optional):
             Intermediate representations or features from the model, if applicable.
     """
+
     last_hidden_state: torch.FloatTensor
     intermediates: Optional[List[torch.Tensor]] = None
 
@@ -320,15 +312,13 @@ def __init__(
         self.use_mask_unit_attention = use_mask_unit_attention
 
     def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
-        """ Input should be of shape [batch, tokens, channels]. """
-        batch_size , num_channels , _ = embeddings.shape
-        num_windows = (
-            (num_channels  // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
-        )
+        """Input should be of shape [batch, tokens, channels]."""
+        batch_size, num_channels, _ = embeddings.shape
+        num_windows = (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
 
         qkv = (
             self.qkv(embeddings)
-            .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim)
+            .reshape(batch_size, -1, num_windows, 3, self.number_of_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
         q, k, v = qkv[0], qkv[1], qkv[2]
@@ -336,7 +326,7 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         if self.q_stride > 1:
             # Refer to Unroll to see how this performs a maxpool-Nd
             q = (
-                q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
+                q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
                 .max(dim=3)
                 .values
             )
@@ -347,9 +337,9 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         else:
             attention = (q * self.scale) @ k.transpose(-1, -2)
             attention = attention.softmax(dim=-1)
-            embeddings = (attention @ v)
+            embeddings = attention @ v
 
-        embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
+        embeddings = embeddings.transpose(1, 3).reshape(batch_size, -1, self.output_dim)
         embeddings = self.projection(embeddings)
         return embeddings
 
@@ -418,9 +408,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.act_func(x)
         return x
 
-@add_start_docstrings("""
+
+@add_start_docstrings(
+    """
 Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
-""")
+"""
+)
 class PatchEmbedding(nn.Module):
     def __init__(
         self,
@@ -442,18 +435,18 @@ def __init__(
             padding=padding,
         )
 
-    def forward(
-        self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         embeddings = do_masked_conv(pixel_values, self.projection, mask)
         embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
         return embeddings
 
+
 class HieraPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+
     config_class = HieraConfig
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
@@ -469,9 +462,8 @@ def _init_weights(self, module, init_bias=0.02):
             nn.init.constant_(module.weight, 1.0)
 
 
-
-
-@add_start_docstrings("""
+@add_start_docstrings(
+    """
 Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
 
 This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
@@ -482,7 +474,7 @@ def _init_weights(self, module, init_bias=0.02):
     config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
         Initializing with a config file does not load the weights associated with the model, only the
         configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-    
+
 Example usage:
     >>> from your_model_file import Hiera, HieraConfig
     >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
@@ -490,7 +482,8 @@ def _init_weights(self, module, init_bias=0.02):
     >>> model = Hiera(config)
     >>> inputs = torch.rand((1, 3, 224, 224))
     >>> outputs = model(inputs)
-                      """)
+                      """
+)
 class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
@@ -531,9 +524,7 @@ def __init__(self, config: HieraConfig):
         assert self.q_pool < len(self.stages)
         self.q_pool, self.q_stride = self.q_pool, self.q_stride
         self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size
-        self.mask_spatial_shape = [
-            i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
-        ]
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)]
         self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)]
 
         self.patch_embedding = PatchEmbedding(
@@ -555,9 +546,7 @@ def __init__(self, config: HieraConfig):
             self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension))
 
         # Setup roll and reroll modules
-        self.unroll = Unroll(
-            self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])
-        )
+        self.unroll = Unroll(self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]))
         self.reroll = Reroll(
             self.input_size,
             self.patch_stride,
@@ -566,7 +555,7 @@ def __init__(self, config: HieraConfig):
             self.q_pool,
         )
         # q_pool locations
-        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]]
+        q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]]
         # stochastic depth decay rule
         dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)]
 
@@ -619,7 +608,6 @@ def __init__(self, config: HieraConfig):
         self.head.projection.bias.data.mul_(self.head_init_scale)
         self.post_init()
 
-
     @torch.jit.ignore
     def no_weight_decay(self):
         if self.sep_position_embeddings:
@@ -632,21 +620,19 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         Generates a random mask, mask_ratio fraction are dropped.
         1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
         """
-        batch_size  = x.shape[0]
+        batch_size = x.shape[0]
         # Tokens selected for masking at mask unit level
         num_windows = math.prod(self.mask_spatial_shape)  # num_mask_units
         len_keep = int(num_windows * (1 - mask_ratio))
-        noise = torch.rand(batch_size , num_windows, device=x.device)
+        noise = torch.rand(batch_size, num_windows, device=x.device)
 
         # Sort noise for each sample
-        ids_shuffle = torch.argsort(
-            noise, dim=1
-        )  # ascend: small is keep, large is remove
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
         ids_restore = torch.argsort(ids_shuffle, dim=1)
 
         # Generate the binary mask: 1 is *keep*, 0 is *remove*
         # Note this is opposite to original MAE
-        mask = torch.zeros([batch_size , num_windows], device=x.device)
+        mask = torch.zeros([batch_size, num_windows], device=x.device)
         mask[:, :len_keep] = 1
         # Unshuffle to get the binary mask
         mask = torch.gather(mask, dim=1, index=ids_restore)
@@ -665,34 +651,34 @@ def get_position_embeddings(self) -> torch.Tensor:
         else:
             return self.position_embeddings
 
-    @add_start_docstrings_to_model_forward("""
+    @add_start_docstrings_to_model_forward(
+        """
     The forward pass for the Hiera model.
 
     Args:
         pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`.
-        
+
         mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
         mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
 
-        
+
         return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
 
         return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
-    
-    
-        
-    """)
-    @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig")
+
+
+
+    """
+    )
     def forward(
         self,
         pixel_values: torch.Tensor,
         mask: torch.Tensor = None,
         return_dict: Optional[bool] = True,
-        return_intermediates: bool = False,
+        return_intermediates: bool = True,
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
-        """
-        """
+        """ """
         # Slowfast training passes in a list
         if isinstance(pixel_values, list):
             pixel_values = pixel_values[0]
@@ -700,9 +686,7 @@ def forward(
 
         pached_embeddings = self.patch_embedding(
             pixel_values,
-            mask=mask.view(
-                pixel_values.shape[0], 1, *self.mask_spatial_shape
-            )  # batch_size , C, *mask_spatial_shape
+            mask=mask.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)  # batch_size , C, *mask_spatial_shape
             if mask is not None
             else None,
         )
@@ -732,8 +716,8 @@ def forward(
         # intermediates[-1] is embeddings in spatial order
         if not return_dict:
             return tuple(v for v in [embeddings, intermediates] if v is not None)
-        
+
         return HieraModelOutput(
             last_hidden_state=embeddings,
             intermediates=intermediates if return_intermediates else None,
-        )
\ No newline at end of file
+        )

From c677783fcfd38ea9b0771e61eaaa1091bf7850fa Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Mon, 26 Feb 2024 23:11:01 +0000
Subject: [PATCH 015/118] Integration & Block tests running

---
 tests/models/hiera/test_modeling_hiera.py | 265 +++++++++++++++++++---
 1 file changed, 235 insertions(+), 30 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 8d593af2a622..72badde557df 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -15,7 +15,8 @@
 """ Testing suite for the PyTorch Hiera model. """
 
 import unittest
-
+from typing import  Tuple
+from transformers.models.hiera.hiera_model import HieraBlock
 from transformers import HieraConfig
 from transformers.testing_utils import (
     require_torch,
@@ -23,65 +24,269 @@
     torch_device,
 )
 from transformers.utils import is_torch_available
-
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 if is_torch_available():
     import torch
     from transformers import HieraModel
-    # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model
-    from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
+    from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
+    from torchvision.transforms.functional import InterpolationMode
+    from torchvision import transforms
+    from PIL import Image
+import math
 class HieraModelTester:
-    # Define this tester to initialize Hiera model and its configurations for testing
     def __init__(
         self,
         parent,
-        batch_size=8,
-        num_channels=3,
-        image_size=224,
-        # Add other model-specific parameters here
+        input_size: Tuple[int, ...] = (224, 224),
+        in_chans: int = 3,
+        embedding_dimension: int = 96,  # initial embedding input_dim
+        number_of_heads: int = 1,  # initial number of number_of_heads
+        num_classes: int = 1000,
+        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, ...] = (2, 2),
+        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
+        # mask_unit_attn: which stages use mask unit attention?
+        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
+        dim_mul: float = 2.0,
+        head_mul: float = 2.0,
+        patch_kernel: Tuple[int, ...] = (7, 7),
+        patch_stride: Tuple[int, ...] = (4, 4),
+        patch_padding: Tuple[int, ...] = (3, 3),
+        mlp_ratio: float = 4.0,
+        drop_path_rate: float = 0.0,
+        head_dropout: float = 0.0,
+        head_init_scale: float = 0.001,
+        sep_position_embeddings: bool = False,
     ):
         self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        # Initialize other necessary attributes here
+        self.input_size = input_size
+        self.in_chans = in_chans
+        self.embedding_dimension = embedding_dimension
+        self.number_of_heads = number_of_heads
+        self.num_classes = num_classes
+        self.stages = stages
+        self.q_pool = q_pool
+        self.q_stride = q_stride
+        self.mask_unit_size = mask_unit_size
+        self.mask_unit_attn = mask_unit_attn
+        self.dim_mul = dim_mul
+        self.head_mul = head_mul
+        self.patch_kernel = patch_kernel
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.drop_path_rate = drop_path_rate
+        self.head_dropout = head_dropout
+        self.head_init_scale = head_init_scale
+        self.sep_position_embeddings = sep_position_embeddings
 
-    def prepare_config_and_inputs(self):
+    def prepare_config_and_inputs(self,checkpoint_url):
         # Prepare configuration and inputs for testing your model
-        pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device)
+        pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1]))
 
-        config = self.get_config()
+        config = self.get_config(checkpoint_url=checkpoint_url)
 
         return config, pixel_values
 
-    def get_config(self):
-        return HieraConfig(
-            # Define necessary configuration parameters here
-        )
+    def get_config(self,checkpoint_url):
+        if "hiera_tiny_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=96, 
+                                number_of_heads=1, 
+                                stages=(1, 2, 7, 2),)
+
+        elif "hiera_small_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=96, 
+                                number_of_heads=1, 
+                                stages=(1, 2, 11, 2),)
+
+        elif "hiera_base_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), )
+
+
+        elif "hiera_base_plus_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=112, 
+                                number_of_heads=2, 
+                                stages=(2, 3, 16, 3),)
+
+        elif "hiera_large_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=144, 
+                                number_of_heads=2, 
+                                stages=(2, 6, 36, 4),)
+
+        elif "hiera_huge_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=256, 
+                                number_of_heads=4, 
+                                stages=(2, 6, 36, 4))
+
+        elif "hiera_base_16x224" in checkpoint_url:
+            config = HieraConfig(num_classes=self.num_classes, 
+                                input_size=(16, 224, 224),
+                                q_stride=(1, 2, 2),
+                                mask_unit_size=(1, 8, 8),
+                                patch_kernel=(3, 7, 7),
+                                patch_stride=(2, 4, 4),
+                                patch_padding=(1, 3, 3),
+                                sep_position_embeddings=True,)
+
+        elif "hiera_base_plus_16x224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=112, 
+                                number_of_heads=2, 
+                                stages=(2, 3, 16, 3))
+
+        elif "hiera_large_16x224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=144, 
+                                number_of_heads=2, 
+                                stages=(2, 6, 36, 4), )
+
+        elif "hiera_huge_16x224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=256, 
+                                number_of_heads=4, 
+                                stages=(2, 6, 36, 4) )
+        else:
+            raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})")
+
+        return config
 
     def create_and_check_model(self, config, pixel_values):
+        batch_size = 1
         model = HieraModel(config=config)
-        model.to(torch_device)
+        num_patches = int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)**2
+        flat_q_stride = math.prod(self.q_stride)
+        embedding_dimension = self.embedding_dimension
+        indermediate_shapes = []
+        for _ in self.stages:
+            indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension))
+            num_patches = num_patches/flat_q_stride
+            embedding_dimension = embedding_dimension * 2
         model.eval()
         with torch.no_grad():
             result = model(pixel_values=pixel_values)
-        # Perform checks here, e.g., output shapes, etc.
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size))
+
+        for idx, x in enumerate(result.intermediates):
+            self.parent.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape")
 
 
 @require_torch
-class HieraModelTest(unittest.TestCase):
+class HieraModelTest():
 
     def setUp(self):
         self.model_tester = HieraModelTester(self)
 
     def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
+        for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name)
+            self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @slow
+    # @slow
     def test_model_from_pretrained(self):
         for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = HieraModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
\ No newline at end of file
+            self.assertIsNotNone(model)
+
+@require_torch
+@slow
+class HieraModelIntegrationTest(unittest.TestCase):
+    def test_forward(self):
+        torch_device = "cpu"
+        input_size = 224
+        batch_size =1
+        patch_kernel = (7,7)
+        patch_padding = (3,3)
+        patch_stride = (4,4)
+        q_stride = (2,2)
+        flat_q_stride =  math.prod(q_stride)
+        stages=(2, 3, 16, 3)
+        embedding_dimension = 96
+        model = HieraModel.from_pretrained("/home/ubuntu/home/hiera/model/")
+        model.to(torch_device)
+        
+        random_tensor = torch.rand(batch_size, 3, input_size, input_size)
+        num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1)**2
+
+        indermediate_shapes = []
+        for _ in stages:
+            indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension))
+            num_patches = num_patches/flat_q_stride
+            embedding_dimension = embedding_dimension * 2
+        out = model(random_tensor)
+
+        out.last_hidden_state.argmax(dim=-1).item()
+
+        out = model(random_tensor, return_intermediates=True)
+        for idx, x in enumerate(out.intermediates):
+            self.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape")
+
+class TestHieraBlock(unittest.TestCase):
+    def test_output_shape(self):
+        batch_size, input_dim, output_dim = 1, 96, 192
+        number_of_heads = 2
+        mlp_ratio = 4.0
+        drop_path = 0.0
+        q_stride = 4
+        window_size = 16
+        use_mask_unit_attention = True
+        num_patches = 3136
+
+        block = HieraBlock(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            number_of_heads=number_of_heads,
+            mlp_ratio=mlp_ratio,
+            drop_path=drop_path,
+            q_stride=q_stride,
+            window_size=window_size,
+            use_mask_unit_attention=use_mask_unit_attention
+        )
+
+        # Create a dummy input
+        x = torch.randn(batch_size, num_patches,input_dim)
+        
+        # Forward pass
+        out = block(x)
+
+        # Check the shape of the output
+        expected_shape = (batch_size, num_patches/q_stride, output_dim)
+        self.assertEqual(out.shape, expected_shape, "Output shape is incorrect")
+
+    def test_input_output_dim_equality(self):
+        batch_size, input_dim, output_dim = 1, 96, 96
+        number_of_heads = 1
+        mlp_ratio = 4.0
+        drop_path = 0.0
+        q_stride = 1
+        window_size = 64
+        use_mask_unit_attention = True
+        num_patches = 3136
+        block = HieraBlock(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            number_of_heads=number_of_heads,
+            mlp_ratio=mlp_ratio,
+            drop_path=drop_path,
+            q_stride=q_stride,
+            window_size=window_size,
+            use_mask_unit_attention=use_mask_unit_attention
+        )
+
+        # Create a dummy input
+        x = torch.randn(batch_size, num_patches,input_dim)
+        
+        # Forward pass
+        out = block(x)
+
+        # Check the shape of the output
+        expected_shape = (batch_size, num_patches, output_dim)
+        self.assertEqual(out.shape, expected_shape, "Output shape is incorrect. Input shape should be equal to output shape")
+
+
+if __name__ == '__main__':
+    test = HieraModelIntegrationTest()
+    test.test_forward()
+    block_test = TestHieraBlock()
+    block_test.test_output_shape()
+    block_test.test_input_output_dim_equality()
+    model_test = HieraModelTest()
+    model_test.setUp()
+    model_test.test_model()
+    model_test.test_model_from_pretrained()

From 130b55b1f010ff22058d420dbaff3cf54e87db06 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 06:04:01 +0000
Subject: [PATCH 016/118] Fixed bugs

---
 src/transformers/__init__.py                              | 6 +++++-
 src/transformers/models/auto/configuration_auto.py        | 2 +-
 src/transformers/models/auto/modeling_auto.py             | 1 +
 src/transformers/models/hiera/__init__.py                 | 8 ++++++--
 src/transformers/models/hiera/hiera_mae.py                | 2 +-
 .../models/hiera/{hiera_model.py => modeling_hiera.py}    | 0
 6 files changed, 14 insertions(+), 5 deletions(-)
 rename src/transformers/models/hiera/{hiera_model.py => modeling_hiera.py} (100%)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4d7ef6ce20d3..9d668babbec2 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6981,7 +6981,11 @@
             GroupViTTextModel,
             GroupViTVisionModel,
         )
-        from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
+        from .models.hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraModel,
+            HieraPreTrainedModel,
+        )
         from .models.hubert import (
             HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             HubertForCTC,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6f824a2e955d..10511e2ff47e 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -590,7 +590,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera", "HieraModel"),
+        ("hiera", "Hiera"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 0fc417e795e4..fb4d571632a4 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -501,6 +501,7 @@
         ("efficientnet", "EfficientNetModel"),
         ("focalnet", "FocalNetModel"),
         ("glpn", "GLPNModel"),
+        ("hiera", "HieraModel"),
         ("imagegpt", "ImageGPTModel"),
         ("levit", "LevitModel"),
         ("mobilenet_v1", "MobileNetV1Model"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index fcffbbf7593e..d32f0a934fea 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -35,7 +35,11 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "]
+    _import_structure["modeling_hiera"] = [
+        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "HieraModel",
+        "HieraPreTrainedModel "
+        ]
 
 
 if TYPE_CHECKING:
@@ -51,7 +55,7 @@
         pass
     else:
         from .hiera_image_processor import HieraImageProcessor
-        from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
+        from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
 
 else:
     import sys
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index 56b91bc7acb7..7c42c22734a1 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing
+from .modeling_hiera import HieraBlock, HieraModel, conv_nd, undo_windowing
 
 
 def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/modeling_hiera.py
similarity index 100%
rename from src/transformers/models/hiera/hiera_model.py
rename to src/transformers/models/hiera/modeling_hiera.py

From 733c59e25212ba99ea00a83db780b034bbfa9376 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 00:06:17 +0000
Subject: [PATCH 017/118] initialized Structure

---
 src/transformers/models/__init__.py           |   1 +
 src/transformers/models/hiera/__init__.py     |  82 +++
 src/transformers/models/hiera/benchmarking.py |  77 +++
 .../models/hiera/configuration_hiera.py       | 128 +++++
 .../models/hiera/convert_hiera_to_pytorch.py  |  27 +
 src/transformers/models/hiera/hiera.py        | 535 ++++++++++++++++++
 src/transformers/models/hiera/hiera_mae.py    | 398 +++++++++++++
 src/transformers/models/hiera/hiera_utils.py  | 287 ++++++++++
 8 files changed, 1535 insertions(+)
 create mode 100644 src/transformers/models/hiera/__init__.py
 create mode 100644 src/transformers/models/hiera/benchmarking.py
 create mode 100644 src/transformers/models/hiera/configuration_hiera.py
 create mode 100644 src/transformers/models/hiera/convert_hiera_to_pytorch.py
 create mode 100644 src/transformers/models/hiera/hiera.py
 create mode 100644 src/transformers/models/hiera/hiera_mae.py
 create mode 100644 src/transformers/models/hiera/hiera_utils.py

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index ebb3db25fb96..5b9c5404fd7a 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -106,6 +106,7 @@
     graphormer,
     groupvit,
     herbert,
+    hiera,
     hubert,
     ibert,
     idefics,
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
new file mode 100644
index 000000000000..bfd200e9dcb9
--- /dev/null
+++ b/src/transformers/models/hiera/__init__.py
@@ -0,0 +1,82 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
+
+
+_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vit_mae"] = [
+        "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ViTMAEForPreTraining",
+        "ViTMAELayer",
+        "ViTMAEModel",
+        "ViTMAEPreTrainedModel",
+    ]
+
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_vit_mae"] = [
+        "TFViTMAEForPreTraining",
+        "TFViTMAEModel",
+        "TFViTMAEPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vit_mae import (
+            VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTMAEForPreTraining,
+            ViTMAELayer,
+            ViTMAEModel,
+            ViTMAEPreTrainedModel,
+        )
+
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py
new file mode 100644
index 000000000000..33166028977a
--- /dev/null
+++ b/src/transformers/models/hiera/benchmarking.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+
+import time
+from typing import List, Tuple, Union
+
+import torch
+from tqdm import tqdm
+
+# From https://github.com/facebookresearch/ToMe/
+def benchmark(
+    model: torch.nn.Module,
+    device: torch.device = 0,
+    input_size: Tuple[int] = (3, 224, 224),
+    batch_size: int = 64,
+    runs: int = 40,
+    throw_out: float = 0.25,
+    use_fp16: bool = False,
+    verbose: bool = False,
+) -> float:
+    """
+    Benchmark the given model with random inputs at the given batch size.
+
+    Args:
+     - model: the module to benchmark
+     - device: the device to use for benchmarking
+     - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w)
+     - batch_size: the batch size to use for evaluation
+     - runs: the number of total runs to do
+     - throw_out: the percentage of runs to throw out at the start of testing
+     - use_fp16: whether or not to benchmark with float16 and autocast
+     - verbose: whether or not to use tqdm to print progress / print throughput at end
+
+    Returns:
+     - the throughput measured in images / second
+    """
+    if not isinstance(device, torch.device):
+        device = torch.device(device)
+    is_cuda = torch.device(device).type == "cuda"
+
+    model = model.eval().to(device)
+    input = torch.rand(batch_size, *input_size, device=device)
+    if use_fp16:
+        input = input.half()
+
+    warm_up = int(runs * throw_out)
+    total = 0
+    start = time.time()
+
+    with torch.autocast(device.type, enabled=use_fp16):
+        with torch.no_grad():
+            for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
+                if i == warm_up:
+                    if is_cuda:
+                        torch.cuda.synchronize()
+                    total = 0
+                    start = time.time()
+
+                model(input)
+                total += batch_size
+
+    if is_cuda:
+        torch.cuda.synchronize()
+
+    end = time.time()
+    elapsed = end - start
+
+    throughput = total / elapsed
+
+    if verbose:
+        print(f"Throughput: {throughput:.2f} im/s")
+
+    return throughput
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
new file mode 100644
index 000000000000..de5de9e7d9e9
--- /dev/null
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -0,0 +1,128 @@
+""" hiera  model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json",
+    # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae
+}
+
+
+class ViTMAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT
+    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the ViT
+    [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        mask_ratio (`float`, *optional*, defaults to 0.75):
+            The ratio of the number of masked tokens in the input sequence.
+        norm_pix_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
+            representation quality in the experiments of the authors.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTMAEConfig, ViTMAEModel
+
+    >>> # Initializing a ViT MAE vit-mae-base style configuration
+    >>> configuration = ViTMAEConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
+    >>> model = ViTMAEModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit_mae"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        mask_ratio=0.75,
+        norm_pix_loss=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.mask_ratio = mask_ratio
+        self.norm_pix_loss = norm_pix_loss
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
new file mode 100644
index 000000000000..506507e4e66e
--- /dev/null
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -0,0 +1,27 @@
+import argparse
+
+import requests
+import torch
+from PIL import Image
+
+
+
+def rename_key(name):
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "patch_embed.projection")
+    return name
+
+
+def e(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+           pass
+        else:
+            new_name = rename_key(key)
+            orig_state_dict[new_name] = val
+
+    return orig_state_dict
+
+
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
new file mode 100644
index 000000000000..35e8c93e160b
--- /dev/null
+++ b/src/transformers/models/hiera/hiera.py
@@ -0,0 +1,535 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+#
+# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+#
+# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
+# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
+# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
+#
+# Paper: https://arxiv.org/abs/2306.00989/
+#
+# References:
+# slowfast: https://github.com/facebookresearch/SlowFast
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# --------------------------------------------------------
+
+import math
+from functools import partial
+from typing import List, Tuple, Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.models.layers import DropPath, Mlp
+
+from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll
+
+
+
+class MaskUnitAttention(nn.Module):
+    """
+    Computes either Mask Unit or Global Attention. Also is able to perform q pooling.
+
+    Note: this assumes the tokens have already been flattened and unrolled into mask units.
+    See `Unroll` for more details.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        heads: int,
+        q_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ):
+        """
+        Args:
+        - dim, dim_out: The input and output feature dimensions.
+        - heads: The number of attention heads.
+        - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
+        - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
+        - use_mask_unit_attn: Use Mask Unit or Global Attention.
+        """
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+        self.heads = heads
+        self.q_stride = q_stride
+
+        self.head_dim = dim_out // heads
+        self.scale = (self.head_dim) ** -0.5
+
+        self.qkv = nn.Linear(dim, 3 * dim_out)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+        self.window_size = window_size
+        self.use_mask_unit_attn = use_mask_unit_attn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ Input should be of shape [batch, tokens, channels]. """
+        B, N, _ = x.shape
+        num_windows = (
+            (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1
+        )
+
+        qkv = (
+            self.qkv(x)
+            .reshape(B, -1, num_windows, 3, self.heads, self.head_dim)
+            .permute(3, 0, 4, 2, 1, 5)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        if self.q_stride > 1:
+            # Refer to Unroll to see how this performs a maxpool-Nd
+            q = (
+                q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim)
+                .max(dim=3)
+                .values
+            )
+
+        if hasattr(F, "scaled_dot_product_attention"):
+            # Note: the original paper did *not* use SDPA, it's a free boost!
+            x = F.scaled_dot_product_attention(q, k, v)
+        else:
+            attn = (q * self.scale) @ k.transpose(-1, -2)
+            attn = attn.softmax(dim=-1)
+            x = (attn @ v)
+
+        x = x.transpose(1, 3).reshape(B, -1, self.dim_out)
+        x = self.proj(x)
+        return x
+
+
+class HieraBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        drop_path: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        act_layer: nn.Module = nn.GELU,
+        q_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+
+        self.norm1 = norm_layer(dim)
+        self.attn = MaskUnitAttention(
+            dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn
+        )
+
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Attention + Q Pooling
+        x_norm = self.norm1(x)
+        if self.dim != self.dim_out:
+            x = do_pool(self.proj(x_norm), stride=self.attn.q_stride)
+        x = x + self.drop_path(self.attn(x_norm))
+
+        # MLP
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Head(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_classes: int,
+        dropout_rate: float = 0.0,
+        act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1),
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
+        self.projection = nn.Linear(dim, num_classes)
+        # act_fun for eval and testing only
+        self.act_func = act_func
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.dropout(x)
+        x = self.projection(x)
+        if not self.training:
+            x = self.act_func(x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d)."""
+
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        kernel: Tuple[int, ...],
+        stride: Tuple[int, ...],
+        padding: Tuple[int, ...],
+    ):
+        super().__init__()
+
+        # Support any number of spatial dimensions
+        self.spatial_dims = len(kernel)
+        self.proj = conv_nd(self.spatial_dims)(
+            dim_in,
+            dim_out,
+            kernel_size=kernel,
+            stride=stride,
+            padding=padding,
+        )
+
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x = do_masked_conv(x, self.proj, mask)
+        x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1)
+        return x
+
+
+class Hiera(nn.Module):
+    def __init__(
+        self,
+        input_size: Tuple[int, ...] = (224, 224),
+        in_chans: int = 3,
+        embed_dim: int = 96,  # initial embed dim
+        num_heads: int = 1,  # initial number of heads
+        num_classes: int = 1000,
+        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, ...] = (2, 2),
+        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
+        # mask_unit_attn: which stages use mask unit attention?
+        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
+        dim_mul: float = 2.0,
+        head_mul: float = 2.0,
+        patch_kernel: Tuple[int, ...] = (7, 7),
+        patch_stride: Tuple[int, ...] = (4, 4),
+        patch_padding: Tuple[int, ...] = (3, 3),
+        mlp_ratio: float = 4.0,
+        drop_path_rate: float = 0.0,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        head_dropout: float = 0.0,
+        head_init_scale: float = 0.001,
+        sep_pos_embed: bool = False,
+    ):
+        super().__init__()
+
+        depth = sum(stages)
+        self.patch_stride = patch_stride
+        self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)]
+        num_tokens = math.prod(self.tokens_spatial_shape)
+        flat_mu_size = math.prod(mask_unit_size)
+        flat_q_stride = math.prod(q_stride)
+
+        assert q_pool < len(stages)
+        self.q_pool, self.q_stride = q_pool, q_stride
+        self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size
+        self.mask_spatial_shape = [
+            i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
+        ]
+        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+
+        self.patch_embed = PatchEmbed(
+            in_chans, embed_dim, patch_kernel, patch_stride, patch_padding
+        )
+
+        self.sep_pos_embed = sep_pos_embed
+        if sep_pos_embed:
+            self.pos_embed_spatial = nn.Parameter(
+                torch.zeros(
+                    1,
+                    self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
+                    embed_dim,
+                )
+            )
+            self.pos_embed_temporal = nn.Parameter(
+                torch.zeros(1, self.tokens_spatial_shape[0], embed_dim)
+            )
+        else:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim))
+
+        # Setup roll and reroll modules
+        self.unroll = Unroll(
+            input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1])
+        )
+        self.reroll = Reroll(
+            input_size,
+            patch_stride,
+            [q_stride] * len(self.stage_ends[:-1]),
+            self.stage_ends,
+            q_pool,
+        )
+        # q_pool locations
+        q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]]
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+
+        # Transformer blocks
+        cur_stage = 0
+        self.blocks = nn.ModuleList()
+
+        for i in range(depth):
+            dim_out = embed_dim
+            # Mask unit or global attention.
+            # Lag by 1 block, so that global attention,
+            # applied post pooling on lower resolution
+            use_mask_unit_attn = mask_unit_attn[cur_stage]
+
+            if i - 1 in self.stage_ends:
+                dim_out = int(embed_dim * dim_mul)
+                num_heads = int(num_heads * head_mul)
+                cur_stage += 1
+                if i in q_pool_blocks:
+                    flat_mu_size //= flat_q_stride
+
+            block = HieraBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                q_stride=(flat_q_stride if i in q_pool_blocks else 1),
+                window_size=flat_mu_size,
+                use_mask_unit_attn=use_mask_unit_attn,
+            )
+
+            embed_dim = dim_out
+            self.blocks.append(block)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout)
+
+        # Initialize everything
+        if sep_pos_embed:
+            nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02)
+            nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02)
+        else:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(partial(self._init_weights))
+        self.head.projection.weight.data.mul_(head_init_scale)
+        self.head.projection.bias.data.mul_(head_init_scale)
+
+    def _init_weights(self, m, init_bias=0.02):
+        if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, init_bias)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, init_bias)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        if self.sep_pos_embed:
+            return ["pos_embed_spatial", "pos_embed_temporal"]
+        else:
+            return ["pos_embed"]
+
+    def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
+        """
+        Generates a random mask, mask_ratio fraction are dropped.
+        1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
+        """
+        B = x.shape[0]
+        # Tokens selected for masking at mask unit level
+        num_windows = math.prod(self.mask_spatial_shape)  # num_mask_units
+        len_keep = int(num_windows * (1 - mask_ratio))
+        noise = torch.rand(B, num_windows, device=x.device)
+
+        # Sort noise for each sample
+        ids_shuffle = torch.argsort(
+            noise, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # Generate the binary mask: 1 is *keep*, 0 is *remove*
+        # Note this is opposite to original MAE
+        mask = torch.zeros([B, num_windows], device=x.device)
+        mask[:, :len_keep] = 1
+        # Unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return mask.bool()
+
+    def get_pos_embed(self) -> torch.Tensor:
+        if self.sep_pos_embed:
+            return self.pos_embed_spatial.repeat(
+                1, self.tokens_spatial_shape[0], 1
+            ) + torch.repeat_interleave(
+                self.pos_embed_temporal,
+                self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
+                dim=1,
+            )
+        else:
+            return self.pos_embed
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor = None,
+        return_intermediates: bool = False,
+    ) -> torch.Tensor:
+        """
+        mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim.
+        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
+        """
+        # Slowfast training passes in a list
+        if isinstance(x, list):
+            x = x[0]
+        intermediates = []
+
+        x = self.patch_embed(
+            x,
+            mask=mask.view(
+                x.shape[0], 1, *self.mask_spatial_shape
+            )  # B, C, *mask_spatial_shape
+            if mask is not None
+            else None,
+        )
+        x = x + self.get_pos_embed()
+        x = self.unroll(x)
+
+        # Discard masked tokens
+        if mask is not None:
+            x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view(
+                x.shape[0], -1, x.shape[-1]
+            )
+
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+
+            if return_intermediates and i in self.stage_ends:
+                intermediates.append(self.reroll(x, i, mask=mask))
+
+        if mask is None:
+            x = x.mean(dim=1)
+            x = self.norm(x)
+            x = self.head(x)
+
+        # x may not always be in spatial order here.
+        # e.g. if q_pool = 2, mask_unit_size = (8, 8), and
+        # q_stride = (2, 2), not all unrolls were consumed,
+        # intermediates[-1] is x in spatial order
+        if return_intermediates:
+            return x, intermediates
+
+        return x
+
+
+# Image models
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_tiny_224(**kwdargs):
+    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_small_224(**kwdargs):
+    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_base_224(**kwdargs):
+    return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_base_plus_224(**kwdargs):
+    return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_large_224(**kwdargs):
+    return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs)
+
+
+@pretrained_model({
+    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
+}, default="mae_in1k_ft_in1k")
+def hiera_huge_224(**kwdargs):
+    return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs)
+
+
+# Video models
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_base_16x224(num_classes: int = 400, **kwdargs):
+    return Hiera(
+        num_classes=num_classes,  # K400 has 400 classes
+        input_size=(16, 224, 224),
+        q_stride=(1, 2, 2),
+        mask_unit_size=(1, 8, 8),
+        patch_kernel=(3, 7, 7),
+        patch_stride=(2, 4, 4),
+        patch_padding=(1, 3, 3),
+        sep_pos_embed=True,
+        **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_base_plus_16x224(**kwdargs):
+    return hiera_base_16x224(
+        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_large_16x224(**kwdargs):
+    return hiera_base_16x224(
+        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
+}, default="mae_k400_ft_k400")
+def hiera_huge_16x224(**kwdargs):
+    return hiera_base_16x224(
+        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+    )
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
new file mode 100644
index 000000000000..64c69cc89d71
--- /dev/null
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -0,0 +1,398 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# mae: https://github.com/facebookresearch/mae
+# slowfast: https://github.com/facebookresearch/SlowFast
+# --------------------------------------------------------
+
+
+from functools import partial
+from typing import Tuple, Optional
+
+import math
+import torch
+import torch.nn as nn
+
+from .hiera import Hiera, HieraBlock
+from .hiera_utils import pretrained_model, undo_windowing, conv_nd
+
+
+def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
+    if isinstance(head, nn.Identity):
+        return x
+
+    B, num_mask_units = x.shape[0:2]
+    # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx])
+    permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
+    x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute))
+
+    # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C']
+    permute = [0] + list(range(2, len(x.shape))) + [1]
+    x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1])
+    return x
+
+
+class MaskedAutoencoderHiera(Hiera):
+    """Masked Autoencoder with Hiera backbone"""
+
+    def __init__(
+        self,
+        in_chans: int = 3,
+        patch_stride: Tuple[int, ...] = (4, 4),
+        mlp_ratio: float = 4.0,
+        decoder_embed_dim: int = 512,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 16,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        **kwdargs,
+    ):
+        super().__init__(
+            in_chans=in_chans,
+            patch_stride=patch_stride,
+            mlp_ratio=mlp_ratio,
+            norm_layer=norm_layer,
+            **kwdargs,
+        )
+
+        del self.norm, self.head
+        encoder_dim_out = self.blocks[-1].dim_out
+        self.encoder_norm = norm_layer(encoder_dim_out)
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride)
+        ]
+        self.tokens_spatial_shape_final = [
+            i // s ** (self.q_pool)
+            for i, s in zip(self.tokens_spatial_shape, self.q_stride)
+        ]
+        # --------------------------------------------------------------------------
+        # Multi-scale fusion heads
+        curr_mu_size = self.mask_unit_size
+        self.multi_scale_fusion_heads = nn.ModuleList()
+
+        for i in self.stage_ends[: self.q_pool]:  # resolution constant after q_pool
+            kernel = [
+                i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)
+            ]
+            curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)]
+            self.multi_scale_fusion_heads.append(
+                conv_nd(len(self.q_stride))(
+                    self.blocks[i].dim_out,
+                    encoder_dim_out,
+                    kernel_size=kernel,
+                    stride=kernel,
+                )
+            )
+        self.multi_scale_fusion_heads.append(nn.Identity())  # final stage, no transform
+
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(encoder_dim_out, decoder_embed_dim)
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+
+        self.decoder_pos_embed = nn.Parameter(
+            torch.zeros(
+                1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim
+            )
+        )
+
+        self.decoder_blocks = nn.ModuleList(
+            [
+                HieraBlock(
+                    dim=decoder_embed_dim,
+                    dim_out=decoder_embed_dim,
+                    heads=decoder_num_heads,
+                    norm_layer=norm_layer,
+                    mlp_ratio=mlp_ratio,
+                )
+                for i in range(decoder_depth)
+            ]
+        )
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+
+        self.pred_stride = patch_stride[-1] * (
+            self.q_stride[-1] ** self.q_pool
+        )  # patch stride of prediction
+
+        self.decoder_pred = nn.Linear(
+            decoder_embed_dim,
+            (self.pred_stride ** min(2, len(self.q_stride))) * in_chans,
+        )  # predictor
+        # --------------------------------------------------------------------------
+
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        nn.init.trunc_normal_(self.mask_token, std=0.02)
+        nn.init.trunc_normal_(self.decoder_pos_embed, std=0.02)
+        self.apply(self._mae_init_weights)
+
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+    def _mae_init_weights(self, m: nn.Module):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_pixel_label_2d(
+        self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True
+    ) -> torch.Tensor:
+        # mask (boolean tensor): True must correspond to *masked*
+        input_img = input_img.permute(0, 2, 3, 1)
+
+        size = self.pred_stride
+        label = input_img.unfold(1, size, size).unfold(2, size, size)
+        label = label.flatten(1, 2).flatten(2)
+        label = label[mask]
+        if norm:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+    def get_pixel_label_3d(
+        self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True
+    ) -> torch.Tensor:
+        # mask (boolean tensor): True must correspond to *masked*
+
+        # We use time strided loss, only take the first frame from each token
+        input_vid = input_vid[:, :, ::self.patch_stride[0], :, :]
+
+        size = self.pred_stride
+        label = input_vid.unfold(3, size, size).unfold(4, size, size)
+        label = label.permute(0, 2, 3, 4, 5, 6, 1)  # Different from 2d, mistake during training lol
+        label = label.flatten(1, 3).flatten(2)
+        label = label[mask]
+
+        if norm:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+
+    def forward_encoder(
+        self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        if mask is None:
+            mask = self.get_random_mask(x, mask_ratio)  # [B, #MUs_all]
+
+        # Get multi-scale representations from encoder
+        _, intermediates = super().forward(x, mask, return_intermediates=True)
+        # Resolution unchanged after q_pool stages, so skip those features
+        intermediates = intermediates[: self.q_pool] + intermediates[-1:]
+
+        # Multi-scale fusion
+        x = 0.0
+        for head, interm_x in zip(self.multi_scale_fusion_heads, intermediates):
+            x += apply_fusion_head(head, interm_x)
+
+        x = self.encoder_norm(x)
+
+        return x, mask
+
+    def forward_decoder(
+        self, x: torch.Tensor, mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Embed tokens
+        x = self.decoder_embed(x)
+
+        # Combine visible and mask tokens
+
+        # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
+        # mask: [B, #MUs_all]
+        x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
+        mask_tokens = self.mask_token.view(
+            (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)
+        )
+        mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:]))
+        mask = mask.expand((-1,) * 2 + x.shape[2:]).bool()
+        x_dec[mask] = x.flatten()
+        x_dec = ~mask * mask_tokens + mask * x_dec
+
+        # Get back spatial order
+        x = undo_windowing(
+            x_dec,
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+        mask = undo_windowing(
+            mask[..., 0:1],
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+
+        # Flatten
+        x = x.reshape(x.shape[0], -1, x.shape[-1])
+        mask = mask.view(x.shape[0], -1)
+
+        # Add pos embed
+        x = x + self.decoder_pos_embed
+
+        # Apply decoder blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+
+        # Predictor projection
+        x = self.decoder_pred(x)
+
+        return x, mask
+
+    def forward_loss(
+        self, x: torch.Tensor, pred: torch.Tensor, mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Note: in mask, 0 is *visible*, 1 is *masked*
+
+        x: e.g. [B, 3, H, W]
+        pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        """
+        if len(self.q_stride) == 2:
+            label = self.get_pixel_label_2d(x, mask)
+        elif len(self.q_stride) == 3:
+            label = self.get_pixel_label_3d(x, mask)
+        else:
+            raise NotImplementedError
+
+        pred = pred[mask]
+        loss = (pred - label) ** 2
+
+        return loss.mean(), pred, label
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_ratio: float = 0.6,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        latent, mask = self.forward_encoder(x, mask_ratio, mask=mask)
+        pred, pred_mask = self.forward_decoder(
+            latent, mask
+        )  # pred_mask is mask at resolution of *prediction*
+
+        # Toggle mask, to generate labels for *masked* tokens
+        return *self.forward_loss(x, pred, ~pred_mask), mask
+
+
+
+
+# Image Models
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
+}, default="mae_in1k")
+def mae_hiera_tiny_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
+}, default="mae_in1k")
+def mae_hiera_small_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
+}, default="mae_in1k")
+def mae_hiera_base_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
+}, default="mae_in1k")
+def mae_hiera_base_plus_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
+}, default="mae_in1k")
+def mae_hiera_large_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+    )
+
+
+@pretrained_model({
+    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
+}, default="mae_in1k")
+def mae_hiera_huge_224(**kwargs):
+    return MaskedAutoencoderHiera(
+        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+    )
+
+
+
+# Video Models
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
+}, default="mae_k400")
+def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs):
+    return MaskedAutoencoderHiera(
+        num_classes=num_classes,  # K400 has 400 classes
+        input_size=(16, 224, 224),
+        q_stride=(1, 2, 2),
+        mask_unit_size=(1, 8, 8),
+        patch_kernel=(3, 7, 7),
+        patch_stride=(2, 4, 4),
+        patch_padding=(1, 3, 3),
+        sep_pos_embed=True,
+        q_pool=2,
+        **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
+}, default="mae_k400")
+@pretrained_model(None)
+def mae_hiera_base_plus_16x224(**kwdargs):
+    return mae_hiera_base_16x224(
+        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
+}, default="mae_k400")
+@pretrained_model(None)
+def mae_hiera_large_16x224(**kwdargs):
+    return mae_hiera_base_16x224(
+        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+    )
+
+
+@pretrained_model({
+    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
+}, default="mae_k400")
+def mae_hiera_huge_16x224(**kwdargs):
+    return mae_hiera_base_16x224(
+        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+    )
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
new file mode 100644
index 000000000000..992c03e08079
--- /dev/null
+++ b/src/transformers/models/hiera/hiera_utils.py
@@ -0,0 +1,287 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+#
+# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+#
+# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
+# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
+# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
+#
+# Paper: https://arxiv.org/abs/2306.00989/
+#
+# References:
+# slowfast: https://github.com/facebookresearch/SlowFast
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# --------------------------------------------------------
+
+import math
+from typing import List, Tuple, Optional, Type, Callable, Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .convert_hiera_to_pytorch import e
+
+def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
+    """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
+
+    def inner(model_func: Callable) -> Callable:
+        def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module:
+            if pretrained:
+                if checkpoints is None:
+                    raise RuntimeError("This model currently doesn't have pretrained weights available.")
+                elif checkpoint is None:
+                    raise RuntimeError("No checkpoint specified.")
+                elif checkpoint not in checkpoints:
+                    raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
+
+                state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
+                # state_dict["model_state"] = e(state_dict["model_state"],{})
+                if "head.projection.weight" in state_dict["model_state"]:
+                    # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
+                    if "num_classes" not in kwdargs:
+                        kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0]
+                    # If the user specified a different number of classes, remove the projection weights or else we'll error out
+                    elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]:
+                        del state_dict["model_state"]["head.projection.weight"]
+                        del state_dict["model_state"]["head.projection.bias"]
+
+            model = model_func(**kwdargs)
+            if pretrained:
+                # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
+                if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"):
+                    strict = False
+
+                model.load_state_dict(state_dict["model_state"], strict=strict)
+            
+            return model
+
+        return model_def
+    
+    return inner
+
+
+
+def conv_nd(n: int) -> Type[nn.Module]:
+    """
+    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
+    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
+    """
+    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
+
+
+def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
+    # Refer to `Unroll` to see how this performs a maxpool-Nd
+    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
+
+
+def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor:
+    # target_size: [(T), (H), W]
+    # (spatial) mask: [B, C, (t), (h), w]
+    if mask is None:
+        return mask
+
+    assert len(mask.shape[2:]) == len(target_size)
+    if mask.shape[2:] != target_size:
+        return F.interpolate(mask.float(), size=target_size)
+    return mask
+
+
+def do_masked_conv(
+    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """Zero-out the masked regions of the input before conv.
+    Prevents leakage of masked regions when using overlapping kernels.
+    """
+    if conv is None:
+        return x
+    if mask is None:
+        return conv(x)
+
+    mask = get_resized_mask(target_size=x.shape[2:], mask=mask)
+    return conv(x * mask.bool())
+
+
+def undo_windowing(
+    x: torch.Tensor, shape: List[int], mu_shape: List[int]
+) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
+        shape: current spatial shape, if it were not organized into mask unit
+            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
+        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
+    Returns:
+        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
+    """
+    D = len(shape)
+    B, C = x.shape[0], x.shape[-1]
+    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
+    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
+    x = x.view(B, *num_MUs, *mu_shape, C)
+
+    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
+    permute = (
+        [0]
+        + sum(
+            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
+            [],
+        )
+        + [len(x.shape) - 1]
+    )
+    x = x.permute(permute).reshape(B, *shape, C)
+
+    return x
+
+
+
+class Unroll(nn.Module):
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
+                           [B, (Sy, Sx, H // Sy, W // Sx), C]
+
+    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in HxW order, so they
+    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+        self.schedule = unroll_schedule
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input: Flattened patch embeddings [B, N, C]
+        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
+        """
+        B, _, C = x.shape
+
+        cur_size = self.size
+        x = x.view(*([B] + cur_size + [C]))
+
+        for strides in self.schedule:
+            # Move patches with the given strides to the batch dimension
+
+            # Create a view of the tensor with the patch stride as separate dims
+            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
+            cur_size = [i // s for i, s in zip(cur_size, strides)]
+            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
+            x = x.view(new_shape)
+
+            # Move the patch stride into the batch dimension
+            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
+            L = len(new_shape)
+            permute = (
+                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Now finally flatten the relevant dims into the batch dimension
+            x = x.flatten(0, len(strides))
+            B *= math.prod(strides)
+
+        x = x.reshape(-1, math.prod(self.size), C)
+        return x
+
+
+class Reroll(nn.Module):
+    """
+    Undos the "unroll" operation so that you can use intermediate features.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+        stage_ends: List[int],
+        q_pool: int,
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        self.schedule = {}
+        size = self.size
+        for i in range(stage_ends[-1] + 1):
+            self.schedule[i] = unroll_schedule, size
+            # schedule unchanged if no pooling at a stage end
+            if i in stage_ends[:q_pool]:
+                if len(unroll_schedule) > 0:
+                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
+                unroll_schedule = unroll_schedule[1:]
+
+    def forward(
+        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no mask is provided:
+            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
+        If a mask is provided:
+            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+        """
+        schedule, size = self.schedule[block_idx]
+        B, N, C = x.shape
+
+        D = len(size)
+        cur_mu_shape = [1] * D
+
+        for strides in schedule:
+            # Extract the current patch from N
+            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
+
+            # Move that patch into the current MU
+            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
+            L = len(x.shape)
+            permute = (
+                [0, 1 + D]
+                + sum(
+                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
+                    [],
+                )
+                + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Reshape to [B, N//(Sy*Sx), *MU, C]
+            for i in range(D):
+                cur_mu_shape[i] *= strides[i]
+            x = x.reshape(B, -1, *cur_mu_shape, C)
+            N = x.shape[1]
+
+        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        x = x.view(B, N, *cur_mu_shape, C)
+
+        # If masked, return [B, #MUs, MUy, MUx, C]
+        if mask is not None:
+            return x
+
+        # If not masked, we can return [B, H, W, C]
+        x = undo_windowing(x, size, cur_mu_shape)
+
+        return x
\ No newline at end of file

From 86a43eddd60f08808a795d10e6565e43148dabf0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 02:17:36 +0000
Subject: [PATCH 018/118] Updated variable names

---
 .../models/hiera/convert_hiera_to_pytorch.py  |  30 +--
 src/transformers/models/hiera/hiera.py        | 200 +++++++++---------
 src/transformers/models/hiera/hiera_mae.py    |  42 ++--
 src/transformers/models/hiera/hiera_utils.py  |   6 +-
 4 files changed, 141 insertions(+), 137 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 506507e4e66e..f1d0c4135796 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -7,21 +7,25 @@
 
 
 def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "patch_embed.projection")
+    # if "patch_embed.proj" in name:
+    #     name = name.replace("patch_embed.proj", "patch_embed.projection")
+    # # elif "block.proj" in name:
+    # #     name = name.replace("block.proj", "block.projection")
+    # elif "attn.proj" in name:
+    #     name = name.replace("attn.proj", "attn.projection")
+    if ".proj." in name:
+        name = name.replace(".proj.", ".projection.")
+    if "attn" in name:
+        name = name.replace("attn", "attention")
+    if "pos_embed" in name:
+        name = name.replace("pos_embed", "position_embeddings")
+    if "patch_embed" in name:
+        name = name.replace("patch_embed", "patch_embedding")
     return name
 
 
-def e(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-           pass
-        else:
-            new_name = rename_key(key)
-            orig_state_dict[new_name] = val
-
-    return orig_state_dict
+def convert_state_dict(orig_state_dict, config):
+    updated_model_state = {rename_key(k): v for k, v in orig_state_dict.items()}
+    return updated_model_state
 
 
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 35e8c93e160b..fcb04f68934e 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -42,47 +42,47 @@ class MaskUnitAttention(nn.Module):
 
     def __init__(
         self,
-        dim: int,
-        dim_out: int,
-        heads: int,
+        input_dim: int,
+        output_dim: int,
+        number_of_heads: int,
         q_stride: int = 1,
         window_size: int = 0,
-        use_mask_unit_attn: bool = False,
+        use_mask_unit_attention: bool = False,
     ):
         """
         Args:
-        - dim, dim_out: The input and output feature dimensions.
-        - heads: The number of attention heads.
+        - input_dim, output_dim: The input and output feature dimensions.
+        - number_of_heads: The number of attention number_of_heads.
         - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
         - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
-        - use_mask_unit_attn: Use Mask Unit or Global Attention.
+        - use_mask_unit_attention: Use Mask Unit or Global Attention.
         """
         super().__init__()
 
-        self.dim = dim
-        self.dim_out = dim_out
-        self.heads = heads
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.number_of_heads = number_of_heads
         self.q_stride = q_stride
 
-        self.head_dim = dim_out // heads
+        self.head_dim = output_dim // number_of_heads
         self.scale = (self.head_dim) ** -0.5
 
-        self.qkv = nn.Linear(dim, 3 * dim_out)
-        self.proj = nn.Linear(dim_out, dim_out)
+        self.qkv = nn.Linear(input_dim, 3 * output_dim)
+        self.projection = nn.Linear(output_dim, output_dim)
 
         self.window_size = window_size
-        self.use_mask_unit_attn = use_mask_unit_attn
+        self.use_mask_unit_attention = use_mask_unit_attention
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """ Input should be of shape [batch, tokens, channels]. """
-        B, N, _ = x.shape
+        batch_size , num_channels , _ = x.shape
         num_windows = (
-            (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1
+            (num_channels  // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
         )
 
         qkv = (
             self.qkv(x)
-            .reshape(B, -1, num_windows, 3, self.heads, self.head_dim)
+            .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
         q, k, v = qkv[0], qkv[1], qkv[2]
@@ -90,7 +90,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.q_stride > 1:
             # Refer to Unroll to see how this performs a maxpool-Nd
             q = (
-                q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim)
+                q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
                 .max(dim=3)
                 .values
             )
@@ -99,52 +99,52 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             # Note: the original paper did *not* use SDPA, it's a free boost!
             x = F.scaled_dot_product_attention(q, k, v)
         else:
-            attn = (q * self.scale) @ k.transpose(-1, -2)
-            attn = attn.softmax(dim=-1)
-            x = (attn @ v)
+            attention = (q * self.scale) @ k.transpose(-1, -2)
+            attention = attention.softmax(dim=-1)
+            x = (attention @ v)
 
-        x = x.transpose(1, 3).reshape(B, -1, self.dim_out)
-        x = self.proj(x)
+        x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
+        x = self.projection(x)
         return x
 
 
 class HieraBlock(nn.Module):
     def __init__(
         self,
-        dim: int,
-        dim_out: int,
-        heads: int,
+        input_dim: int,
+        output_dim: int,
+        number_of_heads: int,
         mlp_ratio: float = 4.0,
         drop_path: float = 0.0,
         norm_layer: nn.Module = nn.LayerNorm,
         act_layer: nn.Module = nn.GELU,
         q_stride: int = 1,
         window_size: int = 0,
-        use_mask_unit_attn: bool = False,
+        use_mask_unit_attention: bool = False,
     ):
         super().__init__()
 
-        self.dim = dim
-        self.dim_out = dim_out
+        self.input_dim = input_dim
+        self.output_dim = output_dim
 
-        self.norm1 = norm_layer(dim)
-        self.attn = MaskUnitAttention(
-            dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn
+        self.norm1 = norm_layer(input_dim)
+        self.attention = MaskUnitAttention(
+            input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention
         )
 
-        self.norm2 = norm_layer(dim_out)
-        self.mlp = Mlp(dim_out, int(dim_out * mlp_ratio), act_layer=act_layer)
+        self.norm2 = norm_layer(output_dim)
+        self.mlp = Mlp(output_dim, int(output_dim * mlp_ratio), act_layer=act_layer)
 
         self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
-        if dim != dim_out:
-            self.proj = nn.Linear(dim, dim_out)
+        if input_dim != output_dim:
+            self.projection = nn.Linear(input_dim, output_dim)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Attention + Q Pooling
-        x_norm = self.norm1(x)
-        if self.dim != self.dim_out:
-            x = do_pool(self.proj(x_norm), stride=self.attn.q_stride)
-        x = x + self.drop_path(self.attn(x_norm))
+        normalized_input = self.norm1(x)
+        if self.input_dim != self.output_dim:
+            x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride)
+        x = x + self.drop_path(self.attention(normalized_input))
 
         # MLP
         x = x + self.drop_path(self.mlp(self.norm2(x)))
@@ -154,14 +154,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class Head(nn.Module):
     def __init__(
         self,
-        dim: int,
+        input_dim: int,
         num_classes: int,
         dropout_rate: float = 0.0,
         act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1),
     ):
         super().__init__()
         self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-        self.projection = nn.Linear(dim, num_classes)
+        self.projection = nn.Linear(input_dim, num_classes)
         # act_fun for eval and testing only
         self.act_func = act_func
 
@@ -173,13 +173,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class PatchEmbed(nn.Module):
-    """Patch embed that supports any number of spatial dimensions (1d, 2d, 3d)."""
+class PatchEmbedding(nn.Module):
+    """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d)."""
 
     def __init__(
         self,
         dim_in: int,
-        dim_out: int,
+        output_dim: int,
         kernel: Tuple[int, ...],
         stride: Tuple[int, ...],
         padding: Tuple[int, ...],
@@ -188,9 +188,9 @@ def __init__(
 
         # Support any number of spatial dimensions
         self.spatial_dims = len(kernel)
-        self.proj = conv_nd(self.spatial_dims)(
+        self.projection = conv_nd(self.spatial_dims)(
             dim_in,
-            dim_out,
+            output_dim,
             kernel_size=kernel,
             stride=stride,
             padding=padding,
@@ -199,7 +199,7 @@ def __init__(
     def forward(
         self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        x = do_masked_conv(x, self.proj, mask)
+        x = do_masked_conv(x, self.projection, mask)
         x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1)
         return x
 
@@ -209,8 +209,8 @@ def __init__(
         self,
         input_size: Tuple[int, ...] = (224, 224),
         in_chans: int = 3,
-        embed_dim: int = 96,  # initial embed dim
-        num_heads: int = 1,  # initial number of heads
+        embedding_dimention: int = 96,  # initial embedding input_dim
+        number_of_heads: int = 1,  # initial number of number_of_heads
         num_classes: int = 1000,
         stages: Tuple[int, ...] = (2, 3, 16, 3),
         q_pool: int = 3,  # number of q_pool stages
@@ -228,7 +228,7 @@ def __init__(
         norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
         head_dropout: float = 0.0,
         head_init_scale: float = 0.001,
-        sep_pos_embed: bool = False,
+        sep_position_embeddings: bool = False,
     ):
         super().__init__()
 
@@ -247,24 +247,24 @@ def __init__(
         ]
         self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
 
-        self.patch_embed = PatchEmbed(
-            in_chans, embed_dim, patch_kernel, patch_stride, patch_padding
+        self.patch_embedding = PatchEmbedding(
+            in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding
         )
 
-        self.sep_pos_embed = sep_pos_embed
-        if sep_pos_embed:
-            self.pos_embed_spatial = nn.Parameter(
+        self.sep_position_embeddings = sep_position_embeddings
+        if sep_position_embeddings:
+            self.position_embeddings_spatial = nn.Parameter(
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    embed_dim,
+                    embedding_dimention,
                 )
             )
-            self.pos_embed_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], embed_dim)
+            self.position_embeddings_temporal = nn.Parameter(
+                torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention)
             )
         else:
-            self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, embed_dim))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention))
 
         # Setup roll and reroll modules
         self.unroll = Unroll(
@@ -287,43 +287,43 @@ def __init__(
         self.blocks = nn.ModuleList()
 
         for i in range(depth):
-            dim_out = embed_dim
+            output_dim = embedding_dimention
             # Mask unit or global attention.
             # Lag by 1 block, so that global attention,
             # applied post pooling on lower resolution
-            use_mask_unit_attn = mask_unit_attn[cur_stage]
+            use_mask_unit_attention = mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
-                dim_out = int(embed_dim * dim_mul)
-                num_heads = int(num_heads * head_mul)
+                output_dim = int(embedding_dimention * dim_mul)
+                number_of_heads = int(number_of_heads * head_mul)
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
 
             block = HieraBlock(
-                dim=embed_dim,
-                dim_out=dim_out,
-                heads=num_heads,
+                input_dim=embedding_dimention,
+                output_dim=output_dim,
+                number_of_heads=number_of_heads,
                 mlp_ratio=mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
                 window_size=flat_mu_size,
-                use_mask_unit_attn=use_mask_unit_attn,
+                use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            embed_dim = dim_out
+            embedding_dimention = output_dim
             self.blocks.append(block)
 
-        self.norm = norm_layer(embed_dim)
-        self.head = Head(embed_dim, num_classes, dropout_rate=head_dropout)
+        self.norm = norm_layer(embedding_dimention)
+        self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout)
 
         # Initialize everything
-        if sep_pos_embed:
-            nn.init.trunc_normal_(self.pos_embed_spatial, std=0.02)
-            nn.init.trunc_normal_(self.pos_embed_temporal, std=0.02)
+        if sep_position_embeddings:
+            nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02)
+            nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02)
         else:
-            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+            nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         self.apply(partial(self._init_weights))
         self.head.projection.weight.data.mul_(head_init_scale)
         self.head.projection.bias.data.mul_(head_init_scale)
@@ -339,21 +339,21 @@ def _init_weights(self, m, init_bias=0.02):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        if self.sep_pos_embed:
-            return ["pos_embed_spatial", "pos_embed_temporal"]
+        if self.sep_position_embeddings:
+            return ["position_embeddings_spatial", "position_embeddings_temporal"]
         else:
-            return ["pos_embed"]
+            return ["position_embeddings"]
 
     def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         """
         Generates a random mask, mask_ratio fraction are dropped.
         1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
         """
-        B = x.shape[0]
+        batch_size  = x.shape[0]
         # Tokens selected for masking at mask unit level
         num_windows = math.prod(self.mask_spatial_shape)  # num_mask_units
         len_keep = int(num_windows * (1 - mask_ratio))
-        noise = torch.rand(B, num_windows, device=x.device)
+        noise = torch.rand(batch_size , num_windows, device=x.device)
 
         # Sort noise for each sample
         ids_shuffle = torch.argsort(
@@ -363,24 +363,24 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
 
         # Generate the binary mask: 1 is *keep*, 0 is *remove*
         # Note this is opposite to original MAE
-        mask = torch.zeros([B, num_windows], device=x.device)
+        mask = torch.zeros([batch_size , num_windows], device=x.device)
         mask[:, :len_keep] = 1
         # Unshuffle to get the binary mask
         mask = torch.gather(mask, dim=1, index=ids_restore)
 
         return mask.bool()
 
-    def get_pos_embed(self) -> torch.Tensor:
-        if self.sep_pos_embed:
-            return self.pos_embed_spatial.repeat(
+    def get_position_embeddings(self) -> torch.Tensor:
+        if self.sep_position_embeddings:
+            return self.position_embeddings_spatial.repeat(
                 1, self.tokens_spatial_shape[0], 1
             ) + torch.repeat_interleave(
-                self.pos_embed_temporal,
+                self.position_embeddings_temporal,
                 self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
                 dim=1,
             )
         else:
-            return self.pos_embed
+            return self.position_embeddings
 
     def forward(
         self,
@@ -389,7 +389,7 @@ def forward(
         return_intermediates: bool = False,
     ) -> torch.Tensor:
         """
-        mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim.
+        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
         """
         # Slowfast training passes in a list
@@ -397,15 +397,15 @@ def forward(
             x = x[0]
         intermediates = []
 
-        x = self.patch_embed(
+        x = self.patch_embedding(
             x,
             mask=mask.view(
                 x.shape[0], 1, *self.mask_spatial_shape
-            )  # B, C, *mask_spatial_shape
+            )  # batch_size , C, *mask_spatial_shape
             if mask is not None
             else None,
         )
-        x = x + self.get_pos_embed()
+        x = x + self.get_position_embeddings()
         x = self.unroll(x)
 
         # Discard masked tokens
@@ -442,7 +442,7 @@ def forward(
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_tiny_224(**kwdargs):
-    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), **kwdargs)
+    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs)
 
 
 @pretrained_model({
@@ -450,7 +450,7 @@ def hiera_tiny_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_small_224(**kwdargs):
-    return Hiera(embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), **kwdargs)
+    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
 
 
 @pretrained_model({
@@ -458,7 +458,7 @@ def hiera_small_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_base_224(**kwdargs):
-    return Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), **kwdargs)
+    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs)
 
 
 @pretrained_model({
@@ -466,7 +466,7 @@ def hiera_base_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_base_plus_224(**kwdargs):
-    return Hiera(embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs)
+    return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
 
 
 @pretrained_model({
@@ -474,7 +474,7 @@ def hiera_base_plus_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_large_224(**kwdargs):
-    return Hiera(embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
 
 
 @pretrained_model({
@@ -482,7 +482,7 @@ def hiera_large_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_huge_224(**kwdargs):
-    return Hiera(embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
 
 
 # Video models
@@ -500,7 +500,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs):
         patch_kernel=(3, 7, 7),
         patch_stride=(2, 4, 4),
         patch_padding=(1, 3, 3),
-        sep_pos_embed=True,
+        sep_position_embeddings=True,
         **kwdargs
     )
 
@@ -511,7 +511,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_base_plus_16x224(**kwdargs):
     return hiera_base_16x224(
-        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+        embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
     )
 
 
@@ -521,7 +521,7 @@ def hiera_base_plus_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_large_16x224(**kwdargs):
     return hiera_base_16x224(
-        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
     )
 
 
@@ -531,5 +531,5 @@ def hiera_large_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_huge_16x224(**kwdargs):
     return hiera_base_16x224(
-        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
     )
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index 64c69cc89d71..a0504997350b 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -25,14 +25,14 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
     if isinstance(head, nn.Identity):
         return x
 
-    B, num_mask_units = x.shape[0:2]
-    # Apply head, e.g [B, #MUs, My, Mx, C] -> head([B * #MUs, C, My, Mx])
+    batch_size , num_mask_units = x.shape[0:2]
+    # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size  * #MUs, C, My, Mx])
     permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
-    x = head(x.reshape(B * num_mask_units, *x.shape[2:]).permute(permute))
+    x = head(x.reshape(batch_size  * num_mask_units, *x.shape[2:]).permute(permute))
 
-    # Restore original layout, e.g. [B * #MUs, C', My', Mx'] -> [B, #MUs, My', Mx', C']
+    # Restore original layout, e.g. [batch_size  * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C']
     permute = [0] + list(range(2, len(x.shape))) + [1]
-    x = x.permute(permute).reshape(B, num_mask_units, *x.shape[2:], x.shape[1])
+    x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1])
     return x
 
 
@@ -132,7 +132,7 @@ def initialize_weights(self):
         self.apply(self._mae_init_weights)
 
         # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
-        w = self.patch_embed.proj.weight.data
+        w = self.patch_embed.projection.weight.data
         nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
 
     def _mae_init_weights(self, m: nn.Module):
@@ -188,7 +188,7 @@ def forward_encoder(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
 
         if mask is None:
-            mask = self.get_random_mask(x, mask_ratio)  # [B, #MUs_all]
+            mask = self.get_random_mask(x, mask_ratio)  # [batch_size , #MUs_all]
 
         # Get multi-scale representations from encoder
         _, intermediates = super().forward(x, mask, return_intermediates=True)
@@ -212,8 +212,8 @@ def forward_decoder(
 
         # Combine visible and mask tokens
 
-        # x: [B, #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
-        # mask: [B, #MUs_all]
+        # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
+        # mask: [batch_size , #MUs_all]
         x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
         mask_tokens = self.mask_token.view(
             (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)
@@ -258,9 +258,9 @@ def forward_loss(
         """
         Note: in mask, 0 is *visible*, 1 is *masked*
 
-        x: e.g. [B, 3, H, W]
-        pred: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
-        label: [B * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        x: e.g. [batch_size , 3, H, W]
+        pred: [batch_size  * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
+        label: [batch_size  * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
         """
         if len(self.q_stride) == 2:
             label = self.get_pixel_label_2d(x, mask)
@@ -299,7 +299,7 @@ def forward(
 }, default="mae_in1k")
 def mae_hiera_tiny_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
+        embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
     )
 
 
@@ -308,7 +308,7 @@ def mae_hiera_tiny_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_small_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
+        embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
     )
 
 
@@ -317,7 +317,7 @@ def mae_hiera_small_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_base_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+        embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
     )
 
 
@@ -326,7 +326,7 @@ def mae_hiera_base_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_base_plus_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
+        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
     )
 
 
@@ -335,7 +335,7 @@ def mae_hiera_base_plus_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_large_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
     )
 
 
@@ -344,7 +344,7 @@ def mae_hiera_large_224(**kwargs):
 }, default="mae_in1k")
 def mae_hiera_huge_224(**kwargs):
     return MaskedAutoencoderHiera(
-        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
+        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
     )
 
 
@@ -375,7 +375,7 @@ def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs):
 @pretrained_model(None)
 def mae_hiera_base_plus_16x224(**kwdargs):
     return mae_hiera_base_16x224(
-        embed_dim=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
+        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
     )
 
 
@@ -385,7 +385,7 @@ def mae_hiera_base_plus_16x224(**kwdargs):
 @pretrained_model(None)
 def mae_hiera_large_16x224(**kwdargs):
     return mae_hiera_base_16x224(
-        embed_dim=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
     )
 
 
@@ -394,5 +394,5 @@ def mae_hiera_large_16x224(**kwdargs):
 }, default="mae_k400")
 def mae_hiera_huge_16x224(**kwdargs):
     return mae_hiera_base_16x224(
-        embed_dim=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
     )
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
index 992c03e08079..c96c63cbfaf9 100644
--- a/src/transformers/models/hiera/hiera_utils.py
+++ b/src/transformers/models/hiera/hiera_utils.py
@@ -24,7 +24,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .convert_hiera_to_pytorch import e
+from .convert_hiera_to_pytorch import convert_state_dict
 
 def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
     """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
@@ -40,7 +40,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool
                     raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
 
                 state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
-                # state_dict["model_state"] = e(state_dict["model_state"],{})
+                state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
                 if "head.projection.weight" in state_dict["model_state"]:
                     # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
                     if "num_classes" not in kwdargs:
@@ -53,7 +53,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool
             model = model_func(**kwdargs)
             if pretrained:
                 # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-                if "decoder_pos_embed" in state_dict["model_state"] and not hasattr(model, "decoder_pos_embed"):
+                if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
                     strict = False
 
                 model.load_state_dict(state_dict["model_state"], strict=strict)

From 1433a7c6e835b9f01c092734297841137935fbf1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 08:10:34 +0000
Subject: [PATCH 019/118] Added Config class, basic HF setup, convert_to_hf

---
 src/transformers/__init__.py                  |   6 +
 .../models/auto/configuration_auto.py         |   3 +
 src/transformers/models/hiera/__init__.py     | 157 ++++++++-----
 .../models/hiera/configuration_hiera.py       | 193 +++++++---------
 .../models/hiera/convert_hiera_to_pytorch.py  | 212 ++++++++++++++++++
 src/transformers/models/hiera/hiera.py        | 129 +++++------
 6 files changed, 470 insertions(+), 230 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 027cf495466c..40c0a56362ac 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -497,6 +497,7 @@
         "GroupViTVisionConfig",
     ],
     "models.herbert": ["HerbertTokenizer"],
+    "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.idefics": [
@@ -5280,6 +5281,7 @@
         GroupViTVisionConfig,
     )
     from .models.herbert import HerbertTokenizer
+    from .models.hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.idefics import (
@@ -6983,6 +6985,10 @@
             HubertModel,
             HubertPreTrainedModel,
         )
+        from .models.hiera import (
+            Hiera,
+            HieraBlock
+        )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             IBertForMaskedLM,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 7bc637f3e106..ed75e74ebfce 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -117,6 +117,7 @@
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
+        ("hiera","HieraConfig")
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
@@ -352,6 +353,7 @@
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP")
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -588,6 +590,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
+        ("hiera","Hiera")
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index bfd200e9dcb9..3ea6efb0056a 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -1,28 +1,18 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from typing import TYPE_CHECKING
 
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
-    is_flax_available,
-    is_tf_available,
     is_torch_available,
 )
 
 
-_import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
+_import_structure = {
+    "configuration_hiera": [
+        "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "HireaConfig",
+    ],
+}
 
 try:
     if not is_torch_available():
@@ -30,28 +20,20 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_vit_mae"] = [
-        "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ViTMAEForPreTraining",
-        "ViTMAELayer",
-        "ViTMAEModel",
-        "ViTMAEPreTrainedModel",
-    ]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_vit_mae"] = [
-        "TFViTMAEForPreTraining",
-        "TFViTMAEModel",
-        "TFViTMAEPreTrainedModel",
+    _import_structure["hirea"] = [
+        "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Hirea",
+        "Head",
+        "HieraBlock",
+        "MaskUnitAttention"
+        ""
     ]
 
 if TYPE_CHECKING:
-    from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+    from .configuration_hiera import (
+        HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        HieraConfig,
+    )
 
     try:
         if not is_torch_available():
@@ -59,24 +41,99 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_vit_mae import (
-            VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ViTMAEForPreTraining,
-            ViTMAELayer,
-            ViTMAEModel,
-            ViTMAEPreTrainedModel,
+        from .hiera import (
+            Hiera,
+            Head,
+            HieraBlock,
+            MaskUnitAttention,
         )
 
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
-
-
 else:
     import sys
 
     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+
+####### PREV:
+    
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from typing import TYPE_CHECKING
+
+# from ...utils import (
+#     OptionalDependencyNotAvailable,
+#     _LazyModule,
+#     is_flax_available,
+#     is_tf_available,
+#     is_torch_available,
+# )
+
+
+# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
+
+# try:
+#     if not is_torch_available():
+#         raise OptionalDependencyNotAvailable()
+# except OptionalDependencyNotAvailable:
+#     pass
+# else:
+#     _import_structure["modeling_vit_mae"] = [
+#         "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
+#         "ViTMAEForPreTraining",
+#         "ViTMAELayer",
+#         "ViTMAEModel",
+#         "ViTMAEPreTrainedModel",
+#     ]
+
+# try:
+#     if not is_tf_available():
+#         raise OptionalDependencyNotAvailable()
+# except OptionalDependencyNotAvailable:
+#     pass
+# else:
+#     _import_structure["modeling_tf_vit_mae"] = [
+#         "TFViTMAEForPreTraining",
+#         "TFViTMAEModel",
+#         "TFViTMAEPreTrainedModel",
+#     ]
+
+# if TYPE_CHECKING:
+#     from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+
+#     try:
+#         if not is_torch_available():
+#             raise OptionalDependencyNotAvailable()
+#     except OptionalDependencyNotAvailable:
+#         pass
+#     else:
+#         from .modeling_vit_mae import (
+#             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
+#             ViTMAEForPreTraining,
+#             ViTMAELayer,
+#             ViTMAEModel,
+#             ViTMAEPreTrainedModel,
+#         )
+
+#     try:
+#         if not is_tf_available():
+#             raise OptionalDependencyNotAvailable()
+#     except OptionalDependencyNotAvailable:
+#         pass
+#     else:
+#         from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
+
+
+# else:
+#     import sys
+
+#     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index de5de9e7d9e9..c7dfaeaeedfb 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -2,127 +2,108 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-
+from typing import  Tuple
 
 logger = logging.get_logger(__name__)
 
-VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/vit-mae-base": "https://huggingface.co/facebook/vit-mae-base/resolve/main/config.json",
-    # See all ViT MAE models at https://huggingface.co/models?filter=vit-mae
+HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+
 }
 
 
-class ViTMAEConfig(PretrainedConfig):
+class HieraConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT
-    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
-    the defaults will yield a similar configuration to that of the ViT
-    [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture.
+    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Hiera
+    [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
-        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the decoder.
-        decoder_hidden_size (`int`, *optional*, defaults to 512):
-            Dimensionality of the decoder.
-        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
-            Number of hidden layers in the decoder.
-        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
-        mask_ratio (`float`, *optional*, defaults to 0.75):
-            The ratio of the number of masked tokens in the input sequence.
-        norm_pix_loss (`bool`, *optional*, defaults to `False`):
-            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
-            representation quality in the experiments of the authors.
-
-    Example:
-
-    ```python
-    >>> from transformers import ViTMAEConfig, ViTMAEModel
-
-    >>> # Initializing a ViT MAE vit-mae-base style configuration
-    >>> configuration = ViTMAEConfig()
-
-    >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
-    >>> model = ViTMAEModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "vit_mae"
-
+        input_size (Tuple[int, ...], optional): Dimensions of the input image (height, width). Defaults to (224, 224).
+        in_chans (int, optional): Number of input channels. Defaults to 3.
+        embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96.
+        number_of_heads (int, optional): Initial number of attention heads. Defaults to 1.
+        num_classes (int, optional): Number of output classes. Defaults to 1000.
+        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. 
+        q_pool (int, optional): Number of pooling stages for queries. Defaults to 3.
+        q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2).
+        mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride.
+        mask_unit_attn (Tuple[bool, ...], optional): Specifies which stages use mask unit attention. Defaults to (True, True, False, False).
+        dim_mul (float, optional): Factor for increasing the dimensionality through the network. Defaults to 2.0.
+        head_mul (float, optional): Factor for increasing the number of heads through the network. Defaults to 2.0.
+        patch_kernel (Tuple[int, ...], optional): Kernel size for patch embedding. Defaults to (7, 7).
+        patch_stride (Tuple[int, ...], optional): Stride for patch embedding. Defaults to (4, 4).
+        patch_padding (Tuple[int, ...], optional): Padding for patch embedding. Defaults to (3, 3).
+        mlp_ratio (float, optional): Ratio of hidden size to feed-forward layer size. Defaults to 4.0.
+        drop_path_rate (float, optional): Dropout rate for stochastic depth. Defaults to 0.0.
+        head_dropout (float, optional): Dropout rate for attention heads. Defaults to 0.0.
+        head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001.
+        sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False.
+
+    
+        Example:
+        ```python
+        >>> from transformers import HieraConfig, Hiera
+
+        >>> # Initializing a ViT MAE vit-mae-base style configuration
+        >>> configuration = HieraConfig()
+
+        >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
+        >>> model = Hiera(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```
+        """
+
+    model_type = "hiera"
     def __init__(
         self,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        image_size=224,
-        patch_size=16,
-        num_channels=3,
-        qkv_bias=True,
-        decoder_num_attention_heads=16,
-        decoder_hidden_size=512,
-        decoder_num_hidden_layers=8,
-        decoder_intermediate_size=2048,
-        mask_ratio=0.75,
-        norm_pix_loss=False,
+        input_size: Tuple[int, ...] = (224, 224),
+        in_chans: int = 3,
+        embedding_dimension: int = 96,  # initial embedding input_dim
+        number_of_heads: int = 1,  # initial number of number_of_heads
+        num_classes: int = 1000,
+        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, ...] = (2, 2),
+        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
+        # mask_unit_attn: which stages use mask unit attention?
+        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
+        dim_mul: float = 2.0,
+        head_mul: float = 2.0,
+        patch_kernel: Tuple[int, ...] = (7, 7),
+        patch_stride: Tuple[int, ...] = (4, 4),
+        patch_padding: Tuple[int, ...] = (3, 3),
+        mlp_ratio: float = 4.0,
+        drop_path_rate: float = 0.0,
+        head_dropout: float = 0.0,
+        head_init_scale: float = 0.001,
+        sep_position_embeddings: bool = False,
         **kwargs,
+
     ):
         super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.qkv_bias = qkv_bias
-        self.decoder_num_attention_heads = decoder_num_attention_heads
-        self.decoder_hidden_size = decoder_hidden_size
-        self.decoder_num_hidden_layers = decoder_num_hidden_layers
-        self.decoder_intermediate_size = decoder_intermediate_size
-        self.mask_ratio = mask_ratio
-        self.norm_pix_loss = norm_pix_loss
+        self.input_size = input_size
+        self.in_chans = in_chans
+        self.embedding_dimension = embedding_dimension
+        self.number_of_heads = number_of_heads
+        self.num_classes = num_classes
+        self.stages = stages
+        self.q_pool = q_pool
+        self.q_stride = q_stride
+        self.mask_unit_size = mask_unit_size
+        self.mask_unit_attn = mask_unit_attn
+        self.dim_mul = dim_mul
+        self.head_mul = head_mul
+        self.patch_kernel = patch_kernel
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.drop_path_rate = drop_path_rate
+        self.head_dropout = head_dropout
+        self.head_init_scale = head_init_scale
+        self.sep_position_embeddings = sep_position_embeddings
\ No newline at end of file
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index f1d0c4135796..77556120bcb4 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,6 +3,12 @@
 import requests
 import torch
 from PIL import Image
+# from .configuration_hiera import HieraConfig
+# from .hiera import Hiera
+# from transformers import HieraConfig, Hiera
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
@@ -29,3 +35,209 @@ def convert_state_dict(orig_state_dict, config):
     return updated_model_state
 
 
+
+class HieraImageProcessor:
+    def __init__(self, size):
+        self.size = size
+        self.transform_list = [
+            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
+            transforms.CenterCrop(self.size)
+        ]
+        self.transform_vis = transforms.Compose(self.transform_list)
+        self.transform_norm = transforms.Compose(self.transform_list + [
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+        ])
+    
+    def process_image(self, image_url):
+        # Load the image
+        img = Image.open(requests.get(image_url, stream=True).raw)
+        
+        # Apply transformations
+        img_vis = self.transform_vis(img)
+        img_norm = self.transform_norm(img)
+        
+        return img_norm
+
+
+
+def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
+    pretrained_models_links = {
+        "hiera_tiny_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
+        },
+        "hiera_small_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
+        },
+        "hiera_base_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
+        },
+        "hiera_base_plus_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
+        },
+        "hiera_large_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
+        },
+        "hiera_huge_224": {
+            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
+            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
+        },
+        "hiera_base_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
+        },
+        "hiera_base_plus_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
+        },
+        "hiera_large_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
+        },
+        "hiera_huge_16x224": {
+            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
+            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
+        }
+    }
+
+
+    if "hiera_tiny_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=96, 
+                            number_of_heads=1, 
+                            stages=(1, 2, 7, 2),)
+        checkpoints = pretrained_models_links["hiera_tiny_224"]
+        checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_small_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=96, 
+                            number_of_heads=1, 
+                            stages=(1, 2, 11, 2),)
+        checkpoints = pretrained_models_links["hiera_small_224"]
+        checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_base_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=96, 
+                            number_of_heads=1, 
+                            stages=(2, 3, 16, 3),)
+        checkpoints = pretrained_models_links["hiera_base_224"]
+        checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_base_plus_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=112, 
+                            number_of_heads=2, 
+                            stages=(2, 3, 16, 3),)
+        checkpoints = pretrained_models_links["hiera_base_plus_224"]
+        checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_large_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=144, 
+                            number_of_heads=2, 
+                            stages=(2, 6, 36, 4),)
+        checkpoints = pretrained_models_links["hiera_large_224"]
+        checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_huge_224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=256, 
+                            number_of_heads=4, 
+                            stages=(2, 6, 36, 4))
+        checkpoints = pretrained_models_links["hiera_huge_224"]
+        checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"]
+
+    elif "hiera_base_16x224" in checkpoint_url:
+        config = HieraConfig(num_classes=num_classes,  # Assuming num_classes is defined elsewhere
+                            input_size=(16, 224, 224),
+                            q_stride=(1, 2, 2),
+                            mask_unit_size=(1, 8, 8),
+                            patch_kernel=(3, 7, 7),
+                            patch_stride=(2, 4, 4),
+                            patch_padding=(1, 3, 3),
+                            sep_position_embeddings=True,)
+        checkpoints = pretrained_models_links["hiera_base_16x224"]
+        checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"]
+
+    elif "hiera_base_plus_16x224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=112, 
+                            number_of_heads=2, 
+                            stages=(2, 3, 16, 3))
+        checkpoints = pretrained_models_links["hiera_base_plus_16x224"]
+        checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"]
+
+    elif "hiera_large_16x224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=144, 
+                            number_of_heads=2, 
+                            stages=(2, 6, 36, 4), )
+        checkpoints = pretrained_models_links["hiera_large_16x224"]
+        checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"]
+
+    elif "hiera_huge_16x224" in checkpoint_url:
+        config = HieraConfig(embedding_dimension=256, 
+                            number_of_heads=4, 
+                            stages=(2, 6, 36, 4) )
+        checkpoints = pretrained_models_links["hiera_huge_16x224"]
+        checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
+
+
+    pretrained = True
+    if pretrained:
+        if checkpoints is None:
+            raise RuntimeError("This model currently doesn't have pretrained weights available.")
+        elif checkpoint is None:
+            raise RuntimeError("No checkpoint specified.")
+        elif checkpoint not in checkpoints:
+            raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
+
+        state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
+        state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
+        if "head.projection.weight" in state_dict["model_state"]:
+            # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
+            if config.num_classes is None:
+                config.num_classes = state_dict["model_state"]["head.projection.weight"].shape[0]
+            # If the user specified a different number of classes, remove the projection weights or else we'll error out
+            elif config.num_classes != state_dict["model_state"]["head.projection.weight"].shape[0]:
+                del state_dict["model_state"]["head.projection.weight"]
+                del state_dict["model_state"]["head.projection.bias"]
+
+    model = Hiera(config)
+    if pretrained:
+        # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
+        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
+            strict = False
+
+        model.load_state_dict(state_dict["model_state"], strict=strict)
+    
+
+
+
+    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
+
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    
+    image_processor = HieraImageProcessor(size=config.image_size)
+    inputs = image_processor.process_image(images=image, return_tensors="pt")
+
+    # forward pass
+    out = model(inputs[None, ...])
+
+    # 207: golden retriever  (imagenet-1k)
+    out.argmax(dim=-1).item()
+
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth"
+    convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/")
+
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index fcb04f68934e..7e42d5914d44 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -21,7 +21,7 @@
 import math
 from functools import partial
 from typing import List, Tuple, Callable, Optional
-
+from .configuration_hiera import HieraConfig
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -205,106 +205,85 @@ def forward(
 
 
 class Hiera(nn.Module):
-    def __init__(
-        self,
-        input_size: Tuple[int, ...] = (224, 224),
-        in_chans: int = 3,
-        embedding_dimention: int = 96,  # initial embedding input_dim
-        number_of_heads: int = 1,  # initial number of number_of_heads
-        num_classes: int = 1000,
-        stages: Tuple[int, ...] = (2, 3, 16, 3),
-        q_pool: int = 3,  # number of q_pool stages
-        q_stride: Tuple[int, ...] = (2, 2),
-        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
-        # mask_unit_attn: which stages use mask unit attention?
-        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
-        dim_mul: float = 2.0,
-        head_mul: float = 2.0,
-        patch_kernel: Tuple[int, ...] = (7, 7),
-        patch_stride: Tuple[int, ...] = (4, 4),
-        patch_padding: Tuple[int, ...] = (3, 3),
-        mlp_ratio: float = 4.0,
-        drop_path_rate: float = 0.0,
-        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
-        head_dropout: float = 0.0,
-        head_init_scale: float = 0.001,
-        sep_position_embeddings: bool = False,
-    ):
+    def __init__(self, config: HieraConfig):
         super().__init__()
-
-        depth = sum(stages)
-        self.patch_stride = patch_stride
-        self.tokens_spatial_shape = [i // s for i, s in zip(input_size, patch_stride)]
+        self.config = config
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)  # Example, adjust as needed
+        self.config = config
+        depth = sum(self.config.stages)
+        self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)]
         num_tokens = math.prod(self.tokens_spatial_shape)
-        flat_mu_size = math.prod(mask_unit_size)
-        flat_q_stride = math.prod(q_stride)
+        flat_mu_size = math.prod(self.config.mask_unit_size)
+        flat_q_stride = math.prod(self.config.q_stride)
 
-        assert q_pool < len(stages)
-        self.q_pool, self.q_stride = q_pool, q_stride
-        self.mu_size, self.mask_unit_size = flat_mu_size, mask_unit_size
+        assert self.config.q_pool < len(self.config.stages)
+        self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride
+        self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size
         self.mask_spatial_shape = [
             i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
         ]
-        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+        self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)]
 
         self.patch_embedding = PatchEmbedding(
-            in_chans, embedding_dimention, patch_kernel, patch_stride, patch_padding
+            self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding
         )
 
-        self.sep_position_embeddings = sep_position_embeddings
-        if sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             self.position_embeddings_spatial = nn.Parameter(
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    embedding_dimention,
+                    self.config.embedding_dimension,
                 )
             )
             self.position_embeddings_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], embedding_dimention)
+                torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension)
             )
         else:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, embedding_dimention))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension))
 
         # Setup roll and reroll modules
         self.unroll = Unroll(
-            input_size, patch_stride, [q_stride] * len(self.stage_ends[:-1])
+            self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1])
         )
         self.reroll = Reroll(
-            input_size,
-            patch_stride,
-            [q_stride] * len(self.stage_ends[:-1]),
+            self.config.input_size,
+            self.config.patch_stride,
+            [self.config.q_stride] * len(self.stage_ends[:-1]),
             self.stage_ends,
-            q_pool,
+            self.config.q_pool,
         )
         # q_pool locations
-        q_pool_blocks = [x + 1 for x in self.stage_ends[:q_pool]]
+        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]]
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)]
 
         # Transformer blocks
         cur_stage = 0
         self.blocks = nn.ModuleList()
 
         for i in range(depth):
-            output_dim = embedding_dimention
+            output_dim = self.config.embedding_dimension
             # Mask unit or global attention.
             # Lag by 1 block, so that global attention,
             # applied post pooling on lower resolution
-            use_mask_unit_attention = mask_unit_attn[cur_stage]
+            use_mask_unit_attention = self.config.mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
-                output_dim = int(embedding_dimention * dim_mul)
-                number_of_heads = int(number_of_heads * head_mul)
+                output_dim = int(self.config.embedding_dimension * self.config.dim_mul)
+                number_of_heads = int(self.config.number_of_heads * self.config.head_mul)
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
+            else:
+                number_of_heads = self.config.number_of_heads
 
             block = HieraBlock(
-                input_dim=embedding_dimention,
+                input_dim=self.config.embedding_dimension,
                 output_dim=output_dim,
                 number_of_heads=number_of_heads,
-                mlp_ratio=mlp_ratio,
+                mlp_ratio=self.config.mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -312,21 +291,21 @@ def __init__(
                 use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            embedding_dimention = output_dim
+            self.config.embedding_dimension = output_dim
             self.blocks.append(block)
 
-        self.norm = norm_layer(embedding_dimention)
-        self.head = Head(embedding_dimention, num_classes, dropout_rate=head_dropout)
+        self.norm = norm_layer(self.config.embedding_dimension)
+        self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout)
 
         # Initialize everything
-        if sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02)
             nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02)
         else:
             nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         self.apply(partial(self._init_weights))
-        self.head.projection.weight.data.mul_(head_init_scale)
-        self.head.projection.bias.data.mul_(head_init_scale)
+        self.head.projection.weight.data.mul_(self.config.head_init_scale)
+        self.head.projection.bias.data.mul_(self.config.head_init_scale)
 
     def _init_weights(self, m, init_bias=0.02):
         if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
@@ -339,7 +318,7 @@ def _init_weights(self, m, init_bias=0.02):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        if self.sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             return ["position_embeddings_spatial", "position_embeddings_temporal"]
         else:
             return ["position_embeddings"]
@@ -371,7 +350,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         return mask.bool()
 
     def get_position_embeddings(self) -> torch.Tensor:
-        if self.sep_position_embeddings:
+        if self.config.sep_position_embeddings:
             return self.position_embeddings_spatial.repeat(
                 1, self.tokens_spatial_shape[0], 1
             ) + torch.repeat_interleave(
@@ -441,8 +420,9 @@ def forward(
     "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
 }, default="mae_in1k_ft_in1k")
-def hiera_tiny_224(**kwdargs):
-    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwdargs)
+def hiera_tiny_224(**kwargs):
+    config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs)
+    return Hiera(config)
 
 
 @pretrained_model({
@@ -450,15 +430,16 @@ def hiera_tiny_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_small_224(**kwdargs):
-    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
+    return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
 
 
 @pretrained_model({
     "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
 }, default="mae_in1k_ft_in1k")
-def hiera_base_224(**kwdargs):
-    return Hiera(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwdargs)
+def hiera_base_224(**kwargs):
+    config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+    return Hiera(config)
 
 
 @pretrained_model({
@@ -466,7 +447,7 @@ def hiera_base_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_base_plus_224(**kwdargs):
-    return Hiera(embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
+    return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
 
 
 @pretrained_model({
@@ -474,7 +455,7 @@ def hiera_base_plus_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_large_224(**kwdargs):
-    return Hiera(embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
 
 
 @pretrained_model({
@@ -482,7 +463,7 @@ def hiera_large_224(**kwdargs):
     "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
 }, default="mae_in1k_ft_in1k")
 def hiera_huge_224(**kwdargs):
-    return Hiera(embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
+    return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
 
 
 # Video models
@@ -511,7 +492,7 @@ def hiera_base_16x224(num_classes: int = 400, **kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_base_plus_16x224(**kwdargs):
     return hiera_base_16x224(
-        embedding_dimention=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
+        embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
     )
 
 
@@ -521,7 +502,7 @@ def hiera_base_plus_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_large_16x224(**kwdargs):
     return hiera_base_16x224(
-        embedding_dimention=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
     )
 
 
@@ -531,5 +512,5 @@ def hiera_large_16x224(**kwdargs):
 }, default="mae_k400_ft_k400")
 def hiera_huge_16x224(**kwdargs):
     return hiera_base_16x224(
-        embedding_dimention=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
+        embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
     )

From 46d495c59bf0dd21a48719c2c5097494d0250fc2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 21:48:20 +0000
Subject: [PATCH 020/118] Fixed Convert function, added hiera to HF files,
 Initilized test files

---
 src/transformers/__init__.py                  |   7 +
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/hiera/__init__.py     |   3 +
 .../models/hiera/convert_hiera_to_pytorch.py  |  56 ++--
 src/transformers/models/hiera/hiera.py        | 242 +++++++-----------
 .../models/hiera/hiera_image_processor.py     |  56 ++++
 tests/models/hiera/__init__.py                |   0
 tests/models/hiera/test_modeling_vit_mae.py   |  44 ++++
 9 files changed, 226 insertions(+), 189 deletions(-)
 create mode 100644 src/transformers/models/hiera/hiera_image_processor.py
 create mode 100644 tests/models/hiera/__init__.py
 create mode 100644 tests/models/hiera/test_modeling_vit_mae.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 40c0a56362ac..69eb50a0ca37 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4148,6 +4148,13 @@
             "TFGroupViTVisionModel",
         ]
     )
+    _import_structure["models.hiera"].extend(
+        [
+            "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Hiera",
+            
+        ]
+    )
     _import_structure["models.hubert"].extend(
         [
             "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ed75e74ebfce..28b8243dd9ef 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -117,7 +117,7 @@
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
-        ("hiera","HieraConfig")
+        ("hiera","HieraConfig"),
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
@@ -353,7 +353,7 @@
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP")
+        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -590,7 +590,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera","Hiera")
+        ("hiera","Hiera"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 05b519d2bcd1..1fa0c71b1537 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -115,6 +115,7 @@
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
         ("groupvit", "GroupViTModel"),
+        ("hiera", "Hiera"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 3ea6efb0056a..f88e32d03c98 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -47,6 +47,9 @@
             HieraBlock,
             MaskUnitAttention,
         )
+        from .hiera_image_processor import (
+            HieraImageProcessor
+        )
 
 else:
     import sys
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 77556120bcb4..d1b6e8a4ad30 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,8 +3,9 @@
 import requests
 import torch
 from PIL import Image
-# from .configuration_hiera import HieraConfig
-# from .hiera import Hiera
+from transformers.models.hiera.configuration_hiera import HieraConfig
+from transformers.models.hiera.hiera import Hiera
+from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
 # from transformers import HieraConfig, Hiera
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
@@ -35,33 +36,8 @@ def convert_state_dict(orig_state_dict, config):
     return updated_model_state
 
 
-
-class HieraImageProcessor:
-    def __init__(self, size):
-        self.size = size
-        self.transform_list = [
-            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
-            transforms.CenterCrop(self.size)
-        ]
-        self.transform_vis = transforms.Compose(self.transform_list)
-        self.transform_norm = transforms.Compose(self.transform_list + [
-            transforms.ToTensor(),
-            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ])
-    
-    def process_image(self, image_url):
-        # Load the image
-        img = Image.open(requests.get(image_url, stream=True).raw)
-        
-        # Apply transformations
-        img_vis = self.transform_vis(img)
-        img_norm = self.transform_norm(img)
-        
-        return img_norm
-
-
-
-def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
+def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
+    strict = True
     pretrained_models_links = {
         "hiera_tiny_224": {
             "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
@@ -121,9 +97,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
         checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, 
-                            number_of_heads=1, 
-                            stages=(2, 3, 16, 3),)
+        config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+
         checkpoints = pretrained_models_links["hiera_base_224"]
         checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
 
@@ -180,7 +155,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
                             stages=(2, 6, 36, 4) )
         checkpoints = pretrained_models_links["hiera_huge_16x224"]
         checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
-
+    elif checkpoint not in checkpoints:
+        raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
 
     pretrained = True
     if pretrained:
@@ -188,10 +164,8 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
             raise RuntimeError("This model currently doesn't have pretrained weights available.")
         elif checkpoint is None:
             raise RuntimeError("No checkpoint specified.")
-        elif checkpoint not in checkpoints:
-            raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
 
-        state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
+        state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu")
         state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
         if "head.projection.weight" in state_dict["model_state"]:
             # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
@@ -202,24 +176,24 @@ def convert_Hiera_checkpoint( checkpoint_url, pytorch_dump_folder_path):
                 del state_dict["model_state"]["head.projection.weight"]
                 del state_dict["model_state"]["head.projection.bias"]
 
-    model = Hiera(config)
+    model = Hiera(config=config)
     if pretrained:
         # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
         if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
             strict = False
 
-        model.load_state_dict(state_dict["model_state"], strict=strict)
+        model.load_state_dict(state_dict["model_state"])
+        # model.load_state_dict(state_dict["model_state"], strict=strict)
     
 
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
 
-    image = Image.open(requests.get(url, stream=True).raw)
 
     
-    image_processor = HieraImageProcessor(size=config.image_size)
-    inputs = image_processor.process_image(images=image, return_tensors="pt")
+    image_processor = HieraImageProcessor(size=224)
+    inputs = image_processor.process_image(image_url=url)
 
     # forward pass
     out = model(inputs[None, ...])
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 7e42d5914d44..7bafed5c3cd0 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -25,11 +25,40 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from dataclasses import dataclass
 
 from timm.models.layers import DropPath, Mlp
-
-from .hiera_utils import pretrained_model, conv_nd, do_pool, do_masked_conv, Unroll, Reroll
-
+from ...modeling_utils import PreTrainedModel
+# from ...modeling_outputs import BaseModelOutput
+# from ...utils import (
+#     ModelOutput,
+#     add_start_docstrings,
+#     add_start_docstrings_to_model_forward,
+#     logging,
+#     replace_return_docstrings,
+# )
+
+from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll
+
+# @dataclass
+# class HieraModelOutput(ModelOutput):
+#     """
+#     Base class for Hiera model's outputs.
+
+#     Args:
+#         last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
+#             Last layer hidden-states.
+#         attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): 
+#             Attentions weights from the model, one for each layer.
+#         hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): 
+#             Hidden states of the model at the output of each layer.
+#         intermediates (list[torch.Tensor], optional): 
+#             Intermediate representations or features from the model, if applicable.
+#     """
+#     last_hidden_state: torch.FloatTensor
+#     attentions: Optional[Tuple[torch.FloatTensor]] = None
+#     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+#     intermediates: Optional[list[torch.Tensor]] = None
 
 
 class MaskUnitAttention(nn.Module):
@@ -204,86 +233,110 @@ def forward(
         return x
 
 
-class Hiera(nn.Module):
+class Hiera(PreTrainedModel):
+    config_class = HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "x"
+    supports_gradient_checkpointing = True
+
     def __init__(self, config: HieraConfig):
-        super().__init__()
+        self.input_size = config.input_size
+        self.in_chans = config.in_chans
+        self.embedding_dimension = config.embedding_dimension
+        self.number_of_heads = config.number_of_heads
+        self.num_classes = config.num_classes
+        self.stages = config.stages
+        self.q_pool = config.q_pool
+        self.q_stride = config.q_stride
+        self.mask_unit_size = config.mask_unit_size
+        self.mask_unit_attn = config.mask_unit_attn
+        self.dim_mul = config.dim_mul
+        self.head_mul = config.head_mul
+        self.patch_kernel = config.patch_kernel
+        self.patch_stride = config.patch_stride
+        self.patch_padding = config.patch_padding
+        self.mlp_ratio = config.mlp_ratio
+        self.drop_path_rate = config.drop_path_rate
+        self.head_dropout = config.head_dropout
+        self.head_init_scale = config.head_init_scale
+        self.sep_position_embeddings = config.sep_position_embeddings
+
+        super().__init__(config)
         self.config = config
-        super().__init__()
         norm_layer = partial(nn.LayerNorm, eps=1e-6)  # Example, adjust as needed
-        self.config = config
-        depth = sum(self.config.stages)
-        self.tokens_spatial_shape = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)]
+        depth = sum(self.stages)
+        self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)]
         num_tokens = math.prod(self.tokens_spatial_shape)
-        flat_mu_size = math.prod(self.config.mask_unit_size)
-        flat_q_stride = math.prod(self.config.q_stride)
+        flat_mu_size = math.prod(self.mask_unit_size)
+        flat_q_stride = math.prod(self.q_stride)
 
-        assert self.config.q_pool < len(self.config.stages)
-        self.q_pool, self.q_stride = self.config.q_pool, self.config.q_stride
-        self.mu_size, self.mask_unit_size = flat_mu_size, self.config.mask_unit_size
+        assert self.q_pool < len(self.stages)
+        self.q_pool, self.q_stride = self.q_pool, self.q_stride
+        self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size
         self.mask_spatial_shape = [
             i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
         ]
-        self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)]
+        self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)]
 
         self.patch_embedding = PatchEmbedding(
-            self.config.in_chans, self.config.embedding_dimension, self.config.patch_kernel, self.config.patch_stride, self.config.patch_padding
+            self.in_chans, self.embedding_dimension, self.patch_kernel, self.patch_stride, self.patch_padding
         )
 
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             self.position_embeddings_spatial = nn.Parameter(
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    self.config.embedding_dimension,
+                    self.embedding_dimension,
                 )
             )
             self.position_embeddings_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], self.config.embedding_dimension)
+                torch.zeros(1, self.tokens_spatial_shape[0], self.embedding_dimension)
             )
         else:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.config.embedding_dimension))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension))
 
         # Setup roll and reroll modules
         self.unroll = Unroll(
-            self.config.input_size, self.config.patch_stride, [self.config.q_stride] * len(self.stage_ends[:-1])
+            self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])
         )
         self.reroll = Reroll(
-            self.config.input_size,
-            self.config.patch_stride,
-            [self.config.q_stride] * len(self.stage_ends[:-1]),
+            self.input_size,
+            self.patch_stride,
+            [self.q_stride] * len(self.stage_ends[:-1]),
             self.stage_ends,
-            self.config.q_pool,
+            self.q_pool,
         )
         # q_pool locations
-        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.config.q_pool]]
+        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]]
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, self.config.drop_path_rate, depth)]
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)]
 
         # Transformer blocks
         cur_stage = 0
         self.blocks = nn.ModuleList()
 
         for i in range(depth):
-            output_dim = self.config.embedding_dimension
+            output_dim = self.embedding_dimension
             # Mask unit or global attention.
             # Lag by 1 block, so that global attention,
             # applied post pooling on lower resolution
-            use_mask_unit_attention = self.config.mask_unit_attn[cur_stage]
+            use_mask_unit_attention = self.mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
-                output_dim = int(self.config.embedding_dimension * self.config.dim_mul)
-                number_of_heads = int(self.config.number_of_heads * self.config.head_mul)
+                output_dim = int(self.embedding_dimension * self.dim_mul)
+                number_of_heads = int(self.number_of_heads * self.head_mul)
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
             else:
-                number_of_heads = self.config.number_of_heads
+                number_of_heads = self.number_of_heads
 
             block = HieraBlock(
-                input_dim=self.config.embedding_dimension,
+                input_dim=self.embedding_dimension,
                 output_dim=output_dim,
                 number_of_heads=number_of_heads,
-                mlp_ratio=self.config.mlp_ratio,
+                mlp_ratio=self.mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -291,21 +344,22 @@ def __init__(self, config: HieraConfig):
                 use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            self.config.embedding_dimension = output_dim
+            self.embedding_dimension = output_dim
             self.blocks.append(block)
 
-        self.norm = norm_layer(self.config.embedding_dimension)
-        self.head = Head(self.config.embedding_dimension, self.config.num_classes, dropout_rate=self.config.head_dropout)
+        self.norm = norm_layer(self.embedding_dimension)
+        self.head = Head(self.embedding_dimension, self.num_classes, dropout_rate=self.head_dropout)
 
         # Initialize everything
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             nn.init.trunc_normal_(self.position_embeddings_spatial, std=0.02)
             nn.init.trunc_normal_(self.position_embeddings_temporal, std=0.02)
         else:
             nn.init.trunc_normal_(self.position_embeddings, std=0.02)
         self.apply(partial(self._init_weights))
-        self.head.projection.weight.data.mul_(self.config.head_init_scale)
-        self.head.projection.bias.data.mul_(self.config.head_init_scale)
+        self.head.projection.weight.data.mul_(self.head_init_scale)
+        self.head.projection.bias.data.mul_(self.head_init_scale)
+        self.post_init()
 
     def _init_weights(self, m, init_bias=0.02):
         if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
@@ -318,7 +372,7 @@ def _init_weights(self, m, init_bias=0.02):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             return ["position_embeddings_spatial", "position_embeddings_temporal"]
         else:
             return ["position_embeddings"]
@@ -350,7 +404,7 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         return mask.bool()
 
     def get_position_embeddings(self) -> torch.Tensor:
-        if self.config.sep_position_embeddings:
+        if self.sep_position_embeddings:
             return self.position_embeddings_spatial.repeat(
                 1, self.tokens_spatial_shape[0], 1
             ) + torch.repeat_interleave(
@@ -411,106 +465,4 @@ def forward(
         if return_intermediates:
             return x, intermediates
 
-        return x
-
-
-# Image models
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_tiny_224(**kwargs):
-    config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 7, 2), **kwargs)
-    return Hiera(config)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_small_224(**kwdargs):
-    return Hiera(embedding_dimension=96, number_of_heads=1, stages=(1, 2, 11, 2), **kwdargs)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_base_224(**kwargs):
-    config = HieraConfig(embedding_dimention=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
-    return Hiera(config)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_base_plus_224(**kwdargs):
-    return Hiera(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_large_224(**kwdargs):
-    return Hiera(embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs)
-
-
-@pretrained_model({
-    "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
-}, default="mae_in1k_ft_in1k")
-def hiera_huge_224(**kwdargs):
-    return Hiera(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs)
-
-
-# Video models
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_base_16x224(num_classes: int = 400, **kwdargs):
-    return Hiera(
-        num_classes=num_classes,  # K400 has 400 classes
-        input_size=(16, 224, 224),
-        q_stride=(1, 2, 2),
-        mask_unit_size=(1, 8, 8),
-        patch_kernel=(3, 7, 7),
-        patch_stride=(2, 4, 4),
-        patch_padding=(1, 3, 3),
-        sep_position_embeddings=True,
-        **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_base_plus_16x224(**kwdargs):
-    return hiera_base_16x224(
-        embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_large_16x224(**kwdargs):
-    return hiera_base_16x224(
-        embedding_dimension=144, number_of_heads=2, stages=(2, 6, 36, 4), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-}, default="mae_k400_ft_k400")
-def hiera_huge_16x224(**kwdargs):
-    return hiera_base_16x224(
-        embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4), **kwdargs
-    )
+        return x
\ No newline at end of file
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
new file mode 100644
index 000000000000..4900e4a4d3fb
--- /dev/null
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -0,0 +1,56 @@
+
+"""Image processor class for Hirea."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from PIL import Image
+import requests
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class HieraImageProcessor(BaseImageProcessor):
+    def __init__(self, size):
+        self.size = size
+        self.transform_list = [
+            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
+            transforms.CenterCrop(self.size)
+        ]
+        self.transform_vis = transforms.Compose(self.transform_list)
+        self.transform_norm = transforms.Compose(self.transform_list + [
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+        ])
+    
+    def process_image(self, image_url):
+        # Load the image
+        img = Image.open(requests.get(image_url, stream=True).raw)
+        
+        # Apply transformations
+        img_vis = self.transform_vis(img)
+        img_norm = self.transform_norm(img)
+        
+        return img_norm
\ No newline at end of file
diff --git a/tests/models/hiera/__init__.py b/tests/models/hiera/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py
new file mode 100644
index 000000000000..014d41766a8e
--- /dev/null
+++ b/tests/models/hiera/test_modeling_vit_mae.py
@@ -0,0 +1,44 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViTMAE model. """
+
+
+import math
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import ViTMAEConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import ViTMAEForPreTraining, ViTMAEModel
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTImageProcessor
\ No newline at end of file

From a25a3a7fc200913b64070a2781ce5e4ff7f87452 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Fri, 16 Feb 2024 23:41:40 +0000
Subject: [PATCH 021/118] better naming for x in forward pass

---
 src/transformers/__init__.py                  |   4 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 src/transformers/models/hiera/__init__.py     |   2 +-
 .../models/hiera/configuration_hiera.py       |   8 +-
 .../models/hiera/convert_hiera_to_pytorch.py  |  10 +-
 src/transformers/models/hiera/hiera.py        | 163 ++++++++++--------
 src/transformers/models/hiera/hiera_mae.py    |   6 +-
 src/transformers/models/hiera/hiera_utils.py  |   6 +-
 tests/models/hiera/test_modeling_hiera.py     |  87 ++++++++++
 tests/models/hiera/test_modeling_vit_mae.py   |  44 -----
 11 files changed, 199 insertions(+), 135 deletions(-)
 create mode 100644 tests/models/hiera/test_modeling_hiera.py
 delete mode 100644 tests/models/hiera/test_modeling_vit_mae.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 69eb50a0ca37..9e3c4c5f7c96 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4151,7 +4151,7 @@
     _import_structure["models.hiera"].extend(
         [
             "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "Hiera",
+            "HieraModel",
             
         ]
     )
@@ -6993,7 +6993,7 @@
             HubertPreTrainedModel,
         )
         from .models.hiera import (
-            Hiera,
+            HieraModel,
             HieraBlock
         )
         from .models.ibert import (
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 28b8243dd9ef..796e524fd0cf 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -590,7 +590,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera","Hiera"),
+        ("hiera","HieraModel"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1fa0c71b1537..0fc417e795e4 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -115,7 +115,7 @@
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
         ("groupvit", "GroupViTModel"),
-        ("hiera", "Hiera"),
+        ("hiera", "HieraModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index f88e32d03c98..0434517bf52c 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -42,7 +42,7 @@
         pass
     else:
         from .hiera import (
-            Hiera,
+            HieraModel,
             Head,
             HieraBlock,
             MaskUnitAttention,
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index c7dfaeaeedfb..e3133354f6ea 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -13,8 +13,8 @@
 
 class HieraConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with
-    the defaults will yield a similar configuration to that of the Hiera
+    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an HieraModel model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the HieraModel
     [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -46,13 +46,13 @@ class HieraConfig(PretrainedConfig):
     
         Example:
         ```python
-        >>> from transformers import HieraConfig, Hiera
+        >>> from transformers import HieraConfig, HieraModel
 
         >>> # Initializing a ViT MAE vit-mae-base style configuration
         >>> configuration = HieraConfig()
 
         >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
-        >>> model = Hiera(configuration)
+        >>> model = HieraModel(configuration)
 
         >>> # Accessing the model configuration
         >>> configuration = model.config
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index d1b6e8a4ad30..d0294f12deab 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,10 +3,10 @@
 import requests
 import torch
 from PIL import Image
-from transformers.models.hiera.configuration_hiera import HieraConfig
-from transformers.models.hiera.hiera import Hiera
-from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
-# from transformers import HieraConfig, Hiera
+# from transformers.models.hiera.configuration_hiera import HieraConfig
+# from transformers.models.hiera.hiera import HieraModel
+# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
+# from transformers import HieraConfig, HieraModel
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
@@ -176,7 +176,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
                 del state_dict["model_state"]["head.projection.weight"]
                 del state_dict["model_state"]["head.projection.bias"]
 
-    model = Hiera(config=config)
+    model = HieraModel(config=config)
     if pretrained:
         # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
         if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 7bafed5c3cd0..72917eb8e1a4 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -20,7 +20,7 @@
 
 import math
 from functools import partial
-from typing import List, Tuple, Callable, Optional
+from typing import List, Tuple, Callable, Optional, Union
 from .configuration_hiera import HieraConfig
 import torch
 import torch.nn as nn
@@ -29,36 +29,34 @@
 
 from timm.models.layers import DropPath, Mlp
 from ...modeling_utils import PreTrainedModel
-# from ...modeling_outputs import BaseModelOutput
-# from ...utils import (
-#     ModelOutput,
-#     add_start_docstrings,
-#     add_start_docstrings_to_model_forward,
-#     logging,
-#     replace_return_docstrings,
-# )
+from ...modeling_outputs import BaseModelOutput
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 
 from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll
 
-# @dataclass
-# class HieraModelOutput(ModelOutput):
-#     """
-#     Base class for Hiera model's outputs.
-
-#     Args:
-#         last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
-#             Last layer hidden-states.
-#         attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True): 
-#             Attentions weights from the model, one for each layer.
-#         hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True): 
-#             Hidden states of the model at the output of each layer.
-#         intermediates (list[torch.Tensor], optional): 
-#             Intermediate representations or features from the model, if applicable.
-#     """
-#     last_hidden_state: torch.FloatTensor
-#     attentions: Optional[Tuple[torch.FloatTensor]] = None
-#     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-#     intermediates: Optional[list[torch.Tensor]] = None
+@dataclass
+class HieraModelOutput(ModelOutput):
+    """
+    Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput.
+
+    Args:
+        last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
+            Last layer hidden-states.
+        attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): 
+            Attentions weights from the model, one for each layer.
+        hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): 
+            Hidden states of the model at the output of each layer.
+        intermediates (List[torch.Tensor], optional): 
+            Intermediate representations or features from the model, if applicable.
+    """
+    last_hidden_state: torch.FloatTensor
+    intermediates: Optional[List[torch.Tensor]] = None
 
 
 class MaskUnitAttention(nn.Module):
@@ -102,15 +100,15 @@ def __init__(
         self.window_size = window_size
         self.use_mask_unit_attention = use_mask_unit_attention
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         """ Input should be of shape [batch, tokens, channels]. """
-        batch_size , num_channels , _ = x.shape
+        batch_size , num_channels , _ = embeddings.shape
         num_windows = (
             (num_channels  // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
         )
 
         qkv = (
-            self.qkv(x)
+            self.qkv(embeddings)
             .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
@@ -126,15 +124,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         if hasattr(F, "scaled_dot_product_attention"):
             # Note: the original paper did *not* use SDPA, it's a free boost!
-            x = F.scaled_dot_product_attention(q, k, v)
+            embeddings = F.scaled_dot_product_attention(q, k, v)
         else:
             attention = (q * self.scale) @ k.transpose(-1, -2)
             attention = attention.softmax(dim=-1)
-            x = (attention @ v)
+            embeddings = (attention @ v)
 
-        x = x.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
-        x = self.projection(x)
-        return x
+        embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
+        embeddings = self.projection(embeddings)
+        return embeddings
 
 
 class HieraBlock(nn.Module):
@@ -168,16 +166,16 @@ def __init__(
         if input_dim != output_dim:
             self.projection = nn.Linear(input_dim, output_dim)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         # Attention + Q Pooling
-        normalized_input = self.norm1(x)
+        normalized_embeddings = self.norm1(embeddings)
         if self.input_dim != self.output_dim:
-            x = do_pool(self.projection(normalized_input), stride=self.attention.q_stride)
-        x = x + self.drop_path(self.attention(normalized_input))
+            embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride)
+        embeddings = embeddings + self.drop_path(self.attention(normalized_embeddings))
 
         # MLP
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
+        embeddings = embeddings + self.drop_path(self.mlp(self.norm2(embeddings)))
+        return embeddings
 
 
 class Head(nn.Module):
@@ -226,17 +224,36 @@ def __init__(
         )
 
     def forward(
-        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+        self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        x = do_masked_conv(x, self.projection, mask)
-        x = x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 1)
-        return x
+        embeddings = do_masked_conv(pixel_values, self.projection, mask)
+        embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
+        return embeddings
+
 
+class HireaModel(PreTrainedModel):
+    """
+    Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+
+    This model is a PyTorch implementation of the Hiera architecture for image classification.
+
+    The model can be used as follows:
+
+    Args:
+        config (HieraConfig): Configuration class instance for `Hiera`.
+
+    Example usage:
+        >>> from your_model_file import Hiera, HieraConfig
+        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+
+        >>> model = Hiera(config)
+        >>> inputs = torch.rand((1, 3, 224, 224))
+        >>> outputs = model(inputs)
+    """
 
-class Hiera(PreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
-    main_input_name = "x"
+    main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 
     def __init__(self, config: HieraConfig):
@@ -417,52 +434,56 @@ def get_position_embeddings(self) -> torch.Tensor:
 
     def forward(
         self,
-        x: torch.Tensor,
+        pixel_values: torch.Tensor,
         mask: torch.Tensor = None,
+        return_dict: Optional[bool] = True,
         return_intermediates: bool = False,
-    ) -> torch.Tensor:
+    ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
         """
         mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
         """
         # Slowfast training passes in a list
-        if isinstance(x, list):
-            x = x[0]
+        if isinstance(pixel_values, list):
+            pixel_values = pixel_values[0]
         intermediates = []
 
-        x = self.patch_embedding(
-            x,
+        pached_embeddings = self.patch_embedding(
+            pixel_values,
             mask=mask.view(
-                x.shape[0], 1, *self.mask_spatial_shape
+                pixel_values.shape[0], 1, *self.mask_spatial_shape
             )  # batch_size , C, *mask_spatial_shape
             if mask is not None
             else None,
         )
-        x = x + self.get_position_embeddings()
-        x = self.unroll(x)
+        embeddings = pached_embeddings + self.get_position_embeddings()
+        embeddings = self.unroll(embeddings)
 
         # Discard masked tokens
         if mask is not None:
-            x = x[mask[..., None].tile(1, self.mu_size, x.shape[2])].view(
-                x.shape[0], -1, x.shape[-1]
+            embeddings = embeddings[mask[..., None].tile(1, self.mu_size, embeddings.shape[2])].view(
+                embeddings.shape[0], -1, embeddings.shape[-1]
             )
 
-        for i, blk in enumerate(self.blocks):
-            x = blk(x)
+        for i, block in enumerate(self.blocks):
+            embeddings = block(embeddings)
 
             if return_intermediates and i in self.stage_ends:
-                intermediates.append(self.reroll(x, i, mask=mask))
+                intermediates.append(self.reroll(embeddings, i, mask=mask))
 
         if mask is None:
-            x = x.mean(dim=1)
-            x = self.norm(x)
-            x = self.head(x)
+            embeddings = embeddings.mean(dim=1)
+            embeddings = self.norm(embeddings)
+            embeddings = self.head(embeddings)
 
-        # x may not always be in spatial order here.
+        # embeddings may not always be in spatial order here.
         # e.g. if q_pool = 2, mask_unit_size = (8, 8), and
         # q_stride = (2, 2), not all unrolls were consumed,
-        # intermediates[-1] is x in spatial order
-        if return_intermediates:
-            return x, intermediates
-
-        return x
\ No newline at end of file
+        # intermediates[-1] is embeddings in spatial order
+        if not return_dict:
+            return tuple(v for v in [embeddings, intermediates] if v is not None)
+        
+        return HieraModelOutput(
+            last_hidden_state=embeddings,
+            intermediates=intermediates if return_intermediates else None,
+        )
\ No newline at end of file
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index a0504997350b..c45056318a38 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera import Hiera, HieraBlock
+from .hiera import HieraModel, HieraBlock
 from .hiera_utils import pretrained_model, undo_windowing, conv_nd
 
 
@@ -36,8 +36,8 @@ def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
     return x
 
 
-class MaskedAutoencoderHiera(Hiera):
-    """Masked Autoencoder with Hiera backbone"""
+class MaskedAutoencoderHiera(HieraModel):
+    """Masked Autoencoder with HieraModel backbone"""
 
     def __init__(
         self,
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
index c96c63cbfaf9..a35b33210941 100644
--- a/src/transformers/models/hiera/hiera_utils.py
+++ b/src/transformers/models/hiera/hiera_utils.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 # --------------------------------------------------------
 #
-# Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles
 #
 # Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
 # Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
@@ -27,7 +27,7 @@
 from .convert_hiera_to_pytorch import convert_state_dict
 
 def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
-    """ Loads a Hiera model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
+    """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
 
     def inner(model_func: Callable) -> Callable:
         def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module:
@@ -69,7 +69,7 @@ def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool
 def conv_nd(n: int) -> Type[nn.Module]:
     """
     Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
-    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
+    If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises)
     """
     return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
new file mode 100644
index 000000000000..8d593af2a622
--- /dev/null
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Hiera model. """
+
+import unittest
+
+from transformers import HieraConfig
+from transformers.testing_utils import (
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available
+
+if is_torch_available():
+    import torch
+    from transformers import HieraModel
+    # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model
+    from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class HieraModelTester:
+    # Define this tester to initialize Hiera model and its configurations for testing
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        num_channels=3,
+        image_size=224,
+        # Add other model-specific parameters here
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        # Initialize other necessary attributes here
+
+    def prepare_config_and_inputs(self):
+        # Prepare configuration and inputs for testing your model
+        pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device)
+
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return HieraConfig(
+            # Define necessary configuration parameters here
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = HieraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values=pixel_values)
+        # Perform checks here, e.g., output shapes, etc.
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size))
+
+
+@require_torch
+class HieraModelTest(unittest.TestCase):
+
+    def setUp(self):
+        self.model_tester = HieraModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = HieraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
\ No newline at end of file
diff --git a/tests/models/hiera/test_modeling_vit_mae.py b/tests/models/hiera/test_modeling_vit_mae.py
deleted file mode 100644
index 014d41766a8e..000000000000
--- a/tests/models/hiera/test_modeling_vit_mae.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ViTMAE model. """
-
-
-import math
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import ViTMAEConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import ViTMAEForPreTraining, ViTMAEModel
-    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
\ No newline at end of file

From 51d11f554c2cdc86c6a52319dead96c811f7174a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-50.us-east-2.compute.internal>
Date: Sat, 17 Feb 2024 00:10:52 +0000
Subject: [PATCH 022/118] Moved utils to hiera

---
 src/transformers/models/hiera/hiera.py | 226 ++++++++++++++++++++++++-
 1 file changed, 223 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera.py
index 72917eb8e1a4..cca502aa80c9 100644
--- a/src/transformers/models/hiera/hiera.py
+++ b/src/transformers/models/hiera/hiera.py
@@ -20,7 +20,7 @@
 
 import math
 from functools import partial
-from typing import List, Tuple, Callable, Optional, Union
+from typing import List, Tuple, Callable, Optional, Union, Type
 from .configuration_hiera import HieraConfig
 import torch
 import torch.nn as nn
@@ -38,7 +38,227 @@
     replace_return_docstrings,
 )
 
-from .hiera_utils import conv_nd, do_pool, do_masked_conv, Unroll, Reroll
+
+def conv_nd(n: int) -> Type[nn.Module]:
+    """
+    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
+    If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises)
+    """
+    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
+
+
+def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
+    # Refer to `Unroll` to see how this performs a maxpool-Nd
+    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
+
+
+def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor:
+    # target_size: [(T), (H), W]
+    # (spatial) mask: [B, C, (t), (h), w]
+    if mask is None:
+        return mask
+
+    assert len(mask.shape[2:]) == len(target_size)
+    if mask.shape[2:] != target_size:
+        return F.interpolate(mask.float(), size=target_size)
+    return mask
+
+
+def do_masked_conv(
+    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """Zero-out the masked regions of the input before conv.
+    Prevents leakage of masked regions when using overlapping kernels.
+    """
+    if conv is None:
+        return x
+    if mask is None:
+        return conv(x)
+
+    mask = get_resized_mask(target_size=x.shape[2:], mask=mask)
+    return conv(x * mask.bool())
+
+
+def undo_windowing(
+    x: torch.Tensor, shape: List[int], mu_shape: List[int]
+) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
+        shape: current spatial shape, if it were not organized into mask unit
+            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
+        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
+    Returns:
+        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
+    """
+    D = len(shape)
+    B, C = x.shape[0], x.shape[-1]
+    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
+    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
+    x = x.view(B, *num_MUs, *mu_shape, C)
+
+    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
+    permute = (
+        [0]
+        + sum(
+            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
+            [],
+        )
+        + [len(x.shape) - 1]
+    )
+    x = x.permute(permute).reshape(B, *shape, C)
+
+    return x
+
+
+
+class Unroll(nn.Module):
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
+                           [B, (Sy, Sx, H // Sy, W // Sx), C]
+
+    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in HxW order, so they
+    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+        self.schedule = unroll_schedule
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input: Flattened patch embeddings [B, N, C]
+        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
+        """
+        B, _, C = x.shape
+
+        cur_size = self.size
+        x = x.view(*([B] + cur_size + [C]))
+
+        for strides in self.schedule:
+            # Move patches with the given strides to the batch dimension
+
+            # Create a view of the tensor with the patch stride as separate dims
+            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
+            cur_size = [i // s for i, s in zip(cur_size, strides)]
+            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
+            x = x.view(new_shape)
+
+            # Move the patch stride into the batch dimension
+            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
+            L = len(new_shape)
+            permute = (
+                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Now finally flatten the relevant dims into the batch dimension
+            x = x.flatten(0, len(strides))
+            B *= math.prod(strides)
+
+        x = x.reshape(-1, math.prod(self.size), C)
+        return x
+
+
+class Reroll(nn.Module):
+    """
+    Undos the "unroll" operation so that you can use intermediate features.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+        stage_ends: List[int],
+        q_pool: int,
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        self.schedule = {}
+        size = self.size
+        for i in range(stage_ends[-1] + 1):
+            self.schedule[i] = unroll_schedule, size
+            # schedule unchanged if no pooling at a stage end
+            if i in stage_ends[:q_pool]:
+                if len(unroll_schedule) > 0:
+                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
+                unroll_schedule = unroll_schedule[1:]
+
+    def forward(
+        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no mask is provided:
+            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
+        If a mask is provided:
+            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+        """
+        schedule, size = self.schedule[block_idx]
+        B, N, C = x.shape
+
+        D = len(size)
+        cur_mu_shape = [1] * D
+
+        for strides in schedule:
+            # Extract the current patch from N
+            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
+
+            # Move that patch into the current MU
+            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
+            L = len(x.shape)
+            permute = (
+                [0, 1 + D]
+                + sum(
+                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
+                    [],
+                )
+                + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Reshape to [B, N//(Sy*Sx), *MU, C]
+            for i in range(D):
+                cur_mu_shape[i] *= strides[i]
+            x = x.reshape(B, -1, *cur_mu_shape, C)
+            N = x.shape[1]
+
+        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        x = x.view(B, N, *cur_mu_shape, C)
+
+        # If masked, return [B, #MUs, MUy, MUx, C]
+        if mask is not None:
+            return x
+
+        # If not masked, we can return [B, H, W, C]
+        x = undo_windowing(x, size, cur_mu_shape)
+
+        return x
+
 
 @dataclass
 class HieraModelOutput(ModelOutput):
@@ -231,7 +451,7 @@ def forward(
         return embeddings
 
 
-class HireaModel(PreTrainedModel):
+class HieraModel(PreTrainedModel):
     """
     Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
 

From ea872fe2f81935e5cf5fdfd086fb3b29fb39f4b9 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 00:17:14 +0000
Subject: [PATCH 023/118] Change hiera -> hiera_model

---
 src/transformers/models/hiera/__init__.py     |  89 +-----
 src/transformers/models/hiera/benchmarking.py |  77 -----
 src/transformers/models/hiera/hiera_mae.py    |   2 +-
 .../models/hiera/{hiera.py => hiera_model.py} |   0
 src/transformers/models/hiera/hiera_utils.py  | 287 ------------------
 5 files changed, 3 insertions(+), 452 deletions(-)
 delete mode 100644 src/transformers/models/hiera/benchmarking.py
 rename src/transformers/models/hiera/{hiera.py => hiera_model.py} (100%)
 delete mode 100644 src/transformers/models/hiera/hiera_utils.py

diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 0434517bf52c..1f388d5361ab 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -41,7 +41,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .hiera import (
+        from .hiera_model import (
             HieraModel,
             Head,
             HieraBlock,
@@ -54,89 +54,4 @@
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
-
-####### PREV:
-    
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# from typing import TYPE_CHECKING
-
-# from ...utils import (
-#     OptionalDependencyNotAvailable,
-#     _LazyModule,
-#     is_flax_available,
-#     is_tf_available,
-#     is_torch_available,
-# )
-
-
-# _import_structure = {"configuration_vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"]}
-
-# try:
-#     if not is_torch_available():
-#         raise OptionalDependencyNotAvailable()
-# except OptionalDependencyNotAvailable:
-#     pass
-# else:
-#     _import_structure["modeling_vit_mae"] = [
-#         "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
-#         "ViTMAEForPreTraining",
-#         "ViTMAELayer",
-#         "ViTMAEModel",
-#         "ViTMAEPreTrainedModel",
-#     ]
-
-# try:
-#     if not is_tf_available():
-#         raise OptionalDependencyNotAvailable()
-# except OptionalDependencyNotAvailable:
-#     pass
-# else:
-#     _import_structure["modeling_tf_vit_mae"] = [
-#         "TFViTMAEForPreTraining",
-#         "TFViTMAEModel",
-#         "TFViTMAEPreTrainedModel",
-#     ]
-
-# if TYPE_CHECKING:
-#     from .configuration_vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
-
-#     try:
-#         if not is_torch_available():
-#             raise OptionalDependencyNotAvailable()
-#     except OptionalDependencyNotAvailable:
-#         pass
-#     else:
-#         from .modeling_vit_mae import (
-#             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
-#             ViTMAEForPreTraining,
-#             ViTMAELayer,
-#             ViTMAEModel,
-#             ViTMAEPreTrainedModel,
-#         )
-
-#     try:
-#         if not is_tf_available():
-#             raise OptionalDependencyNotAvailable()
-#     except OptionalDependencyNotAvailable:
-#         pass
-#     else:
-#         from .modeling_tf_vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
-
-
-# else:
-#     import sys
-
-#     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
diff --git a/src/transformers/models/hiera/benchmarking.py b/src/transformers/models/hiera/benchmarking.py
deleted file mode 100644
index 33166028977a..000000000000
--- a/src/transformers/models/hiera/benchmarking.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-
-import time
-from typing import List, Tuple, Union
-
-import torch
-from tqdm import tqdm
-
-# From https://github.com/facebookresearch/ToMe/
-def benchmark(
-    model: torch.nn.Module,
-    device: torch.device = 0,
-    input_size: Tuple[int] = (3, 224, 224),
-    batch_size: int = 64,
-    runs: int = 40,
-    throw_out: float = 0.25,
-    use_fp16: bool = False,
-    verbose: bool = False,
-) -> float:
-    """
-    Benchmark the given model with random inputs at the given batch size.
-
-    Args:
-     - model: the module to benchmark
-     - device: the device to use for benchmarking
-     - input_size: the input size to pass to the model e.g., (ch, h, w) or (ch, t, h, w)
-     - batch_size: the batch size to use for evaluation
-     - runs: the number of total runs to do
-     - throw_out: the percentage of runs to throw out at the start of testing
-     - use_fp16: whether or not to benchmark with float16 and autocast
-     - verbose: whether or not to use tqdm to print progress / print throughput at end
-
-    Returns:
-     - the throughput measured in images / second
-    """
-    if not isinstance(device, torch.device):
-        device = torch.device(device)
-    is_cuda = torch.device(device).type == "cuda"
-
-    model = model.eval().to(device)
-    input = torch.rand(batch_size, *input_size, device=device)
-    if use_fp16:
-        input = input.half()
-
-    warm_up = int(runs * throw_out)
-    total = 0
-    start = time.time()
-
-    with torch.autocast(device.type, enabled=use_fp16):
-        with torch.no_grad():
-            for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
-                if i == warm_up:
-                    if is_cuda:
-                        torch.cuda.synchronize()
-                    total = 0
-                    start = time.time()
-
-                model(input)
-                total += batch_size
-
-    if is_cuda:
-        torch.cuda.synchronize()
-
-    end = time.time()
-    elapsed = end - start
-
-    throughput = total / elapsed
-
-    if verbose:
-        print(f"Throughput: {throughput:.2f} im/s")
-
-    return throughput
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index c45056318a38..f0e2e7854bff 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera import HieraModel, HieraBlock
+from .hiera_model import HieraModel, HieraBlock
 from .hiera_utils import pretrained_model, undo_windowing, conv_nd
 
 
diff --git a/src/transformers/models/hiera/hiera.py b/src/transformers/models/hiera/hiera_model.py
similarity index 100%
rename from src/transformers/models/hiera/hiera.py
rename to src/transformers/models/hiera/hiera_model.py
diff --git a/src/transformers/models/hiera/hiera_utils.py b/src/transformers/models/hiera/hiera_utils.py
deleted file mode 100644
index a35b33210941..000000000000
--- a/src/transformers/models/hiera/hiera_utils.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-#
-# HieraModel: A Hierarchical Vision Transformer without the Bells-and-Whistles
-#
-# Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan,
-# Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed,
-# Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
-#
-# Paper: https://arxiv.org/abs/2306.00989/
-#
-# References:
-# slowfast: https://github.com/facebookresearch/SlowFast
-# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
-# --------------------------------------------------------
-
-import math
-from typing import List, Tuple, Optional, Type, Callable, Dict
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .convert_hiera_to_pytorch import convert_state_dict
-
-def pretrained_model(checkpoints: Dict[str, str], default: str = None) -> Callable:
-    """ Loads a HieraModel model from a pretrained source (if pretrained=True). Use "checkpoint" to specify the checkpoint. """
-
-    def inner(model_func: Callable) -> Callable:
-        def model_def(pretrained: bool = False, checkpoint: str = default, strict: bool = True, **kwdargs) -> nn.Module:
-            if pretrained:
-                if checkpoints is None:
-                    raise RuntimeError("This model currently doesn't have pretrained weights available.")
-                elif checkpoint is None:
-                    raise RuntimeError("No checkpoint specified.")
-                elif checkpoint not in checkpoints:
-                    raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
-
-                state_dict = torch.hub.load_state_dict_from_url(checkpoints[checkpoint], map_location="cpu")
-                state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
-                if "head.projection.weight" in state_dict["model_state"]:
-                    # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
-                    if "num_classes" not in kwdargs:
-                        kwdargs["num_classes"] = state_dict["model_state"]["head.projection.weight"].shape[0]
-                    # If the user specified a different number of classes, remove the projection weights or else we'll error out
-                    elif kwdargs["num_classes"] != state_dict["model_state"]["head.projection.weight"].shape[0]:
-                        del state_dict["model_state"]["head.projection.weight"]
-                        del state_dict["model_state"]["head.projection.bias"]
-
-            model = model_func(**kwdargs)
-            if pretrained:
-                # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-                if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
-                    strict = False
-
-                model.load_state_dict(state_dict["model_state"], strict=strict)
-            
-            return model
-
-        return model_def
-    
-    return inner
-
-
-
-def conv_nd(n: int) -> Type[nn.Module]:
-    """
-    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
-    If you wanted a 4d HieraModel, you could probably just implement this for n=4. (no promises)
-    """
-    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
-
-
-def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
-    # Refer to `Unroll` to see how this performs a maxpool-Nd
-    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
-
-
-def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tensor:
-    # target_size: [(T), (H), W]
-    # (spatial) mask: [B, C, (t), (h), w]
-    if mask is None:
-        return mask
-
-    assert len(mask.shape[2:]) == len(target_size)
-    if mask.shape[2:] != target_size:
-        return F.interpolate(mask.float(), size=target_size)
-    return mask
-
-
-def do_masked_conv(
-    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
-) -> torch.Tensor:
-    """Zero-out the masked regions of the input before conv.
-    Prevents leakage of masked regions when using overlapping kernels.
-    """
-    if conv is None:
-        return x
-    if mask is None:
-        return conv(x)
-
-    mask = get_resized_mask(target_size=x.shape[2:], mask=mask)
-    return conv(x * mask.bool())
-
-
-def undo_windowing(
-    x: torch.Tensor, shape: List[int], mu_shape: List[int]
-) -> torch.Tensor:
-    """
-    Restore spatial organization by undoing windowed organization of mask units.
-
-    Args:
-        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
-        shape: current spatial shape, if it were not organized into mask unit
-            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
-        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
-    Returns:
-        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
-    """
-    D = len(shape)
-    B, C = x.shape[0], x.shape[-1]
-    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
-    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
-    x = x.view(B, *num_MUs, *mu_shape, C)
-
-    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
-    permute = (
-        [0]
-        + sum(
-            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
-            [],
-        )
-        + [len(x.shape) - 1]
-    )
-    x = x.permute(permute).reshape(B, *shape, C)
-
-    return x
-
-
-
-class Unroll(nn.Module):
-    """
-    Reorders the tokens such that patches are contiguous in memory.
-    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
-                           [B, (Sy, Sx, H // Sy, W // Sx), C]
-
-    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
-    Not only is this faster, but it also makes it easy to support inputs of arbitrary
-    dimensions in addition to patch-wise sparsity.
-
-    Performing this operation multiple times in sequence puts entire windows as contiguous
-    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
-    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
-    computed easily and efficiently, while also allowing max to be applied sequentially.
-
-    Note: This means that intermediate values of the model are not in HxW order, so they
-    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
-    The last block of the network is fine though, since by then the strides are all consumed.
-    """
-
-    def __init__(
-        self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
-    ):
-        super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-        self.schedule = unroll_schedule
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Input: Flattened patch embeddings [B, N, C]
-        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
-        """
-        B, _, C = x.shape
-
-        cur_size = self.size
-        x = x.view(*([B] + cur_size + [C]))
-
-        for strides in self.schedule:
-            # Move patches with the given strides to the batch dimension
-
-            # Create a view of the tensor with the patch stride as separate dims
-            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
-            cur_size = [i // s for i, s in zip(cur_size, strides)]
-            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
-            x = x.view(new_shape)
-
-            # Move the patch stride into the batch dimension
-            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
-            L = len(new_shape)
-            permute = (
-                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
-            )
-            x = x.permute(permute)
-
-            # Now finally flatten the relevant dims into the batch dimension
-            x = x.flatten(0, len(strides))
-            B *= math.prod(strides)
-
-        x = x.reshape(-1, math.prod(self.size), C)
-        return x
-
-
-class Reroll(nn.Module):
-    """
-    Undos the "unroll" operation so that you can use intermediate features.
-    """
-
-    def __init__(
-        self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
-        stage_ends: List[int],
-        q_pool: int,
-    ):
-        super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-
-        # The first stage has to reverse everything
-        # The next stage has to reverse all but the first unroll, etc.
-        self.schedule = {}
-        size = self.size
-        for i in range(stage_ends[-1] + 1):
-            self.schedule[i] = unroll_schedule, size
-            # schedule unchanged if no pooling at a stage end
-            if i in stage_ends[:q_pool]:
-                if len(unroll_schedule) > 0:
-                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
-                unroll_schedule = unroll_schedule[1:]
-
-    def forward(
-        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
-    ) -> torch.Tensor:
-        """
-        Roll the given tensor back up to spatial order assuming it's from the given block.
-
-        If no mask is provided:
-            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
-        If a mask is provided:
-            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
-        """
-        schedule, size = self.schedule[block_idx]
-        B, N, C = x.shape
-
-        D = len(size)
-        cur_mu_shape = [1] * D
-
-        for strides in schedule:
-            # Extract the current patch from N
-            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
-
-            # Move that patch into the current MU
-            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
-            L = len(x.shape)
-            permute = (
-                [0, 1 + D]
-                + sum(
-                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
-                    [],
-                )
-                + [L - 1]
-            )
-            x = x.permute(permute)
-
-            # Reshape to [B, N//(Sy*Sx), *MU, C]
-            for i in range(D):
-                cur_mu_shape[i] *= strides[i]
-            x = x.reshape(B, -1, *cur_mu_shape, C)
-            N = x.shape[1]
-
-        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
-        x = x.view(B, N, *cur_mu_shape, C)
-
-        # If masked, return [B, #MUs, MUy, MUx, C]
-        if mask is not None:
-            return x
-
-        # If not masked, we can return [B, H, W, C]
-        x = undo_windowing(x, size, cur_mu_shape)
-
-        return x
\ No newline at end of file

From fa570f307c814942366dce660ba0407192c14b22 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 01:10:17 +0000
Subject: [PATCH 024/118] Fixed integration into tranformers

---
 src/transformers/__init__.py                        |  2 +-
 src/transformers/models/hiera/__init__.py           | 13 ++++++++-----
 .../models/hiera/hiera_image_processor.py           |  2 +-
 src/transformers/models/hiera/hiera_model.py        |  3 +++
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9e3c4c5f7c96..51771d7f2229 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6993,8 +6993,8 @@
             HubertPreTrainedModel,
         )
         from .models.hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
-            HieraBlock
         )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 1f388d5361ab..2b83a4c8d693 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -9,8 +9,8 @@
 
 _import_structure = {
     "configuration_hiera": [
-        "HIREA_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "HireaConfig",
+        "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "HieraConfig",
     ],
 }
 
@@ -20,15 +20,16 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["hirea"] = [
-        "HIREA_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Hirea",
+    _import_structure["hiera_model"] = [
+        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "HieraModel",
         "Head",
         "HieraBlock",
         "MaskUnitAttention"
         ""
     ]
 
+
 if TYPE_CHECKING:
     from .configuration_hiera import (
         HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -42,10 +43,12 @@
         pass
     else:
         from .hiera_model import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
             Head,
             HieraBlock,
             MaskUnitAttention,
+            
         )
         from .hiera_image_processor import (
             HieraImageProcessor
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
index 4900e4a4d3fb..d3f2ce96a64b 100644
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -1,5 +1,5 @@
 
-"""Image processor class for Hirea."""
+"""Image processor class for Hiera."""
 
 from typing import Dict, List, Optional, Union
 
diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py
index cca502aa80c9..5e7493e3c6a7 100644
--- a/src/transformers/models/hiera/hiera_model.py
+++ b/src/transformers/models/hiera/hiera_model.py
@@ -38,6 +38,9 @@
     replace_return_docstrings,
 )
 
+HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "",
+]
 
 def conv_nd(n: int) -> Type[nn.Module]:
     """

From 7e41f4998e22df8d00c8fc6d378272b6569c5a8e Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 01:23:55 +0000
Subject: [PATCH 025/118] Fix: Convert Checkpoint

---
 .../models/hiera/convert_hiera_to_pytorch.py       | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index d0294f12deab..76c86bcb0cbb 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -3,9 +3,9 @@
 import requests
 import torch
 from PIL import Image
-# from transformers.models.hiera.configuration_hiera import HieraConfig
-# from transformers.models.hiera.hiera import HieraModel
-# from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
+from transformers import HieraConfig
+from transformers import HieraModel
+from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
 # from transformers import HieraConfig, HieraModel
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
@@ -199,11 +199,13 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
     out = model(inputs[None, ...])
 
     # 207: golden retriever  (imagenet-1k)
-    out.argmax(dim=-1).item()
+    out.last_hidden_state.argmax(dim=-1).item()
 
+    # If you also want intermediate feature maps
+    out = model(inputs[None, ...], return_intermediates=True)
 
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
+    for x in out.intermediates:
+        print(x.shape)    
 
     print(f"Saving image processor to {pytorch_dump_folder_path}")
     image_processor.save_pretrained(pytorch_dump_folder_path)

From f47d06a960c74926ad002578edcd48a64af3fba3 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 07:38:00 +0000
Subject: [PATCH 026/118] added documentation for hiera

---
 README.md                   | 1 +
 README_de.md                | 1 +
 README_es.md                | 1 +
 README_fr.md                | 1 +
 README_hd.md                | 1 +
 README_ja.md                | 1 +
 README_ko.md                | 1 +
 README_pt-br.md             | 1 +
 README_ru.md                | 1 +
 README_te.md                | 1 +
 README_zh-hans.md           | 1 +
 README_zh-hant.md           | 1 +
 docs/source/en/_toctree.yml | 2 ++
 docs/source/en/index.md     | 1 +
 14 files changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 54e228a11502..b6ec0f083527 100644
--- a/README.md
+++ b/README.md
@@ -390,6 +390,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_de.md b/README_de.md
index 71ff7ce4aa33..b98c4c08113c 100644
--- a/README_de.md
+++ b/README_de.md
@@ -385,6 +385,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_es.md b/README_es.md
index b3c6845000d2..e5c596e70634 100644
--- a/README_es.md
+++ b/README_es.md
@@ -363,6 +363,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_fr.md b/README_fr.md
index 4b87eba5bbe1..53d8612c8b94 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -384,6 +384,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Les Transformers sont-ils vraiment inefficaces pour la représentation graphique ?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT : la segmentation sémantique émerge de la supervision textuelle](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ : référentiel complet pour la compréhension du langage polonais](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (de Facebook) publié avec l'article [Hiera : un transformateur de vision hiérarchique sans cloches et sifflets]( https://arxiv.org/abs/2306.00989) par Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT : Apprentissage de la représentation autonome de la parole par prédiction masquée des unités cachées](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT : Quantification entière de BERT avec des entiers uniquement](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS : Un ensemble de données filtré à l'échelle du Web d'intercalation de documents texte-image](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_hd.md b/README_hd.md
index e68d9d39ba62..1dd181b01b34 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -337,6 +337,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** ((फेसबुक से) पेपर के साथ जारी किया गया [हिरा: बेल्स-एंड-व्हिसल्स के बिना एक पदानुक्रमित विजन ट्रांसफार्मर](https://arxiv.org/abs/2306.00989) by चैतन्य रयाली, युआन-टिंग हू, डैनियल बोल्या, चेन वेई, हाओकी फैन, पो-याओ हुआंग, वैभव अग्रवाल, अर्कबंधु चौधरी, ओमिद पौरसीद, जूडी हॉफमैन, जितेंद्र मलिक, द्वारा यांगहाओ ली, क्रिस्टोफ़ फ़िचटेनहोफ़र
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ja.md b/README_ja.md
index d314b07140f5..c2103ac6a2b3 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -397,6 +397,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook から) Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer から公開された研究論文 [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989)  
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ko.md b/README_ko.md
index f8679087ad17..bd781f6adf6d 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -312,6 +312,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (Facebook 에서) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_pt-br.md b/README_pt-br.md
index 684d96366aaf..65ff9fdc0f97 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -390,6 +390,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ru.md b/README_ru.md
index e552b5cd4f90..4b01b6cf8060 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -381,6 +381,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_te.md b/README_te.md
index 8da790e18204..3e69e473862e 100644
--- a/README_te.md
+++ b/README_te.md
@@ -383,6 +383,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 1832870d52ff..fc616751ee0e 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -336,6 +336,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (来自 Facebook) 伴随论文  [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) 由 Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 2bf31890f359..5adb28a3070b 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -348,6 +348,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/Hiera)** (from Facebook) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ff6e91dbcf25..4b59e76f5490 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -628,6 +628,8 @@
         title: CLAP
       - local: model_doc/encodec
         title: EnCodec
+      - local: model_doc/hiera
+        title: Hiera
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 34995edec39c..b26c9f91360c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -155,6 +155,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
+|                        [Hiera](model_doc/hiera)                          |       ✅        |         ❌         |      ❌      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
 |                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
 |                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |

From 9d249e0933d54a255d7baf247762dfbb9b35dd38 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 07:38:31 +0000
Subject: [PATCH 027/118] added documentation for hiera

---
 docs/source/en/model_doc/hiera.md | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 docs/source/en/model_doc/hiera.md

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
new file mode 100644
index 000000000000..1c46bae9b072
--- /dev/null
+++ b/docs/source/en/model_doc/hiera.md
@@ -0,0 +1,40 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hiera
+
+## Overview
+
+Hubert was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
+
+The abstract from the paper is the following:
+
+Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.
+
+## HireaConfig
+
+[[autodoc]] HieraConfig
+
+<frameworkcontent>
+<pt>
+
+## HireaModel
+
+[[autodoc]] HireaModel
+    - forward
+
+</tf>
+</frameworkcontent>
\ No newline at end of file

From 7cff18690863c605e1ac180005ca2fff0dfb2050 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sat, 17 Feb 2024 07:39:18 +0000
Subject: [PATCH 028/118] added Docstings to models, Transformers based changes

---
 src/transformers/__init__.py                  |   2 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/hiera/__init__.py     |  24 ++--
 .../models/hiera/configuration_hiera.py       |  15 +++
 .../models/hiera/convert_hiera_to_pytorch.py  |  15 +++
 .../models/hiera/hiera_image_processor.py     |  14 +++
 src/transformers/models/hiera/hiera_mae.py    | 113 +-----------------
 src/transformers/models/hiera/hiera_model.py  |  89 +++++++++-----
 8 files changed, 124 insertions(+), 149 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 51771d7f2229..2e727a215038 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4152,6 +4152,7 @@
         [
             "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
             "HieraModel",
+            "HieraPreTrainedModel"
             
         ]
     )
@@ -6995,6 +6996,7 @@
         from .models.hiera import (
             HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
+            HieraPreTrainedModel
         )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index aef894a425ba..5261753d202d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -69,6 +69,7 @@
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
+        ("hiera", "HieraImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 2b83a4c8d693..0787bffe767e 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING
 
 from ...utils import (
@@ -23,9 +38,7 @@
     _import_structure["hiera_model"] = [
         "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "HieraModel",
-        "Head",
-        "HieraBlock",
-        "MaskUnitAttention"
+        "HieraPreTrainedModel"
         ""
     ]
 
@@ -45,10 +58,7 @@
         from .hiera_model import (
             HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraModel,
-            Head,
-            HieraBlock,
-            MaskUnitAttention,
-            
+            HieraPreTrainedModel
         )
         from .hiera_image_processor import (
             HieraImageProcessor
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index e3133354f6ea..a4ab4fd9d30b 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -1,5 +1,20 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ hiera  model configuration"""
 
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from typing import  Tuple
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 76c86bcb0cbb..5ca2ecd262d9 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 
 import requests
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
index d3f2ce96a64b..4e41e14bc6f8 100644
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -1,3 +1,17 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """Image processor class for Hiera."""
 
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index f0e2e7854bff..d4ec15058b2d 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,8 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera_model import HieraModel, HieraBlock
-from .hiera_utils import pretrained_model, undo_windowing, conv_nd
+from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd
 
 
 def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
@@ -287,112 +286,4 @@ def forward(
         )  # pred_mask is mask at resolution of *prediction*
 
         # Toggle mask, to generate labels for *masked* tokens
-        return *self.forward_loss(x, pred, ~pred_mask), mask
-
-
-
-
-# Image Models
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
-}, default="mae_in1k")
-def mae_hiera_tiny_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=96, num_heads=1, stages=(1, 2, 7, 2), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
-}, default="mae_in1k")
-def mae_hiera_small_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=96, num_heads=1, stages=(1, 2, 11, 2), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
-}, default="mae_in1k")
-def mae_hiera_base_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=96, num_heads=1, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
-}, default="mae_in1k")
-def mae_hiera_base_plus_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
-}, default="mae_in1k")
-def mae_hiera_large_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
-    )
-
-
-@pretrained_model({
-    "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
-}, default="mae_in1k")
-def mae_hiera_huge_224(**kwargs):
-    return MaskedAutoencoderHiera(
-        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), q_pool=2, **kwargs,
-    )
-
-
-
-# Video Models
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
-}, default="mae_k400")
-def mae_hiera_base_16x224(num_classes: int = 400, **kwdargs):
-    return MaskedAutoencoderHiera(
-        num_classes=num_classes,  # K400 has 400 classes
-        input_size=(16, 224, 224),
-        q_stride=(1, 2, 2),
-        mask_unit_size=(1, 8, 8),
-        patch_kernel=(3, 7, 7),
-        patch_stride=(2, 4, 4),
-        patch_padding=(1, 3, 3),
-        sep_pos_embed=True,
-        q_pool=2,
-        **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
-}, default="mae_k400")
-@pretrained_model(None)
-def mae_hiera_base_plus_16x224(**kwdargs):
-    return mae_hiera_base_16x224(
-        embedding_dimention=112, num_heads=2, stages=(2, 3, 16, 3), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
-}, default="mae_k400")
-@pretrained_model(None)
-def mae_hiera_large_16x224(**kwdargs):
-    return mae_hiera_base_16x224(
-        embedding_dimention=144, num_heads=2, stages=(2, 6, 36, 4), **kwdargs
-    )
-
-
-@pretrained_model({
-    "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-}, default="mae_k400")
-def mae_hiera_huge_16x224(**kwdargs):
-    return mae_hiera_base_16x224(
-        embedding_dimention=256, num_heads=4, stages=(2, 6, 36, 4), **kwdargs
-    )
+        return *self.forward_loss(x, pred, ~pred_mask), mask
\ No newline at end of file
diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py
index 5e7493e3c6a7..b1ed0db0e4b9 100644
--- a/src/transformers/models/hiera/hiera_model.py
+++ b/src/transformers/models/hiera/hiera_model.py
@@ -271,10 +271,6 @@ class HieraModelOutput(ModelOutput):
     Args:
         last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
             Last layer hidden-states.
-        attentions (Tuple[torch.FloatTensor], optional, returned when output_attentions=True): 
-            Attentions weights from the model, one for each layer.
-        hidden_states (Tuple[torch.FloatTensor], optional, returned when output_hidden_states=True): 
-            Hidden states of the model at the output of each layer.
         intermediates (List[torch.Tensor], optional): 
             Intermediate representations or features from the model, if applicable.
     """
@@ -422,10 +418,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.act_func(x)
         return x
 
-
+@add_start_docstrings("""
+Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
+""")
 class PatchEmbedding(nn.Module):
-    """Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d)."""
-
     def __init__(
         self,
         dim_in: int,
@@ -453,27 +449,49 @@ def forward(
         embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
         return embeddings
 
-
-class HieraModel(PreTrainedModel):
+class HieraPreTrainedModel(PreTrainedModel):
     """
-    Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
 
-    This model is a PyTorch implementation of the Hiera architecture for image classification.
+    def _init_weights(self, module, init_bias=0.02):
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                nn.init.constant_(module.bias, init_bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, init_bias)
+            nn.init.constant_(module.weight, 1.0)
 
-    The model can be used as follows:
 
-    Args:
-        config (HieraConfig): Configuration class instance for `Hiera`.
 
-    Example usage:
-        >>> from your_model_file import Hiera, HieraConfig
-        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
 
-        >>> model = Hiera(config)
-        >>> inputs = torch.rand((1, 3, 224, 224))
-        >>> outputs = model(inputs)
-    """
+@add_start_docstrings("""
+Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+
+This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
+
+The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance.
 
+Parameters:
+    config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
+        Initializing with a config file does not load the weights associated with the model, only the
+        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    
+Example usage:
+    >>> from your_model_file import Hiera, HieraConfig
+    >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+
+    >>> model = Hiera(config)
+    >>> inputs = torch.rand((1, 3, 224, 224))
+    >>> outputs = model(inputs)
+                      """)
+class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
@@ -601,14 +619,6 @@ def __init__(self, config: HieraConfig):
         self.head.projection.bias.data.mul_(self.head_init_scale)
         self.post_init()
 
-    def _init_weights(self, m, init_bias=0.02):
-        if isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, init_bias)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, init_bias)
-            nn.init.constant_(m.weight, 1.0)
 
     @torch.jit.ignore
     def no_weight_decay(self):
@@ -655,6 +665,25 @@ def get_position_embeddings(self) -> torch.Tensor:
         else:
             return self.position_embeddings
 
+    @add_start_docstrings_to_model_forward("""
+    The forward pass for the Hiera model.
+
+    Args:
+        pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`.
+        
+        mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
+        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
+        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
+
+        
+        return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
+
+        return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
+    
+    
+        
+    """)
+    @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig")
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -663,8 +692,6 @@ def forward(
         return_intermediates: bool = False,
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
         """
-        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
-        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
         """
         # Slowfast training passes in a list
         if isinstance(pixel_values, list):

From c4a4168783bbd9f38b08e30935141e78a282d91f Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sun, 18 Feb 2024 06:55:51 +0000
Subject: [PATCH 029/118] make style and quality

---
 src/transformers/__init__.py                  |  15 +--
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/hiera/__init__.py     |  19 +---
 .../models/hiera/configuration_hiera.py       |  18 ++--
 .../models/hiera/convert_hiera_to_pytorch.py  | 102 +++++++++---------
 .../models/hiera/hiera_image_processor.py     |  51 ++++-----
 src/transformers/models/hiera/hiera_mae.py    |  54 +++-------
 7 files changed, 104 insertions(+), 161 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2e727a215038..4d7ef6ce20d3 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -497,7 +497,7 @@
         "GroupViTVisionConfig",
     ],
     "models.herbert": ["HerbertTokenizer"],
-    "models.hiera":["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP","HieraConfig"],
+    "models.hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.idefics": [
@@ -4149,12 +4149,7 @@
         ]
     )
     _import_structure["models.hiera"].extend(
-        [
-            "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "HieraModel",
-            "HieraPreTrainedModel"
-            
-        ]
+        ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"]
     )
     _import_structure["models.hubert"].extend(
         [
@@ -6986,6 +6981,7 @@
             GroupViTTextModel,
             GroupViTVisionModel,
         )
+        from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
         from .models.hubert import (
             HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             HubertForCTC,
@@ -6993,11 +6989,6 @@
             HubertModel,
             HubertPreTrainedModel,
         )
-        from .models.hiera import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
-            HieraModel,
-            HieraPreTrainedModel
-        )
         from .models.ibert import (
             IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             IBertForMaskedLM,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 796e524fd0cf..6f824a2e955d 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -116,8 +116,8 @@
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
+        ("hiera", "HieraConfig"),
         ("hubert", "HubertConfig"),
-        ("hiera","HieraConfig"),
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
@@ -352,8 +352,8 @@
         ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("hiera", "HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("hiera","HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -590,7 +590,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera","HieraModel"),
+        ("hiera", "HieraModel"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 0787bffe767e..fcffbbf7593e 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -35,12 +35,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["hiera_model"] = [
-        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "HieraModel",
-        "HieraPreTrainedModel"
-        ""
-    ]
+    _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "]
 
 
 if TYPE_CHECKING:
@@ -55,16 +50,10 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .hiera_model import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
-            HieraModel,
-            HieraPreTrainedModel
-        )
-        from .hiera_image_processor import (
-            HieraImageProcessor
-        )
+        from .hiera_image_processor import HieraImageProcessor
+        from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
 
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index a4ab4fd9d30b..8d40e7a72777 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -15,15 +15,15 @@
 """ hiera  model configuration"""
 
 
+from typing import Tuple
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from typing import  Tuple
 
-logger = logging.get_logger(__name__)
 
-HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+logger = logging.get_logger(__name__)
 
-}
+HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 
 
 class HieraConfig(PretrainedConfig):
@@ -42,7 +42,7 @@ class HieraConfig(PretrainedConfig):
         embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96.
         number_of_heads (int, optional): Initial number of attention heads. Defaults to 1.
         num_classes (int, optional): Number of output classes. Defaults to 1000.
-        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model. 
+        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model.
         q_pool (int, optional): Number of pooling stages for queries. Defaults to 3.
         q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2).
         mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride.
@@ -58,7 +58,7 @@ class HieraConfig(PretrainedConfig):
         head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001.
         sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False.
 
-    
+
         Example:
         ```python
         >>> from transformers import HieraConfig, HieraModel
@@ -72,9 +72,10 @@ class HieraConfig(PretrainedConfig):
         >>> # Accessing the model configuration
         >>> configuration = model.config
         ```
-        """
+    """
 
     model_type = "hiera"
+
     def __init__(
         self,
         input_size: Tuple[int, ...] = (224, 224),
@@ -99,7 +100,6 @@ def __init__(
         head_init_scale: float = 0.001,
         sep_position_embeddings: bool = False,
         **kwargs,
-
     ):
         super().__init__(**kwargs)
         self.input_size = input_size
@@ -121,4 +121,4 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.head_dropout = head_dropout
         self.head_init_scale = head_init_scale
-        self.sep_position_embeddings = sep_position_embeddings
\ No newline at end of file
+        self.sep_position_embeddings = sep_position_embeddings
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 5ca2ecd262d9..794a62147d78 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -15,17 +15,11 @@
 
 import argparse
 
-import requests
 import torch
-from PIL import Image
-from transformers import HieraConfig
-from transformers import HieraModel
-from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
-# from transformers import HieraConfig, HieraModel
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
+# from transformers import HieraConfig, HieraModel
+from transformers import HieraConfig, HieraModel
+from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
 
 
 def rename_key(name):
@@ -51,7 +45,7 @@ def convert_state_dict(orig_state_dict, config):
     return updated_model_state
 
 
-def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
+def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs):
     strict = True
     pretrained_models_links = {
         "hiera_tiny_224": {
@@ -93,21 +87,24 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
         "hiera_huge_16x224": {
             "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
             "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-        }
+        },
     }
 
-
     if "hiera_tiny_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, 
-                            number_of_heads=1, 
-                            stages=(1, 2, 7, 2),)
+        config = HieraConfig(
+            embedding_dimension=96,
+            number_of_heads=1,
+            stages=(1, 2, 7, 2),
+        )
         checkpoints = pretrained_models_links["hiera_tiny_224"]
         checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_small_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, 
-                            number_of_heads=1, 
-                            stages=(1, 2, 11, 2),)
+        config = HieraConfig(
+            embedding_dimension=96,
+            number_of_heads=1,
+            stages=(1, 2, 11, 2),
+        )
         checkpoints = pretrained_models_links["hiera_small_224"]
         checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
 
@@ -118,56 +115,57 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
         checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_plus_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=112, 
-                            number_of_heads=2, 
-                            stages=(2, 3, 16, 3),)
+        config = HieraConfig(
+            embedding_dimension=112,
+            number_of_heads=2,
+            stages=(2, 3, 16, 3),
+        )
         checkpoints = pretrained_models_links["hiera_base_plus_224"]
         checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_large_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=144, 
-                            number_of_heads=2, 
-                            stages=(2, 6, 36, 4),)
+        config = HieraConfig(
+            embedding_dimension=144,
+            number_of_heads=2,
+            stages=(2, 6, 36, 4),
+        )
         checkpoints = pretrained_models_links["hiera_large_224"]
         checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_huge_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=256, 
-                            number_of_heads=4, 
-                            stages=(2, 6, 36, 4))
+        config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
         checkpoints = pretrained_models_links["hiera_huge_224"]
         checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_16x224" in checkpoint_url:
-        config = HieraConfig(num_classes=num_classes,  # Assuming num_classes is defined elsewhere
-                            input_size=(16, 224, 224),
-                            q_stride=(1, 2, 2),
-                            mask_unit_size=(1, 8, 8),
-                            patch_kernel=(3, 7, 7),
-                            patch_stride=(2, 4, 4),
-                            patch_padding=(1, 3, 3),
-                            sep_position_embeddings=True,)
+        config = HieraConfig(
+            input_size=(16, 224, 224),
+            q_stride=(1, 2, 2),
+            mask_unit_size=(1, 8, 8),
+            patch_kernel=(3, 7, 7),
+            patch_stride=(2, 4, 4),
+            patch_padding=(1, 3, 3),
+            sep_position_embeddings=True,
+        )
         checkpoints = pretrained_models_links["hiera_base_16x224"]
         checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"]
 
     elif "hiera_base_plus_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=112, 
-                            number_of_heads=2, 
-                            stages=(2, 3, 16, 3))
+        config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3))
         checkpoints = pretrained_models_links["hiera_base_plus_16x224"]
         checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"]
 
     elif "hiera_large_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=144, 
-                            number_of_heads=2, 
-                            stages=(2, 6, 36, 4), )
+        config = HieraConfig(
+            embedding_dimension=144,
+            number_of_heads=2,
+            stages=(2, 6, 36, 4),
+        )
         checkpoints = pretrained_models_links["hiera_large_16x224"]
         checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"]
 
     elif "hiera_huge_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=256, 
-                            number_of_heads=4, 
-                            stages=(2, 6, 36, 4) )
+        config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
         checkpoints = pretrained_models_links["hiera_huge_16x224"]
         checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
     elif checkpoint not in checkpoints:
@@ -181,7 +179,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
             raise RuntimeError("No checkpoint specified.")
 
         state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu")
-        state_dict["model_state"] = convert_state_dict(state_dict["model_state"],{})
+        state_dict["model_state"] = convert_state_dict(state_dict["model_state"], {})
         if "head.projection.weight" in state_dict["model_state"]:
             # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
             if config.num_classes is None:
@@ -194,19 +192,16 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
     model = HieraModel(config=config)
     if pretrained:
         # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(model, "decoder_position_embeddings"):
+        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(
+            model, "decoder_position_embeddings"
+        ):
             strict = False
 
-        model.load_state_dict(state_dict["model_state"])
+        model.load_state_dict(state_dict["model_state"], strict)
         # model.load_state_dict(state_dict["model_state"], strict=strict)
-    
-
-
 
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
 
-
-    
     image_processor = HieraImageProcessor(size=224)
     inputs = image_processor.process_image(image_url=url)
 
@@ -220,7 +215,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
     out = model(inputs[None, ...], return_intermediates=True)
 
     for x in out.intermediates:
-        print(x.shape)    
+        print(x.shape)
 
     print(f"Saving image processor to {pytorch_dump_folder_path}")
     image_processor.save_pretrained(pytorch_dump_folder_path)
@@ -231,4 +226,3 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path,**kwargs):
 
     checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth"
     convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/")
-
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
index 4e41e14bc6f8..0200687c4835 100644
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ b/src/transformers/models/hiera/hiera_image_processor.py
@@ -15,32 +15,18 @@
 
 """Image processor class for Hiera."""
 
-from typing import Dict, List, Optional, Union
 
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import rescale, resize, to_channel_dimension_format
-from ...image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    infer_channel_dimension_format,
-    is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from ...utils import TensorType, is_vision_available, logging
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from PIL import Image
 import requests
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+from ...image_processing_utils import BaseImageProcessor
+from ...utils import is_vision_available, logging
 
 
 if is_vision_available():
-    import PIL
+    from PIL import Image
+    from torchvision import transforms
+    from torchvision.transforms.functional import InterpolationMode
 
 
 logger = logging.get_logger(__name__)
@@ -51,20 +37,23 @@ def __init__(self, size):
         self.size = size
         self.transform_list = [
             transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
-            transforms.CenterCrop(self.size)
+            transforms.CenterCrop(self.size),
         ]
         self.transform_vis = transforms.Compose(self.transform_list)
-        self.transform_norm = transforms.Compose(self.transform_list + [
-            transforms.ToTensor(),
-            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ])
-    
+        self.transform_norm = transforms.Compose(
+            self.transform_list
+            + [
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ]
+        )
+
     def process_image(self, image_url):
         # Load the image
         img = Image.open(requests.get(image_url, stream=True).raw)
-        
+
         # Apply transformations
-        img_vis = self.transform_vis(img)
+        # img_vis = self.transform_vis(img)
         img_norm = self.transform_norm(img)
-        
-        return img_norm
\ No newline at end of file
+
+        return img_norm
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index d4ec15058b2d..56b91bc7acb7 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -10,28 +10,28 @@
 # --------------------------------------------------------
 
 
+import math
 from functools import partial
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
-import math
 import torch
 import torch.nn as nn
 
-from .hiera_model import HieraModel, HieraBlock, undo_windowing, conv_nd
+from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing
 
 
 def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
     if isinstance(head, nn.Identity):
         return x
 
-    batch_size , num_mask_units = x.shape[0:2]
+    batch_size, num_mask_units = x.shape[0:2]
     # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size  * #MUs, C, My, Mx])
     permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
-    x = head(x.reshape(batch_size  * num_mask_units, *x.shape[2:]).permute(permute))
+    x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute))
 
     # Restore original layout, e.g. [batch_size  * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C']
     permute = [0] + list(range(2, len(x.shape))) + [1]
-    x = x.permute(permute).reshape(batch_size , num_mask_units, *x.shape[2:], x.shape[1])
+    x = x.permute(permute).reshape(batch_size, num_mask_units, *x.shape[2:], x.shape[1])
     return x
 
 
@@ -64,8 +64,7 @@ def __init__(
             i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride)
         ]
         self.tokens_spatial_shape_final = [
-            i // s ** (self.q_pool)
-            for i, s in zip(self.tokens_spatial_shape, self.q_stride)
+            i // s ** (self.q_pool) for i, s in zip(self.tokens_spatial_shape, self.q_stride)
         ]
         # --------------------------------------------------------------------------
         # Multi-scale fusion heads
@@ -73,9 +72,7 @@ def __init__(
         self.multi_scale_fusion_heads = nn.ModuleList()
 
         for i in self.stage_ends[: self.q_pool]:  # resolution constant after q_pool
-            kernel = [
-                i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)
-            ]
+            kernel = [i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)]
             curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)]
             self.multi_scale_fusion_heads.append(
                 conv_nd(len(self.q_stride))(
@@ -94,9 +91,7 @@ def __init__(
         self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
 
         self.decoder_pos_embed = nn.Parameter(
-            torch.zeros(
-                1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim
-            )
+            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim)
         )
 
         self.decoder_blocks = nn.ModuleList(
@@ -113,9 +108,7 @@ def __init__(
         )
         self.decoder_norm = norm_layer(decoder_embed_dim)
 
-        self.pred_stride = patch_stride[-1] * (
-            self.q_stride[-1] ** self.q_pool
-        )  # patch stride of prediction
+        self.pred_stride = patch_stride[-1] * (self.q_stride[-1] ** self.q_pool)  # patch stride of prediction
 
         self.decoder_pred = nn.Linear(
             decoder_embed_dim,
@@ -143,9 +136,7 @@ def _mae_init_weights(self, m: nn.Module):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
 
-    def get_pixel_label_2d(
-        self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True
-    ) -> torch.Tensor:
+    def get_pixel_label_2d(self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor:
         # mask (boolean tensor): True must correspond to *masked*
         input_img = input_img.permute(0, 2, 3, 1)
 
@@ -160,13 +151,11 @@ def get_pixel_label_2d(
 
         return label
 
-    def get_pixel_label_3d(
-        self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True
-    ) -> torch.Tensor:
+    def get_pixel_label_3d(self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor:
         # mask (boolean tensor): True must correspond to *masked*
 
         # We use time strided loss, only take the first frame from each token
-        input_vid = input_vid[:, :, ::self.patch_stride[0], :, :]
+        input_vid = input_vid[:, :, :: self.patch_stride[0], :, :]
 
         size = self.pred_stride
         label = input_vid.unfold(3, size, size).unfold(4, size, size)
@@ -181,11 +170,9 @@ def get_pixel_label_3d(
 
         return label
 
-
     def forward_encoder(
         self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-
         if mask is None:
             mask = self.get_random_mask(x, mask_ratio)  # [batch_size , #MUs_all]
 
@@ -203,9 +190,7 @@ def forward_encoder(
 
         return x, mask
 
-    def forward_decoder(
-        self, x: torch.Tensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward_decoder(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         # Embed tokens
         x = self.decoder_embed(x)
 
@@ -214,9 +199,7 @@ def forward_decoder(
         # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
         # mask: [batch_size , #MUs_all]
         x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
-        mask_tokens = self.mask_token.view(
-            (1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,)
-        )
+        mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,))
         mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:]))
         mask = mask.expand((-1,) * 2 + x.shape[2:]).bool()
         x_dec[mask] = x.flatten()
@@ -279,11 +262,8 @@ def forward(
         mask_ratio: float = 0.6,
         mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-
         latent, mask = self.forward_encoder(x, mask_ratio, mask=mask)
-        pred, pred_mask = self.forward_decoder(
-            latent, mask
-        )  # pred_mask is mask at resolution of *prediction*
+        pred, pred_mask = self.forward_decoder(latent, mask)  # pred_mask is mask at resolution of *prediction*
 
         # Toggle mask, to generate labels for *masked* tokens
-        return *self.forward_loss(x, pred, ~pred_mask), mask
\ No newline at end of file
+        return *self.forward_loss(x, pred, ~pred_mask), mask

From 01e46628895b0e5643fa7a1d60211e5d6e0b32bc Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sun, 18 Feb 2024 06:56:38 +0000
Subject: [PATCH 030/118] make style and quality

---
 src/transformers/models/hiera/hiera_model.py | 128 ++++++++-----------
 1 file changed, 56 insertions(+), 72 deletions(-)

diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/hiera_model.py
index b1ed0db0e4b9..9345084769ec 100644
--- a/src/transformers/models/hiera/hiera_model.py
+++ b/src/transformers/models/hiera/hiera_model.py
@@ -19,29 +19,29 @@
 # --------------------------------------------------------
 
 import math
+from dataclasses import dataclass
 from functools import partial
-from typing import List, Tuple, Callable, Optional, Union, Type
-from .configuration_hiera import HieraConfig
+from typing import Callable, List, Optional, Tuple, Type, Union
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from dataclasses import dataclass
-
 from timm.models.layers import DropPath, Mlp
+
 from ...modeling_utils import PreTrainedModel
-from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
 )
+from .configuration_hiera import HieraConfig
+
 
 HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "",
+    "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
 ]
 
+
 def conv_nd(n: int) -> Type[nn.Module]:
     """
     Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
@@ -67,9 +67,7 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso
     return mask
 
 
-def do_masked_conv(
-    x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None
-) -> torch.Tensor:
+def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
     """Zero-out the masked regions of the input before conv.
     Prevents leakage of masked regions when using overlapping kernels.
     """
@@ -82,9 +80,7 @@ def do_masked_conv(
     return conv(x * mask.bool())
 
 
-def undo_windowing(
-    x: torch.Tensor, shape: List[int], mu_shape: List[int]
-) -> torch.Tensor:
+def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor:
     """
     Restore spatial organization by undoing windowed organization of mask units.
 
@@ -116,7 +112,6 @@ def undo_windowing(
     return x
 
 
-
 class Unroll(nn.Module):
     """
     Reorders the tokens such that patches are contiguous in memory.
@@ -169,9 +164,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             # Move the patch stride into the batch dimension
             # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
             L = len(new_shape)
-            permute = (
-                [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
-            )
+            permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
             x = x.permute(permute)
 
             # Now finally flatten the relevant dims into the batch dimension
@@ -210,9 +203,7 @@ def __init__(
                     size = [n // s for n, s in zip(size, unroll_schedule[0])]
                 unroll_schedule = unroll_schedule[1:]
 
-    def forward(
-        self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor:
         """
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
@@ -269,11 +260,12 @@ class HieraModelOutput(ModelOutput):
     Base class for HieraModel model's outputs, conforming to Hugging Face's ModelOutput.
 
     Args:
-        last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)): 
+        last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)):
             Last layer hidden-states.
-        intermediates (List[torch.Tensor], optional): 
+        intermediates (List[torch.Tensor], optional):
             Intermediate representations or features from the model, if applicable.
     """
+
     last_hidden_state: torch.FloatTensor
     intermediates: Optional[List[torch.Tensor]] = None
 
@@ -320,15 +312,13 @@ def __init__(
         self.use_mask_unit_attention = use_mask_unit_attention
 
     def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
-        """ Input should be of shape [batch, tokens, channels]. """
-        batch_size , num_channels , _ = embeddings.shape
-        num_windows = (
-            (num_channels  // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
-        )
+        """Input should be of shape [batch, tokens, channels]."""
+        batch_size, num_channels, _ = embeddings.shape
+        num_windows = (num_channels // (self.q_stride * self.window_size)) if self.use_mask_unit_attention else 1
 
         qkv = (
             self.qkv(embeddings)
-            .reshape(batch_size , -1, num_windows, 3, self.number_of_heads, self.head_dim)
+            .reshape(batch_size, -1, num_windows, 3, self.number_of_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
         q, k, v = qkv[0], qkv[1], qkv[2]
@@ -336,7 +326,7 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         if self.q_stride > 1:
             # Refer to Unroll to see how this performs a maxpool-Nd
             q = (
-                q.view(batch_size , self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
+                q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
                 .max(dim=3)
                 .values
             )
@@ -347,9 +337,9 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         else:
             attention = (q * self.scale) @ k.transpose(-1, -2)
             attention = attention.softmax(dim=-1)
-            embeddings = (attention @ v)
+            embeddings = attention @ v
 
-        embeddings = embeddings.transpose(1, 3).reshape(batch_size , -1, self.output_dim)
+        embeddings = embeddings.transpose(1, 3).reshape(batch_size, -1, self.output_dim)
         embeddings = self.projection(embeddings)
         return embeddings
 
@@ -418,9 +408,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.act_func(x)
         return x
 
-@add_start_docstrings("""
+
+@add_start_docstrings(
+    """
 Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
-""")
+"""
+)
 class PatchEmbedding(nn.Module):
     def __init__(
         self,
@@ -442,18 +435,18 @@ def __init__(
             padding=padding,
         )
 
-    def forward(
-        self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         embeddings = do_masked_conv(pixel_values, self.projection, mask)
         embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
         return embeddings
 
+
 class HieraPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+
     config_class = HieraConfig
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
@@ -469,9 +462,8 @@ def _init_weights(self, module, init_bias=0.02):
             nn.init.constant_(module.weight, 1.0)
 
 
-
-
-@add_start_docstrings("""
+@add_start_docstrings(
+    """
 Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
 
 This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
@@ -482,7 +474,7 @@ def _init_weights(self, module, init_bias=0.02):
     config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
         Initializing with a config file does not load the weights associated with the model, only the
         configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-    
+
 Example usage:
     >>> from your_model_file import Hiera, HieraConfig
     >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
@@ -490,7 +482,8 @@ def _init_weights(self, module, init_bias=0.02):
     >>> model = Hiera(config)
     >>> inputs = torch.rand((1, 3, 224, 224))
     >>> outputs = model(inputs)
-                      """)
+                      """
+)
 class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
@@ -531,9 +524,7 @@ def __init__(self, config: HieraConfig):
         assert self.q_pool < len(self.stages)
         self.q_pool, self.q_stride = self.q_pool, self.q_stride
         self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size
-        self.mask_spatial_shape = [
-            i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)
-        ]
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)]
         self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)]
 
         self.patch_embedding = PatchEmbedding(
@@ -555,9 +546,7 @@ def __init__(self, config: HieraConfig):
             self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension))
 
         # Setup roll and reroll modules
-        self.unroll = Unroll(
-            self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1])
-        )
+        self.unroll = Unroll(self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]))
         self.reroll = Reroll(
             self.input_size,
             self.patch_stride,
@@ -566,7 +555,7 @@ def __init__(self, config: HieraConfig):
             self.q_pool,
         )
         # q_pool locations
-        q_pool_blocks = [x + 1 for x in self.stage_ends[:self.q_pool]]
+        q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]]
         # stochastic depth decay rule
         dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, depth)]
 
@@ -619,7 +608,6 @@ def __init__(self, config: HieraConfig):
         self.head.projection.bias.data.mul_(self.head_init_scale)
         self.post_init()
 
-
     @torch.jit.ignore
     def no_weight_decay(self):
         if self.sep_position_embeddings:
@@ -632,21 +620,19 @@ def get_random_mask(self, x: torch.Tensor, mask_ratio: float) -> torch.Tensor:
         Generates a random mask, mask_ratio fraction are dropped.
         1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
         """
-        batch_size  = x.shape[0]
+        batch_size = x.shape[0]
         # Tokens selected for masking at mask unit level
         num_windows = math.prod(self.mask_spatial_shape)  # num_mask_units
         len_keep = int(num_windows * (1 - mask_ratio))
-        noise = torch.rand(batch_size , num_windows, device=x.device)
+        noise = torch.rand(batch_size, num_windows, device=x.device)
 
         # Sort noise for each sample
-        ids_shuffle = torch.argsort(
-            noise, dim=1
-        )  # ascend: small is keep, large is remove
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
         ids_restore = torch.argsort(ids_shuffle, dim=1)
 
         # Generate the binary mask: 1 is *keep*, 0 is *remove*
         # Note this is opposite to original MAE
-        mask = torch.zeros([batch_size , num_windows], device=x.device)
+        mask = torch.zeros([batch_size, num_windows], device=x.device)
         mask[:, :len_keep] = 1
         # Unshuffle to get the binary mask
         mask = torch.gather(mask, dim=1, index=ids_restore)
@@ -665,34 +651,34 @@ def get_position_embeddings(self) -> torch.Tensor:
         else:
             return self.position_embeddings
 
-    @add_start_docstrings_to_model_forward("""
+    @add_start_docstrings_to_model_forward(
+        """
     The forward pass for the Hiera model.
 
     Args:
         pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`.
-        
+
         mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
         mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
 
-        
+
         return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
 
         return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
-    
-    
-        
-    """)
-    @replace_return_docstrings(output_type=HieraModelOutput,config_class="HieraConfig")
+
+
+
+    """
+    )
     def forward(
         self,
         pixel_values: torch.Tensor,
         mask: torch.Tensor = None,
         return_dict: Optional[bool] = True,
-        return_intermediates: bool = False,
+        return_intermediates: bool = True,
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
-        """
-        """
+        """ """
         # Slowfast training passes in a list
         if isinstance(pixel_values, list):
             pixel_values = pixel_values[0]
@@ -700,9 +686,7 @@ def forward(
 
         pached_embeddings = self.patch_embedding(
             pixel_values,
-            mask=mask.view(
-                pixel_values.shape[0], 1, *self.mask_spatial_shape
-            )  # batch_size , C, *mask_spatial_shape
+            mask=mask.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)  # batch_size , C, *mask_spatial_shape
             if mask is not None
             else None,
         )
@@ -732,8 +716,8 @@ def forward(
         # intermediates[-1] is embeddings in spatial order
         if not return_dict:
             return tuple(v for v in [embeddings, intermediates] if v is not None)
-        
+
         return HieraModelOutput(
             last_hidden_state=embeddings,
             intermediates=intermediates if return_intermediates else None,
-        )
\ No newline at end of file
+        )

From cc2c623197701c7371a00b7119d10928c5eae8a4 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Mon, 26 Feb 2024 23:11:01 +0000
Subject: [PATCH 031/118] Integration & Block tests running

---
 tests/models/hiera/test_modeling_hiera.py | 265 +++++++++++++++++++---
 1 file changed, 235 insertions(+), 30 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 8d593af2a622..72badde557df 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -15,7 +15,8 @@
 """ Testing suite for the PyTorch Hiera model. """
 
 import unittest
-
+from typing import  Tuple
+from transformers.models.hiera.hiera_model import HieraBlock
 from transformers import HieraConfig
 from transformers.testing_utils import (
     require_torch,
@@ -23,65 +24,269 @@
     torch_device,
 )
 from transformers.utils import is_torch_available
-
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 if is_torch_available():
     import torch
     from transformers import HieraModel
-    # Assuming HIERA_PRETRAINED_MODEL_ARCHIVE_LIST is defined somewhere for your model
-    from transformers.models.hiera.configuration_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
+    from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
+    from torchvision.transforms.functional import InterpolationMode
+    from torchvision import transforms
+    from PIL import Image
+import math
 class HieraModelTester:
-    # Define this tester to initialize Hiera model and its configurations for testing
     def __init__(
         self,
         parent,
-        batch_size=8,
-        num_channels=3,
-        image_size=224,
-        # Add other model-specific parameters here
+        input_size: Tuple[int, ...] = (224, 224),
+        in_chans: int = 3,
+        embedding_dimension: int = 96,  # initial embedding input_dim
+        number_of_heads: int = 1,  # initial number of number_of_heads
+        num_classes: int = 1000,
+        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, ...] = (2, 2),
+        mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
+        # mask_unit_attn: which stages use mask unit attention?
+        mask_unit_attn: Tuple[bool, ...] = (True, True, False, False),
+        dim_mul: float = 2.0,
+        head_mul: float = 2.0,
+        patch_kernel: Tuple[int, ...] = (7, 7),
+        patch_stride: Tuple[int, ...] = (4, 4),
+        patch_padding: Tuple[int, ...] = (3, 3),
+        mlp_ratio: float = 4.0,
+        drop_path_rate: float = 0.0,
+        head_dropout: float = 0.0,
+        head_init_scale: float = 0.001,
+        sep_position_embeddings: bool = False,
     ):
         self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        # Initialize other necessary attributes here
+        self.input_size = input_size
+        self.in_chans = in_chans
+        self.embedding_dimension = embedding_dimension
+        self.number_of_heads = number_of_heads
+        self.num_classes = num_classes
+        self.stages = stages
+        self.q_pool = q_pool
+        self.q_stride = q_stride
+        self.mask_unit_size = mask_unit_size
+        self.mask_unit_attn = mask_unit_attn
+        self.dim_mul = dim_mul
+        self.head_mul = head_mul
+        self.patch_kernel = patch_kernel
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.drop_path_rate = drop_path_rate
+        self.head_dropout = head_dropout
+        self.head_init_scale = head_init_scale
+        self.sep_position_embeddings = sep_position_embeddings
 
-    def prepare_config_and_inputs(self):
+    def prepare_config_and_inputs(self,checkpoint_url):
         # Prepare configuration and inputs for testing your model
-        pixel_values = torch.rand((self.batch_size, self.num_channels, self.image_size, self.image_size), device=torch_device)
+        pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1]))
 
-        config = self.get_config()
+        config = self.get_config(checkpoint_url=checkpoint_url)
 
         return config, pixel_values
 
-    def get_config(self):
-        return HieraConfig(
-            # Define necessary configuration parameters here
-        )
+    def get_config(self,checkpoint_url):
+        if "hiera_tiny_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=96, 
+                                number_of_heads=1, 
+                                stages=(1, 2, 7, 2),)
+
+        elif "hiera_small_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=96, 
+                                number_of_heads=1, 
+                                stages=(1, 2, 11, 2),)
+
+        elif "hiera_base_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), )
+
+
+        elif "hiera_base_plus_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=112, 
+                                number_of_heads=2, 
+                                stages=(2, 3, 16, 3),)
+
+        elif "hiera_large_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=144, 
+                                number_of_heads=2, 
+                                stages=(2, 6, 36, 4),)
+
+        elif "hiera_huge_224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=256, 
+                                number_of_heads=4, 
+                                stages=(2, 6, 36, 4))
+
+        elif "hiera_base_16x224" in checkpoint_url:
+            config = HieraConfig(num_classes=self.num_classes, 
+                                input_size=(16, 224, 224),
+                                q_stride=(1, 2, 2),
+                                mask_unit_size=(1, 8, 8),
+                                patch_kernel=(3, 7, 7),
+                                patch_stride=(2, 4, 4),
+                                patch_padding=(1, 3, 3),
+                                sep_position_embeddings=True,)
+
+        elif "hiera_base_plus_16x224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=112, 
+                                number_of_heads=2, 
+                                stages=(2, 3, 16, 3))
+
+        elif "hiera_large_16x224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=144, 
+                                number_of_heads=2, 
+                                stages=(2, 6, 36, 4), )
+
+        elif "hiera_huge_16x224" in checkpoint_url:
+            config = HieraConfig(embedding_dimension=256, 
+                                number_of_heads=4, 
+                                stages=(2, 6, 36, 4) )
+        else:
+            raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})")
+
+        return config
 
     def create_and_check_model(self, config, pixel_values):
+        batch_size = 1
         model = HieraModel(config=config)
-        model.to(torch_device)
+        num_patches = int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)**2
+        flat_q_stride = math.prod(self.q_stride)
+        embedding_dimension = self.embedding_dimension
+        indermediate_shapes = []
+        for _ in self.stages:
+            indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension))
+            num_patches = num_patches/flat_q_stride
+            embedding_dimension = embedding_dimension * 2
         model.eval()
         with torch.no_grad():
             result = model(pixel_values=pixel_values)
-        # Perform checks here, e.g., output shapes, etc.
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_attention_heads, self.seq_length, self.hidden_size))
+
+        for idx, x in enumerate(result.intermediates):
+            self.parent.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape")
 
 
 @require_torch
-class HieraModelTest(unittest.TestCase):
+class HieraModelTest():
 
     def setUp(self):
         self.model_tester = HieraModelTester(self)
 
     def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
+        for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name)
+            self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @slow
+    # @slow
     def test_model_from_pretrained(self):
         for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = HieraModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
\ No newline at end of file
+            self.assertIsNotNone(model)
+
+@require_torch
+@slow
+class HieraModelIntegrationTest(unittest.TestCase):
+    def test_forward(self):
+        torch_device = "cpu"
+        input_size = 224
+        batch_size =1
+        patch_kernel = (7,7)
+        patch_padding = (3,3)
+        patch_stride = (4,4)
+        q_stride = (2,2)
+        flat_q_stride =  math.prod(q_stride)
+        stages=(2, 3, 16, 3)
+        embedding_dimension = 96
+        model = HieraModel.from_pretrained("/home/ubuntu/home/hiera/model/")
+        model.to(torch_device)
+        
+        random_tensor = torch.rand(batch_size, 3, input_size, input_size)
+        num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1)**2
+
+        indermediate_shapes = []
+        for _ in stages:
+            indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension))
+            num_patches = num_patches/flat_q_stride
+            embedding_dimension = embedding_dimension * 2
+        out = model(random_tensor)
+
+        out.last_hidden_state.argmax(dim=-1).item()
+
+        out = model(random_tensor, return_intermediates=True)
+        for idx, x in enumerate(out.intermediates):
+            self.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape")
+
+class TestHieraBlock(unittest.TestCase):
+    def test_output_shape(self):
+        batch_size, input_dim, output_dim = 1, 96, 192
+        number_of_heads = 2
+        mlp_ratio = 4.0
+        drop_path = 0.0
+        q_stride = 4
+        window_size = 16
+        use_mask_unit_attention = True
+        num_patches = 3136
+
+        block = HieraBlock(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            number_of_heads=number_of_heads,
+            mlp_ratio=mlp_ratio,
+            drop_path=drop_path,
+            q_stride=q_stride,
+            window_size=window_size,
+            use_mask_unit_attention=use_mask_unit_attention
+        )
+
+        # Create a dummy input
+        x = torch.randn(batch_size, num_patches,input_dim)
+        
+        # Forward pass
+        out = block(x)
+
+        # Check the shape of the output
+        expected_shape = (batch_size, num_patches/q_stride, output_dim)
+        self.assertEqual(out.shape, expected_shape, "Output shape is incorrect")
+
+    def test_input_output_dim_equality(self):
+        batch_size, input_dim, output_dim = 1, 96, 96
+        number_of_heads = 1
+        mlp_ratio = 4.0
+        drop_path = 0.0
+        q_stride = 1
+        window_size = 64
+        use_mask_unit_attention = True
+        num_patches = 3136
+        block = HieraBlock(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            number_of_heads=number_of_heads,
+            mlp_ratio=mlp_ratio,
+            drop_path=drop_path,
+            q_stride=q_stride,
+            window_size=window_size,
+            use_mask_unit_attention=use_mask_unit_attention
+        )
+
+        # Create a dummy input
+        x = torch.randn(batch_size, num_patches,input_dim)
+        
+        # Forward pass
+        out = block(x)
+
+        # Check the shape of the output
+        expected_shape = (batch_size, num_patches, output_dim)
+        self.assertEqual(out.shape, expected_shape, "Output shape is incorrect. Input shape should be equal to output shape")
+
+
+if __name__ == '__main__':
+    test = HieraModelIntegrationTest()
+    test.test_forward()
+    block_test = TestHieraBlock()
+    block_test.test_output_shape()
+    block_test.test_input_output_dim_equality()
+    model_test = HieraModelTest()
+    model_test.setUp()
+    model_test.test_model()
+    model_test.test_model_from_pretrained()

From f172b7490835cdebad3ebe3e1fd8fe6940aef728 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 06:04:01 +0000
Subject: [PATCH 032/118] Fixed bugs

---
 src/transformers/__init__.py                              | 6 +++++-
 src/transformers/models/auto/configuration_auto.py        | 2 +-
 src/transformers/models/auto/modeling_auto.py             | 1 +
 src/transformers/models/hiera/__init__.py                 | 8 ++++++--
 src/transformers/models/hiera/hiera_mae.py                | 2 +-
 .../models/hiera/{hiera_model.py => modeling_hiera.py}    | 0
 6 files changed, 14 insertions(+), 5 deletions(-)
 rename src/transformers/models/hiera/{hiera_model.py => modeling_hiera.py} (100%)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4d7ef6ce20d3..9d668babbec2 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6981,7 +6981,11 @@
             GroupViTTextModel,
             GroupViTVisionModel,
         )
-        from .models.hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
+        from .models.hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraModel,
+            HieraPreTrainedModel,
+        )
         from .models.hubert import (
             HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             HubertForCTC,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6f824a2e955d..10511e2ff47e 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -590,7 +590,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
-        ("hiera", "HieraModel"),
+        ("hiera", "Hiera"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 0fc417e795e4..fb4d571632a4 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -501,6 +501,7 @@
         ("efficientnet", "EfficientNetModel"),
         ("focalnet", "FocalNetModel"),
         ("glpn", "GLPNModel"),
+        ("hiera", "HieraModel"),
         ("imagegpt", "ImageGPTModel"),
         ("levit", "LevitModel"),
         ("mobilenet_v1", "MobileNetV1Model"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index fcffbbf7593e..d32f0a934fea 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -35,7 +35,11 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["hiera_model"] = ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel "]
+    _import_structure["modeling_hiera"] = [
+        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "HieraModel",
+        "HieraPreTrainedModel "
+        ]
 
 
 if TYPE_CHECKING:
@@ -51,7 +55,7 @@
         pass
     else:
         from .hiera_image_processor import HieraImageProcessor
-        from .hiera_model import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
+        from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
 
 else:
     import sys
diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
index 56b91bc7acb7..7c42c22734a1 100644
--- a/src/transformers/models/hiera/hiera_mae.py
+++ b/src/transformers/models/hiera/hiera_mae.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn as nn
 
-from .hiera_model import HieraBlock, HieraModel, conv_nd, undo_windowing
+from .modeling_hiera import HieraBlock, HieraModel, conv_nd, undo_windowing
 
 
 def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
diff --git a/src/transformers/models/hiera/hiera_model.py b/src/transformers/models/hiera/modeling_hiera.py
similarity index 100%
rename from src/transformers/models/hiera/hiera_model.py
rename to src/transformers/models/hiera/modeling_hiera.py

From 35b3720aa89ad9711df9fc38abc75b2a93c0d012 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 17:41:40 +0000
Subject: [PATCH 033/118] Removed tim dependency

---
 .../models/hiera/modeling_hiera.py            | 92 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 9345084769ec..f463834a437b 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -18,15 +18,16 @@
 # timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
 # --------------------------------------------------------
 
+import collections.abc
 import math
 from dataclasses import dataclass
 from functools import partial
+from itertools import repeat
 from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from timm.models.layers import DropPath, Mlp
 
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -38,7 +39,7 @@
 
 
 HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
+    "namangarg110/hiera_base_224",
 ]
 
 
@@ -112,6 +113,93 @@ def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> to
     return x
 
 
+# Copied from transformers.models.swin.modeling_swin.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+
+
+# Copied from timm.layers.helpers
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+# Copied from timm.layers.mlp
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
 class Unroll(nn.Module):
     """
     Reorders the tokens such that patches are contiguous in memory.

From 5f90a2559d1807d710c2fcea15678da7ff9054df Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 17:42:06 +0000
Subject: [PATCH 034/118] added HieraBlock

---
 src/transformers/models/hiera/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index d32f0a934fea..3346e03f9a88 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -38,8 +38,9 @@
     _import_structure["modeling_hiera"] = [
         "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "HieraModel",
-        "HieraPreTrainedModel "
-        ]
+        "HieraPreTrainedModel",
+        "HieraBlock",
+    ]
 
 
 if TYPE_CHECKING:
@@ -55,7 +56,7 @@
         pass
     else:
         from .hiera_image_processor import HieraImageProcessor
-        from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraModel, HieraPreTrainedModel
+        from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock, HieraModel, HieraPreTrainedModel
 
 else:
     import sys

From ebde8c89a821c21063051b184605e5ffb4dbf6c7 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 17:42:27 +0000
Subject: [PATCH 035/118] fixed: Model name

---
 src/transformers/models/hiera/convert_hiera_to_pytorch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index 794a62147d78..f4f82d59a3c9 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -217,12 +217,12 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs)
     for x in out.intermediates:
         print(x.shape)
 
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path, push_to_hub=True, safe_serialization=False)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
     checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth"
-    convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="~/")
+    convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="/home/ubuntu/home/hiera/hiera_base_224")

From 772e421b0d56759dacc710e6ab303ea1f0f79900 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 17:43:02 +0000
Subject: [PATCH 036/118] added tests for HieraModel, HieraBlock

---
 tests/models/hiera/test_modeling_hiera.py | 272 ++++++++++++----------
 1 file changed, 143 insertions(+), 129 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 72badde557df..21e0f14fe58f 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -15,24 +15,84 @@
 """ Testing suite for the PyTorch Hiera model. """
 
 import unittest
-from typing import  Tuple
-from transformers.models.hiera.hiera_model import HieraBlock
-from transformers import HieraConfig
+from typing import Tuple
+
+from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraConfig, HieraModel
+from transformers.models.hiera import HieraBlock
 from transformers.testing_utils import (
     require_torch,
     slow,
-    torch_device,
 )
 from transformers.utils import is_torch_available
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+
 if is_torch_available():
     import torch
-    from transformers import HieraModel
-    from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
-    from torchvision.transforms.functional import InterpolationMode
-    from torchvision import transforms
-    from PIL import Image
 import math
+
+
+class HieraBlockTester:
+    def __init__(
+        self,
+        parent,
+        batch_size: int = 1,
+        input_dim: int = 96,
+        output_dim: int = 192,
+        number_of_heads: int = 2,
+        mlp_ratio: float = 4.0,
+        drop_path: float = 0.0,
+        q_stride: int = 4,
+        window_size: int = 16,
+        use_mask_unit_attention: bool = True,
+        num_patches: int = 3136,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.number_of_heads = number_of_heads
+        self.mlp_ratio = mlp_ratio
+        self.drop_path = drop_path
+        self.q_stride = q_stride
+        self.window_size = window_size
+        self.use_mask_unit_attention = use_mask_unit_attention
+        self.num_patches = num_patches
+
+    def create_and_check_block(self):
+        block = HieraBlock(
+            input_dim=self.input_dim,
+            output_dim=self.output_dim,
+            number_of_heads=self.number_of_heads,
+            mlp_ratio=self.mlp_ratio,
+            drop_path=self.drop_path,
+            q_stride=self.q_stride,
+            window_size=self.window_size,
+            use_mask_unit_attention=self.use_mask_unit_attention,
+        )
+
+        x = torch.randn(self.batch_size, self.num_patches, self.input_dim)
+        out = block(x)
+
+        expected_shape = (self.batch_size, self.num_patches // self.q_stride, self.output_dim)
+        self.parent.assertEqual(out.shape, expected_shape, "Output shape is incorrect")
+
+
+@require_torch
+class TestHieraBlock(unittest.TestCase):
+    def setUp(self):
+        self.block_tester = HieraBlockTester(self)
+
+    def test_output_shape(self):
+        self.block_tester.create_and_check_block()
+
+    def test_input_output_dim_equality(self):
+        self.block_tester.output_dim = self.block_tester.input_dim
+        self.block_tester.q_stride = 1
+        self.block_tester.number_of_heads = 1
+        self.block_tester.window_size = 64
+        self.block_tester.create_and_check_block()
+
+
 class HieraModelTester:
     def __init__(
         self,
@@ -81,7 +141,7 @@ def __init__(
         self.head_init_scale = head_init_scale
         self.sep_position_embeddings = sep_position_embeddings
 
-    def prepare_config_and_inputs(self,checkpoint_url):
+    def prepare_config_and_inputs(self, checkpoint_url):
         # Prepare configuration and inputs for testing your model
         pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1]))
 
@@ -89,60 +149,69 @@ def prepare_config_and_inputs(self,checkpoint_url):
 
         return config, pixel_values
 
-    def get_config(self,checkpoint_url):
+    def get_config(self, checkpoint_url):
         if "hiera_tiny_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=96, 
-                                number_of_heads=1, 
-                                stages=(1, 2, 7, 2),)
+            config = HieraConfig(
+                embedding_dimension=96,
+                number_of_heads=1,
+                stages=(1, 2, 7, 2),
+            )
 
         elif "hiera_small_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=96, 
-                                number_of_heads=1, 
-                                stages=(1, 2, 11, 2),)
+            config = HieraConfig(
+                embedding_dimension=96,
+                number_of_heads=1,
+                stages=(1, 2, 11, 2),
+            )
 
         elif "hiera_base_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), )
-
+            config = HieraConfig(
+                embedding_dimension=96,
+                number_of_heads=1,
+                stages=(2, 3, 16, 3),
+            )
 
         elif "hiera_base_plus_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=112, 
-                                number_of_heads=2, 
-                                stages=(2, 3, 16, 3),)
+            config = HieraConfig(
+                embedding_dimension=112,
+                number_of_heads=2,
+                stages=(2, 3, 16, 3),
+            )
 
         elif "hiera_large_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=144, 
-                                number_of_heads=2, 
-                                stages=(2, 6, 36, 4),)
+            config = HieraConfig(
+                embedding_dimension=144,
+                number_of_heads=2,
+                stages=(2, 6, 36, 4),
+            )
 
         elif "hiera_huge_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=256, 
-                                number_of_heads=4, 
-                                stages=(2, 6, 36, 4))
+            config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
 
         elif "hiera_base_16x224" in checkpoint_url:
-            config = HieraConfig(num_classes=self.num_classes, 
-                                input_size=(16, 224, 224),
-                                q_stride=(1, 2, 2),
-                                mask_unit_size=(1, 8, 8),
-                                patch_kernel=(3, 7, 7),
-                                patch_stride=(2, 4, 4),
-                                patch_padding=(1, 3, 3),
-                                sep_position_embeddings=True,)
+            config = HieraConfig(
+                num_classes=self.num_classes,
+                input_size=(16, 224, 224),
+                q_stride=(1, 2, 2),
+                mask_unit_size=(1, 8, 8),
+                patch_kernel=(3, 7, 7),
+                patch_stride=(2, 4, 4),
+                patch_padding=(1, 3, 3),
+                sep_position_embeddings=True,
+            )
 
         elif "hiera_base_plus_16x224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=112, 
-                                number_of_heads=2, 
-                                stages=(2, 3, 16, 3))
+            config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3))
 
         elif "hiera_large_16x224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=144, 
-                                number_of_heads=2, 
-                                stages=(2, 6, 36, 4), )
+            config = HieraConfig(
+                embedding_dimension=144,
+                number_of_heads=2,
+                stages=(2, 6, 36, 4),
+            )
 
         elif "hiera_huge_16x224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=256, 
-                                number_of_heads=4, 
-                                stages=(2, 6, 36, 4) )
+            config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
         else:
             raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})")
 
@@ -151,25 +220,29 @@ def get_config(self,checkpoint_url):
     def create_and_check_model(self, config, pixel_values):
         batch_size = 1
         model = HieraModel(config=config)
-        num_patches = int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)**2
+        num_patches = (
+            int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)
+            ** 2
+        )
         flat_q_stride = math.prod(self.q_stride)
         embedding_dimension = self.embedding_dimension
         indermediate_shapes = []
         for _ in self.stages:
-            indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension))
-            num_patches = num_patches/flat_q_stride
+            indermediate_shapes.append(
+                (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension)
+            )
+            num_patches = num_patches / flat_q_stride
             embedding_dimension = embedding_dimension * 2
         model.eval()
         with torch.no_grad():
             result = model(pixel_values=pixel_values)
 
         for idx, x in enumerate(result.intermediates):
-            self.parent.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape")
+            self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
 
 
 @require_torch
-class HieraModelTest():
-
+class HieraModelTest(unittest.TestCase):
     def setUp(self):
         self.model_tester = HieraModelTester(self)
 
@@ -178,36 +251,39 @@ def test_model(self):
             config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name)
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    # @slow
+    @slow
     def test_model_from_pretrained(self):
         for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = HieraModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+
 @require_torch
 @slow
 class HieraModelIntegrationTest(unittest.TestCase):
     def test_forward(self):
         torch_device = "cpu"
         input_size = 224
-        batch_size =1
-        patch_kernel = (7,7)
-        patch_padding = (3,3)
-        patch_stride = (4,4)
-        q_stride = (2,2)
-        flat_q_stride =  math.prod(q_stride)
-        stages=(2, 3, 16, 3)
+        batch_size = 1
+        patch_kernel = (7, 7)
+        patch_padding = (3, 3)
+        patch_stride = (4, 4)
+        q_stride = (2, 2)
+        flat_q_stride = math.prod(q_stride)
+        stages = (2, 3, 16, 3)
         embedding_dimension = 96
-        model = HieraModel.from_pretrained("/home/ubuntu/home/hiera/model/")
+        model = HieraModel.from_pretrained("namangarg110/hiera_base_224")
         model.to(torch_device)
-        
+
         random_tensor = torch.rand(batch_size, 3, input_size, input_size)
-        num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1)**2
+        num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1) ** 2
 
         indermediate_shapes = []
         for _ in stages:
-            indermediate_shapes.append((batch_size,int(math.sqrt(num_patches)),int(math.sqrt(num_patches)),embedding_dimension))
-            num_patches = num_patches/flat_q_stride
+            indermediate_shapes.append(
+                (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension)
+            )
+            num_patches = num_patches / flat_q_stride
             embedding_dimension = embedding_dimension * 2
         out = model(random_tensor)
 
@@ -215,72 +291,10 @@ def test_forward(self):
 
         out = model(random_tensor, return_intermediates=True)
         for idx, x in enumerate(out.intermediates):
-            self.assertEqual(x.shape,indermediate_shapes[idx],"Invalid Intermediate shape")
-
-class TestHieraBlock(unittest.TestCase):
-    def test_output_shape(self):
-        batch_size, input_dim, output_dim = 1, 96, 192
-        number_of_heads = 2
-        mlp_ratio = 4.0
-        drop_path = 0.0
-        q_stride = 4
-        window_size = 16
-        use_mask_unit_attention = True
-        num_patches = 3136
-
-        block = HieraBlock(
-            input_dim=input_dim,
-            output_dim=output_dim,
-            number_of_heads=number_of_heads,
-            mlp_ratio=mlp_ratio,
-            drop_path=drop_path,
-            q_stride=q_stride,
-            window_size=window_size,
-            use_mask_unit_attention=use_mask_unit_attention
-        )
-
-        # Create a dummy input
-        x = torch.randn(batch_size, num_patches,input_dim)
-        
-        # Forward pass
-        out = block(x)
-
-        # Check the shape of the output
-        expected_shape = (batch_size, num_patches/q_stride, output_dim)
-        self.assertEqual(out.shape, expected_shape, "Output shape is incorrect")
-
-    def test_input_output_dim_equality(self):
-        batch_size, input_dim, output_dim = 1, 96, 96
-        number_of_heads = 1
-        mlp_ratio = 4.0
-        drop_path = 0.0
-        q_stride = 1
-        window_size = 64
-        use_mask_unit_attention = True
-        num_patches = 3136
-        block = HieraBlock(
-            input_dim=input_dim,
-            output_dim=output_dim,
-            number_of_heads=number_of_heads,
-            mlp_ratio=mlp_ratio,
-            drop_path=drop_path,
-            q_stride=q_stride,
-            window_size=window_size,
-            use_mask_unit_attention=use_mask_unit_attention
-        )
-
-        # Create a dummy input
-        x = torch.randn(batch_size, num_patches,input_dim)
-        
-        # Forward pass
-        out = block(x)
-
-        # Check the shape of the output
-        expected_shape = (batch_size, num_patches, output_dim)
-        self.assertEqual(out.shape, expected_shape, "Output shape is incorrect. Input shape should be equal to output shape")
+            self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test = HieraModelIntegrationTest()
     test.test_forward()
     block_test = TestHieraBlock()

From 850350eef1fac3eb23fd68bf4137135114975e56 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 19:07:37 +0000
Subject: [PATCH 037/118] fixed imports

---
 tests/models/hiera/test_modeling_hiera.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 21e0f14fe58f..326159d9f23e 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -17,8 +17,9 @@
 import unittest
 from typing import Tuple
 
-from transformers import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraConfig, HieraModel
-from transformers.models.hiera import HieraBlock
+from transformers import HieraConfig, HieraModel
+
+from transformers.models.hiera.modeling_hiera import HieraBlock, HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
 from transformers.testing_utils import (
     require_torch,
     slow,

From 20f3bc04a5a036d2c31b4c861f3c4d1fccd1824b Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 19:26:05 +0000
Subject: [PATCH 038/118] fixed quality & copies

---
 docs/source/en/index.md                       |  2 +-
 .../models/hiera/configuration_hiera.py       | 60 ++++++++-----------
 src/transformers/utils/dummy_pt_objects.py    | 17 ++++++
 tests/models/hiera/test_modeling_hiera.py     |  3 +-
 4 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index b26c9f91360c..1acd49678534 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -155,7 +155,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
-|                        [Hiera](model_doc/hiera)                          |       ✅        |         ❌         |      ❌      |
+|                         [Hiera](model_doc/hiera)                         |       ✅        |         ❌         |      ❌      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
 |                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
 |                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 8d40e7a72777..5b5e92688521 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -37,41 +37,31 @@ class HieraConfig(PretrainedConfig):
 
 
     Args:
-        input_size (Tuple[int, ...], optional): Dimensions of the input image (height, width). Defaults to (224, 224).
-        in_chans (int, optional): Number of input channels. Defaults to 3.
-        embedding_dimension (int, optional): Dimension of the initial embedding. Defaults to 96.
-        number_of_heads (int, optional): Initial number of attention heads. Defaults to 1.
-        num_classes (int, optional): Number of output classes. Defaults to 1000.
-        stages (Tuple[int, ...], optional): Defines the number of blocks at each stage of the model.
-        q_pool (int, optional): Number of pooling stages for queries. Defaults to 3.
-        q_stride (Tuple[int, ...], optional): Stride size for pooling. Defaults to (2, 2).
-        mask_unit_size (Tuple[int, ...], optional): Dimensions for the mask unit. Must be compatible with q_stride.
-        mask_unit_attn (Tuple[bool, ...], optional): Specifies which stages use mask unit attention. Defaults to (True, True, False, False).
-        dim_mul (float, optional): Factor for increasing the dimensionality through the network. Defaults to 2.0.
-        head_mul (float, optional): Factor for increasing the number of heads through the network. Defaults to 2.0.
-        patch_kernel (Tuple[int, ...], optional): Kernel size for patch embedding. Defaults to (7, 7).
-        patch_stride (Tuple[int, ...], optional): Stride for patch embedding. Defaults to (4, 4).
-        patch_padding (Tuple[int, ...], optional): Padding for patch embedding. Defaults to (3, 3).
-        mlp_ratio (float, optional): Ratio of hidden size to feed-forward layer size. Defaults to 4.0.
-        drop_path_rate (float, optional): Dropout rate for stochastic depth. Defaults to 0.0.
-        head_dropout (float, optional): Dropout rate for attention heads. Defaults to 0.0.
-        head_init_scale (float, optional): Initial scaling factor for attention head weights. Defaults to 0.001.
-        sep_position_embeddings (bool, optional): Whether to use separate position embeddings. Defaults to False.
-
-
-        Example:
-        ```python
-        >>> from transformers import HieraConfig, HieraModel
-
-        >>> # Initializing a ViT MAE vit-mae-base style configuration
-        >>> configuration = HieraConfig()
-
-        >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
-        >>> model = HieraModel(configuration)
-
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-        ```
+        input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width). Defaults to (224, 224).
+        in_chans (int, optional, *optional*, defaults to 3): Number of input channels. Defaults to 3.
+        embedding_dimension (int, optional, *optional*, defaults to 96): Dimension of the initial embedding. Defaults to 96.
+        number_of_heads (int, optional, *optional*, defaults to 1): Initial number of attention heads. Defaults to 1.
+        num_classes (int, optional, *optional*, defaults to 1000): Number of output classes. Defaults to 1000.
+        stages (Tuple[int, ...], optional, *optional*, defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model.
+        q_pool (int, optional, *optional*, defaults to 3): Number of pooling stages for queries. Defaults to 3.
+        q_stride (Tuple[int, ...], optional, *optional*, defaults to `(2, 2)`): Stride size for pooling. Defaults to (2, 2).
+        mask_unit_size (Tuple[int, ...], optional, *optional*, defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride.
+        mask_unit_attn (Tuple[bool, ...], optional, *optional*, defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention. Defaults to (True, True, False, False).
+        dim_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the dimensionality through the network. Defaults to 2.0.
+        head_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the number of heads through the network. Defaults to 2.0.
+        patch_kernel (Tuple[int, ...], optional, *optional*, defaults to `(7, 7)`): Kernel size for patch embedding. Defaults to (7, 7).
+        patch_stride (Tuple[int, ...], optional, *optional*, defaults to `(4, 4)`): Stride for patch embedding. Defaults to (4, 4).
+        patch_padding (Tuple[int, ...], optional, *optional*, defaults to `(3, 3)`): Padding for patch embedding. Defaults to (3, 3).
+        mlp_ratio (float, optional, *optional*, defaults to 4.0): Ratio of hidden size to feed-forward layer size. Defaults to 4.0.
+        drop_path_rate (float, optional, *optional*, defaults to 0.0): Dropout rate for stochastic depth. Defaults to 0.0.
+        head_dropout (float, optional, *optional*, defaults to 0.0): Dropout rate for attention heads. Defaults to 0.0.
+        head_init_scale (float, optional, *optional*, defaults to 0.001): Initial scaling factor for attention head weights. Defaults to 0.001.
+        sep_position_embeddings (bool, optional, *optional*, defaults to `False`): Whether to use separate position embeddings. Defaults to False.
+
+
+
+
+
     """
 
     model_type = "hiera"
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5c635cf7af2c..4e1b0211216a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4240,6 +4240,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class HieraModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HieraPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 326159d9f23e..5d90ca9f9f55 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -18,8 +18,7 @@
 from typing import Tuple
 
 from transformers import HieraConfig, HieraModel
-
-from transformers.models.hiera.modeling_hiera import HieraBlock, HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
+from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock
 from transformers.testing_utils import (
     require_torch,
     slow,

From 12ef68aee902199df6546f1ffe5a58332a896f4d Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 28 Feb 2024 23:30:41 +0000
Subject: [PATCH 039/118] Fixes

---
 docs/source/en/model_doc/hiera.md             |  4 +-
 .../models/auto/image_processing_auto.py      |  1 -
 src/transformers/models/hiera/__init__.py     |  9 ++-
 tests/models/hiera/test_modeling_hiera.py     | 60 ++++++++-----------
 4 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 1c46bae9b072..d38e2e70c770 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -31,9 +31,9 @@ Modern hierarchical vision transformers have added several vision-specific compo
 <frameworkcontent>
 <pt>
 
-## HireaModel
+## HieraModel
 
-[[autodoc]] HireaModel
+[[autodoc]] HieraModel
     - forward
 
 </tf>
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 5261753d202d..aef894a425ba 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -69,7 +69,6 @@
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
-        ("hiera", "HieraImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 3346e03f9a88..b04392f55fa5 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -43,6 +43,7 @@
     ]
 
 
+
 if TYPE_CHECKING:
     from .configuration_hiera import (
         HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -55,8 +56,12 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .hiera_image_processor import HieraImageProcessor
-        from .modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock, HieraModel, HieraPreTrainedModel
+        from .modeling_hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, 
+            HieraModel, 
+            HieraPreTrainedModel,
+            HieraBlock,
+            )
 
 else:
     import sys
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 5d90ca9f9f55..8f24484a71c8 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -17,7 +17,7 @@
 import unittest
 from typing import Tuple
 
-from transformers import HieraConfig, HieraModel
+from transformers import HieraConfig, HieraModel, HieraPreTrainedModel
 from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock
 from transformers.testing_utils import (
     require_torch,
@@ -78,7 +78,7 @@ def create_and_check_block(self):
 
 
 @require_torch
-class TestHieraBlock(unittest.TestCase):
+class HieraBlockTest(unittest.TestCase):
     def setUp(self):
         self.block_tester = HieraBlockTester(self)
 
@@ -94,6 +94,9 @@ def test_input_output_dim_equality(self):
 
 
 class HieraModelTester:
+
+    all_model_classes = (HieraModel, HieraPreTrainedModel) if is_torch_available() else ()
+
     def __init__(
         self,
         parent,
@@ -219,26 +222,27 @@ def get_config(self, checkpoint_url):
 
     def create_and_check_model(self, config, pixel_values):
         batch_size = 1
-        model = HieraModel(config=config)
-        num_patches = (
-            int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)
-            ** 2
-        )
-        flat_q_stride = math.prod(self.q_stride)
-        embedding_dimension = self.embedding_dimension
-        indermediate_shapes = []
-        for _ in self.stages:
-            indermediate_shapes.append(
-                (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension)
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            num_patches = (
+                int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)
+                ** 2
             )
-            num_patches = num_patches / flat_q_stride
-            embedding_dimension = embedding_dimension * 2
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values=pixel_values)
-
-        for idx, x in enumerate(result.intermediates):
-            self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
+            flat_q_stride = math.prod(self.q_stride)
+            embedding_dimension = self.embedding_dimension
+            indermediate_shapes = []
+            for _ in self.stages:
+                indermediate_shapes.append(
+                    (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension)
+                )
+                num_patches = num_patches / flat_q_stride
+                embedding_dimension = embedding_dimension * 2
+            model.eval()
+            with torch.no_grad():
+                result = model(pixel_values=pixel_values)
+
+            for idx, x in enumerate(result.intermediates):
+                self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
 
 
 @require_torch
@@ -291,16 +295,4 @@ def test_forward(self):
 
         out = model(random_tensor, return_intermediates=True)
         for idx, x in enumerate(out.intermediates):
-            self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
-
-
-if __name__ == "__main__":
-    test = HieraModelIntegrationTest()
-    test.test_forward()
-    block_test = TestHieraBlock()
-    block_test.test_output_shape()
-    block_test.test_input_output_dim_equality()
-    model_test = HieraModelTest()
-    model_test.setUp()
-    model_test.test_model()
-    model_test.test_model_from_pretrained()
+            self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
\ No newline at end of file

From 3faf1e708c14a7af87ca7b6cad45cf24edc60714 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:22:15 -0600
Subject: [PATCH 040/118] Update docs/source/en/model_doc/hiera.md

Fix name

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/hiera.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index d38e2e70c770..bda5e0b9ad2f 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-Hubert was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
+Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 
 The abstract from the paper is the following:
 

From 7debb8dd8d224b0c4681f62126648dd1fcac2426 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:23:03 -0600
Subject: [PATCH 041/118] Update docs/source/en/model_doc/hiera.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/hiera.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index bda5e0b9ad2f..10664868c9bd 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -24,7 +24,7 @@ The abstract from the paper is the following:
 
 Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.
 
-## HireaConfig
+## HieraConfig
 
 [[autodoc]] HieraConfig
 

From 942e8e9147f46968cb0e1a0362e44a5a16b81ddc Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:23:35 -0600
Subject: [PATCH 042/118] Update docs/source/en/model_doc/hiera.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/hiera.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 10664868c9bd..8cd6dc1a977a 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -28,7 +28,6 @@ Modern hierarchical vision transformers have added several vision-specific compo
 
 [[autodoc]] HieraConfig
 
-<frameworkcontent>
 <pt>
 
 ## HieraModel

From 4e9ddd47f45690713e9460a11f52acd2c4e0112c Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:24:44 -0600
Subject: [PATCH 043/118] Update
 src/transformers/models/hiera/configuration_hiera.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/transformers/models/hiera/configuration_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 5b5e92688521..81910fbdf8f4 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -28,7 +28,7 @@
 
 class HieraConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`hiera`]. It is used to instantiate an HieraModel model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with
     the defaults will yield a similar configuration to that of the HieraModel
     [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture.
 

From c9e77046133a2e77e2c3d4d93c919eea71abd500 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:25:08 -0600
Subject: [PATCH 044/118] Update
 src/transformers/models/hiera/configuration_hiera.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/transformers/models/hiera/configuration_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 81910fbdf8f4..1d02957c2b73 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -37,7 +37,7 @@ class HieraConfig(PretrainedConfig):
 
 
     Args:
-        input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width). Defaults to (224, 224).
+        input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width).
         in_chans (int, optional, *optional*, defaults to 3): Number of input channels. Defaults to 3.
         embedding_dimension (int, optional, *optional*, defaults to 96): Dimension of the initial embedding. Defaults to 96.
         number_of_heads (int, optional, *optional*, defaults to 1): Initial number of attention heads. Defaults to 1.

From d8e8b735e697796ad74a0f4cd7775afdf986de9f Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:27:24 -0600
Subject: [PATCH 045/118] Update
 src/transformers/models/hiera/modeling_hiera.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/transformers/models/hiera/modeling_hiera.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index f463834a437b..cc678e1c71ea 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -377,7 +377,8 @@ def __init__(
     ):
         """
         Args:
-        - input_dim, output_dim: The input and output feature dimensions.
+        input_dim (`int`): The input feature dimensions.
+        output_dim (`int`): The output feature dimensions.
         - number_of_heads: The number of attention number_of_heads.
         - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
         - window_size: The current (flattened) size of a mask unit *after* pooling (if any).

From 6027674616eb077b7bcb10a61be1b4247444c0c2 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 11:27:41 -0600
Subject: [PATCH 046/118] Update
 src/transformers/models/hiera/modeling_hiera.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index cc678e1c71ea..9165b9a529f0 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -379,7 +379,7 @@ def __init__(
         Args:
         input_dim (`int`): The input feature dimensions.
         output_dim (`int`): The output feature dimensions.
-        - number_of_heads: The number of attention number_of_heads.
+        number_of_heads (`int`): The number of attention heads.
         - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
         - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
         - use_mask_unit_attention: Use Mask Unit or Global Attention.

From 0e3a0e5b11133df720ea4f398d75a8686ad305b4 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 18:56:37 +0000
Subject: [PATCH 047/118] Fixed formatting

---
 .../models/hiera/modeling_hiera.py            | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 9165b9a529f0..fe4d67f2e6a4 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -69,7 +69,8 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso
 
 
 def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """Zero-out the masked regions of the input before conv.
+    """
+    Zero-out the masked regions of the input before conv.
     Prevents leakage of masked regions when using overlapping kernels.
     """
     if conv is None:
@@ -296,9 +297,9 @@ def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) ->
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
         If no mask is provided:
-            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
+            Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
         If a mask is provided:
-            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+            Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
         """
         schedule, size = self.schedule[block_idx]
         B, N, C = x.shape
@@ -377,12 +378,12 @@ def __init__(
     ):
         """
         Args:
-        input_dim (`int`): The input feature dimensions.
-        output_dim (`int`): The output feature dimensions.
-        number_of_heads (`int`): The number of attention heads.
-        - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
-        - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
-        - use_mask_unit_attention: Use Mask Unit or Global Attention.
+            input_dim (`int`): The input feature dimensions.
+            output_dim (`int`): The output feature dimensions.
+            number_of_heads (`int`): The number of attention heads.
+            q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
+            window_size: The current (flattened) size of a mask unit *after* pooling (if any).
+            use_mask_unit_attention: Use Mask Unit or Global Attention.
         """
         super().__init__()
 
@@ -499,7 +500,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 @add_start_docstrings(
-    """
+"""
 Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
 """
 )
@@ -552,7 +553,7 @@ def _init_weights(self, module, init_bias=0.02):
 
 
 @add_start_docstrings(
-    """
+"""
 Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
 
 This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
@@ -571,7 +572,7 @@ def _init_weights(self, module, init_bias=0.02):
     >>> model = Hiera(config)
     >>> inputs = torch.rand((1, 3, 224, 224))
     >>> outputs = model(inputs)
-                      """
+"""
 )
 class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
@@ -741,7 +742,7 @@ def get_position_embeddings(self) -> torch.Tensor:
             return self.position_embeddings
 
     @add_start_docstrings_to_model_forward(
-        """
+    """
     The forward pass for the Hiera model.
 
     Args:
@@ -750,14 +751,8 @@ def get_position_embeddings(self) -> torch.Tensor:
         mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
         mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
         Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
-
-
         return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
-
         return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
-
-
-
     """
     )
     def forward(
@@ -767,7 +762,6 @@ def forward(
         return_dict: Optional[bool] = True,
         return_intermediates: bool = True,
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
-        """ """
         # Slowfast training passes in a list
         if isinstance(pixel_values, list):
             pixel_values = pixel_values[0]
@@ -809,4 +803,4 @@ def forward(
         return HieraModelOutput(
             last_hidden_state=embeddings,
             intermediates=intermediates if return_intermediates else None,
-        )
+        )
\ No newline at end of file

From e9a41269485f3f541caf686d1a138268c7fbcfcc Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 19:20:35 +0000
Subject: [PATCH 048/118] Code quality & Import differences

---
 src/transformers/models/hiera/__init__.py     |  9 +--
 .../models/hiera/modeling_hiera.py            | 58 +++++++++----------
 tests/models/hiera/test_modeling_hiera.py     |  8 ++-
 3 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index b04392f55fa5..d8c62fc0800a 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -39,11 +39,9 @@
         "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "HieraModel",
         "HieraPreTrainedModel",
-        "HieraBlock",
     ]
 
 
-
 if TYPE_CHECKING:
     from .configuration_hiera import (
         HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -57,11 +55,10 @@
         pass
     else:
         from .modeling_hiera import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, 
-            HieraModel, 
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraModel,
             HieraPreTrainedModel,
-            HieraBlock,
-            )
+        )
 
 else:
     import sys
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index fe4d67f2e6a4..b7267ae7b7f5 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -500,9 +500,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 @add_start_docstrings(
-"""
-Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
-"""
+    """
+    Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
+    """
 )
 class PatchEmbedding(nn.Module):
     def __init__(
@@ -553,26 +553,26 @@ def _init_weights(self, module, init_bias=0.02):
 
 
 @add_start_docstrings(
-"""
-Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
+    """
+    Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles.
 
-This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
+    This model is a PyTorch implementation of the Hiera architecture for image classification. It introduces a hierarchical design that processes images in a coarse-to-fine manner, efficiently handling various scales and complexities within the images.
 
-The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance.
+    The model is built on the principles of Vision Transformers but introduces mask units to focus on specific regions of interest, significantly reducing computational requirements while maintaining competitive performance.
 
-Parameters:
-    config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
-        Initializing with a config file does not load the weights associated with the model, only the
-        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    Parameters:
+        config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
-Example usage:
-    >>> from your_model_file import Hiera, HieraConfig
-    >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+    Example usage:
+        >>> from your_model_file import Hiera, HieraConfig
+        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
 
-    >>> model = Hiera(config)
-    >>> inputs = torch.rand((1, 3, 224, 224))
-    >>> outputs = model(inputs)
-"""
+        >>> model = Hiera(config)
+        >>> inputs = torch.rand((1, 3, 224, 224))
+        >>> outputs = model(inputs)
+    """
 )
 class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
@@ -742,18 +742,18 @@ def get_position_embeddings(self) -> torch.Tensor:
             return self.position_embeddings
 
     @add_start_docstrings_to_model_forward(
-    """
-    The forward pass for the Hiera model.
+        """
+        The forward pass for the Hiera model.
 
-    Args:
-        pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`.
+        Args:
+            pixel_values (`torch.Tensor`): Input tensor of shape `(batch_size, channels, height, width)`.
 
-        mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
-        mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
-        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
-        return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
-        return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
-    """
+            mask (`torch.Tensor`, optional): A boolean tensor of shape `(batch_size, num_mask_units)` indicating which mask units to keep (True) or remove (False).
+            mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
+            Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
+            return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
+            return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
+        """
     )
     def forward(
         self,
@@ -803,4 +803,4 @@ def forward(
         return HieraModelOutput(
             last_hidden_state=embeddings,
             intermediates=intermediates if return_intermediates else None,
-        )
\ No newline at end of file
+        )
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 8f24484a71c8..de9afd8a0a59 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -94,7 +94,6 @@ def test_input_output_dim_equality(self):
 
 
 class HieraModelTester:
-
     all_model_classes = (HieraModel, HieraPreTrainedModel) if is_torch_available() else ()
 
     def __init__(
@@ -225,7 +224,10 @@ def create_and_check_model(self, config, pixel_values):
         for model_class in self.all_model_classes:
             model = model_class(config=config)
             num_patches = (
-                int(((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0]) + 1)
+                int(
+                    ((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0])
+                    + 1
+                )
                 ** 2
             )
             flat_q_stride = math.prod(self.q_stride)
@@ -295,4 +297,4 @@ def test_forward(self):
 
         out = model(random_tensor, return_intermediates=True)
         for idx, x in enumerate(out.intermediates):
-            self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
\ No newline at end of file
+            self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")

From b057d912d532483d35e44c9d1ccb5f4924490fa1 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 20:02:11 +0000
Subject: [PATCH 049/118] quality and repo-consistency fix

---
 src/transformers/__init__.py                         | 6 +++---
 src/transformers/models/hiera/configuration_hiera.py | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9d668babbec2..de8bbdb00371 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2365,6 +2365,9 @@
             "GroupViTVisionModel",
         ]
     )
+    _import_structure["models.hiera"].extend(
+        ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"]
+    )
     _import_structure["models.hubert"].extend(
         [
             "HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4148,9 +4151,6 @@
             "TFGroupViTVisionModel",
         ]
     )
-    _import_structure["models.hiera"].extend(
-        ["HIERA_PRETRAINED_MODEL_ARCHIVE_LIST", "HieraModel", "HieraPreTrainedModel"]
-    )
     _import_structure["models.hubert"].extend(
         [
             "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 1d02957c2b73..dc4e7d554bee 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -23,14 +23,17 @@
 
 logger = logging.get_logger(__name__)
 
-HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "namangarg110/hiera_base_224": "https://huggingface.co/namangarg110/hiera_base_224/blob/main/config.json",
+    # See all Hiera models at https://huggingface.co/models?filter=hiera
+}
 
 
 class HieraConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with
     the defaults will yield a similar configuration to that of the HieraModel
-    [facebookresearch/hiera](https://github.com/facebookresearch/hiera) architecture.
+    [namangarg110/hiera_base_224](https://huggingface.co/namangarg110/hiera_base_224/) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From d7210cc169ee21d3eaff040362883cce548c18f5 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 20:27:09 +0000
Subject: [PATCH 050/118] fixed no torch error

---
 tests/models/hiera/test_modeling_hiera.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index de9afd8a0a59..729d1de4247c 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -17,8 +17,7 @@
 import unittest
 from typing import Tuple
 
-from transformers import HieraConfig, HieraModel, HieraPreTrainedModel
-from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock
+from transformers import HieraConfig
 from transformers.testing_utils import (
     require_torch,
     slow,
@@ -28,6 +27,10 @@
 
 if is_torch_available():
     import torch
+
+    from transformers import HieraModel
+    from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock
+
 import math
 
 
@@ -94,7 +97,7 @@ def test_input_output_dim_equality(self):
 
 
 class HieraModelTester:
-    all_model_classes = (HieraModel, HieraPreTrainedModel) if is_torch_available() else ()
+    all_model_classes = (HieraModel,) if is_torch_available() else ()
 
     def __init__(
         self,

From 10dfa68d4ba8a6f536f6181cd1025a4f56f6485d Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 20:42:47 +0000
Subject: [PATCH 051/118] Docstring fix

---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index b7267ae7b7f5..eb7da758fd6d 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -566,7 +566,7 @@ def _init_weights(self, module, init_bias=0.02):
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
     Example usage:
-        >>> from your_model_file import Hiera, HieraConfig
+        >>> from transformers import Hiera, HieraConfig
         >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
 
         >>> model = Hiera(config)

From f81fa7625d2753f71eabe05f36443cf0825927fb Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 20:52:22 +0000
Subject: [PATCH 052/118] Docstring fix

---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index eb7da758fd6d..0b5a1fa35213 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -566,7 +566,7 @@ def _init_weights(self, module, init_bias=0.02):
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
     Example usage:
-        >>> from transformers import Hiera, HieraConfig
+        >>> from transformers import HieraModel, HieraConfig
         >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
 
         >>> model = Hiera(config)

From 5951e581507d331cbbf56717e1c702b2410466a1 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 20:53:07 +0000
Subject: [PATCH 053/118] doc string fix

---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 0b5a1fa35213..159aaa0b9fa3 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -567,7 +567,7 @@ def _init_weights(self, module, init_bias=0.02):
 
     Example usage:
         >>> from transformers import HieraModel, HieraConfig
-        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3))
 
         >>> model = Hiera(config)
         >>> inputs = torch.rand((1, 3, 224, 224))

From 9b194a4b496e2dbbeb872b6475431ef5a1cd6489 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 29 Feb 2024 20:58:02 +0000
Subject: [PATCH 054/118] fixed example usage

---
 src/transformers/models/hiera/modeling_hiera.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 159aaa0b9fa3..536de5592202 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -567,9 +567,9 @@ def _init_weights(self, module, init_bias=0.02):
 
     Example usage:
         >>> from transformers import HieraModel, HieraConfig
+        >>> import torch
         >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3))
-
-        >>> model = Hiera(config)
+        >>> model = HieraModel(config)
         >>> inputs = torch.rand((1, 3, 224, 224))
         >>> outputs = model(inputs)
     """

From dd3da8f4b469677f1de1e89e611084aa06f6780a Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 7 Mar 2024 05:29:43 +0000
Subject: [PATCH 055/118] Resolved issues in modeling_hiera

---
 .../models/hiera/modeling_hiera.py            | 332 +++++++++---------
 tests/models/hiera/test_modeling_hiera.py     |  66 +---
 2 files changed, 160 insertions(+), 238 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 536de5592202..3cd0d21c56b8 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1,8 +1,8 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Meta and The HuggingFace Team. All rights reserved.
 # All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
+# This code is part of a project that uses the model by Meta, licensed under
+# the Creative Commons Attribution-NonCommercial 4.0 International License.
+# To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or
 # --------------------------------------------------------
 #
 # Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
@@ -12,10 +12,6 @@
 # Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
 #
 # Paper: https://arxiv.org/abs/2306.00989/
-#
-# References:
-# slowfast: https://github.com/facebookresearch/SlowFast
-# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
 # --------------------------------------------------------
 
 import collections.abc
@@ -52,7 +48,7 @@ def conv_nd(n: int) -> Type[nn.Module]:
 
 
 def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
-    # Refer to `Unroll` to see how this performs a maxpool-Nd
+    # Refer to `HieraUnroll` to see how this performs a maxpool-Nd
     return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
 
 
@@ -61,8 +57,6 @@ def get_resized_mask(target_size: torch.Size, mask: torch.Tensor) -> torch.Tenso
     # (spatial) mask: [B, C, (t), (h), w]
     if mask is None:
         return mask
-
-    assert len(mask.shape[2:]) == len(target_size)
     if mask.shape[2:] != target_size:
         return F.interpolate(mask.float(), size=target_size)
     return mask
@@ -82,36 +76,35 @@ def do_masked_conv(x: torch.Tensor, conv: nn.Module, mask: Optional[torch.Tensor
     return conv(x * mask.bool())
 
 
-def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor:
+def undo_windowing(tensor: torch.Tensor, spatial_shape: List[int], mask_unit_shape: List[int]) -> torch.Tensor:
     """
     Restore spatial organization by undoing windowed organization of mask units.
 
     Args:
-        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
-        shape: current spatial shape, if it were not organized into mask unit
-            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
-        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
+        tensor: Tensor organized by mask units windows, e.g., in 2D [batch_size, num_mask_units_y*num_mask_units_x, mask_unit_height, mask_unit_width, channels].
+        spatial_shape: Desired spatial shape if it were not organized into mask unit windows, e.g., in 2D [batch_size, num_mask_units_y*mask_unit_height, num_mask_units_x*mask_unit_width, channels].
+        mask_unit_shape: Current mask unit shape, e.g., in 2D [mask_unit_height, mask_unit_width].
     Returns:
-        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
+        Restored tensor with spatial organization, e.g., in 2D [batch_size, num_mask_units_y*mask_unit_height, num_mask_units_x*mask_unit_width, channels].
     """
-    D = len(shape)
-    B, C = x.shape[0], x.shape[-1]
-    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
-    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
-    x = x.view(B, *num_MUs, *mu_shape, C)
-
-    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
-    permute = (
+    num_dimensions = len(spatial_shape)
+    batch_size, channels = tensor.shape[0], tensor.shape[-1]
+    # [batch_size, num_mask_units_y*num_mask_units_x, mask_unit_height, mask_unit_width, channels] -> [batch_size, num_mask_units_y, num_mask_units_x, mask_unit_height, mask_unit_width, channels]
+    num_mask_units = [spatial_dim // mask_unit_dim for spatial_dim, mask_unit_dim in zip(spatial_shape, mask_unit_shape)]
+    tensor = tensor.view(batch_size, *num_mask_units, *mask_unit_shape, channels)
+
+    # Calculate the permutation order for restoring spatial organization
+    permute_order = (
         [0]
         + sum(
-            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
+            [list(p) for p in zip(range(1, 1 + num_dimensions), range(1 + num_dimensions, 1 + 2 * num_dimensions))],
             [],
         )
-        + [len(x.shape) - 1]
+        + [len(tensor.shape) - 1]
     )
-    x = x.permute(permute).reshape(B, *shape, C)
+    tensor = tensor.permute(permute_order).reshape(batch_size, *spatial_shape, channels)
 
-    return x
+    return tensor
 
 
 # Copied from transformers.models.swin.modeling_swin.drop_path
@@ -134,42 +127,29 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
     return output
 
 
-class DropPath(nn.Module):
+class HieraDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
 
     def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
+        super(HieraDropPath, self).__init__()
         self.drop_prob = drop_prob
         self.scale_by_keep = scale_by_keep
 
     def forward(self, x):
         return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
 
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob,3):0.3f}"
-
-
-# Copied from timm.layers.helpers
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
-            return tuple(x)
-        return tuple(repeat(x, n))
-
-    return parse
 
 
-to_2tuple = _ntuple(2)
 
 
-# Copied from timm.layers.mlp
-class Mlp(nn.Module):
+class HieraMlp(nn.Module):
     """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
 
     def __init__(
         self,
+        config: HieraConfig,
         in_features,
-        hidden_features=None,
+        # hidden_features=None,
         out_features=None,
         act_layer=nn.GELU,
         norm_layer=None,
@@ -178,18 +158,25 @@ def __init__(
         use_conv=False,
     ):
         super().__init__()
+        self.config = config
+        hidden_features = int(in_features * self.config.mlp_ratio)
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        bias = (bias, bias) if not isinstance(bias, tuple) else bias
+
+        drop_probs = (drop, drop) if not isinstance(drop, tuple) else drop
 
-        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
         self.act = act_layer()
         self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
-        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
         self.drop2 = nn.Dropout(drop_probs[1])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()  
+
+        if use_conv:  
+            self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0])  
+            self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1])  
+        else:  
+            self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])  
+            self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])  
 
     def forward(self, x):
         x = self.fc1(x)
@@ -201,7 +188,7 @@ def forward(self, x):
         return x
 
 
-class Unroll(nn.Module):
+class HieraUnroll(nn.Module):
     """
     Reorders the tokens such that patches are contiguous in memory.
     E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
@@ -223,124 +210,123 @@ class Unroll(nn.Module):
 
     def __init__(
         self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
+        config: HieraConfig,
     ):
         super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-        self.schedule = unroll_schedule
+        self.config = config
+        self.size = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)]
+        self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)]
+        self.schedule = [self.config.q_stride] * len(self.stage_ends[:-1])
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         """
         Input: Flattened patch embeddings [B, N, C]
         Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
         """
-        B, _, C = x.shape
+        batch_size, _, channels = embeddings.shape
 
-        cur_size = self.size
-        x = x.view(*([B] + cur_size + [C]))
+        current_size = self.size
+        embeddings = embeddings.view(*([batch_size] + current_size + [channels]))
 
-        for strides in self.schedule:
+        for stride_steps in self.schedule:
             # Move patches with the given strides to the batch dimension
 
             # Create a view of the tensor with the patch stride as separate dims
             # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
-            cur_size = [i // s for i, s in zip(cur_size, strides)]
-            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
-            x = x.view(new_shape)
+            current_size = [dimension // stride for dimension, stride in zip(current_size, stride_steps)]
+            new_shape = [batch_size] + sum([[dimension, stride] for dimension, stride in zip(current_size, stride_steps)], []) + [channels]
+            embeddings = embeddings.view(new_shape)
 
             # Move the patch stride into the batch dimension
             # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
-            L = len(new_shape)
-            permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
-            x = x.permute(permute)
+            shape_length = len(new_shape)
+            permute_order = [0] + list(range(2, shape_length - 1, 2)) + list(range(1, shape_length - 1, 2)) + [shape_length - 1]
+            embeddings = embeddings.permute(*permute_order)
 
             # Now finally flatten the relevant dims into the batch dimension
-            x = x.flatten(0, len(strides))
-            B *= math.prod(strides)
+            embeddings = embeddings.flatten(0, len(stride_steps))
+            batch_size *= math.prod(stride_steps)
 
-        x = x.reshape(-1, math.prod(self.size), C)
-        return x
+        embeddings = embeddings.reshape(-1, math.prod(self.size), channels)
+        return embeddings
 
 
-class Reroll(nn.Module):
+class HieraReroll(nn.Module):
     """
     Undos the "unroll" operation so that you can use intermediate features.
     """
 
     def __init__(
         self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
-        stage_ends: List[int],
-        q_pool: int,
+        config: HieraConfig,
+        
     ):
         super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-
+        self.config = config
+        self.size = [i // s for i, s in zip(self.config.input_size, self.config.patch_stride)]
+        self.stage_ends = [sum(self.config.stages[:i]) - 1 for i in range(1, len(self.config.stages) + 1)]
+        unroll_schedule = [self.config.q_stride] * len(self.stage_ends[:-1])
         # The first stage has to reverse everything
         # The next stage has to reverse all but the first unroll, etc.
         self.schedule = {}
         size = self.size
-        for i in range(stage_ends[-1] + 1):
+        for i in range(self.stage_ends[-1] + 1):
             self.schedule[i] = unroll_schedule, size
             # schedule unchanged if no pooling at a stage end
-            if i in stage_ends[:q_pool]:
+            if i in self.stage_ends[:self.config.q_pool]:
                 if len(unroll_schedule) > 0:
-                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
+                    size = [new_size // stride for new_size, stride in zip(size, unroll_schedule[0])]
                 unroll_schedule = unroll_schedule[1:]
 
-    def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor:
+    def forward(self, embeddings: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor:
         """
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
         If no mask is provided:
             Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
         If a mask is provided:
-            Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+            Returns [B, #MaskUnits, MaskUnitHeight, MaskUnitWidth, C] for 2d, etc.
         """
         schedule, size = self.schedule[block_idx]
-        B, N, C = x.shape
+        batch_size, num_tokens, num_channels = embeddings.shape
 
-        D = len(size)
-        cur_mu_shape = [1] * D
+        num_dimensions = len(size)
+        current_mask_unit_shape = [1] * num_dimensions
 
         for strides in schedule:
             # Extract the current patch from N
-            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
+            embeddings = embeddings.view(batch_size, *strides, num_tokens // math.prod(strides), *current_mask_unit_shape, num_channels)
 
             # Move that patch into the current MU
             # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
-            L = len(x.shape)
+            shape_length = len(embeddings.shape)
             permute = (
-                [0, 1 + D]
+                [0, 1 + num_dimensions]
                 + sum(
-                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
+                    [list(p) for p in zip(range(1, 1 + num_dimensions), range(1 + num_dimensions + 1, shape_length - 1))],
                     [],
                 )
-                + [L - 1]
+                + [shape_length - 1]
             )
-            x = x.permute(permute)
+            embeddings = embeddings.permute(permute)
 
             # Reshape to [B, N//(Sy*Sx), *MU, C]
-            for i in range(D):
-                cur_mu_shape[i] *= strides[i]
-            x = x.reshape(B, -1, *cur_mu_shape, C)
-            N = x.shape[1]
+            for i in range(num_dimensions):
+                current_mask_unit_shape[i] *= strides[i]
+            embeddings = embeddings.reshape(batch_size, -1, *current_mask_unit_shape, num_channels)
+            num_tokens = embeddings.shape[1]
 
         # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
-        x = x.view(B, N, *cur_mu_shape, C)
+        embeddings = embeddings.view(batch_size, num_tokens, *current_mask_unit_shape, num_channels)
 
         # If masked, return [B, #MUs, MUy, MUx, C]
         if mask is not None:
-            return x
+            return embeddings
 
         # If not masked, we can return [B, H, W, C]
-        x = undo_windowing(x, size, cur_mu_shape)
+        embeddings = undo_windowing(embeddings, size, current_mask_unit_shape)
 
-        return x
+        return embeddings
 
 
 @dataclass
@@ -357,18 +343,22 @@ class HieraModelOutput(ModelOutput):
 
     last_hidden_state: torch.FloatTensor
     intermediates: Optional[List[torch.Tensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
 
 
-class MaskUnitAttention(nn.Module):
+class HieraMaskUnitAttention(nn.Module):
     """
     Computes either Mask Unit or Global Attention. Also is able to perform q pooling.
 
     Note: this assumes the tokens have already been flattened and unrolled into mask units.
-    See `Unroll` for more details.
+    See `HieraUnroll` for more details.
     """
 
     def __init__(
         self,
+        config: HieraConfig,
         input_dim: int,
         output_dim: int,
         number_of_heads: int,
@@ -386,7 +376,7 @@ def __init__(
             use_mask_unit_attention: Use Mask Unit or Global Attention.
         """
         super().__init__()
-
+        self.config = config
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.number_of_heads = number_of_heads
@@ -414,33 +404,30 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         q, k, v = qkv[0], qkv[1], qkv[2]
 
         if self.q_stride > 1:
-            # Refer to Unroll to see how this performs a maxpool-Nd
+            # Refer to HieraUnroll to see how this performs a maxpool-Nd
             q = (
                 q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
                 .max(dim=3)
                 .values
             )
 
-        if hasattr(F, "scaled_dot_product_attention"):
-            # Note: the original paper did *not* use SDPA, it's a free boost!
-            embeddings = F.scaled_dot_product_attention(q, k, v)
-        else:
-            attention = (q * self.scale) @ k.transpose(-1, -2)
-            attention = attention.softmax(dim=-1)
-            embeddings = attention @ v
+    
+        attention = (q * self.scale) @ k.transpose(-1, -2)
+        attention = attention.softmax(dim=-1)
+        embeddings = attention @ v
 
         embeddings = embeddings.transpose(1, 3).reshape(batch_size, -1, self.output_dim)
         embeddings = self.projection(embeddings)
-        return embeddings
+        return embeddings, attention
 
 
 class HieraBlock(nn.Module):
     def __init__(
         self,
+        config: HieraConfig,
         input_dim: int,
         output_dim: int,
         number_of_heads: int,
-        mlp_ratio: float = 4.0,
         drop_path: float = 0.0,
         norm_layer: nn.Module = nn.LayerNorm,
         act_layer: nn.Module = nn.GELU,
@@ -449,19 +436,18 @@ def __init__(
         use_mask_unit_attention: bool = False,
     ):
         super().__init__()
-
+        self.config = config
         self.input_dim = input_dim
         self.output_dim = output_dim
-
         self.norm1 = norm_layer(input_dim)
-        self.attention = MaskUnitAttention(
-            input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention
+        self.attention = HieraMaskUnitAttention(
+            config, input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention
         )
 
         self.norm2 = norm_layer(output_dim)
-        self.mlp = Mlp(output_dim, int(output_dim * mlp_ratio), act_layer=act_layer)
+        self.mlp = HieraMlp(config, output_dim,  act_layer=act_layer)
 
-        self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
+        self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity()
         if input_dim != output_dim:
             self.projection = nn.Linear(input_dim, output_dim)
 
@@ -470,24 +456,25 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         normalized_embeddings = self.norm1(embeddings)
         if self.input_dim != self.output_dim:
             embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride)
-        embeddings = embeddings + self.drop_path(self.attention(normalized_embeddings))
+        attention_output , attention_weights = self.attention(normalized_embeddings)
+        embeddings = embeddings + self.drop_path(attention_output)
 
         # MLP
         embeddings = embeddings + self.drop_path(self.mlp(self.norm2(embeddings)))
-        return embeddings
+        return embeddings, attention_weights
 
 
-class Head(nn.Module):
+class HieraHead(nn.Module):
     def __init__(
         self,
+        config: HieraConfig,
         input_dim: int,
-        num_classes: int,
-        dropout_rate: float = 0.0,
         act_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: x.softmax(dim=-1),
     ):
         super().__init__()
-        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-        self.projection = nn.Linear(input_dim, num_classes)
+        self.config = config
+        self.dropout = nn.Dropout(self.config.head_dropout) if self.config.head_dropout > 0 else nn.Identity()
+        self.projection = nn.Linear(input_dim, self.config.num_classes)
         # act_fun for eval and testing only
         self.act_func = act_func
 
@@ -498,31 +485,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.act_func(x)
         return x
 
-
-@add_start_docstrings(
-    """
-    Patch embedding that supports any number of spatial dimensions (1d, 2d, 3d).
-    """
-)
-class PatchEmbedding(nn.Module):
+class HieraPatchEmbedding(nn.Module):
     def __init__(
         self,
-        dim_in: int,
-        output_dim: int,
-        kernel: Tuple[int, ...],
-        stride: Tuple[int, ...],
-        padding: Tuple[int, ...],
+        config: HieraConfig,
     ):
         super().__init__()
-
+        self.config = config
         # Support any number of spatial dimensions
-        self.spatial_dims = len(kernel)
+        self.spatial_dims = len(self.config.patch_kernel)
         self.projection = conv_nd(self.spatial_dims)(
-            dim_in,
-            output_dim,
-            kernel_size=kernel,
-            stride=stride,
-            padding=padding,
+            self.config.in_chans,
+            self.config.embedding_dimension,
+            kernel_size=self.config.patch_kernel,
+            stride=self.config.patch_stride,
+            padding=self.config.patch_padding,
         )
 
     def forward(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
@@ -604,22 +581,19 @@ def __init__(self, config: HieraConfig):
 
         super().__init__(config)
         self.config = config
-        norm_layer = partial(nn.LayerNorm, eps=1e-6)  # Example, adjust as needed
+        norm_layer = partial(nn.LayerNorm, eps=1e-6) 
         depth = sum(self.stages)
         self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)]
         num_tokens = math.prod(self.tokens_spatial_shape)
         flat_mu_size = math.prod(self.mask_unit_size)
         flat_q_stride = math.prod(self.q_stride)
 
-        assert self.q_pool < len(self.stages)
         self.q_pool, self.q_stride = self.q_pool, self.q_stride
         self.mu_size, self.mask_unit_size = flat_mu_size, self.mask_unit_size
         self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, self.mask_unit_size)]
         self.stage_ends = [sum(self.stages[:i]) - 1 for i in range(1, len(self.stages) + 1)]
 
-        self.patch_embedding = PatchEmbedding(
-            self.in_chans, self.embedding_dimension, self.patch_kernel, self.patch_stride, self.patch_padding
-        )
+        self.patch_embedding = HieraPatchEmbedding(config)
 
         if self.sep_position_embeddings:
             self.position_embeddings_spatial = nn.Parameter(
@@ -636,14 +610,9 @@ def __init__(self, config: HieraConfig):
             self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension))
 
         # Setup roll and reroll modules
-        self.unroll = Unroll(self.input_size, self.patch_stride, [self.q_stride] * len(self.stage_ends[:-1]))
-        self.reroll = Reroll(
-            self.input_size,
-            self.patch_stride,
-            [self.q_stride] * len(self.stage_ends[:-1]),
-            self.stage_ends,
-            self.q_pool,
-        )
+        self.unroll = HieraUnroll(config) 
+        self.reroll = HieraReroll(config)
+        
         # q_pool locations
         q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]]
         # stochastic depth decay rule
@@ -670,10 +639,10 @@ def __init__(self, config: HieraConfig):
                 number_of_heads = self.number_of_heads
 
             block = HieraBlock(
+                config,
                 input_dim=self.embedding_dimension,
                 output_dim=output_dim,
                 number_of_heads=number_of_heads,
-                mlp_ratio=self.mlp_ratio,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -685,7 +654,7 @@ def __init__(self, config: HieraConfig):
             self.blocks.append(block)
 
         self.norm = norm_layer(self.embedding_dimension)
-        self.head = Head(self.embedding_dimension, self.num_classes, dropout_rate=self.head_dropout)
+        self.head = HieraHead(config, self.embedding_dimension)
 
         # Initialize everything
         if self.sep_position_embeddings:
@@ -752,7 +721,10 @@ def get_position_embeddings(self) -> torch.Tensor:
             mask should be a boolean tensor of shape [batch_size , #MUt*#MUy*#MUx] where #MU are the number of mask units in that input_dim.
             Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
             return_dict (`bool`, optional): Whether to return a dictionary of outputs or a plain tuple.
-            return_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
+            output_intermediates (`bool`, optional): Whether to return intermediate features from each stage of the model.
+            output_attentions (`bool`, optional): Whether to return attention weights
+            output_hidden_states(`bool`, optional): Whether to return Hidden States
+
         """
     )
     def forward(
@@ -760,12 +732,14 @@ def forward(
         pixel_values: torch.Tensor,
         mask: torch.Tensor = None,
         return_dict: Optional[bool] = True,
-        return_intermediates: bool = True,
+        output_intermediates: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
-        # Slowfast training passes in a list
-        if isinstance(pixel_values, list):
-            pixel_values = pixel_values[0]
-        intermediates = []
+        intermediates = [] if output_intermediates  else None
+        attentions = [] if output_attentions else None
+        hidden_states = [] if output_hidden_states else None
 
         pached_embeddings = self.patch_embedding(
             pixel_values,
@@ -776,6 +750,9 @@ def forward(
         embeddings = pached_embeddings + self.get_position_embeddings()
         embeddings = self.unroll(embeddings)
 
+        if output_hidden_states:
+            hidden_states.append(embeddings)
+
         # Discard masked tokens
         if mask is not None:
             embeddings = embeddings[mask[..., None].tile(1, self.mu_size, embeddings.shape[2])].view(
@@ -783,9 +760,14 @@ def forward(
             )
 
         for i, block in enumerate(self.blocks):
-            embeddings = block(embeddings)
+            embeddings, attention = block(embeddings)
+            if output_attentions:
+                attentions.append(attention)
+
+            if output_hidden_states:
+                hidden_states.append(embeddings)
 
-            if return_intermediates and i in self.stage_ends:
+            if output_intermediates and i in self.stage_ends:
                 intermediates.append(self.reroll(embeddings, i, mask=mask))
 
         if mask is None:
@@ -798,9 +780,11 @@ def forward(
         # q_stride = (2, 2), not all unrolls were consumed,
         # intermediates[-1] is embeddings in spatial order
         if not return_dict:
-            return tuple(v for v in [embeddings, intermediates] if v is not None)
+            return tuple(v for v in [embeddings, intermediates, attention, hidden_states] if v is not None)
 
         return HieraModelOutput(
             last_hidden_state=embeddings,
-            intermediates=intermediates if return_intermediates else None,
+            intermediates=intermediates if output_intermediates else None,
+            attentions=attentions if output_attentions else None,
+            hidden_states=hidden_states if output_hidden_states else None,
         )
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 729d1de4247c..38d84d015220 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,68 +34,6 @@
 import math
 
 
-class HieraBlockTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 1,
-        input_dim: int = 96,
-        output_dim: int = 192,
-        number_of_heads: int = 2,
-        mlp_ratio: float = 4.0,
-        drop_path: float = 0.0,
-        q_stride: int = 4,
-        window_size: int = 16,
-        use_mask_unit_attention: bool = True,
-        num_patches: int = 3136,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.number_of_heads = number_of_heads
-        self.mlp_ratio = mlp_ratio
-        self.drop_path = drop_path
-        self.q_stride = q_stride
-        self.window_size = window_size
-        self.use_mask_unit_attention = use_mask_unit_attention
-        self.num_patches = num_patches
-
-    def create_and_check_block(self):
-        block = HieraBlock(
-            input_dim=self.input_dim,
-            output_dim=self.output_dim,
-            number_of_heads=self.number_of_heads,
-            mlp_ratio=self.mlp_ratio,
-            drop_path=self.drop_path,
-            q_stride=self.q_stride,
-            window_size=self.window_size,
-            use_mask_unit_attention=self.use_mask_unit_attention,
-        )
-
-        x = torch.randn(self.batch_size, self.num_patches, self.input_dim)
-        out = block(x)
-
-        expected_shape = (self.batch_size, self.num_patches // self.q_stride, self.output_dim)
-        self.parent.assertEqual(out.shape, expected_shape, "Output shape is incorrect")
-
-
-@require_torch
-class HieraBlockTest(unittest.TestCase):
-    def setUp(self):
-        self.block_tester = HieraBlockTester(self)
-
-    def test_output_shape(self):
-        self.block_tester.create_and_check_block()
-
-    def test_input_output_dim_equality(self):
-        self.block_tester.output_dim = self.block_tester.input_dim
-        self.block_tester.q_stride = 1
-        self.block_tester.number_of_heads = 1
-        self.block_tester.window_size = 64
-        self.block_tester.create_and_check_block()
-
-
 class HieraModelTester:
     all_model_classes = (HieraModel,) if is_torch_available() else ()
 
@@ -298,6 +236,6 @@ def test_forward(self):
 
         out.last_hidden_state.argmax(dim=-1).item()
 
-        out = model(random_tensor, return_intermediates=True)
+        out = model(random_tensor, output_intermediates=True)
         for idx, x in enumerate(out.intermediates):
             self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")

From 3475b2d9361069e8caf775eff4a0be283667d980 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 7 Mar 2024 05:29:56 +0000
Subject: [PATCH 056/118] Removed Hiera MAE

---
 src/transformers/models/hiera/hiera_mae.py | 269 ---------------------
 1 file changed, 269 deletions(-)
 delete mode 100644 src/transformers/models/hiera/hiera_mae.py

diff --git a/src/transformers/models/hiera/hiera_mae.py b/src/transformers/models/hiera/hiera_mae.py
deleted file mode 100644
index 7c42c22734a1..000000000000
--- a/src/transformers/models/hiera/hiera_mae.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# mae: https://github.com/facebookresearch/mae
-# slowfast: https://github.com/facebookresearch/SlowFast
-# --------------------------------------------------------
-
-
-import math
-from functools import partial
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from .modeling_hiera import HieraBlock, HieraModel, conv_nd, undo_windowing
-
-
-def apply_fusion_head(head: nn.Module, x: torch.Tensor) -> torch.Tensor:
-    if isinstance(head, nn.Identity):
-        return x
-
-    batch_size, num_mask_units = x.shape[0:2]
-    # Apply head, e.g [batch_size , #MUs, My, Mx, C] -> head([batch_size  * #MUs, C, My, Mx])
-    permute = [0] + [len(x.shape) - 2] + list(range(1, len(x.shape) - 2))
-    x = head(x.reshape(batch_size * num_mask_units, *x.shape[2:]).permute(permute))
-
-    # Restore original layout, e.g. [batch_size  * #MUs, C', My', Mx'] -> [batch_size , #MUs, My', Mx', C']
-    permute = [0] + list(range(2, len(x.shape))) + [1]
-    x = x.permute(permute).reshape(batch_size, num_mask_units, *x.shape[2:], x.shape[1])
-    return x
-
-
-class MaskedAutoencoderHiera(HieraModel):
-    """Masked Autoencoder with HieraModel backbone"""
-
-    def __init__(
-        self,
-        in_chans: int = 3,
-        patch_stride: Tuple[int, ...] = (4, 4),
-        mlp_ratio: float = 4.0,
-        decoder_embed_dim: int = 512,
-        decoder_depth: int = 8,
-        decoder_num_heads: int = 16,
-        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
-        **kwdargs,
-    ):
-        super().__init__(
-            in_chans=in_chans,
-            patch_stride=patch_stride,
-            mlp_ratio=mlp_ratio,
-            norm_layer=norm_layer,
-            **kwdargs,
-        )
-
-        del self.norm, self.head
-        encoder_dim_out = self.blocks[-1].dim_out
-        self.encoder_norm = norm_layer(encoder_dim_out)
-        self.mask_unit_spatial_shape_final = [
-            i // s ** (self.q_pool) for i, s in zip(self.mask_unit_size, self.q_stride)
-        ]
-        self.tokens_spatial_shape_final = [
-            i // s ** (self.q_pool) for i, s in zip(self.tokens_spatial_shape, self.q_stride)
-        ]
-        # --------------------------------------------------------------------------
-        # Multi-scale fusion heads
-        curr_mu_size = self.mask_unit_size
-        self.multi_scale_fusion_heads = nn.ModuleList()
-
-        for i in self.stage_ends[: self.q_pool]:  # resolution constant after q_pool
-            kernel = [i // s for i, s in zip(curr_mu_size, self.mask_unit_spatial_shape_final)]
-            curr_mu_size = [i // s for i, s in zip(curr_mu_size, self.q_stride)]
-            self.multi_scale_fusion_heads.append(
-                conv_nd(len(self.q_stride))(
-                    self.blocks[i].dim_out,
-                    encoder_dim_out,
-                    kernel_size=kernel,
-                    stride=kernel,
-                )
-            )
-        self.multi_scale_fusion_heads.append(nn.Identity())  # final stage, no transform
-
-        # --------------------------------------------------------------------------
-        # MAE decoder specifics
-        self.decoder_embed = nn.Linear(encoder_dim_out, decoder_embed_dim)
-
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
-
-        self.decoder_pos_embed = nn.Parameter(
-            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), decoder_embed_dim)
-        )
-
-        self.decoder_blocks = nn.ModuleList(
-            [
-                HieraBlock(
-                    dim=decoder_embed_dim,
-                    dim_out=decoder_embed_dim,
-                    heads=decoder_num_heads,
-                    norm_layer=norm_layer,
-                    mlp_ratio=mlp_ratio,
-                )
-                for i in range(decoder_depth)
-            ]
-        )
-        self.decoder_norm = norm_layer(decoder_embed_dim)
-
-        self.pred_stride = patch_stride[-1] * (self.q_stride[-1] ** self.q_pool)  # patch stride of prediction
-
-        self.decoder_pred = nn.Linear(
-            decoder_embed_dim,
-            (self.pred_stride ** min(2, len(self.q_stride))) * in_chans,
-        )  # predictor
-        # --------------------------------------------------------------------------
-
-        self.initialize_weights()
-
-    def initialize_weights(self):
-        nn.init.trunc_normal_(self.mask_token, std=0.02)
-        nn.init.trunc_normal_(self.decoder_pos_embed, std=0.02)
-        self.apply(self._mae_init_weights)
-
-        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
-        w = self.patch_embed.projection.weight.data
-        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
-
-    def _mae_init_weights(self, m: nn.Module):
-        if isinstance(m, nn.Linear):
-            nn.init.xavier_uniform_(m.weight)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def get_pixel_label_2d(self, input_img: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor:
-        # mask (boolean tensor): True must correspond to *masked*
-        input_img = input_img.permute(0, 2, 3, 1)
-
-        size = self.pred_stride
-        label = input_img.unfold(1, size, size).unfold(2, size, size)
-        label = label.flatten(1, 2).flatten(2)
-        label = label[mask]
-        if norm:
-            mean = label.mean(dim=-1, keepdim=True)
-            var = label.var(dim=-1, keepdim=True)
-            label = (label - mean) / (var + 1.0e-6) ** 0.5
-
-        return label
-
-    def get_pixel_label_3d(self, input_vid: torch.Tensor, mask: torch.Tensor, norm: bool = True) -> torch.Tensor:
-        # mask (boolean tensor): True must correspond to *masked*
-
-        # We use time strided loss, only take the first frame from each token
-        input_vid = input_vid[:, :, :: self.patch_stride[0], :, :]
-
-        size = self.pred_stride
-        label = input_vid.unfold(3, size, size).unfold(4, size, size)
-        label = label.permute(0, 2, 3, 4, 5, 6, 1)  # Different from 2d, mistake during training lol
-        label = label.flatten(1, 3).flatten(2)
-        label = label[mask]
-
-        if norm:
-            mean = label.mean(dim=-1, keepdim=True)
-            var = label.var(dim=-1, keepdim=True)
-            label = (label - mean) / (var + 1.0e-6) ** 0.5
-
-        return label
-
-    def forward_encoder(
-        self, x: torch.Tensor, mask_ratio: float, mask: Optional[torch.Tensor] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if mask is None:
-            mask = self.get_random_mask(x, mask_ratio)  # [batch_size , #MUs_all]
-
-        # Get multi-scale representations from encoder
-        _, intermediates = super().forward(x, mask, return_intermediates=True)
-        # Resolution unchanged after q_pool stages, so skip those features
-        intermediates = intermediates[: self.q_pool] + intermediates[-1:]
-
-        # Multi-scale fusion
-        x = 0.0
-        for head, interm_x in zip(self.multi_scale_fusion_heads, intermediates):
-            x += apply_fusion_head(head, interm_x)
-
-        x = self.encoder_norm(x)
-
-        return x, mask
-
-    def forward_decoder(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Embed tokens
-        x = self.decoder_embed(x)
-
-        # Combine visible and mask tokens
-
-        # x: [batch_size , #MUs, *mask_unit_spatial_shape_final, encoder_dim_out]
-        # mask: [batch_size , #MUs_all]
-        x_dec = torch.zeros(*mask.shape, *x.shape[2:], device=x.device, dtype=x.dtype)
-        mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(x.shape[2:-1])) + (-1,))
-        mask = mask.reshape(mask.shape + (1,) * len(x.shape[2:]))
-        mask = mask.expand((-1,) * 2 + x.shape[2:]).bool()
-        x_dec[mask] = x.flatten()
-        x_dec = ~mask * mask_tokens + mask * x_dec
-
-        # Get back spatial order
-        x = undo_windowing(
-            x_dec,
-            self.tokens_spatial_shape_final,
-            self.mask_unit_spatial_shape_final,
-        )
-        mask = undo_windowing(
-            mask[..., 0:1],
-            self.tokens_spatial_shape_final,
-            self.mask_unit_spatial_shape_final,
-        )
-
-        # Flatten
-        x = x.reshape(x.shape[0], -1, x.shape[-1])
-        mask = mask.view(x.shape[0], -1)
-
-        # Add pos embed
-        x = x + self.decoder_pos_embed
-
-        # Apply decoder blocks
-        for blk in self.decoder_blocks:
-            x = blk(x)
-        x = self.decoder_norm(x)
-
-        # Predictor projection
-        x = self.decoder_pred(x)
-
-        return x, mask
-
-    def forward_loss(
-        self, x: torch.Tensor, pred: torch.Tensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Note: in mask, 0 is *visible*, 1 is *masked*
-
-        x: e.g. [batch_size , 3, H, W]
-        pred: [batch_size  * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
-        label: [batch_size  * num_pred_tokens, num_pixels_in_pred_patch * in_chans]
-        """
-        if len(self.q_stride) == 2:
-            label = self.get_pixel_label_2d(x, mask)
-        elif len(self.q_stride) == 3:
-            label = self.get_pixel_label_3d(x, mask)
-        else:
-            raise NotImplementedError
-
-        pred = pred[mask]
-        loss = (pred - label) ** 2
-
-        return loss.mean(), pred, label
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask_ratio: float = 0.6,
-        mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        latent, mask = self.forward_encoder(x, mask_ratio, mask=mask)
-        pred, pred_mask = self.forward_decoder(latent, mask)  # pred_mask is mask at resolution of *prediction*
-
-        # Toggle mask, to generate labels for *masked* tokens
-        return *self.forward_loss(x, pred, ~pred_mask), mask

From 5ba0aafba75958cebebfd463f6318bb0bf2cece7 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Fri, 15 Mar 2024 07:00:43 +0000
Subject: [PATCH 057/118] Added test and resolved bug

---
 .../models/hiera/convert_hiera_to_pytorch.py  | 60 ++++++++++---------
 .../models/hiera/modeling_hiera.py            | 14 ++---
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index f4f82d59a3c9..f85f37dd04bf 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,21 +14,17 @@
 # limitations under the License.
 
 import argparse
+from PIL import Image
 
 import torch
 
 # from transformers import HieraConfig, HieraModel
 from transformers import HieraConfig, HieraModel
-from transformers.models.hiera.hiera_image_processor import HieraImageProcessor
-
+from transformers import BeitImageProcessor
+from transformers.image_utils import PILImageResampling, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+import requests
 
 def rename_key(name):
-    # if "patch_embed.proj" in name:
-    #     name = name.replace("patch_embed.proj", "patch_embed.projection")
-    # # elif "block.proj" in name:
-    # #     name = name.replace("block.proj", "block.projection")
-    # elif "attn.proj" in name:
-    #     name = name.replace("attn.proj", "attn.projection")
     if ".proj." in name:
         name = name.replace(".proj.", ".projection.")
     if "attn" in name:
@@ -109,7 +105,7 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs)
         checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
 
     elif "hiera_base_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3), **kwargs)
+        config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3))
 
         checkpoints = pretrained_models_links["hiera_base_224"]
         checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
@@ -197,29 +193,39 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs)
         ):
             strict = False
 
-        model.load_state_dict(state_dict["model_state"], strict)
-        # model.load_state_dict(state_dict["model_state"], strict=strict)
-
+        model.load_state_dict(state_dict["model_state"], strict=strict)
+
+
+    image_processor = BeitImageProcessor(
+                                        size = {"height":256,"width":256},
+                                        do_rescale=True,
+                                        do_center_crop=True,
+                                        crop_size = {"height":224,"width":224},
+                                        do_normalize=True,
+                                        do_reduce_labels=False,
+                                        do_resize=True,
+                                        image_std=IMAGENET_DEFAULT_STD,
+                                        image_mean=IMAGENET_DEFAULT_MEAN,
+                                        resample = PILImageResampling.BICUBIC)  
+    
+    
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
 
-    image_processor = HieraImageProcessor(size=224)
-    inputs = image_processor.process_image(image_url=url)
-
-    # forward pass
-    out = model(inputs[None, ...])
-
-    # 207: golden retriever  (imagenet-1k)
-    out.last_hidden_state.argmax(dim=-1).item()
-
+    processed_image = image_processor(images=image, return_tensors="pt")  
+    model.load_state_dict(state_dict["model_state"], strict=strict)
+    expected_slice = torch.tensor(
+    [ 0.1825,  0.8655,  0.5779,  1.1550,  1.1025,  0.6381,  1.0288, -0.0624, 0.1455]
+        )
     # If you also want intermediate feature maps
-    out = model(inputs[None, ...], return_intermediates=True)
+    out = model(processed_image.pixel_values)
+    out.last_hidden_state.argmax(dim=-1).item()
+    assert torch.allclose(out.last_hidden_state[0, :9], expected_slice, atol=1e-4)
 
-    for x in out.intermediates:
-        print(x.shape)
 
     print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path, push_to_hub=True, safe_serialization=False)
-
+    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False)
+    
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 3cd0d21c56b8..6c8d6c93cf26 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -624,25 +624,21 @@ def __init__(self, config: HieraConfig):
 
         for i in range(depth):
             output_dim = self.embedding_dimension
-            # Mask unit or global attention.
-            # Lag by 1 block, so that global attention,
-            # applied post pooling on lower resolution
             use_mask_unit_attention = self.mask_unit_attn[cur_stage]
 
             if i - 1 in self.stage_ends:
                 output_dim = int(self.embedding_dimension * self.dim_mul)
-                number_of_heads = int(self.number_of_heads * self.head_mul)
+                self.number_of_heads = int(self.number_of_heads * self.head_mul)  # Update the class variable
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
-            else:
-                number_of_heads = self.number_of_heads
+
 
             block = HieraBlock(
                 config,
                 input_dim=self.embedding_dimension,
                 output_dim=output_dim,
-                number_of_heads=number_of_heads,
+                number_of_heads=self.number_of_heads,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -650,7 +646,7 @@ def __init__(self, config: HieraConfig):
                 use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            self.embedding_dimension = output_dim
+            self.embedding_dimension = output_dim 
             self.blocks.append(block)
 
         self.norm = norm_layer(self.embedding_dimension)
@@ -787,4 +783,4 @@ def forward(
             intermediates=intermediates if output_intermediates else None,
             attentions=attentions if output_attentions else None,
             hidden_states=hidden_states if output_hidden_states else None,
-        )
+        )
\ No newline at end of file

From 3adb788e60956511582536bc8b411a9c622f175c Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Fri, 15 Mar 2024 07:01:48 +0000
Subject: [PATCH 058/118] fixed doc string

---
 .../models/hiera/configuration_hiera.py       | 47 +++++++++----------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index dc4e7d554bee..885e647ef260 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -40,31 +40,26 @@ class HieraConfig(PretrainedConfig):
 
 
     Args:
-        input_size (Tuple[int, ...], optional, *optional*, defaults to `(224, 224)`): Dimensions of the input image (height, width).
-        in_chans (int, optional, *optional*, defaults to 3): Number of input channels. Defaults to 3.
-        embedding_dimension (int, optional, *optional*, defaults to 96): Dimension of the initial embedding. Defaults to 96.
-        number_of_heads (int, optional, *optional*, defaults to 1): Initial number of attention heads. Defaults to 1.
-        num_classes (int, optional, *optional*, defaults to 1000): Number of output classes. Defaults to 1000.
-        stages (Tuple[int, ...], optional, *optional*, defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model.
-        q_pool (int, optional, *optional*, defaults to 3): Number of pooling stages for queries. Defaults to 3.
-        q_stride (Tuple[int, ...], optional, *optional*, defaults to `(2, 2)`): Stride size for pooling. Defaults to (2, 2).
-        mask_unit_size (Tuple[int, ...], optional, *optional*, defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride.
-        mask_unit_attn (Tuple[bool, ...], optional, *optional*, defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention. Defaults to (True, True, False, False).
-        dim_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the dimensionality through the network. Defaults to 2.0.
-        head_mul (float, optional, *optional*, defaults to 2.0): Factor for increasing the number of heads through the network. Defaults to 2.0.
-        patch_kernel (Tuple[int, ...], optional, *optional*, defaults to `(7, 7)`): Kernel size for patch embedding. Defaults to (7, 7).
-        patch_stride (Tuple[int, ...], optional, *optional*, defaults to `(4, 4)`): Stride for patch embedding. Defaults to (4, 4).
-        patch_padding (Tuple[int, ...], optional, *optional*, defaults to `(3, 3)`): Padding for patch embedding. Defaults to (3, 3).
-        mlp_ratio (float, optional, *optional*, defaults to 4.0): Ratio of hidden size to feed-forward layer size. Defaults to 4.0.
-        drop_path_rate (float, optional, *optional*, defaults to 0.0): Dropout rate for stochastic depth. Defaults to 0.0.
-        head_dropout (float, optional, *optional*, defaults to 0.0): Dropout rate for attention heads. Defaults to 0.0.
-        head_init_scale (float, optional, *optional*, defaults to 0.001): Initial scaling factor for attention head weights. Defaults to 0.001.
-        sep_position_embeddings (bool, optional, *optional*, defaults to `False`): Whether to use separate position embeddings. Defaults to False.
-
-
-
-
-
+        input_size (Tuple[int, int] or int, , defaults to `(224, 224)`): Dimensions of the input image (height, width).
+        in_chans (int, optional, , defaults to 3): Number of input channels. 
+        embedding_dimension (int, optional, defaults to 96): Dimension of the initial embedding. 
+        number_of_heads (int, optional, defaults to 1): Initial number of attention heads. 
+        num_classes (int, optional, , defaults to 1000): Number of output classes. 
+        stages (Tuple[int, ...], optional, , defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model.
+        q_pool (int, optional, , defaults to 3): Number of pooling stages for queries. .
+        q_stride (Tuple[int, ...], optional, , defaults to `(2, 2)`): Stride size for pooling.
+        mask_unit_size (Tuple[int, ...], optional, , defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride.
+        mask_unit_attn (Tuple[bool, ...], optional, , defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention.
+        dim_mul (float, optional, , defaults to 2.0): Factor for increasing the dimensionality through the network. 
+        head_mul (float, optional, , defaults to 2.0): Factor for increasing the number of heads through the network. 
+        patch_kernel (Tuple[int, ...], optional, , defaults to `(7, 7)`): Kernel size for patch embedding. 
+        patch_stride (Tuple[int, ...], optional, , defaults to `(4, 4)`): Stride for patch embedding. 
+        patch_padding (Tuple[int, ...], optional, , defaults to `(3, 3)`): Padding for patch embedding. 
+        mlp_ratio (float, optional, , defaults to 4.0): Ratio of hidden size to feed-forward layer size.
+        drop_path_rate (float, optional, , defaults to 0.0): Dropout rate for stochastic depth. 
+        head_dropout (float, optional, , defaults to 0.0): Dropout rate for attention heads. 
+        head_init_scale (float, optional, , defaults to 0.001): Initial scaling factor for attention head weights.
+        sep_position_embeddings (bool, optional, , defaults to `False`): Whether to use separate position embeddings.
     """
 
     model_type = "hiera"

From c69df922789adfa5a872d0088dc4aca41b69aa1f Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 28 Mar 2024 20:41:25 +0100
Subject: [PATCH 059/118] First commit

---
 README.md                                     |    1 +
 README_de.md                                  |    1 +
 README_es.md                                  |    1 +
 README_fr.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_pt-br.md                               |    1 +
 README_ru.md                                  |    1 +
 README_te.md                                  |    1 +
 README_vi.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    1 +
 docs/source/en/model_doc/hiera.md             |   56 +
 docs/source/en/tasks/image_classification.md  |    2 +-
 src/transformers/__init__.py                  |   18 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    2 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 src/transformers/models/hiera/__init__.py     |   59 +
 .../models/hiera/configuration_hiera.py       |  175 +++
 .../models/hiera/convert_hiera_to_hf.py       |  332 ++++++
 .../models/hiera/modeling_hiera.py            | 1043 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   31 +
 tests/models/hiera/__init__.py                |    0
 tests/models/hiera/test_modeling_hiera.py     |  317 +++++
 30 files changed, 2056 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/model_doc/hiera.md
 create mode 100644 src/transformers/models/hiera/__init__.py
 create mode 100644 src/transformers/models/hiera/configuration_hiera.py
 create mode 100644 src/transformers/models/hiera/convert_hiera_to_hf.py
 create mode 100644 src/transformers/models/hiera/modeling_hiera.py
 create mode 100644 tests/models/hiera/__init__.py
 create mode 100644 tests/models/hiera/test_modeling_hiera.py

diff --git a/README.md b/README.md
index 4a3b78756716..783a503237a8 100644
--- a/README.md
+++ b/README.md
@@ -391,6 +391,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_de.md b/README_de.md
index 5c3fa28ccba8..59d347f3438b 100644
--- a/README_de.md
+++ b/README_de.md
@@ -387,6 +387,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_es.md b/README_es.md
index 9a6ea777a790..3c6ef0abe280 100644
--- a/README_es.md
+++ b/README_es.md
@@ -364,6 +364,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_fr.md b/README_fr.md
index 7f7fe2343e27..f8d9cd5f6b3a 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -385,6 +385,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_hd.md b/README_hd.md
index 12df2d0740c9..90f7145d3811 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -338,6 +338,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ja.md b/README_ja.md
index 78cd7b0474be..e053b3409e03 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -398,6 +398,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ko.md b/README_ko.md
index 1798760d86e9..d9f6577d154e 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -313,6 +313,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_pt-br.md b/README_pt-br.md
index 899acaf7f1c4..68bd03da9e13 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -396,6 +396,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_ru.md b/README_ru.md
index fdb647996556..ef61a742e51e 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -386,6 +386,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_te.md b/README_te.md
index 8906438d1fb0..711f016548da 100644
--- a/README_te.md
+++ b/README_te.md
@@ -388,6 +388,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_vi.md b/README_vi.md
index 5aabe6ccc353..60f4201fc385 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -387,6 +387,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (từ Microsoft) được phát hành với bài báo [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (từ UCSD, NVIDIA) được phát hành với bài báo [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (từ Allegro.pl, AGH University of Science and Technology) được phát hành với bài báo [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (từ Facebook) được phát hành với bài báo [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (từ Berkeley) được phát hành với bài báo [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (từ HuggingFace) được phát hành với bài báo [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index ca3d42eb00b9..0a341a1ffc5f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -337,6 +337,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 78278a76a289..c6a1e4075d7c 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -349,6 +349,7 @@ conda install conda-forge::transformers
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hiera](https://huggingface.co/docs/transformers/main/model_doc/hiera)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 92ee8eeda447..8508e693916c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -565,6 +565,8 @@
         title: FocalNet
       - local: model_doc/glpn
         title: GLPN
+      - local: model_doc/hiera
+        title: Hiera
       - local: model_doc/imagegpt
         title: ImageGPT
       - local: model_doc/levit
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ffa9ae3f4b0b..d52b5a288cc6 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -156,6 +156,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
+|                         [Hiera](model_doc/hiera)                         |       ✅        |         ❌         |      ❌      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
 |                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
 |                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
new file mode 100644
index 000000000000..233f63b29759
--- /dev/null
+++ b/docs/source/en/model_doc/hiera.md
@@ -0,0 +1,56 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hiera
+
+## Overview
+
+The Hiera model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## HieraConfig
+
+[[autodoc]] HieraConfig
+
+## HieraModel
+
+[[autodoc]] HieraModel
+    - forward
+
+## HieraForMaskedImageModeling
+
+[[autodoc]] HieraForMaskedImageModeling
+    - forward
+
+## HieraForImageClassification
+
+[[autodoc]] HieraForImageClassification
+    - forward
+
+</pt>
+<tf>
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 22a568f5e446..3f0eee3d5ff8 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -34,7 +34,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [CLIP](../model_doc/clip), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [PVTv2](../model_doc/pvt_v2), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SigLIP](../model_doc/siglip), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [CLIP](../model_doc/clip), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [Hiera](../model_doc/hiera), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [PVTv2](../model_doc/pvt_v2), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SigLIP](../model_doc/siglip), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index da29d77972f4..3d3bb6fcd35e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -499,6 +499,7 @@
         "GroupViTVisionConfig",
     ],
     "models.herbert": ["HerbertTokenizer"],
+    "models.hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.idefics": [
@@ -2399,6 +2400,15 @@
             "GroupViTVisionModel",
         ]
     )
+    _import_structure["models.hiera"].extend(
+        [
+            "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "HieraForImageClassification",
+            "HieraForMaskedImageModeling",
+            "HieraModel",
+            "HieraPreTrainedModel",
+        ]
+    )
     _import_structure["models.hubert"].extend(
         [
             "HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -5383,6 +5393,7 @@
         GroupViTVisionConfig,
     )
     from .models.herbert import HerbertTokenizer
+    from .models.hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.idefics import (
@@ -7110,6 +7121,13 @@
             GroupViTTextModel,
             GroupViTVisionModel,
         )
+        from .models.hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraForImageClassification,
+            HieraForMaskedImageModeling,
+            HieraModel,
+            HieraPreTrainedModel,
+        )
         from .models.hubert import (
             HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             HubertForCTC,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0599d3b876e6..5d866b2d51b4 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -107,6 +107,7 @@
     graphormer,
     groupvit,
     herbert,
+    hiera,
     hubert,
     ibert,
     idefics,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index bf46066002fe..95bfa104ecb6 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -121,6 +121,7 @@
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
+        ("hiera", "HieraConfig"),
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
@@ -384,6 +385,7 @@
         ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
+        ("hiera", "Hiera"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index f8cb55091b02..86992edf49c0 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -60,6 +60,7 @@
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
+        ("hiera", "HieraFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
         ("imagegpt", "ImageGPTFeatureExtractor"),
         ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 3debf97fea20..971f368c900f 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -69,6 +69,7 @@
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
+        ("hiera", "HieraImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 150dea04f374..2a08f4ea2c81 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -116,6 +116,7 @@
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
         ("groupvit", "GroupViTModel"),
+        ("hiera", "HieraModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
@@ -548,6 +549,7 @@
     [
         ("deit", "DeiTForMaskedImageModeling"),
         ("focalnet", "FocalNetForMaskedImageModeling"),
+        ("hiera", "HieraForMaskedImageModeling"),
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
@@ -587,6 +589,7 @@
         ),
         ("efficientnet", "EfficientNetForImageClassification"),
         ("focalnet", "FocalNetForImageClassification"),
+        ("hiera", "HieraForImageClassification"),
         ("imagegpt", "ImageGPTForImageClassification"),
         (
             "levit",
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
new file mode 100644
index 000000000000..fb05b30adcb1
--- /dev/null
+++ b/src/transformers/models/hiera/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {"configuration_hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig", "HieraOnnxConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_hiera"] = [
+        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "HieraForImageClassification",
+        "HieraForMaskedImageModeling",
+        "HieraModel",
+        "HieraPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig, HieraOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_hiera import (
+            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraForImageClassification,
+            HieraForMaskedImageModeling,
+            HieraModel,
+            HieraPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
new file mode 100644
index 000000000000..25e309ef1fe8
--- /dev/null
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Hiera model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "EduardoPacheco/hiera-tiny-224": "https://huggingface.co/EduardoPacheco/hiera-tiny-224/resolve/main/config.json",
+}
+
+
+class HieraConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate an Hiera
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Hiera
+    [google/hiera-base-patch16-224](https://huggingface.co/google/hiera-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        input_size (`tuple(int)`, *optional*, defaults to `(224, 224)`):
+            The size (resolution) of input in the format (height, width) for images
+            and (frames, height, width) for videos.
+        patch_kernel (`tuple(int)`, *optional*, defaults to `(7, 7)`):
+            The size (resolution) of each patch.
+        patch_stride (`tuple(int)`, *optional*, defaults to `(4, 4)`):
+            The stride of the patch.
+        patch_padding (`tuple(int)`, *optional*, defaults to `(3, 3)`):
+            The padding of the patch.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of mlp hidden dim to embedding dim.
+        depths (`tuple(int)`, *optional*, defaults to `[2, 3, 16, 3]`):
+            Depth of each layer in the Transformer encoder.
+        initial_num_heads (`int`, *optional*, defaults to 1):
+            Initial number of attention heads in the first layer of the Transformer encoder.
+        num_head_multiplier (`float`, *optional*, defaults to 2.0):
+            The multiplier to the number of attention heads in each layer of the Transformer encoder.
+        embed_dim_multiplier (`float`, *optional*, defaults to 2.0):
+            The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder.
+        num_query_pool (`int`, *optional*, defaults to 3):
+            The number of query pool stages.
+        query_stride (`tuple(int)`, *optional*, defaults to `(2, 2)`):
+            The stride of the query pool.
+        masked_unit_size (`tuple(int)`, *optional*, defaults to `(8, 8)`):
+            The size of the masked unit.
+        masked_unit_attention (`list(bool)`, *optional*, defaults to `[True, True, False, False]`):
+            Whether to use masked unit attention in each layer of the Transformer encoder.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop path rate.
+        sep_pos_embed (`bool`, *optional*, defaults to `False`):
+            Whether to use separate position embedding for temporal and spatial dimensions. Must be `True` for videos.
+            and `False` for images.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices and
+            the zero_initializer for initializing all bias vectors.
+        layer_norm_init (`float`, *optional*, defaults to 1.0):
+            The initial weight value for layer normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import HieraConfig, HieraModel
+
+    >>> # Initializing a Hiera hiera-base-patch16-224 style configuration
+    >>> configuration = HieraConfig()
+
+    >>> # Initializing a model (with random weights) from the hiera-base-patch16-224 style configuration
+    >>> model = HieraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "hiera"
+
+    def __init__(
+        self,
+        embed_dim=96,
+        input_size=(224, 224),
+        patch_kernel=(7, 7),
+        patch_stride=(4, 4),
+        patch_padding=(3, 3),
+        mlp_ratio=4.0,
+        depths=[2, 3, 16, 3],
+        initial_num_heads=1,
+        num_head_multiplier=2.0,
+        embed_dim_multiplier=2.0,
+        num_query_pool=3,
+        query_stride=(2, 2),
+        masked_unit_size=(8, 8),
+        masked_unit_attention=[True, True, False, False],
+        drop_path_rate=0.0,
+        sep_pos_embed=False,
+        num_channels=3,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_init=1.0,
+        layer_norm_eps=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.embed_dim = embed_dim
+        self.input_size = input_size
+        self.patch_kernel = patch_kernel
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.depths = depths
+        self.initial_num_heads = initial_num_heads
+        self.num_head_multiplier = num_head_multiplier
+        self.embed_dim_multiplier = embed_dim_multiplier
+        self.num_query_pool = num_query_pool
+        self.query_stride = query_stride
+        self.masked_unit_size = masked_unit_size
+        self.masked_unit_attention = masked_unit_attention
+        self.drop_path_rate = drop_path_rate
+        self.sep_pos_embed = sep_pos_embed
+        self.num_channels = num_channels
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_init = layer_norm_init
+        self.layer_norm_eps = layer_norm_eps
+
+        self.hidden_size = embed_dim
+
+
+class HieraOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
new file mode 100644
index 000000000000..e36725baf84f
--- /dev/null
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -0,0 +1,332 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Hiera checkpoints trained with the DINO method."""
+
+
+import argparse
+from dataclasses import dataclass
+
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from transformers import BeitImageProcessor, HieraConfig, HieraForImageClassification, HieraModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    # fmt: off
+    num_stages = len(config.depths)
+    # embedding dimensions for input and stages
+    dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)]
+
+    global_layer_idx = 0
+    for stage_idx in range(num_stages):
+        dim_in = dims[stage_idx]
+        dim_out = dims[stage_idx + 1]
+        for layer_idx in range(config.depths[stage_idx]):
+            rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight"))
+            rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias"))
+            rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight"))
+            rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias"))
+            rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight"))
+            rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias"))
+            rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight"))
+            rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias"))
+            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight"))
+            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias"))
+            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight"))
+            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias"))
+
+            # projection layer only for the first layer of each stage boundary (except the first stage)
+            if dim_out != dim_in and layer_idx == 0:
+                rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight"))
+                rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias"))
+
+            global_layer_idx += 1
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias")
+        ]
+    )
+
+    if config.sep_pos_embed:
+        rename_keys.extend(
+            [
+                ("pos_embed_spatial", "hiera.embeddings.position_embeddings_spatial"),
+                ("pos_embed_temporal", "hiera.embeddings.position_embeddings_temporal")
+            ]
+        )
+    else:
+        rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings"))
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend([("norm.weight", "layernorm.weight"), ("norm.bias", "layernorm.bias")])
+
+        # if just the base model, we should remove "hiera" from all keys that start with "hiera"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("hiera") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "hiera.layernorm.weight"),
+                ("norm.bias", "hiera.layernorm.bias"),
+                ("head.projection.weight", "classifier.weight"),
+                ("head.projection.bias", "classifier.bias"),
+            ]
+        )
+    # fmt: on
+    return rename_keys
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.projection.weight", "head.projection.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@dataclass
+class HieraInfo:
+    base_checkpoint_url: str
+    checkpoint_url: str
+    config: HieraConfig
+
+
+def get_hiera_config(model_name: str, base_model: bool) -> HieraInfo:
+    kwargs = {} if base_model else {"num_labels": 400 if model_name.endswith("16x224") else 1000}
+
+    if model_name == "hiera-tiny-224":
+        config = HieraConfig(depths=[1, 2, 7, 2], **kwargs)
+    elif model_name == "hiera-small-224":
+        HieraConfig(depths=[1, 2, 11, 2], **kwargs)
+    elif model_name == "hiera-base-224":
+        config = HieraConfig(**kwargs)
+    elif model_name == "hiera-base-plus-224":
+        config = HieraConfig(embed_dim=112, initial_num_heads=2, **kwargs)
+    elif model_name == "hiera-large-224":
+        config = HieraConfig(embed_dim=144, initial_num_heads=2, depths=[2, 6, 36, 4], **kwargs)
+    elif model_name == "hiera-huge-224":
+        config = HieraConfig(embed_dim=256, initial_num_heads=4, depths=[2, 6, 36, 4], **kwargs)
+    elif model_name == "hiera-base-16x224":
+        config = HieraConfig(
+            input_size=(16, 224, 224),
+            query_stride=(1, 2, 2),
+            masked_unit_size=(1, 8, 8),
+            patch_kernel=(3, 7, 7),
+            patch_stride=(2, 4, 4),
+            patch_padding=(1, 3, 3),
+            sep_pos_embed=True,
+            **kwargs,
+        )
+    elif model_name == "hiera-base-plus-16x224":
+        config = HieraConfig(
+            input_size=(16, 224, 224),
+            query_stride=(1, 2, 2),
+            masked_unit_size=(1, 8, 8),
+            patch_kernel=(3, 7, 7),
+            patch_stride=(2, 4, 4),
+            patch_padding=(1, 3, 3),
+            sep_pos_embed=True,
+            embed_dim=112,
+            initial_num_heads=2,
+            **kwargs,
+        )
+    elif model_name == "hiera-large-16x224":
+        config = HieraConfig(
+            input_size=(16, 224, 224),
+            query_stride=(1, 2, 2),
+            masked_unit_size=(1, 8, 8),
+            patch_kernel=(3, 7, 7),
+            patch_stride=(2, 4, 4),
+            patch_padding=(1, 3, 3),
+            sep_pos_embed=True,
+            embed_dim=144,
+            initial_num_heads=2,
+            depths=[2, 6, 36, 4],
+            **kwargs,
+        )
+    elif model_name == "hiera-huge-16x224":
+        config = HieraConfig(
+            input_size=(16, 224, 224),
+            query_stride=(1, 2, 2),
+            masked_unit_size=(1, 8, 8),
+            patch_kernel=(3, 7, 7),
+            patch_stride=(2, 4, 4),
+            patch_padding=(1, 3, 3),
+            sep_pos_embed=True,
+            embed_dim=256,
+            initial_num_heads=4,
+            depths=[2, 6, 36, 4],
+            **kwargs,
+        )
+    else:
+        raise ValueError(f"Unrecognized model name: {model_name}")
+
+    return config
+
+
+@torch.no_grad()
+def convert_hiera_checkpoint(args):
+    model_name = args.model_name
+    base_model = args.base_model
+    pytorch_dump_folder_path = args.pytorch_dump_folder_path
+    verify_logits = args.verify_logits
+    push_to_hub = args.push_to_hub
+    IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+
+    config = get_hiera_config(model_name, base_model)
+
+    # Load original hiera model
+    original_model = torch.hub.load(
+        "facebookresearch/hiera",
+        model=model_name.replace("-", "_"),
+        pretrained=True,
+        checkpoint="mae_in1k_ft_in1k" if not base_model else "mae_in1k",
+    )
+
+    original_model.eval()
+    original_state_dict = original_model.state_dict()
+    if base_model:
+        remove_classification_head_(original_state_dict)
+
+    # # Rename keys
+    new_state_dict = original_state_dict.copy()
+    rename_keys = create_rename_keys(config, base_model)
+
+    for src, dest in rename_keys:
+        rename_key(new_state_dict, src, dest)
+
+    # Load HF hiera model
+    model = HieraModel(config) if base_model else HieraForImageClassification(config)
+    model.eval()
+
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
+
+    input_image = prepare_img()
+
+    if model_name.endswith("16x224"):
+        original_image_preprocessor = None
+    else:
+        original_image_preprocessor = transforms.Compose(
+            [
+                transforms.Resize(
+                    int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC
+                ),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ]
+        )
+
+    image_processor = BeitImageProcessor(
+        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"height": 224, "width": 224}
+    )
+    inputs = image_processor(images=input_image, return_tensors="pt")
+
+    expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
+
+    assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
+
+    outputs = model(**inputs)
+    # original implementation returns logits.softmax(dim=-1)
+    expected_prob = original_model(input_image)
+
+    if verify_logits and not base_model:
+        output_prob = outputs.logits.softmax(dim=-1)
+        assert torch.allclose(output_prob, expected_prob, atol=1e-4)
+        print("Looks good!")
+    else:
+        print("Converted without verifying logits")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and processor for {model_name} to hub")
+        hub_name = model_name
+        if not base_model:
+            hub_name = f"{model_name}-k400" if model_name.endswith("16x224") else f"{model_name}-in1k"
+        model.push_to_hub(f"EduardoPacheco/{hub_name}")
+        image_processor.push_to_hub(f"EduardoPacheco/{hub_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="hiera-tiny-224",
+        type=str,
+        choices=[
+            "hiera-tiny-224",
+            "hiera-small-224",
+            "hiera-base-224",
+            "hiera-base-plus-224",
+            "hiera-large-224",
+            "hiera-huge-224",
+            "hiera-base-16x224",
+            "hiera-base-plus-16x224",
+            "hiera-large-16x224",
+            "hiera-huge-16x224",
+        ],
+        help="Name of the Hiera model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--verify_logits",
+        action="store_true",
+        help="Whether or not to verify the logits against the original implementation.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    parser.add_argument(
+        "--base_model",
+        action="store_true",
+        help="Whether to only convert the base model (no projection head weights).",
+    )
+
+    args = parser.parse_args()
+    convert_hiera_checkpoint(args)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
new file mode 100644
index 000000000000..94740cd64a0b
--- /dev/null
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -0,0 +1,1043 @@
+# coding=utf-8
+# Copyright 2024 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Hiera model."""
+
+
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    MaskedImageModelingOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_hiera import HieraConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "HieraConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "EduardoPacheco/hiera-tiny-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/hiera-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "EduardoPacheco/hiera-tiny-224",
+    # See all Hiera models at https://huggingface.co/models?filter=hiera
+]
+
+
+# Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L73
+def conv_nd(n: int) -> nn.Module:
+    """
+    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
+    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
+    """
+    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
+
+
+# Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L81
+def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
+    # Refer to `Unroll` to see how this performs a maxpool-Nd
+    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
+
+
+class HieraPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        # Support any number of spatial dimensions
+        self.spatial_dims = len(config.patch_kernel)
+        if self.spatial_dims not in (2, 3):
+            raise ValueError(
+                f"The number of dimensions of the input image should be 2 or 3, but got {self.spatial_dims}."
+            )
+        self.num_channels = config.num_channels
+        self.image_size = config.input_size
+
+        self.projection = conv_nd(self.spatial_dims)(
+            self.num_channels,
+            config.hidden_size,
+            kernel_size=config.patch_kernel,
+            stride=config.patch_stride,
+            padding=config.patch_padding,
+        )
+
+    def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Zero-out the masked regions of the input before conv.
+        Prevents leakage of masked regions when using overlapping kernels.
+        """
+        if mask is None:
+            return self.projection(pixel_values)
+
+        target_size = pixel_values.shape[2:]
+
+        if len(mask.shape[2:]) != len(target_size):
+            raise ValueError(
+                f"The length of the spatial dimensions of the mask should match the one from input image, but got {len(mask.shape[2:])} and {len(target_size)}."
+            )
+
+        if mask.shape[2:] != target_size:
+            mask = nn.functional.interpolate(mask.float(), size=target_size)
+
+        return self.projection(pixel_values * mask.bool())
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        _, num_channels, _, _ = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+
+        embeddings = self.masked_conv(pixel_values, bool_masked_pos)
+        embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
+
+        return embeddings
+
+
+class HieraEmbeddings(nn.Module):
+    """
+    Construct position and patch embeddings.
+    """
+
+    def __init__(self, config: HieraConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.num_tokens = math.prod(self.tokens_spatial_shape)
+        self.sep_pos_embed = config.sep_pos_embed
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+        self.patch_embeddings = HieraPatchEmbeddings(config)
+
+        if self.sep_pos_embed:
+            self.position_embeddings_spatial = nn.Parameter(
+                torch.zeros(
+                    1,
+                    self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
+                    config.hidden_size,
+                )
+            )
+            self.position_embeddings_temporal = nn.Parameter(
+                torch.zeros(1, self.tokens_spatial_shape[0], config.hidden_size)
+            )
+        else:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.hidden_size))
+
+    def get_position_embedding(self) -> torch.Tensor:
+        if self.sep_pos_embed:
+            return self.position_embeddings_spatial.repeat(
+                1, self.tokens_spatial_shape[0], 1
+            ) + torch.repeat_interleave(
+                self.position_embeddings_temporal,
+                self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
+                dim=1,
+            )
+        else:
+            return self.position_embeddings
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+        if len(self.mask_spatial_shape) == 2:
+            batch_size, num_channels, height, width = pixel_values.shape
+        else:
+            batch_size, num_channels, depth, height, width = pixel_values.shape
+
+        if bool_masked_pos is not None:
+            bool_masked_pos = bool_masked_pos.view(batch_size, 1, *self.mask_spatial_shape)
+
+        embeddings = self.patch_embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        embeddings = embeddings + self.get_position_embedding()
+
+        return embeddings
+
+
+class HieraMaskUnitAttention(nn.Module):
+    """
+    Computes either Mask Unit or Global Attention. Also is able to perform q pooling.
+
+    Note: this assumes the tokens have already been flattened and unrolled into mask units.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        query_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+        self.num_heads = num_heads
+        self.query_stride = query_stride
+
+        self.head_dim = dim_out // num_heads
+        self.scale = (self.head_dim) ** -0.5
+
+        self.qkv = nn.Linear(dim, 3 * dim_out)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+        self.window_size = window_size
+        self.use_mask_unit_attn = use_mask_unit_attn
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor:
+        """Input should be of shape [batch, tokens, channels]."""
+        batch_size, seq_len, _ = hidden_states.shape
+
+        num_windows = 1
+        if self.use_mask_unit_attn:
+            num_windows = seq_len // (self.q_stride * self.window_size)
+
+        qkv = self.qkv(hidden_states)
+        qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim)
+        qkv = qkv.permute(3, 0, 4, 2, 1, 5)
+
+        query, key, value = qkv.unbind(0)
+
+        if self.query_stride > 1:
+            # Refer to Unroll to see how this performs a maxpool-Nd
+            query = query.view(batch_size, self.num_heads, num_windows, self.query_stride, -1, self.head_dim)
+            query = query.max(dim=3).values
+
+        attn_weights = (query * self.scale) @ key.transpose(-1, -2)
+        attn_weights = attn_weights.softmax(dim=-1)
+
+        attn_output = attn_weights @ value
+        attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.dim_out)
+        attn_output = self.proj(attn_output)
+
+        return (attn_output, attn_weights) if output_attentions else (attn_output, None)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Hiera
+class HieraDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class HieraMlp(nn.Module):
+    def __init__(self, config, dim: int):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(dim, int(dim * config.mlp_ratio))
+        self.fc2 = nn.Linear(int(dim * config.mlp_ratio), dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class HieraLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        drop_path: float = 0.0,
+        query_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+        self.query_stride = query_stride
+
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attn = HieraMaskUnitAttention(dim, dim_out, num_heads, query_stride, window_size, use_mask_unit_attn)
+
+        self.layernorm_after = nn.LayerNorm(dim_out, eps=config.layer_norm_eps)
+        self.mlp = HieraMlp(config, dim_out)
+
+        self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity()
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        # Attention + Q Pooling
+        hidden_states_norm = self.layernorm_before(hidden_states)
+
+        if self.dim != self.dim_out:
+            hidden_states = self.proj(hidden_states_norm)
+            # Refer to `HieraUnroll` to see how this performs a maxpool-Nd
+            hidden_states = hidden_states.view(batch_size, self.query_stride, -1, hidden_dim).max(dim=1).values
+
+        (hidden_states_norm, attn_weights) = self.attn(hidden_states_norm, output_attentions=output_attentions)
+        hidden_states = hidden_states + self.drop_path(hidden_states_norm)
+
+        residual = hidden_states
+        hidden_states = self.layernorm_after(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.drop_path(hidden_states)
+
+        return (hidden_states, attn_weights)
+
+
+class HieraStage(nn.Module):
+    def __init__(
+        self,
+        config,
+        depth: int,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        drop_path: List[float],
+        query_stride: List[int],
+        window_size: int,
+        use_mask_unit_attn: bool,
+    ) -> None:
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                HieraLayer(
+                    config=config,
+                    dim=dim if i == 0 else dim_out,
+                    dim_out=dim_out,
+                    num_heads=num_heads,
+                    drop_path=drop_path[i],
+                    query_stride=query_stride[i],
+                    window_size=window_size,
+                    use_mask_unit_attn=use_mask_unit_attn,
+                )
+                for i in range(depth)
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor:
+        for layer_module in self.layers:
+            (hidden_states, attn_weights) = layer_module(hidden_states, output_attentions=output_attentions)
+
+        return hidden_states, attn_weights
+
+
+class HieraEncoder(nn.Module):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        # query strides rule
+        stage_ends = [sum(config.depths[:i]) - 1 for i in range(1, len(config.depths) + 1)]
+        query_pool_layer = [stage_end + 1 for stage_end in stage_ends[: config.num_query_pool]]
+        query_strides = [
+            math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(sum(config.depths))
+        ]
+
+        # Transformer blocks
+        self.stages = nn.ModuleList()
+        embed_dim = config.embed_dim
+
+        for idx_stage, depth in enumerate(config.depths):
+            dim_out = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
+
+            stage = HieraStage(
+                config=config,
+                depth=depth,
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=int(config.initial_num_heads * config.num_head_multiplier**idx_stage),
+                drop_path=dpr[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
+                query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
+                window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage),
+                use_mask_unit_attn=config.masked_unit_attention[idx_stage],
+            )
+
+            embed_dim = dim_out
+            self.stages.append(stage)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, stage_module in enumerate(self.stages):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    stage_module.__call__, hidden_states, output_attentions
+                )
+            else:
+                layer_outputs = stage_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class HieraUnroll(nn.Module):
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
+                           [B, (Sy, Sx, H // Sy, W // Sx), C]
+
+    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in HxW order, so they
+    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.schedule = [config.query_stride] * len(config.depths[:-1])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input: Flattened patch embeddings [B, N, C]
+        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
+        """
+        B, _, C = x.shape
+
+        cur_size = self.size
+        x = x.view(*([B] + cur_size + [C]))
+
+        for strides in self.schedule:
+            # Move patches with the given strides to the batch dimension
+
+            # Create a view of the tensor with the patch stride as separate dims
+            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
+            cur_size = [i // s for i, s in zip(cur_size, strides)]
+            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
+            x = x.view(new_shape)
+
+            # Move the patch stride into the batch dimension
+            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
+            L = len(new_shape)
+            permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
+            x = x.permute(permute)
+
+            # Now finally flatten the relevant dims into the batch dimension
+            x = x.flatten(0, len(strides))
+            B *= math.prod(strides)
+
+        x = x.reshape(-1, math.prod(self.size), C)
+        return x
+
+
+def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
+        shape: current spatial shape, if it were not organized into mask unit
+            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
+        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
+    Returns:
+        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
+    """
+    D = len(shape)
+    B, C = x.shape[0], x.shape[-1]
+    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
+    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
+    x = x.view(B, *num_MUs, *mu_shape, C)
+
+    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
+    permute = (
+        [0]
+        + sum(
+            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
+            [],
+        )
+        + [len(x.shape) - 1]
+    )
+    x = x.permute(permute).reshape(B, *shape, C)
+
+    return x
+
+
+class HieraReroll(nn.Module):
+    """
+    Undos the "unroll" operation so that you can use intermediate features.
+    """
+
+    def __init__(
+        self,
+        input_size: Tuple[int, ...],
+        patch_stride: Tuple[int, ...],
+        unroll_schedule: List[Tuple[int, ...]],
+        stage_ends: List[int],
+        q_pool: int,
+    ):
+        super().__init__()
+        self.size = [i // s for i, s in zip(input_size, patch_stride)]
+
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        self.schedule = {}
+        size = self.size
+        for i in range(stage_ends[-1] + 1):
+            self.schedule[i] = unroll_schedule, size
+            # schedule unchanged if no pooling at a stage end
+            if i in stage_ends[:q_pool]:
+                if len(unroll_schedule) > 0:
+                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
+                unroll_schedule = unroll_schedule[1:]
+
+    def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no mask is provided:
+            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
+        If a mask is provided:
+            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
+        """
+        schedule, size = self.schedule[block_idx]
+        B, N, C = x.shape
+
+        D = len(size)
+        cur_mu_shape = [1] * D
+
+        for strides in schedule:
+            # Extract the current patch from N
+            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
+
+            # Move that patch into the current MU
+            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
+            L = len(x.shape)
+            permute = (
+                [0, 1 + D]
+                + sum(
+                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
+                    [],
+                )
+                + [L - 1]
+            )
+            x = x.permute(permute)
+
+            # Reshape to [B, N//(Sy*Sx), *MU, C]
+            for i in range(D):
+                cur_mu_shape[i] *= strides[i]
+            x = x.reshape(B, -1, *cur_mu_shape, C)
+            N = x.shape[1]
+
+        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        x = x.view(B, N, *cur_mu_shape, C)
+
+        # If masked, return [B, #MUs, MUy, MUx, C]
+        if mask is not None:
+            return x
+
+        # If not masked, we can return [B, H, W, C]
+        x = undo_windowing(x, size, cur_mu_shape)
+
+        return x
+
+
+class HieraPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HieraEmbeddings", "HieraLayer"]
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        std = self.config.initializer_range
+
+        if isinstance(module, HieraEmbeddings):
+            if self.config.sep_pos_embed:
+                nn.init.trunc_normal_(module.position_embeddings_spatial, std=std)
+                nn.init.trunc_normal_(module.position_embeddings_temporal, std=std)
+            else:
+                nn.init.trunc_normal_(module.position_embeddings, std=std)
+
+        elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            nn.init.trunc_normal_(module.weight, std=std)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                nn.init.constant_(module.bias, std)
+
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, std)
+            nn.init.constant_(module.weight, self.config.layer_norm_init)
+
+
+HIERA_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+HIERA_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Hiera Model transformer outputting raw hidden-states without any specific head on top.",
+    HIERA_START_DOCSTRING,
+)
+class HieraModel(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (self.num_layers - 1))
+
+        self.embeddings = HieraEmbeddings(config, use_mask_token=use_mask_token)
+        self.unroll = HieraUnroll(config)
+        self.encoder = HieraEncoder(config)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> HieraPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output)
+            pooled_output = self.layernorm(pooled_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Hiera Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """,
+    HIERA_START_DOCSTRING,
+)
+# Copied from transformers.models.vit.modeling_vit.ViTForMaskedImageModeling with VIT->HIERA,ViT->Hiera,vit->hiera,google/vit-base-patch16-224-in21k->EduardoPacheco/hiera-tiny-224
+class HieraForMaskedImageModeling(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__(config)
+
+        self.hiera = HieraModel(config, add_pooling_layer=False, use_mask_token=True)
+
+        self.decoder = nn.Sequential(
+            nn.Conv2d(
+                in_channels=config.hidden_size,
+                out_channels=config.encoder_stride**2 * config.num_channels,
+                kernel_size=1,
+            ),
+            nn.PixelShuffle(config.encoder_stride),
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedImageModelingOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, HieraForMaskedImageModeling
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/hiera-base-patch16-224-in21k")
+        >>> model = HieraForMaskedImageModeling.from_pretrained("google/hiera-base-patch16-224-in21k")
+
+        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+        >>> # create random boolean mask of shape (batch_size, num_patches)
+        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
+        >>> list(reconstructed_pixel_values.shape)
+        [1, 3, 224, 224]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if bool_masked_pos is not None and (self.config.patch_size != self.config.encoder_stride):
+            raise ValueError(
+                "When `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that "
+                "the reconstructed image has the same dimensions as the input. "
+                f"Got `patch_size` = {self.config.patch_size} and `encoder_stride` = {self.config.encoder_stride}."
+            )
+
+        outputs = self.hiera(
+            pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # Reshape to (batch_size, num_channels, height, width)
+        sequence_output = sequence_output[:, 1:]
+        batch_size, sequence_length, num_channels = sequence_output.shape
+        height = width = math.floor(sequence_length**0.5)
+        sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+
+        # Reconstruct pixel values
+        reconstructed_pixel_values = self.decoder(sequence_output)
+
+        masked_im_loss = None
+        if bool_masked_pos is not None:
+            size = self.config.image_size // self.config.patch_size
+            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+            mask = (
+                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+                .repeat_interleave(self.config.patch_size, 2)
+                .unsqueeze(1)
+                .contiguous()
+            )
+            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+        if not return_dict:
+            output = (reconstructed_pixel_values,) + outputs[1:]
+            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+        return MaskedImageModelingOutput(
+            loss=masked_im_loss,
+            reconstruction=reconstructed_pixel_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """,
+    HIERA_START_DOCSTRING,
+)
+class HieraForImageClassification(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.hiera = HieraModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.hiera.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.hiera(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1bdab80a13f6..4b20b73414d0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4267,6 +4267,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class HieraForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HieraForMaskedImageModeling(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HieraModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HieraPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/hiera/__init__.py b/tests/models/hiera/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
new file mode 100644
index 000000000000..55fbe76a4d56
--- /dev/null
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -0,0 +1,317 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Hiera model. """
+
+
+import unittest
+
+from transformers import HieraConfig
+from transformers.testing_utils import (
+    require_accelerate,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_fp16,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import HieraForImageClassification, HieraForMaskedImageModeling, HieraModel
+    from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTImageProcessor
+
+
+class HieraModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+        encoder_stride=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.encoder_stride = encoder_stride
+
+        # in Hiera, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return HieraConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = HieraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
+        model = HieraForMaskedImageModeling(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+        )
+
+        # test greyscale images
+        config.num_channels = 1
+        model = HieraForMaskedImageModeling(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = HieraForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = HieraForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Hiera does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            HieraModel,
+            HieraForImageClassification,
+            HieraForMaskedImageModeling,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = HieraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=HieraConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Hiera does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_image_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = HieraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class HieraModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("google/hiera-base-patch16-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = HieraForImageClassification.from_pretrained("google/hiera-base-patch16-224").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # Hiera models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model = HieraModel.from_pretrained("facebook/dino-hieras8").to(torch_device)
+
+        image_processor = ViTImageProcessor.from_pretrained("facebook/dino-hieras8", size=480)
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 3601, 384))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    @require_accelerate
+    @require_torch_accelerator
+    @require_torch_fp16
+    def test_inference_fp16(self):
+        r"""
+        A small test to make sure that inference work in half precision without any problem.
+        """
+        model = HieraModel.from_pretrained("facebook/dino-hieras8", torch_dtype=torch.float16, device_map="auto")
+        image_processor = self.default_image_processor
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass to make sure inference works in fp16
+        with torch.no_grad():
+            _ = model(pixel_values)

From 2e1f8d4005850260887753b5e430bf402f90ac59 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 29 Mar 2024 22:04:33 +0100
Subject: [PATCH 060/118] Finished conversion script and model forward working

---
 .../models/hiera/convert_hiera_to_hf.py       | 36 +++++++---
 .../models/hiera/modeling_hiera.py            | 67 ++++++++++++++-----
 2 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index e36725baf84f..5c48bb55bb79 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -206,6 +206,7 @@ def convert_hiera_checkpoint(args):
     base_model = args.base_model
     pytorch_dump_folder_path = args.pytorch_dump_folder_path
     verify_logits = args.verify_logits
+    verify_pixel_values = args.verify_pixel_values
     push_to_hub = args.push_to_hub
     IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
     IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
@@ -256,23 +257,31 @@ def convert_hiera_checkpoint(args):
             ]
         )
 
-    image_processor = BeitImageProcessor(
-        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"height": 224, "width": 224}
-    )
+    image_processor = BeitImageProcessor(image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD)
     inputs = image_processor(images=input_image, return_tensors="pt")
 
     expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
 
-    assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
+    if verify_pixel_values:
+        input_image = prepare_img()
+
+        inputs = image_processor(images=input_image, return_tensors="pt")
+        expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
+        assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
+        print("Pixel values look good!")
+    else:
+        print("Converted without verifying pixel values")
+        inputs = {"pixel_values": torch.rand((1, 3, 224, 224))}
+        expected_pixel_values = inputs["pixel_values"]
 
     outputs = model(**inputs)
     # original implementation returns logits.softmax(dim=-1)
-    expected_prob = original_model(input_image)
+    expected_prob = original_model(expected_pixel_values)
 
     if verify_logits and not base_model:
         output_prob = outputs.logits.softmax(dim=-1)
         assert torch.allclose(output_prob, expected_prob, atol=1e-4)
-        print("Looks good!")
+        print("Logits look good!")
     else:
         print("Converted without verifying logits")
 
@@ -294,7 +303,7 @@ def convert_hiera_checkpoint(args):
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--model_name",
+        "--model-name",
         default="hiera-tiny-224",
         type=str,
         choices=[
@@ -312,21 +321,26 @@ def convert_hiera_checkpoint(args):
         help="Name of the Hiera model you'd like to convert.",
     )
     parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+        "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
     parser.add_argument(
-        "--verify_logits",
+        "--verify-logits",
         action="store_true",
         help="Whether or not to verify the logits against the original implementation.",
     )
     parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+        "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
     )
     parser.add_argument(
-        "--base_model",
+        "--base-model",
         action="store_true",
         help="Whether to only convert the base model (no projection head weights).",
     )
+    parser.add_argument(
+        "--verify-pixel-values",
+        action="store_true",
+        help="Whether to verify the pixel values of the input image.",
+    )
 
     args = parser.parse_args()
     convert_hiera_checkpoint(args)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 94740cd64a0b..bb86a866abb7 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -227,13 +227,18 @@ def __init__(
         self.window_size = window_size
         self.use_mask_unit_attn = use_mask_unit_attn
 
-    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor:
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
         """Input should be of shape [batch, tokens, channels]."""
         batch_size, seq_len, _ = hidden_states.shape
 
         num_windows = 1
         if self.use_mask_unit_attn:
-            num_windows = seq_len // (self.q_stride * self.window_size)
+            num_windows = seq_len // (self.query_stride * self.window_size)
 
         qkv = self.qkv(hidden_states)
         qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim)
@@ -249,6 +254,10 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False)
         attn_weights = (query * self.scale) @ key.transpose(-1, -2)
         attn_weights = attn_weights.softmax(dim=-1)
 
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
         attn_output = attn_weights @ value
         attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.dim_out)
         attn_output = self.proj(attn_output)
@@ -335,17 +344,24 @@ def __init__(
         if dim != dim_out:
             self.proj = nn.Linear(dim, dim_out)
 
-    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor:
-        batch_size, seq_len, hidden_dim = hidden_states.shape
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
         # Attention + Q Pooling
         hidden_states_norm = self.layernorm_before(hidden_states)
 
         if self.dim != self.dim_out:
             hidden_states = self.proj(hidden_states_norm)
             # Refer to `HieraUnroll` to see how this performs a maxpool-Nd
-            hidden_states = hidden_states.view(batch_size, self.query_stride, -1, hidden_dim).max(dim=1).values
+            hidden_states = hidden_states.view(batch_size, self.query_stride, -1, self.dim_out).max(dim=1).values
 
-        (hidden_states_norm, attn_weights) = self.attn(hidden_states_norm, output_attentions=output_attentions)
+        (hidden_states_norm, attn_weights) = self.attn(
+            hidden_states_norm, head_mask, output_attentions=output_attentions
+        )
         hidden_states = hidden_states + self.drop_path(hidden_states_norm)
 
         residual = hidden_states
@@ -368,8 +384,14 @@ def __init__(
         query_stride: List[int],
         window_size: int,
         use_mask_unit_attn: bool,
+        stage_num: int,
     ) -> None:
         super().__init__()
+        # we need to know if the previous stage used masked attention
+        # mask unit or global attention.
+        # lag by 1 layer, so that global attention,
+        # applied post pooling on lower resolution
+        previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0]
         self.layers = nn.ModuleList(
             [
                 HieraLayer(
@@ -380,15 +402,20 @@ def __init__(
                     drop_path=drop_path[i],
                     query_stride=query_stride[i],
                     window_size=window_size,
-                    use_mask_unit_attn=use_mask_unit_attn,
+                    use_mask_unit_attn=use_mask_unit_attn or (previous_stage_used_masked_attention and i == 0),
                 )
                 for i in range(depth)
             ]
         )
 
-    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> torch.Tensor:
-        for layer_module in self.layers:
-            (hidden_states, attn_weights) = layer_module(hidden_states, output_attentions=output_attentions)
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.FloatTensor], output_attentions: bool = False
+    ) -> torch.Tensor:
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            (hidden_states, attn_weights) = layer_module(
+                hidden_states, layer_head_mask, output_attentions=output_attentions
+            )
 
         return hidden_states, attn_weights
 
@@ -424,6 +451,7 @@ def __init__(self, config: HieraConfig) -> None:
                 query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
                 window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage),
                 use_mask_unit_attn=config.masked_unit_attention[idx_stage],
+                stage_num=idx_stage,
             )
 
             embed_dim = dim_out
@@ -434,6 +462,7 @@ def __init__(self, config: HieraConfig) -> None:
     def forward(
         self,
         hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -442,15 +471,16 @@ def forward(
         all_self_attentions = () if output_attentions else None
 
         for i, stage_module in enumerate(self.stages):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    stage_module.__call__, hidden_states, output_attentions
+                    stage_module.__call__, hidden_states, layer_head_mask, output_attentions
                 )
             else:
-                layer_outputs = stage_module(hidden_states, output_attentions)
+                layer_outputs = stage_module(hidden_states, layer_head_mask, output_attentions)
 
             hidden_states = layer_outputs[0]
 
@@ -775,19 +805,19 @@ def forward(
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
 
         # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
         expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
         if pixel_values.dtype != expected_dtype:
             pixel_values = pixel_values.to(expected_dtype)
 
-        embedding_output = self.embeddings(
-            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        hidden_states = self.unroll(embedding_output)
 
         encoder_outputs = self.encoder(
-            embedding_output,
+            hidden_states,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -796,7 +826,8 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = None
         if self.pooler is not None:
-            pooled_output = self.pooler(sequence_output)
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
             pooled_output = self.layernorm(pooled_output)
 
         if not return_dict:

From 5924b6cfc2f4be02c1f60f035b1080228665a6c5 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Sun, 31 Mar 2024 10:53:20 +0000
Subject: [PATCH 061/118] Resolved all issues

---
 docs/source/en/model_doc/hiera.md             |  18 +-
 src/transformers/models/hiera/__init__.py     |   2 +-
 .../models/hiera/configuration_hiera.py       |  64 ++++--
 .../models/hiera/convert_hiera_to_pytorch.py  |  45 ++---
 .../models/hiera/hiera_image_processor.py     |  59 ------
 .../models/hiera/modeling_hiera.py            |  94 ++++-----
 tests/models/hiera/test_modeling_hiera.py     | 190 ++++++++----------
 7 files changed, 209 insertions(+), 263 deletions(-)
 delete mode 100644 src/transformers/models/hiera/hiera_image_processor.py

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 8cd6dc1a977a..f519f00893cc 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -1,4 +1,4 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -20,20 +20,20 @@ rendered properly in your Markdown viewer.
 
 Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 
+The paper introduces "Hiera," a hierarchical Vision Transformer that simplifies the architecture of modern hierarchical vision transformers by removing unnecessary components without compromising on accuracy or efficiency. Unlike traditional transformers that add complex vision-specific components to improve supervised classification performance, Hiera demonstrates that such additions, often termed "bells-and-whistles," are not essential for high accuracy. By leveraging a strong visual pretext task (MAE) for pretraining, Hiera retains simplicity and achieves superior accuracy and speed both in inference and training across various image and video recognition tasks. The approach suggests that spatial biases required for vision tasks can be effectively learned through proper pretraining, eliminating the need for added architectural complexity. 
+
 The abstract from the paper is the following:
 
-Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.
+*Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
+
+This model was contributed by [namangarg110](https://huggingface.co/namangarg110). The original code can be found
+[here](https://github.com/facebookresearch/hiera). The pre-trained checkpoints can be found on the
+[Hugging Face Hub](https://huggingface.co/models?sort=downloads&search=namangarg110%2Fhiera).
 
 ## HieraConfig
 
 [[autodoc]] HieraConfig
 
-<pt>
-
 ## HieraModel
-
 [[autodoc]] HieraModel
-    - forward
-
-</tf>
-</frameworkcontent>
\ No newline at end of file
+    - forward
\ No newline at end of file
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index d8c62fc0800a..99dbd5097b3f 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 885e647ef260..b3686a87810d 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -40,26 +40,46 @@ class HieraConfig(PretrainedConfig):
 
 
     Args:
-        input_size (Tuple[int, int] or int, , defaults to `(224, 224)`): Dimensions of the input image (height, width).
-        in_chans (int, optional, , defaults to 3): Number of input channels. 
-        embedding_dimension (int, optional, defaults to 96): Dimension of the initial embedding. 
-        number_of_heads (int, optional, defaults to 1): Initial number of attention heads. 
-        num_classes (int, optional, , defaults to 1000): Number of output classes. 
-        stages (Tuple[int, ...], optional, , defaults to `(2, 3, 16, 3)`): Defines the number of blocks at each stage of the model.
-        q_pool (int, optional, , defaults to 3): Number of pooling stages for queries. .
-        q_stride (Tuple[int, ...], optional, , defaults to `(2, 2)`): Stride size for pooling.
-        mask_unit_size (Tuple[int, ...], optional, , defaults to `(8, 8)`): Dimensions for the mask unit. Must be compatible with q_stride.
-        mask_unit_attn (Tuple[bool, ...], optional, , defaults to `(True, True, False, False)`): Specifies which stages use mask unit attention.
-        dim_mul (float, optional, , defaults to 2.0): Factor for increasing the dimensionality through the network. 
-        head_mul (float, optional, , defaults to 2.0): Factor for increasing the number of heads through the network. 
-        patch_kernel (Tuple[int, ...], optional, , defaults to `(7, 7)`): Kernel size for patch embedding. 
-        patch_stride (Tuple[int, ...], optional, , defaults to `(4, 4)`): Stride for patch embedding. 
-        patch_padding (Tuple[int, ...], optional, , defaults to `(3, 3)`): Padding for patch embedding. 
-        mlp_ratio (float, optional, , defaults to 4.0): Ratio of hidden size to feed-forward layer size.
-        drop_path_rate (float, optional, , defaults to 0.0): Dropout rate for stochastic depth. 
-        head_dropout (float, optional, , defaults to 0.0): Dropout rate for attention heads. 
-        head_init_scale (float, optional, , defaults to 0.001): Initial scaling factor for attention head weights.
-        sep_position_embeddings (bool, optional, , defaults to `False`): Whether to use separate position embeddings.
+        input_size (`Tuple[int, int]` or `int`, *optional*, defaults to `(224, 224)`):
+            Dimensions of the input image (height, width).
+        in_chans (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        embedding_dimension (`int`, *optional*, defaults to 96):
+            Dimension of the initial embedding.
+        num_attention_heads (`int`, *optional*, defaults to 1):
+            Initial number of attention heads.
+        num_classes (`int`, *optional*, defaults to 1000):
+            Number of output classes.
+        stages (`Tuple[int, ...]`, *optional*, defaults to `(2, 3, 16, 3)`):
+            Defines the number of blocks at each stage of the model.
+        q_pool (`int`, *optional*, defaults to 3):
+            Number of pooling stages for queries.
+        q_stride (`Tuple[int, ...]`, *optional*, defaults to `(2, 2)`):
+            Stride size for pooling.
+        mask_unit_size (`Tuple[int, ...]`, *optional*, defaults to `(8, 8)`):
+            Dimensions for the mask unit. Must be compatible with q_stride.
+        mask_unit_attn (`Tuple[bool, ...]`, *optional*, defaults to `(True, True, False, False)`):
+            Specifies which stages use mask unit attention.
+        dim_mul (`float`, *optional*, defaults to 2.0):
+            Factor for increasing the dimensionality through the network.
+        head_mul (`float`, *optional*, defaults to 2.0):
+            Factor for increasing the number of heads through the network.
+        patch_kernel (`Tuple[int, ...]`, *optional*, defaults to `(7, 7)`):
+            Kernel size for patch embedding.
+        patch_stride (`Tuple[int, ...]`, *optional*, defaults to `(4, 4)`):
+            Stride for patch embedding.
+        patch_padding (`Tuple[int, ...]`, *optional*, defaults to `(3, 3)`):
+            Padding for patch embedding.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of hidden size to feed-forward layer size.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        head_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout rate for attention heads.
+        head_init_scale (`float`, *optional*, defaults to 0.001):
+            Initial scaling factor for attention head weights.
+        sep_position_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to use separate position embeddings.
     """
 
     model_type = "hiera"
@@ -69,7 +89,7 @@ def __init__(
         input_size: Tuple[int, ...] = (224, 224),
         in_chans: int = 3,
         embedding_dimension: int = 96,  # initial embedding input_dim
-        number_of_heads: int = 1,  # initial number of number_of_heads
+        num_attention_heads: int = 1,  # initial number of num_attention_heads
         num_classes: int = 1000,
         stages: Tuple[int, ...] = (2, 3, 16, 3),
         q_pool: int = 3,  # number of q_pool stages
@@ -93,7 +113,7 @@ def __init__(
         self.input_size = input_size
         self.in_chans = in_chans
         self.embedding_dimension = embedding_dimension
-        self.number_of_heads = number_of_heads
+        self.num_attention_heads = num_attention_heads
         self.num_classes = num_classes
         self.stages = stages
         self.q_pool = q_pool
diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
index f85f37dd04bf..3da9749f4eb5 100644
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
@@ -14,15 +14,15 @@
 # limitations under the License.
 
 import argparse
-from PIL import Image
 
+import requests
 import torch
+from PIL import Image
 
 # from transformers import HieraConfig, HieraModel
-from transformers import HieraConfig, HieraModel
-from transformers import BeitImageProcessor
-from transformers.image_utils import PILImageResampling, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-import requests
+from transformers import BeitImageProcessor, HieraConfig, HieraModel
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+
 
 def rename_key(name):
     if ".proj." in name:
@@ -195,37 +195,36 @@ def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs)
 
         model.load_state_dict(state_dict["model_state"], strict=strict)
 
-
     image_processor = BeitImageProcessor(
-                                        size = {"height":256,"width":256},
-                                        do_rescale=True,
-                                        do_center_crop=True,
-                                        crop_size = {"height":224,"width":224},
-                                        do_normalize=True,
-                                        do_reduce_labels=False,
-                                        do_resize=True,
-                                        image_std=IMAGENET_DEFAULT_STD,
-                                        image_mean=IMAGENET_DEFAULT_MEAN,
-                                        resample = PILImageResampling.BICUBIC)  
-    
-    
+        size={"height": 256, "width": 256},
+        do_rescale=True,
+        do_center_crop=True,
+        crop_size={"height": 224, "width": 224},
+        do_normalize=True,
+        do_reduce_labels=False,
+        do_resize=True,
+        image_std=IMAGENET_DEFAULT_STD,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        resample=PILImageResampling.BICUBIC,
+    )
+
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
     image = Image.open(requests.get(url, stream=True).raw)
 
-    processed_image = image_processor(images=image, return_tensors="pt")  
+    processed_image = image_processor(images=image, return_tensors="pt")
     model.load_state_dict(state_dict["model_state"], strict=strict)
-    expected_slice = torch.tensor(
-    [ 0.1825,  0.8655,  0.5779,  1.1550,  1.1025,  0.6381,  1.0288, -0.0624, 0.1455]
-        )
+    expected_slice = torch.tensor([0.1825, 0.8655, 0.5779, 1.1550, 1.1025, 0.6381, 1.0288, -0.0624, 0.1455])
     # If you also want intermediate feature maps
     out = model(processed_image.pixel_values)
     out.last_hidden_state.argmax(dim=-1).item()
     assert torch.allclose(out.last_hidden_state[0, :9], expected_slice, atol=1e-4)
 
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained("/home/ubuntu/home/hiera/hiera_base_224_image_processor/")
 
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False)
-    
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/hiera/hiera_image_processor.py b/src/transformers/models/hiera/hiera_image_processor.py
deleted file mode 100644
index 0200687c4835..000000000000
--- a/src/transformers/models/hiera/hiera_image_processor.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Image processor class for Hiera."""
-
-
-import requests
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
-from ...image_processing_utils import BaseImageProcessor
-from ...utils import is_vision_available, logging
-
-
-if is_vision_available():
-    from PIL import Image
-    from torchvision import transforms
-    from torchvision.transforms.functional import InterpolationMode
-
-
-logger = logging.get_logger(__name__)
-
-
-class HieraImageProcessor(BaseImageProcessor):
-    def __init__(self, size):
-        self.size = size
-        self.transform_list = [
-            transforms.Resize(int((256 / 224) * self.size), interpolation=InterpolationMode.BICUBIC),
-            transforms.CenterCrop(self.size),
-        ]
-        self.transform_vis = transforms.Compose(self.transform_list)
-        self.transform_norm = transforms.Compose(
-            self.transform_list
-            + [
-                transforms.ToTensor(),
-                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-            ]
-        )
-
-    def process_image(self, image_url):
-        # Load the image
-        img = Image.open(requests.get(image_url, stream=True).raw)
-
-        # Apply transformations
-        # img_vis = self.transform_vis(img)
-        img_norm = self.transform_norm(img)
-
-        return img_norm
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 6c8d6c93cf26..a80b0daf334e 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -14,11 +14,9 @@
 # Paper: https://arxiv.org/abs/2306.00989/
 # --------------------------------------------------------
 
-import collections.abc
 import math
 from dataclasses import dataclass
 from functools import partial
-from itertools import repeat
 from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
@@ -90,7 +88,9 @@ def undo_windowing(tensor: torch.Tensor, spatial_shape: List[int], mask_unit_sha
     num_dimensions = len(spatial_shape)
     batch_size, channels = tensor.shape[0], tensor.shape[-1]
     # [batch_size, num_mask_units_y*num_mask_units_x, mask_unit_height, mask_unit_width, channels] -> [batch_size, num_mask_units_y, num_mask_units_x, mask_unit_height, mask_unit_width, channels]
-    num_mask_units = [spatial_dim // mask_unit_dim for spatial_dim, mask_unit_dim in zip(spatial_shape, mask_unit_shape)]
+    num_mask_units = [
+        spatial_dim // mask_unit_dim for spatial_dim, mask_unit_dim in zip(spatial_shape, mask_unit_shape)
+    ]
     tensor = tensor.view(batch_size, *num_mask_units, *mask_unit_shape, channels)
 
     # Calculate the permutation order for restoring spatial organization
@@ -139,9 +139,6 @@ def forward(self, x):
         return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
 
 
-
-
-
 class HieraMlp(nn.Module):
     """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
 
@@ -169,14 +166,14 @@ def __init__(
         self.act = act_layer()
         self.drop1 = nn.Dropout(drop_probs[0])
         self.drop2 = nn.Dropout(drop_probs[1])
-        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()  
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
 
-        if use_conv:  
-            self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0])  
-            self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1])  
-        else:  
-            self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])  
-            self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])  
+        if use_conv:
+            self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0])
+            self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1])
+        else:
+            self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+            self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
 
     def forward(self, x):
         x = self.fc1(x)
@@ -234,13 +231,19 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
             # Create a view of the tensor with the patch stride as separate dims
             # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
             current_size = [dimension // stride for dimension, stride in zip(current_size, stride_steps)]
-            new_shape = [batch_size] + sum([[dimension, stride] for dimension, stride in zip(current_size, stride_steps)], []) + [channels]
+            new_shape = (
+                [batch_size]
+                + sum([[dimension, stride] for dimension, stride in zip(current_size, stride_steps)], [])
+                + [channels]
+            )
             embeddings = embeddings.view(new_shape)
 
             # Move the patch stride into the batch dimension
             # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
             shape_length = len(new_shape)
-            permute_order = [0] + list(range(2, shape_length - 1, 2)) + list(range(1, shape_length - 1, 2)) + [shape_length - 1]
+            permute_order = (
+                [0] + list(range(2, shape_length - 1, 2)) + list(range(1, shape_length - 1, 2)) + [shape_length - 1]
+            )
             embeddings = embeddings.permute(*permute_order)
 
             # Now finally flatten the relevant dims into the batch dimension
@@ -259,7 +262,6 @@ class HieraReroll(nn.Module):
     def __init__(
         self,
         config: HieraConfig,
-        
     ):
         super().__init__()
         self.config = config
@@ -273,7 +275,7 @@ def __init__(
         for i in range(self.stage_ends[-1] + 1):
             self.schedule[i] = unroll_schedule, size
             # schedule unchanged if no pooling at a stage end
-            if i in self.stage_ends[:self.config.q_pool]:
+            if i in self.stage_ends[: self.config.q_pool]:
                 if len(unroll_schedule) > 0:
                     size = [new_size // stride for new_size, stride in zip(size, unroll_schedule[0])]
                 unroll_schedule = unroll_schedule[1:]
@@ -295,7 +297,9 @@ def forward(self, embeddings: torch.Tensor, block_idx: int, mask: torch.Tensor =
 
         for strides in schedule:
             # Extract the current patch from N
-            embeddings = embeddings.view(batch_size, *strides, num_tokens // math.prod(strides), *current_mask_unit_shape, num_channels)
+            embeddings = embeddings.view(
+                batch_size, *strides, num_tokens // math.prod(strides), *current_mask_unit_shape, num_channels
+            )
 
             # Move that patch into the current MU
             # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
@@ -303,7 +307,10 @@ def forward(self, embeddings: torch.Tensor, block_idx: int, mask: torch.Tensor =
             permute = (
                 [0, 1 + num_dimensions]
                 + sum(
-                    [list(p) for p in zip(range(1, 1 + num_dimensions), range(1 + num_dimensions + 1, shape_length - 1))],
+                    [
+                        list(p)
+                        for p in zip(range(1, 1 + num_dimensions), range(1 + num_dimensions + 1, shape_length - 1))
+                    ],
                     [],
                 )
                 + [shape_length - 1]
@@ -347,7 +354,6 @@ class HieraModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-
 class HieraMaskUnitAttention(nn.Module):
     """
     Computes either Mask Unit or Global Attention. Also is able to perform q pooling.
@@ -361,7 +367,7 @@ def __init__(
         config: HieraConfig,
         input_dim: int,
         output_dim: int,
-        number_of_heads: int,
+        num_attention_heads: int,
         q_stride: int = 1,
         window_size: int = 0,
         use_mask_unit_attention: bool = False,
@@ -370,7 +376,7 @@ def __init__(
         Args:
             input_dim (`int`): The input feature dimensions.
             output_dim (`int`): The output feature dimensions.
-            number_of_heads (`int`): The number of attention heads.
+            num_attention_heads (`int`): The number of attention heads.
             q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
             window_size: The current (flattened) size of a mask unit *after* pooling (if any).
             use_mask_unit_attention: Use Mask Unit or Global Attention.
@@ -379,10 +385,10 @@ def __init__(
         self.config = config
         self.input_dim = input_dim
         self.output_dim = output_dim
-        self.number_of_heads = number_of_heads
+        self.num_attention_heads = num_attention_heads
         self.q_stride = q_stride
 
-        self.head_dim = output_dim // number_of_heads
+        self.head_dim = output_dim // num_attention_heads
         self.scale = (self.head_dim) ** -0.5
 
         self.qkv = nn.Linear(input_dim, 3 * output_dim)
@@ -398,7 +404,7 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
 
         qkv = (
             self.qkv(embeddings)
-            .reshape(batch_size, -1, num_windows, 3, self.number_of_heads, self.head_dim)
+            .reshape(batch_size, -1, num_windows, 3, self.num_attention_heads, self.head_dim)
             .permute(3, 0, 4, 2, 1, 5)
         )
         q, k, v = qkv[0], qkv[1], qkv[2]
@@ -406,12 +412,11 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         if self.q_stride > 1:
             # Refer to HieraUnroll to see how this performs a maxpool-Nd
             q = (
-                q.view(batch_size, self.number_of_heads, num_windows, self.q_stride, -1, self.head_dim)
+                q.view(batch_size, self.num_attention_heads, num_windows, self.q_stride, -1, self.head_dim)
                 .max(dim=3)
                 .values
             )
 
-    
         attention = (q * self.scale) @ k.transpose(-1, -2)
         attention = attention.softmax(dim=-1)
         embeddings = attention @ v
@@ -427,7 +432,7 @@ def __init__(
         config: HieraConfig,
         input_dim: int,
         output_dim: int,
-        number_of_heads: int,
+        num_attention_heads: int,
         drop_path: float = 0.0,
         norm_layer: nn.Module = nn.LayerNorm,
         act_layer: nn.Module = nn.GELU,
@@ -441,11 +446,11 @@ def __init__(
         self.output_dim = output_dim
         self.norm1 = norm_layer(input_dim)
         self.attention = HieraMaskUnitAttention(
-            config, input_dim, output_dim, number_of_heads, q_stride, window_size, use_mask_unit_attention
+            config, input_dim, output_dim, num_attention_heads, q_stride, window_size, use_mask_unit_attention
         )
 
         self.norm2 = norm_layer(output_dim)
-        self.mlp = HieraMlp(config, output_dim,  act_layer=act_layer)
+        self.mlp = HieraMlp(config, output_dim, act_layer=act_layer)
 
         self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity()
         if input_dim != output_dim:
@@ -456,7 +461,7 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
         normalized_embeddings = self.norm1(embeddings)
         if self.input_dim != self.output_dim:
             embeddings = do_pool(self.projection(normalized_embeddings), stride=self.attention.q_stride)
-        attention_output , attention_weights = self.attention(normalized_embeddings)
+        attention_output, attention_weights = self.attention(normalized_embeddings)
         embeddings = embeddings + self.drop_path(attention_output)
 
         # MLP
@@ -485,6 +490,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.act_func(x)
         return x
 
+
 class HieraPatchEmbedding(nn.Module):
     def __init__(
         self,
@@ -517,7 +523,7 @@ class HieraPreTrainedModel(PreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
-    supports_gradient_checkpointing = True
+    supports_gradient_checkpointing = False
 
     def _init_weights(self, module, init_bias=0.02):
         if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
@@ -545,7 +551,7 @@ def _init_weights(self, module, init_bias=0.02):
     Example usage:
         >>> from transformers import HieraModel, HieraConfig
         >>> import torch
-        >>> config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3))
+        >>> config = HieraConfig(embedding_dimension=96, num_attention_heads=1, stages=(2, 3, 16, 3))
         >>> model = HieraModel(config)
         >>> inputs = torch.rand((1, 3, 224, 224))
         >>> outputs = model(inputs)
@@ -555,13 +561,13 @@ class HieraModel(HieraPreTrainedModel):
     config_class = HieraConfig
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
-    supports_gradient_checkpointing = True
+    supports_gradient_checkpointing = False
 
     def __init__(self, config: HieraConfig):
         self.input_size = config.input_size
         self.in_chans = config.in_chans
         self.embedding_dimension = config.embedding_dimension
-        self.number_of_heads = config.number_of_heads
+        self.num_attention_heads = config.num_attention_heads
         self.num_classes = config.num_classes
         self.stages = config.stages
         self.q_pool = config.q_pool
@@ -581,7 +587,7 @@ def __init__(self, config: HieraConfig):
 
         super().__init__(config)
         self.config = config
-        norm_layer = partial(nn.LayerNorm, eps=1e-6) 
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
         depth = sum(self.stages)
         self.tokens_spatial_shape = [i // s for i, s in zip(self.input_size, self.patch_stride)]
         num_tokens = math.prod(self.tokens_spatial_shape)
@@ -610,9 +616,9 @@ def __init__(self, config: HieraConfig):
             self.position_embeddings = nn.Parameter(torch.zeros(1, num_tokens, self.embedding_dimension))
 
         # Setup roll and reroll modules
-        self.unroll = HieraUnroll(config) 
+        self.unroll = HieraUnroll(config)
         self.reroll = HieraReroll(config)
-        
+
         # q_pool locations
         q_pool_blocks = [x + 1 for x in self.stage_ends[: self.q_pool]]
         # stochastic depth decay rule
@@ -628,17 +634,16 @@ def __init__(self, config: HieraConfig):
 
             if i - 1 in self.stage_ends:
                 output_dim = int(self.embedding_dimension * self.dim_mul)
-                self.number_of_heads = int(self.number_of_heads * self.head_mul)  # Update the class variable
+                self.num_attention_heads = int(self.num_attention_heads * self.head_mul)  # Update the class variable
                 cur_stage += 1
                 if i in q_pool_blocks:
                     flat_mu_size //= flat_q_stride
 
-
             block = HieraBlock(
                 config,
                 input_dim=self.embedding_dimension,
                 output_dim=output_dim,
-                number_of_heads=self.number_of_heads,
+                num_attention_heads=self.num_attention_heads,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
                 q_stride=(flat_q_stride if i in q_pool_blocks else 1),
@@ -646,7 +651,7 @@ def __init__(self, config: HieraConfig):
                 use_mask_unit_attention=use_mask_unit_attention,
             )
 
-            self.embedding_dimension = output_dim 
+            self.embedding_dimension = output_dim
             self.blocks.append(block)
 
         self.norm = norm_layer(self.embedding_dimension)
@@ -731,9 +736,8 @@ def forward(
         output_intermediates: bool = True,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
-        
     ) -> Union[Tuple[torch.Tensor], HieraModelOutput]:
-        intermediates = [] if output_intermediates  else None
+        intermediates = [] if output_intermediates else None
         attentions = [] if output_attentions else None
         hidden_states = [] if output_hidden_states else None
 
@@ -783,4 +787,4 @@ def forward(
             intermediates=intermediates if output_intermediates else None,
             attentions=attentions if output_attentions else None,
             hidden_states=hidden_states if output_hidden_states else None,
-        )
\ No newline at end of file
+        )
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 38d84d015220..68fe9358df80 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -17,20 +17,25 @@
 import unittest
 from typing import Tuple
 
+import requests
+
 from transformers import HieraConfig
 from transformers.testing_utils import (
     require_torch,
     slow,
+    torch_device,
 )
 from transformers.utils import is_torch_available
 
+from ...test_modeling_common import ModelTesterMixin
+
 
 if is_torch_available():
     import torch
+    from PIL import Image
 
-    from transformers import HieraModel
-    from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST, HieraBlock
-
+    from transformers import BeitImageProcessor, HieraModel
+    from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
 import math
 
 
@@ -40,12 +45,12 @@ class HieraModelTester:
     def __init__(
         self,
         parent,
-        input_size: Tuple[int, ...] = (224, 224),
+        input_size: Tuple[int, ...] = (32, 32),
         in_chans: int = 3,
-        embedding_dimension: int = 96,  # initial embedding input_dim
+        embedding_dimension: int = 32,  # initial embedding input_dim
         number_of_heads: int = 1,  # initial number of number_of_heads
-        num_classes: int = 1000,
-        stages: Tuple[int, ...] = (2, 3, 16, 3),
+        num_classes: int = 3,
+        stages: Tuple[int, ...] = (1, 1, 1, 1),
         q_pool: int = 3,  # number of q_pool stages
         q_stride: Tuple[int, ...] = (2, 2),
         mask_unit_size: Tuple[int, ...] = (8, 8),  # must divide q_stride ** (#stages-1)
@@ -61,6 +66,7 @@ def __init__(
         head_dropout: float = 0.0,
         head_init_scale: float = 0.001,
         sep_position_embeddings: bool = False,
+        is_training=True,
     ):
         self.parent = parent
         self.input_size = input_size
@@ -83,87 +89,30 @@ def __init__(
         self.head_dropout = head_dropout
         self.head_init_scale = head_init_scale
         self.sep_position_embeddings = sep_position_embeddings
+        self.is_training = is_training
 
-    def prepare_config_and_inputs(self, checkpoint_url):
+    def prepare_config_and_inputs(self):
         # Prepare configuration and inputs for testing your model
         pixel_values = torch.rand((1, self.in_chans, self.input_size[0], self.input_size[1]))
 
-        config = self.get_config(checkpoint_url=checkpoint_url)
+        config = self.get_config()
 
-        return config, pixel_values
-
-    def get_config(self, checkpoint_url):
-        if "hiera_tiny_224" in checkpoint_url:
-            config = HieraConfig(
-                embedding_dimension=96,
-                number_of_heads=1,
-                stages=(1, 2, 7, 2),
-            )
-
-        elif "hiera_small_224" in checkpoint_url:
-            config = HieraConfig(
-                embedding_dimension=96,
-                number_of_heads=1,
-                stages=(1, 2, 11, 2),
-            )
-
-        elif "hiera_base_224" in checkpoint_url:
-            config = HieraConfig(
-                embedding_dimension=96,
-                number_of_heads=1,
-                stages=(2, 3, 16, 3),
-            )
+        return config, pixel_values.to(torch_device)
 
-        elif "hiera_base_plus_224" in checkpoint_url:
-            config = HieraConfig(
-                embedding_dimension=112,
-                number_of_heads=2,
-                stages=(2, 3, 16, 3),
-            )
-
-        elif "hiera_large_224" in checkpoint_url:
-            config = HieraConfig(
-                embedding_dimension=144,
-                number_of_heads=2,
-                stages=(2, 6, 36, 4),
-            )
-
-        elif "hiera_huge_224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
-
-        elif "hiera_base_16x224" in checkpoint_url:
-            config = HieraConfig(
-                num_classes=self.num_classes,
-                input_size=(16, 224, 224),
-                q_stride=(1, 2, 2),
-                mask_unit_size=(1, 8, 8),
-                patch_kernel=(3, 7, 7),
-                patch_stride=(2, 4, 4),
-                patch_padding=(1, 3, 3),
-                sep_position_embeddings=True,
-            )
-
-        elif "hiera_base_plus_16x224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3))
-
-        elif "hiera_large_16x224" in checkpoint_url:
-            config = HieraConfig(
-                embedding_dimension=144,
-                number_of_heads=2,
-                stages=(2, 6, 36, 4),
-            )
-
-        elif "hiera_huge_16x224" in checkpoint_url:
-            config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
-        else:
-            raise RuntimeError(f"Invalid checkpoint url ({checkpoint_url})")
-
-        return config
+    def get_config(self):
+        return HieraConfig(
+            input_size=self.input_size,
+            embedding_dimension=self.embedding_dimension,
+            number_of_heads=self.number_of_heads,
+            stages=self.stages,
+            num_classes=self.num_classes,
+        )
 
     def create_and_check_model(self, config, pixel_values):
         batch_size = 1
         for model_class in self.all_model_classes:
             model = model_class(config=config)
+            model.to(torch_device)
             num_patches = (
                 int(
                     ((self.input_size[0] - self.patch_kernel[0] + 2 * self.patch_padding[0]) / self.patch_stride[0])
@@ -182,20 +131,50 @@ def create_and_check_model(self, config, pixel_values):
                 embedding_dimension = embedding_dimension * 2
             model.eval()
             with torch.no_grad():
-                result = model(pixel_values=pixel_values)
+                result = model(pixel_values=pixel_values.to(torch_device))
 
             for idx, x in enumerate(result.intermediates):
                 self.parent.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
 
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
 
 @require_torch
-class HieraModelTest(unittest.TestCase):
+class HieraModelTest(unittest.TestCase, ModelTesterMixin):
+    all_model_classes = (HieraModel,) if is_torch_available() else ()
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_attention_outputs = False
+    test_model_outputs_equivalence = False
+    test_config = False
+    test_hidden_states_output = False
+    test_initialization = False
+    test_retain_grad_hidden_states_attentions = False
+
     def setUp(self):
         self.model_tester = HieraModelTester(self)
+        # self.config_tester = ConfigTester(self, config_class=HieraConfig, has_text_modality=False, hidden_size=32)
+
+    def test_config(self):
+        pass
+        # self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Hiera does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
 
     def test_model(self):
         for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            config_and_inputs = self.model_tester.prepare_config_and_inputs(model_name)
+            config_and_inputs = self.model_tester.prepare_config_and_inputs()
             self.model_tester.create_and_check_model(*config_and_inputs)
 
     @slow
@@ -204,38 +183,41 @@ def test_model_from_pretrained(self):
             model = HieraModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # for model_class in self.all_model_classes:
+        #     model = model_class(config)
+        #     self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+        #     x = model.get_output_embeddings()
+        #     self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+
+def prepare_img():
+    image = Image.open("/home/ubuntu/home/hiera/transformers/tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
 
 @require_torch
 @slow
 class HieraModelIntegrationTest(unittest.TestCase):
     def test_forward(self):
-        torch_device = "cpu"
-        input_size = 224
-        batch_size = 1
-        patch_kernel = (7, 7)
-        patch_padding = (3, 3)
-        patch_stride = (4, 4)
-        q_stride = (2, 2)
-        flat_q_stride = math.prod(q_stride)
-        stages = (2, 3, 16, 3)
-        embedding_dimension = 96
         model = HieraModel.from_pretrained("namangarg110/hiera_base_224")
         model.to(torch_device)
 
-        random_tensor = torch.rand(batch_size, 3, input_size, input_size)
-        num_patches = int(((input_size - patch_kernel[0] + 2 * patch_padding[0]) / patch_stride[0]) + 1) ** 2
-
-        indermediate_shapes = []
-        for _ in stages:
-            indermediate_shapes.append(
-                (batch_size, int(math.sqrt(num_patches)), int(math.sqrt(num_patches)), embedding_dimension)
-            )
-            num_patches = num_patches / flat_q_stride
-            embedding_dimension = embedding_dimension * 2
-        out = model(random_tensor)
+        url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        image_processor = BeitImageProcessor.from_pretrained("namangarg110/hiera_image_processor", size=224)
 
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+        expected_slice = torch.tensor([0.1825, 0.8655, 0.5779, 1.1550, 1.1025, 0.6381, 1.0288, -0.0624, 0.1455])
+        # If you also want intermediate feature maps
+        out = model(pixel_values)
         out.last_hidden_state.argmax(dim=-1).item()
+        self.assertTrue(torch.allclose(out.last_hidden_state[0, :9], expected_slice, atol=1e-4))
+
 
-        out = model(random_tensor, output_intermediates=True)
-        for idx, x in enumerate(out.intermediates):
-            self.assertEqual(x.shape, indermediate_shapes[idx], "Invalid Intermediate shape")
+if __name__ == "__main__":
+    test = HieraModelIntegrationTest()
+    test.test_forward()

From 8360db81dd89f5339701ef406bf842b34ee20911 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 02:10:22 +0200
Subject: [PATCH 062/118] nits

---
 docs/source/en/model_doc/hiera.md             |  15 +-
 .../models/auto/feature_extraction_auto.py    |   1 -
 .../models/auto/image_processing_auto.py      |   2 +-
 .../models/hiera/configuration_hiera.py       |   2 +-
 .../models/hiera/convert_hiera_to_hf.py       |  22 +--
 .../models/hiera/modeling_hiera.py            | 183 +++++++++++++-----
 6 files changed, 150 insertions(+), 75 deletions(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 233f63b29759..26138077f436 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -18,19 +18,16 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Hiera model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
 
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
+The paper introduces "Hiera," a hierarchical Vision Transformer that simplifies the architecture of modern hierarchical vision transformers by removing unnecessary components without compromising on accuracy or efficiency. Unlike traditional transformers that add complex vision-specific components to improve supervised classification performance, Hiera demonstrates that such additions, often termed "bells-and-whistles," are not essential for high accuracy. By leveraging a strong visual pretext task (MAE) for pretraining, Hiera retains simplicity and achieves superior accuracy and speed both in inference and training across various image and video recognition tasks. The approach suggests that spatial biases required for vision tasks can be effectively learned through proper pretraining, eliminating the need for added architectural complexity. 
 
-Tips:
+The abstract from the paper is the following:
 
-<INSERT TIPS ABOUT MODEL HERE>
+*Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco). The original code can be found
+[here](https://github.com/facebookresearch/hiera).
 
 
 ## HieraConfig
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 86992edf49c0..f8cb55091b02 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -60,7 +60,6 @@
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
-        ("hiera", "HieraFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
         ("imagegpt", "ImageGPTFeatureExtractor"),
         ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 971f368c900f..a2b4a35b3ae4 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -69,7 +69,7 @@
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
-        ("hiera", "HieraImageProcessor"),
+        ("hiera", "BitImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 25e309ef1fe8..a8f1cdeba681 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 5c48bb55bb79..d03a46cebd12 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
+# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,14 +16,13 @@
 
 
 import argparse
-from dataclasses import dataclass
 
 import requests
 import torch
 from PIL import Image
 from torchvision import transforms
 
-from transformers import BeitImageProcessor, HieraConfig, HieraForImageClassification, HieraModel
+from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraModel
 from transformers.utils import logging
 
 
@@ -92,8 +91,8 @@ def create_rename_keys(config, base_model=False):
         # layernorm + classification head
         rename_keys.extend(
             [
-                ("norm.weight", "hiera.layernorm.weight"),
-                ("norm.bias", "hiera.layernorm.bias"),
+                ("norm.weight", "hiera.pooler.layernorm.weight"),
+                ("norm.bias", "hiera.pooler.layernorm.bias"),
                 ("head.projection.weight", "classifier.weight"),
                 ("head.projection.bias", "classifier.bias"),
             ]
@@ -120,14 +119,7 @@ def prepare_img():
     return im
 
 
-@dataclass
-class HieraInfo:
-    base_checkpoint_url: str
-    checkpoint_url: str
-    config: HieraConfig
-
-
-def get_hiera_config(model_name: str, base_model: bool) -> HieraInfo:
+def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
     kwargs = {} if base_model else {"num_labels": 400 if model_name.endswith("16x224") else 1000}
 
     if model_name == "hiera-tiny-224":
@@ -257,7 +249,9 @@ def convert_hiera_checkpoint(args):
             ]
         )
 
-    image_processor = BeitImageProcessor(image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD)
+    image_processor = BitImageProcessor(
+        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256}
+    )
     inputs = image_processor(images=input_image, return_tensors="pt")
 
     expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index bb86a866abb7..9788676b4c34 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,10 +48,10 @@
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "EduardoPacheco/hiera-tiny-224"
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
 
 # Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/hiera-base-patch16-224"
+_IMAGE_CLASS_CHECKPOINT = "Eduardo/hiera-tiny-224-ink1"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
 
 
@@ -93,7 +93,7 @@ def __init__(self, config):
                 f"The number of dimensions of the input image should be 2 or 3, but got {self.spatial_dims}."
             )
         self.num_channels = config.num_channels
-        self.image_size = config.input_size
+        self.image_size = config.input_size[-2:]
 
         self.projection = conv_nd(self.spatial_dims)(
             self.num_channels,
@@ -122,14 +122,28 @@ def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] =
 
         return self.projection(pixel_values * mask.bool())
 
-    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
-        _, num_channels, _, _ = pixel_values.shape
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        height, width = pixel_values.shape[-2:]
+
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
             )
 
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
         embeddings = self.masked_conv(pixel_values, bool_masked_pos)
         embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
 
@@ -143,7 +157,7 @@ class HieraEmbeddings(nn.Module):
 
     def __init__(self, config: HieraConfig, use_mask_token: bool = False) -> None:
         super().__init__()
-
+        self.patch_stride = config.patch_stride
         self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
         self.num_tokens = math.prod(self.tokens_spatial_shape)
         self.sep_pos_embed = config.sep_pos_embed
@@ -167,19 +181,74 @@ def __init__(self, config: HieraConfig, use_mask_token: bool = False) -> None:
         else:
             self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.hidden_size))
 
-    def get_position_embedding(self) -> torch.Tensor:
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, pos_embeds: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Adapted from:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = pos_embeds.shape[1]
+        if num_patches == num_positions and height == width:
+            return pos_embeds
+        dim = embeddings.shape[-1]
+        h0 = height // self.patch_stride[0] if not self.sep_pos_embed else height // self.patch_stride[1]
+        w0 = width // self.patch_stride[1] if not self.sep_pos_embed else width // self.patch_stride[2]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        pos_embeds = pos_embeds.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        pos_embeds = pos_embeds.permute(0, 3, 1, 2)
+        pos_embeds = nn.functional.interpolate(
+            pos_embeds,
+            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if int(h0) != pos_embeds.shape[-2] or int(w0) != pos_embeds.shape[-1]:
+            raise ValueError("The interpolated position encoding does not have the right size")
+        pos_embeds = pos_embeds.permute(0, 2, 3, 1).view(1, -1, dim)
+        return pos_embeds
+
+    def get_position_embedding(
+        self, embeddings: torch.Tensor, height: int, width: int, interpolate_pos_encoding: bool
+    ) -> torch.Tensor:
         if self.sep_pos_embed:
-            return self.position_embeddings_spatial.repeat(
-                1, self.tokens_spatial_shape[0], 1
-            ) + torch.repeat_interleave(
+            spatial = self.position_embeddings_spatial
+            spatial = (
+                self.interpolate_pos_encoding(embeddings, spatial, height, width)
+                if interpolate_pos_encoding
+                else spatial
+            )
+            spatial = spatial.repeat(1, self.tokens_spatial_shape[0], 1)
+
+            temporal = torch.repeat_interleave(
                 self.position_embeddings_temporal,
                 self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
                 dim=1,
             )
+
+            return spatial + temporal
         else:
-            return self.position_embeddings
+            position_embeddings = self.position_embeddings
+            position_embeddings = (
+                self.interpolate_pos_encoding(embeddings, position_embeddings, height, width)
+                if interpolate_pos_encoding
+                else position_embeddings
+            )
+            return position_embeddings
 
-    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
         if len(self.mask_spatial_shape) == 2:
             batch_size, num_channels, height, width = pixel_values.shape
         else:
@@ -188,9 +257,11 @@ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Bo
         if bool_masked_pos is not None:
             bool_masked_pos = bool_masked_pos.view(batch_size, 1, *self.mask_spatial_shape)
 
-        embeddings = self.patch_embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        embeddings = self.patch_embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
 
-        embeddings = embeddings + self.get_position_embedding()
+        embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
 
         return embeddings
 
@@ -502,10 +573,10 @@ def forward(
 class HieraUnroll(nn.Module):
     """
     Reorders the tokens such that patches are contiguous in memory.
-    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
-                           [B, (Sy, Sx, H // Sy, W // Sx), C]
+    E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
+    [batch_size, (stride, stride, height // stride, width // stride), hidden_size]
 
-    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
+    This allows operations like Max2d to be computed as x.view(batch_size, stride*stride, -1, hidden_size).max(dim=1).
     Not only is this faster, but it also makes it easy to support inputs of arbitrary
     dimensions in addition to patch-wise sparsity.
 
@@ -514,8 +585,8 @@ class HieraUnroll(nn.Module):
     size 8x8 would be contiguous in memory, allowing operations like mask unit attention
     computed easily and efficiently, while also allowing max to be applied sequentially.
 
-    Note: This means that intermediate values of the model are not in HxW order, so they
-    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
+    Note: This means that intermediate values of the model are not in height x width order, so they
+    need to be re-rolled if you want to use the intermediate values as a height x width feature map.
     The last block of the network is fine though, since by then the strides are all consumed.
     """
 
@@ -524,37 +595,41 @@ def __init__(self, config) -> None:
         self.size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
         self.schedule = [config.query_stride] * len(config.depths[:-1])
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
-        Input: Flattened patch embeddings [B, N, C]
-        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
+        Input: Flattened patch embeddings [batch_size, seq_len, hidden_size]
+        Output: Patch embeddings [batch_size, seq_len, hidden_size] permuted such
+            that [batch_size, 4, seq_len//4, hidden_size].max(1) etc. performs MaxPoolNd
         """
-        B, _, C = x.shape
+        batch_size, _, hidden_size = hidden_states.shape
 
-        cur_size = self.size
-        x = x.view(*([B] + cur_size + [C]))
+        current_size = self.size
+        hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size]))
 
         for strides in self.schedule:
             # Move patches with the given strides to the batch dimension
 
             # Create a view of the tensor with the patch stride as separate dims
-            # For example in 2d: [B, H // Sy, Sy, W // Sx, Sx, C]
-            cur_size = [i // s for i, s in zip(cur_size, strides)]
-            new_shape = [B] + sum([[i, s] for i, s in zip(cur_size, strides)], []) + [C]
-            x = x.view(new_shape)
+            # For example in 2d: [batch_size, height // stride, stride, width // stride, stride, C]
+            current_size = [i // s for i, s in zip(current_size, strides)]
+            # initialize new_shape with [height // stride, stride, width // stride, stride]
+            new_shape = [item for pair in zip(current_size, strides) for item in pair]
+            # add batch_size and hidden_size to new_shape
+            new_shape = [batch_size] + new_shape + [hidden_size]
+            hidden_states = hidden_states.view(new_shape)
 
             # Move the patch stride into the batch dimension
-            # For example in 2d: [B, Sy, Sx, H // Sy, W // Sx, C]
-            L = len(new_shape)
-            permute = [0] + list(range(2, L - 1, 2)) + list(range(1, L - 1, 2)) + [L - 1]
-            x = x.permute(permute)
+            # For example in 2d: [batch_size, stride, stride, height // stride, width // stride, hidden_size]
+            num_dims = len(new_shape)
+            permute = [0] + list(range(2, num_dims - 1, 2)) + list(range(1, num_dims - 1, 2)) + [num_dims - 1]
+            hidden_states = hidden_states.permute(permute)
 
             # Now finally flatten the relevant dims into the batch dimension
-            x = x.flatten(0, len(strides))
-            B *= math.prod(strides)
+            hidden_states = hidden_states.flatten(0, len(strides))
+            batch_size *= math.prod(strides)
 
-        x = x.reshape(-1, math.prod(self.size), C)
-        return x
+        hidden_states = hidden_states.reshape(-1, math.prod(self.size), hidden_size)
+        return hidden_states
 
 
 def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor:
@@ -737,6 +812,21 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 """
 
 
+class HieraPooler(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+        self.layernorm = nn.LayerNorm(num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.transpose(1, 2)
+        pooled_output = self.pooler(hidden_states)
+        pooled_output = torch.flatten(pooled_output, 1)
+        pooled_output = self.layernorm(pooled_output)
+        return pooled_output
+
+
 @add_start_docstrings(
     "The bare Hiera Model transformer outputting raw hidden-states without any specific head on top.",
     HIERA_START_DOCSTRING,
@@ -744,16 +834,13 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 class HieraModel(HieraPreTrainedModel):
     def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
         super().__init__(config)
-        self.config = config
-        self.num_layers = len(config.depths)
-        self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (self.num_layers - 1))
+        self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
 
         self.embeddings = HieraEmbeddings(config, use_mask_token=use_mask_token)
         self.unroll = HieraUnroll(config)
         self.encoder = HieraEncoder(config)
 
-        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
-        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+        self.pooler = HieraPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -780,8 +867,8 @@ class PreTrainedModel
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
@@ -826,9 +913,7 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = None
         if self.pooler is not None:
-            pooled_output = self.pooler(sequence_output.transpose(1, 2))
-            pooled_output = torch.flatten(pooled_output, 1)
-            pooled_output = self.layernorm(pooled_output)
+            pooled_output = self.pooler(sequence_output)
 
         if not return_dict:
             head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
@@ -971,8 +1056,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
-    the [CLS] token) e.g. for ImageNet.
+    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state with
+    average pooling) e.g. for ImageNet.
 
     <Tip>
 

From 96518546469f190957a477305cf28f0146ee771f Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 03:22:58 +0200
Subject: [PATCH 063/118] Improving tests

---
 .../models/hiera/convert_hiera_to_hf.py       | 23 +++--
 tests/models/hiera/test_modeling_hiera.py     | 99 +++++++++++--------
 2 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index d03a46cebd12..8d4b7faf049b 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -83,10 +83,10 @@ def create_rename_keys(config, base_model=False):
 
     if base_model:
         # layernorm + pooler
-        rename_keys.extend([("norm.weight", "layernorm.weight"), ("norm.bias", "layernorm.bias")])
+        rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")])
 
         # if just the base model, we should remove "hiera" from all keys that start with "hiera"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("hiera") else pair for pair in rename_keys]
+        rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys]
     else:
         # layernorm + classification head
         rename_keys.extend(
@@ -263,6 +263,7 @@ def convert_hiera_checkpoint(args):
         expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
         assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
         print("Pixel values look good!")
+        print(f"{inputs.pixel_values[0, :3, :3, :3]=}")
     else:
         print("Converted without verifying pixel values")
         inputs = {"pixel_values": torch.rand((1, 3, 224, 224))}
@@ -270,12 +271,20 @@ def convert_hiera_checkpoint(args):
 
     outputs = model(**inputs)
     # original implementation returns logits.softmax(dim=-1)
-    expected_prob = original_model(expected_pixel_values)
+    expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
 
-    if verify_logits and not base_model:
-        output_prob = outputs.logits.softmax(dim=-1)
-        assert torch.allclose(output_prob, expected_prob, atol=1e-4)
-        print("Logits look good!")
+    if verify_logits:
+        if not base_model:
+            assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-4)
+            print("Logits look good!")
+            print(f"{outputs.logits[:, :5]=}")
+        else:
+            expected_last_hidden = expected_intermediates[-1]
+            batch_size, _, _, hidden_dim = expected_last_hidden.shape
+            expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
+            assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-4)
+            print("Logits look good!")
+            print(f"{outputs.last_hidden_state[0, :3, :3]=}")
     else:
         print("Converted without verifying logits")
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 55fbe76a4d56..7040dc76f7b5 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -15,6 +15,7 @@
 """ Testing suite for the PyTorch Hiera model. """
 
 
+import math
 import unittest
 
 from transformers import HieraConfig
@@ -53,48 +54,38 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        image_size=30,
-        patch_size=2,
+        input_size=(32, 32),
+        mlp_ratio=1.0,
         num_channels=3,
+        depths=[1, 1, 1, 1],
+        initial_num_heads=1,
+        num_head_multiplier=1.0,
+        embed_dim_multiplier=1.0,
         is_training=True,
         use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
+        embed_dim=32,
         hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
         initializer_range=0.02,
         scope=None,
-        encoder_stride=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
+        self.input_size = input_size
+        self.mlp_ratio = mlp_ratio
         self.num_channels = num_channels
+        self.depths = depths
+        self.initial_num_heads = initial_num_heads
+        self.num_head_multiplier = num_head_multiplier
+        self.embed_dim_multiplier = embed_dim_multiplier
         self.is_training = is_training
         self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
+        self.embed_dim = embed_dim
         self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
-        self.scope = scope
-        self.encoder_stride = encoder_stride
-
-        # in Hiera, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
+        self.scope
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.input_size[0], self.input_size[1]])
 
         labels = None
         if self.use_labels:
@@ -106,19 +97,16 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         return HieraConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
+            embed_dim=self.embed_dim,
+            input_size=self.input_size,
+            mlp_ratio=self.mlp_ratio,
             num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
+            depths=self.depths,
+            initial_num_heads=self.initial_num_heads,
+            num_head_multiplier=self.num_head_multiplier,
+            embed_dim_multiplier=self.embed_dim_multiplier,
             hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
             initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -126,7 +114,13 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+        expected_seq_len = math.prod([i // s for i, s in zip(config.input_size, config.patch_stride)]) * math.prod(
+            config.query_stride
+        ) ** (-len(config.depths))
+        expected_dim = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
 
     def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
         model = HieraForMaskedImageModeling(config=config)
@@ -143,7 +137,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
         model.to(torch_device)
         model.eval()
 
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        pixel_values = floats_tensor([self.batch_size, 1, self.input_size[0], self.input_size[0]])
         result = model(pixel_values)
         self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
 
@@ -192,6 +186,11 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"image-feature-extraction": HieraModel, "image-classification": HieraForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
 
     test_pruning = False
@@ -248,16 +247,28 @@ def prepare_img():
 class HieraModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("google/hiera-base-patch16-224") if is_vision_available() else None
+        return (
+            ViTImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-ink1") if is_vision_available() else None
+        )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = HieraForImageClassification.from_pretrained("google/hiera-base-patch16-224").to(torch_device)
+        model = HieraForImageClassification.from_pretrained("EduardoPacheco/hiera-tiny-224-ink1").to(torch_device)
 
         image_processor = self.default_image_processor
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
+        expected_pixel_values = torch.tensor(
+            [
+                [[0.2967, 0.4679, 0.4508], [0.3309, 0.4337, 0.3309], [0.3309, 0.3823, 0.3309]],
+                [[-1.5455, -1.4930, -1.5455], [-1.5280, -1.4755, -1.5980], [-1.5630, -1.5280, -1.4755]],
+                [[-0.6367, -0.4973, -0.5321], [-0.7936, -0.6715, -0.6715], [-0.8284, -0.7413, -0.5670]],
+            ]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(inputs.pixel_values[0, :3, :3, :3], expected_pixel_values, atol=1e-4))
+
         # forward pass
         with torch.no_grad():
             outputs = model(**inputs)
@@ -266,9 +277,9 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+        expected_slice = torch.tensor([[0.8028, 0.2409, -0.2254, -0.3712, -0.2848]]).to(torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
 
     @slow
     def test_inference_interpolate_pos_encoding(self):
@@ -305,7 +316,9 @@ def test_inference_fp16(self):
         r"""
         A small test to make sure that inference work in half precision without any problem.
         """
-        model = HieraModel.from_pretrained("facebook/dino-hieras8", torch_dtype=torch.float16, device_map="auto")
+        model = HieraModel.from_pretrained(
+            "EduardoPacheco/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto"
+        )
         image_processor = self.default_image_processor
 
         image = prepare_img()

From 88f81d0e58f99c84a2c863f6fedd107b8b4d0357 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 04:52:43 +0200
Subject: [PATCH 064/118] Nits

---
 .../models/hiera/modeling_hiera.py            | 233 +++++++++---------
 tests/models/hiera/test_modeling_hiera.py     |  12 +-
 2 files changed, 122 insertions(+), 123 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 9788676b4c34..6be626e567b5 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -51,7 +51,7 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
 
 # Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "Eduardo/hiera-tiny-224-ink1"
+_IMAGE_CLASS_CHECKPOINT = "EduardoPacheco/hiera-tiny-224-ink1"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
 
 
@@ -491,6 +491,106 @@ def forward(
         return hidden_states, attn_weights
 
 
+def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shape: List[int]) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+    """
+    num_dims = len(shape)
+    batch_size, hidden_size = hidden_states.shape[0], hidden_states.shape[-1]
+    # From: [batch_size, num_mask_unit_height*num_#mask_unit_wdith, mask_unit_height, mask_unit_width, hidden_size]
+    # To: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
+    num_mask_units = [s // mu for s, mu in zip(shape, mask_unit_shape)]
+    hidden_states = hidden_states.view(batch_size, *num_mask_units, *mask_unit_shape, hidden_size)
+
+    # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
+    # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]
+    permute = (
+        [0]
+        + sum(
+            [list(p) for p in zip(range(1, 1 + num_dims), range(1 + num_dims, 1 + 2 * num_dims))],
+            [],
+        )
+        + [len(hidden_states.shape) - 1]
+    )
+    hidden_states = hidden_states.permute(permute).reshape(batch_size, *shape, hidden_size)
+
+    return hidden_states
+
+
+class HieraReroll(nn.Module):
+    """
+    Undos the "unroll" operation so that you can use intermediate features.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        self.schedule = {}
+        for idx_stage in range(len(config.depths)):
+            stage_size = [i // (s**idx_stage) for i, s in zip(size, config.query_stride)]
+            schedule = unroll_schedule[idx_stage:]
+            self.schedule[idx_stage] = schedule, stage_size
+
+    def forward(self, hidden_states: torch.Tensor, stage_idx: int, bool_mask_pos: torch.Tensor = None) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no mask is provided returns:
+            - [batch_size, height, width, hidden_size] for 2d
+            - [batch_size, frames, height, width, hidden_size] for 3d
+        If a mask is provided returns:
+            - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] for 2d
+        """
+        schedule, size = self.schedule[stage_idx]
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        num_dim = len(size)
+        mask_unit_shape = [1] * num_dim
+
+        for strides in schedule:
+            # Extract the current patch from seq_len
+            hidden_states = hidden_states.view(
+                batch_size, *strides, seq_len // math.prod(strides), *mask_unit_shape, hidden_size
+            )
+
+            # Move that patch into the current MU
+            # Example in 2d:
+            # Input: [batch_size, stride, stride, seq_len//(stride*stride), mask_unit_height, mask_unit_width, hidden_size]
+            # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size]
+            L = len(hidden_states.shape)
+            permute = (
+                [0, 1 + num_dim]
+                + sum(
+                    [list(p) for p in zip(range(1, 1 + num_dim), range(1 + num_dim + 1, L - 1))],
+                    [],
+                )
+                + [L - 1]
+            )
+            hidden_states = hidden_states.permute(permute)
+
+            # Reshape to [batch_size, seq_len//(stride*stride), *mask_units, hidden_size]
+            for i in range(num_dim):
+                mask_unit_shape[i] *= strides[i]
+            hidden_states = hidden_states.reshape(batch_size, -1, *mask_unit_shape, hidden_size)
+            seq_len = hidden_states.shape[1]
+
+        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size)
+
+        # If masked, return [B, #MUs, MUy, MUx, C]
+        if bool_mask_pos is not None:
+            return hidden_states
+
+        # If not masked, we can return [B, H, W, C]
+        hidden_states = undo_windowing(hidden_states, size, mask_unit_shape)
+
+        return hidden_states
+
+
 class HieraEncoder(nn.Module):
     def __init__(self, config: HieraConfig) -> None:
         super().__init__()
@@ -528,23 +628,29 @@ def __init__(self, config: HieraConfig) -> None:
             embed_dim = dim_out
             self.stages.append(stage)
 
+        self.reroll = HieraReroll(config)
         self.gradient_checkpointing = False
 
     def forward(
         self,
         hidden_states: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ) -> Union[tuple, BaseModelOutput]:
         all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, bool_masked_pos=bool_masked_pos)
+            all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
+
         for i, stage_module in enumerate(self.stages):
             layer_head_mask = head_mask[i] if head_mask is not None else None
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
@@ -558,8 +664,10 @@ def forward(
             if output_attentions:
                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
 
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, bool_masked_pos=bool_masked_pos)
+                all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         if not return_dict:
             return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
@@ -632,117 +740,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def undo_windowing(x: torch.Tensor, shape: List[int], mu_shape: List[int]) -> torch.Tensor:
-    """
-    Restore spatial organization by undoing windowed organization of mask units.
-
-    Args:
-        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
-        shape: current spatial shape, if it were not organized into mask unit
-            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
-        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
-    Returns:
-        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
-    """
-    D = len(shape)
-    B, C = x.shape[0], x.shape[-1]
-    # [B, #MUy*#MUx, MUy, MUx, C] -> [B, #MUy, #MUx, MUy, MUx, C]
-    num_MUs = [s // mu for s, mu in zip(shape, mu_shape)]
-    x = x.view(B, *num_MUs, *mu_shape, C)
-
-    # [B, #MUy, #MUx, MUy, MUx, C] -> [B, #MUy*MUy, #MUx*MUx, C]
-    permute = (
-        [0]
-        + sum(
-            [list(p) for p in zip(range(1, 1 + D), range(1 + D, 1 + 2 * D))],
-            [],
-        )
-        + [len(x.shape) - 1]
-    )
-    x = x.permute(permute).reshape(B, *shape, C)
-
-    return x
-
-
-class HieraReroll(nn.Module):
-    """
-    Undos the "unroll" operation so that you can use intermediate features.
-    """
-
-    def __init__(
-        self,
-        input_size: Tuple[int, ...],
-        patch_stride: Tuple[int, ...],
-        unroll_schedule: List[Tuple[int, ...]],
-        stage_ends: List[int],
-        q_pool: int,
-    ):
-        super().__init__()
-        self.size = [i // s for i, s in zip(input_size, patch_stride)]
-
-        # The first stage has to reverse everything
-        # The next stage has to reverse all but the first unroll, etc.
-        self.schedule = {}
-        size = self.size
-        for i in range(stage_ends[-1] + 1):
-            self.schedule[i] = unroll_schedule, size
-            # schedule unchanged if no pooling at a stage end
-            if i in stage_ends[:q_pool]:
-                if len(unroll_schedule) > 0:
-                    size = [n // s for n, s in zip(size, unroll_schedule[0])]
-                unroll_schedule = unroll_schedule[1:]
-
-    def forward(self, x: torch.Tensor, block_idx: int, mask: torch.Tensor = None) -> torch.Tensor:
-        """
-        Roll the given tensor back up to spatial order assuming it's from the given block.
-
-        If no mask is provided:
-            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
-        If a mask is provided:
-            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
-        """
-        schedule, size = self.schedule[block_idx]
-        B, N, C = x.shape
-
-        D = len(size)
-        cur_mu_shape = [1] * D
-
-        for strides in schedule:
-            # Extract the current patch from N
-            x = x.view(B, *strides, N // math.prod(strides), *cur_mu_shape, C)
-
-            # Move that patch into the current MU
-            # Example in 2d: [B, Sy, Sx, N//(Sy*Sx), MUy, MUx, C] -> [B, N//(Sy*Sx), Sy, MUy, Sx, MUx, C]
-            L = len(x.shape)
-            permute = (
-                [0, 1 + D]
-                + sum(
-                    [list(p) for p in zip(range(1, 1 + D), range(1 + D + 1, L - 1))],
-                    [],
-                )
-                + [L - 1]
-            )
-            x = x.permute(permute)
-
-            # Reshape to [B, N//(Sy*Sx), *MU, C]
-            for i in range(D):
-                cur_mu_shape[i] *= strides[i]
-            x = x.reshape(B, -1, *cur_mu_shape, C)
-            N = x.shape[1]
-
-        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
-        x = x.view(B, N, *cur_mu_shape, C)
-
-        # If masked, return [B, #MUs, MUy, MUx, C]
-        if mask is not None:
-            return x
-
-        # If not masked, we can return [B, H, W, C]
-        x = undo_windowing(x, size, cur_mu_shape)
-
-        return x
-
-
 class HieraPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -790,7 +787,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 HIERA_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
             for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -905,6 +902,7 @@ def forward(
 
         encoder_outputs = self.encoder(
             hidden_states,
+            bool_masked_pos=bool_masked_pos,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -939,7 +937,6 @@ def forward(
     """,
     HIERA_START_DOCSTRING,
 )
-# Copied from transformers.models.vit.modeling_vit.ViTForMaskedImageModeling with VIT->HIERA,ViT->Hiera,vit->hiera,google/vit-base-patch16-224-in21k->EduardoPacheco/hiera-tiny-224
 class HieraForMaskedImageModeling(HieraPreTrainedModel):
     def __init__(self, config: HieraConfig) -> None:
         super().__init__(config)
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 7040dc76f7b5..f09023a674bb 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -46,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTImageProcessor
+    from transformers import AutoImageProcessor
 
 
 class HieraModelTester:
@@ -67,6 +67,7 @@ def __init__(
         hidden_act="gelu",
         initializer_range=0.02,
         scope=None,
+        type_sequence_label_size=10,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -82,7 +83,8 @@ def __init__(
         self.embed_dim = embed_dim
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.scope
+        self.scope = scope
+        self.type_sequence_label_size = type_sequence_label_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.input_size[0], self.input_size[1]])
@@ -248,12 +250,12 @@ class HieraModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
         return (
-            ViTImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-ink1") if is_vision_available() else None
+            AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = HieraForImageClassification.from_pretrained("EduardoPacheco/hiera-tiny-224-ink1").to(torch_device)
+        model = HieraForImageClassification.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k").to(torch_device)
 
         image_processor = self.default_image_processor
         image = prepare_img()
@@ -289,7 +291,7 @@ def test_inference_interpolate_pos_encoding(self):
         # to visualize self-attention on higher resolution images.
         model = HieraModel.from_pretrained("facebook/dino-hieras8").to(torch_device)
 
-        image_processor = ViTImageProcessor.from_pretrained("facebook/dino-hieras8", size=480)
+        image_processor = AutoImageProcessor.from_pretrained("facebook/dino-hieras8", size=480)
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)

From 10b74a7e729947b5679f56df76c6f0c70408a57e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 05:05:06 +0200
Subject: [PATCH 065/118] More nits

---
 .../models/hiera/modeling_hiera.py            | 174 ++++++++----------
 1 file changed, 79 insertions(+), 95 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 6be626e567b5..cde90b32f219 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -517,25 +517,60 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
     return hidden_states
 
 
-class HieraReroll(nn.Module):
-    """
-    Undos the "unroll" operation so that you can use intermediate features.
-    """
-
-    def __init__(self, config):
+class HieraEncoder(nn.Module):
+    def __init__(self, config: HieraConfig) -> None:
         super().__init__()
-        size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
-        unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+        self.config = config
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        # query strides rule
+        stage_ends = [sum(config.depths[:i]) - 1 for i in range(1, len(config.depths) + 1)]
+        query_pool_layer = [stage_end + 1 for stage_end in stage_ends[: config.num_query_pool]]
+        query_strides = [
+            math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(sum(config.depths))
+        ]
+
+        # Transformer blocks
+        self.stages = nn.ModuleList()
+        embed_dim = config.embed_dim
+
+        for idx_stage, depth in enumerate(config.depths):
+            dim_out = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
 
+            stage = HieraStage(
+                config=config,
+                depth=depth,
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=int(config.initial_num_heads * config.num_head_multiplier**idx_stage),
+                drop_path=dpr[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
+                query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
+                window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage),
+                use_mask_unit_attn=config.masked_unit_attention[idx_stage],
+                stage_num=idx_stage,
+            )
+
+            embed_dim = dim_out
+            self.stages.append(stage)
+
+        # Setting reroll schedule
         # The first stage has to reverse everything
         # The next stage has to reverse all but the first unroll, etc.
+        size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
         self.schedule = {}
         for idx_stage in range(len(config.depths)):
             stage_size = [i // (s**idx_stage) for i, s in zip(size, config.query_stride)]
             schedule = unroll_schedule[idx_stage:]
             self.schedule[idx_stage] = schedule, stage_size
 
-    def forward(self, hidden_states: torch.Tensor, stage_idx: int, bool_mask_pos: torch.Tensor = None) -> torch.Tensor:
+        self.gradient_checkpointing = False
+
+    def reroll(
+        self, hidden_states: torch.Tensor, stage_idx: int, bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
         """
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
@@ -582,7 +617,7 @@ def forward(self, hidden_states: torch.Tensor, stage_idx: int, bool_mask_pos: to
         hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size)
 
         # If masked, return [B, #MUs, MUy, MUx, C]
-        if bool_mask_pos is not None:
+        if bool_masked_pos is not None:
             return hidden_states
 
         # If not masked, we can return [B, H, W, C]
@@ -590,47 +625,6 @@ def forward(self, hidden_states: torch.Tensor, stage_idx: int, bool_mask_pos: to
 
         return hidden_states
 
-
-class HieraEncoder(nn.Module):
-    def __init__(self, config: HieraConfig) -> None:
-        super().__init__()
-        self.config = config
-
-        # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
-        # query strides rule
-        stage_ends = [sum(config.depths[:i]) - 1 for i in range(1, len(config.depths) + 1)]
-        query_pool_layer = [stage_end + 1 for stage_end in stage_ends[: config.num_query_pool]]
-        query_strides = [
-            math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(sum(config.depths))
-        ]
-
-        # Transformer blocks
-        self.stages = nn.ModuleList()
-        embed_dim = config.embed_dim
-
-        for idx_stage, depth in enumerate(config.depths):
-            dim_out = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
-
-            stage = HieraStage(
-                config=config,
-                depth=depth,
-                dim=embed_dim,
-                dim_out=dim_out,
-                num_heads=int(config.initial_num_heads * config.num_head_multiplier**idx_stage),
-                drop_path=dpr[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
-                query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
-                window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage),
-                use_mask_unit_attn=config.masked_unit_attention[idx_stage],
-                stage_num=idx_stage,
-            )
-
-            embed_dim = dim_out
-            self.stages.append(stage)
-
-        self.reroll = HieraReroll(config)
-        self.gradient_checkpointing = False
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -678,7 +672,7 @@ def forward(
         )
 
 
-class HieraUnroll(nn.Module):
+def unroll(hidden_states: torch.Tensor, size, schedule) -> torch.Tensor:
     """
     Reorders the tokens such that patches are contiguous in memory.
     E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
@@ -697,47 +691,35 @@ class HieraUnroll(nn.Module):
     need to be re-rolled if you want to use the intermediate values as a height x width feature map.
     The last block of the network is fine though, since by then the strides are all consumed.
     """
-
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
-        self.schedule = [config.query_stride] * len(config.depths[:-1])
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Input: Flattened patch embeddings [batch_size, seq_len, hidden_size]
-        Output: Patch embeddings [batch_size, seq_len, hidden_size] permuted such
-            that [batch_size, 4, seq_len//4, hidden_size].max(1) etc. performs MaxPoolNd
-        """
-        batch_size, _, hidden_size = hidden_states.shape
-
-        current_size = self.size
-        hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size]))
-
-        for strides in self.schedule:
-            # Move patches with the given strides to the batch dimension
-
-            # Create a view of the tensor with the patch stride as separate dims
-            # For example in 2d: [batch_size, height // stride, stride, width // stride, stride, C]
-            current_size = [i // s for i, s in zip(current_size, strides)]
-            # initialize new_shape with [height // stride, stride, width // stride, stride]
-            new_shape = [item for pair in zip(current_size, strides) for item in pair]
-            # add batch_size and hidden_size to new_shape
-            new_shape = [batch_size] + new_shape + [hidden_size]
-            hidden_states = hidden_states.view(new_shape)
-
-            # Move the patch stride into the batch dimension
-            # For example in 2d: [batch_size, stride, stride, height // stride, width // stride, hidden_size]
-            num_dims = len(new_shape)
-            permute = [0] + list(range(2, num_dims - 1, 2)) + list(range(1, num_dims - 1, 2)) + [num_dims - 1]
-            hidden_states = hidden_states.permute(permute)
-
-            # Now finally flatten the relevant dims into the batch dimension
-            hidden_states = hidden_states.flatten(0, len(strides))
-            batch_size *= math.prod(strides)
-
-        hidden_states = hidden_states.reshape(-1, math.prod(self.size), hidden_size)
-        return hidden_states
+    batch_size, _, hidden_size = hidden_states.shape
+
+    current_size = size
+    hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size]))
+
+    for strides in schedule:
+        # Move patches with the given strides to the batch dimension
+
+        # Create a view of the tensor with the patch stride as separate dims
+        # For example in 2d: [batch_size, height // stride, stride, width // stride, stride, C]
+        current_size = [i // s for i, s in zip(current_size, strides)]
+        # initialize new_shape with [height // stride, stride, width // stride, stride]
+        new_shape = [item for pair in zip(current_size, strides) for item in pair]
+        # add batch_size and hidden_size to new_shape
+        new_shape = [batch_size] + new_shape + [hidden_size]
+        hidden_states = hidden_states.view(new_shape)
+
+        # Move the patch stride into the batch dimension
+        # For example in 2d: [batch_size, stride, stride, height // stride, width // stride, hidden_size]
+        num_dims = len(new_shape)
+        permute = [0] + list(range(2, num_dims - 1, 2)) + list(range(1, num_dims - 1, 2)) + [num_dims - 1]
+        hidden_states = hidden_states.permute(permute)
+
+        # Now finally flatten the relevant dims into the batch dimension
+        hidden_states = hidden_states.flatten(0, len(strides))
+        batch_size *= math.prod(strides)
+
+    hidden_states = hidden_states.reshape(-1, math.prod(size), hidden_size)
+    return hidden_states
 
 
 class HieraPreTrainedModel(PreTrainedModel):
@@ -834,9 +816,11 @@ def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, use_mask
         self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
 
         self.embeddings = HieraEmbeddings(config, use_mask_token=use_mask_token)
-        self.unroll = HieraUnroll(config)
         self.encoder = HieraEncoder(config)
 
+        self.unroll_size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
         self.pooler = HieraPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
@@ -898,7 +882,7 @@ def forward(
 
         embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
 
-        hidden_states = self.unroll(embedding_output)
+        hidden_states = unroll(embedding_output, self.unroll_size, self.unroll_schedule)
 
         encoder_outputs = self.encoder(
             hidden_states,

From 4700ba97fabb6da798bf034d19c8f41a92e51b6f Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 07:11:50 +0200
Subject: [PATCH 066/118] Improving HieraForMaskedImageModeling

---
 .../models/hiera/modeling_hiera.py            | 282 +++++++++++++++---
 1 file changed, 247 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index cde90b32f219..98bdda409bb4 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -16,6 +16,7 @@
 
 
 import math
+from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
@@ -29,6 +30,7 @@
     BaseModelOutputWithPooling,
     ImageClassifierOutput,
     MaskedImageModelingOutput,
+    ModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -61,6 +63,75 @@
 ]
 
 
+@dataclass
+class HieraEncoderOutput(ModelOutput):
+    """
+    Swin encoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. Thesre are the unrolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class HieraModelOutput(ModelOutput):
+    """
+    Swin model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+            Average pooling of the last layer hidden-state.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 # Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L73
 def conv_nd(n: int) -> nn.Module:
     """
@@ -155,7 +226,7 @@ class HieraEmbeddings(nn.Module):
     Construct position and patch embeddings.
     """
 
-    def __init__(self, config: HieraConfig, use_mask_token: bool = False) -> None:
+    def __init__(self, config: HieraConfig) -> None:
         super().__init__()
         self.patch_stride = config.patch_stride
         self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
@@ -163,8 +234,6 @@ def __init__(self, config: HieraConfig, use_mask_token: bool = False) -> None:
         self.sep_pos_embed = config.sep_pos_embed
         self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
 
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
-
         self.patch_embeddings = HieraPatchEmbeddings(config)
 
         if self.sep_pos_embed:
@@ -665,10 +734,11 @@ def forward(
 
         if not return_dict:
             return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutput(
+        return HieraEncoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
         )
 
 
@@ -811,11 +881,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     HIERA_START_DOCSTRING,
 )
 class HieraModel(HieraPreTrainedModel):
-    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True):
         super().__init__(config)
         self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
 
-        self.embeddings = HieraEmbeddings(config, use_mask_token=use_mask_token)
+        self.embeddings = HieraEmbeddings(config)
         self.encoder = HieraEncoder(config)
 
         self.unroll_size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
@@ -840,7 +910,7 @@ class PreTrainedModel
     @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPooling,
+        output_type=HieraModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="vision",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -901,12 +971,162 @@ def forward(
             head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
             return head_outputs + encoder_outputs[1:]
 
-        return BaseModelOutputWithPooling(
+        return HieraModelOutput(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+class HieraDecoder(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+        self.tokens_spatial_shape = [[i // s for i, s in zip(config.input_size, config.patch_stride)]]
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
+        ]
+
+        self.decoder_embeddings = nn.Linear(num_features, config.decoder_embed_dim)
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_embed_dim))
+
+        self.decoder_position_embeddings = nn.Parameter(
+            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), config.decoder_embed_dim)
+        )
+
+        self.decoder_block = HieraStage(
+            config=config,
+            dim=config.decoder_embed_dim,
+            dim_out=config.decoder_embed_dim,
+            heads=config.decoder_num_heads,
+            depth=config.decoder_depth,
+            mlp_ratio=config.mlp_ratio,
+            use_mask_unit_attn=False,
+            drop_path=[0.0] * config.decoder_depth,
+            query_stride=[1] * config.decoder_depth,
+            window_size=1,
+            stage_num=0,
+        )
+
+        self.decoder_norm = nn.LayerNorm(config.decoder_embed_dim, eps=config.layer_norm_eps)
+
+        # patch stride of prediction
+        self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** self.num_query_pool)
+        pred_dim = (self.pred_stride ** len(config.query_stride)) * config.num_channels
+
+        self.decoder_pred = nn.Linear(config.decoder_embed_dim, pred_dim)
+
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        bool_masked_pos: torch.BoolTensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        # Embed tokens
+        hidden_states = self.decoder_embeddings(encoder_hidden_states)
+
+        # Combine visible and mask tokens
+
+        # hidden_states : [batch_size, num_mask_units, *mask_unit_spatial_shape_final, encoder_dim_out]
+        # mask: [B, #MUs_all]
+        decoder_hidden_states = torch.zeros(
+            *bool_masked_pos.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
+        )
+        mask_tokens = self.mask_token.view(
+            (1,) * (len(bool_masked_pos.shape) + len(hidden_states.shape[2:-1])) + (-1,)
+        )
+        new_mask_shape = bool_masked_pos.shape + (1,) * len(hidden_states.shape[2:])
+        mask = bool_masked_pos.reshape(new_mask_shape)
+        expand_shape = (-1,) * 2 + hidden_states.shape[2:]
+        mask = mask.expand(expand_shape).bool()
+        decoder_hidden_states[mask] = hidden_states.flatten()
+        decoder_hidden_states = ~mask * mask_tokens + mask * decoder_hidden_states
+
+        # Get back spatial order
+        hidden_states = undo_windowing(
+            decoder_hidden_states,
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+        mask = undo_windowing(
+            mask[..., 0:1],
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+
+        # Flatten
+        hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1])
+        mask = mask.view(hidden_states.shape[0], -1)
+
+        # Add pos embed
+        hidden_states = hidden_states + self.decoder_pos_embed
+
+        # Apply decoder blocks
+        hidden_states, attn_weights = self.decoder_block(
+            hidden_states, head_mask=head_mask, output_attentions=output_attentions
         )
+        hidden_states = self.decoder_norm(hidden_states)
+
+        # Predictor projection
+        hidden_states = self.decoder_pred(hidden_states)
+
+        return hidden_states, mask
+
+
+class HieraMultiScaleHead(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
+        ]
+        self.stage_dimensions = [
+            int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
+        ]
+        current_masked_unit_size = self.mask_unit_size
+        self.multi_scale_fusion_heads = nn.ModuleList()
+
+        for idx in range(config.num_query_pool):
+            kernel = [i // s for i, s in zip(current_masked_unit_size, self.mask_unit_spatial_shape_final)]
+            current_masked_unit_size = [i // s for i, s in zip(current_masked_unit_size, config.query_stride)]
+            self.multi_scale_fusion_heads.append(
+                conv_nd(len(config.query_stride))(
+                    self.stage_dimensions[idx],
+                    self.stage_dimensions[-1],
+                    kernel_size=kernel,
+                    stride=kernel,
+                )
+            )
+        self.multi_scale_fusion_heads.append(nn.Identity())
+
+    def apply_fusion_head(head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
+        if isinstance(head, nn.Identity):
+            return hidden_states
+
+        batch_size, num_mask_units = hidden_states.shape[0:2]
+        # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width])
+        permute = [0] + [len(hidden_states.shape) - 2] + list(range(1, len(hidden_states.shape) - 2))
+        hidden_states.reshape(batch_size * num_mask_units, *hidden_states.shape[2:])
+        hidden_states = head(hidden_states)
+        hidden_states = hidden_states.permute(permute)
+
+        # Restore original layout
+        permute = [0] + list(range(2, len(hidden_states.shape))) + [1]
+        hidden_states = hidden_states.permute(permute)
+        hidden_states = hidden_states.reshape(
+            batch_size, num_mask_units, *hidden_states.shape[2:], hidden_states.shape[1]
+        )
+        return hidden_states
+
+    def forward(self, feature_maps: List[torch.Tensor]) -> torch.Tensor:
+        # Multi-scale fusion
+        hidden_states = 0.0
+        for head, feature_map in zip(self.multi_scale_fusion_heads, feature_maps):
+            hidden_states = hidden_states + self.apply_fusion_head(head, feature_map)
 
 
 @add_start_docstrings(
@@ -924,17 +1144,13 @@ def forward(
 class HieraForMaskedImageModeling(HieraPreTrainedModel):
     def __init__(self, config: HieraConfig) -> None:
         super().__init__(config)
-
-        self.hiera = HieraModel(config, add_pooling_layer=False, use_mask_token=True)
-
-        self.decoder = nn.Sequential(
-            nn.Conv2d(
-                in_channels=config.hidden_size,
-                out_channels=config.encoder_stride**2 * config.num_channels,
-                kernel_size=1,
-            ),
-            nn.PixelShuffle(config.encoder_stride),
-        )
+        # Encoder
+        self.hiera = HieraModel(config, add_pooling_layer=False)
+        self.encoder_norm = nn.LayerNorm(self.hiera.num_features, eps=config.layer_norm_eps)
+        # Multi-scale fusion heads
+        self.multiscale_fusion = HieraMultiScaleHead(config)
+        # Decoder
+        self.decoder = HieraDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -982,33 +1198,29 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if bool_masked_pos is not None and (self.config.patch_size != self.config.encoder_stride):
-            raise ValueError(
-                "When `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that "
-                "the reconstructed image has the same dimensions as the input. "
-                f"Got `patch_size` = {self.config.patch_size} and `encoder_stride` = {self.config.encoder_stride}."
-            )
-
         outputs = self.hiera(
             pixel_values,
             bool_masked_pos=bool_masked_pos,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_hidden_states=True,
             interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
-
-        # Reshape to (batch_size, num_channels, height, width)
-        sequence_output = sequence_output[:, 1:]
-        batch_size, sequence_length, num_channels = sequence_output.shape
-        height = width = math.floor(sequence_length**0.5)
-        sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+        feature_maps = outputs[-1]
+        # Take only the query pooled and last hidden states
+        feature_maps = feature_maps[1 : self.hiera.config.num_query_pool] + [feature_maps[-1]]
+        fused_hidden_states = self.multiscale_fusion(feature_maps)
+        fused_hidden_states = self.encoder_norm(fused_hidden_states)
 
         # Reconstruct pixel values
-        reconstructed_pixel_values = self.decoder(sequence_output)
+        reconstructed_pixel_values, mask = self.decoder(
+            fused_hidden_states,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
 
         masked_im_loss = None
         if bool_masked_pos is not None:

From be26de40be4313de6a682e1cb041ed8f4c60845f Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 15:50:51 +0200
Subject: [PATCH 067/118] More improvements and nits

---
 docs/source/en/model_doc/hiera.md             |   6 +-
 src/transformers/__init__.py                  |   6 +-
 src/transformers/models/auto/modeling_auto.py |   3 +-
 src/transformers/models/hiera/__init__.py     |   6 +-
 .../models/hiera/configuration_hiera.py       |  26 ++-
 .../models/hiera/modeling_hiera.py            | 213 ++++++++++++++++--
 src/transformers/utils/dummy_pt_objects.py    |   9 +-
 tests/models/hiera/test_modeling_hiera.py     |  56 ++++-
 utils/check_repo.py                           |   1 +
 9 files changed, 292 insertions(+), 34 deletions(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 26138077f436..964036b5ef12 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -39,11 +39,11 @@ This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPac
 [[autodoc]] HieraModel
     - forward
 
-## HieraForMaskedImageModeling
+## HieraForPreTraining
 
-[[autodoc]] HieraForMaskedImageModeling
+[[autodoc]] HieraForPreTraining
     - forward
-
+  
 ## HieraForImageClassification
 
 [[autodoc]] HieraForImageClassification
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3d3bb6fcd35e..22e4688271e1 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2403,8 +2403,9 @@
     _import_structure["models.hiera"].extend(
         [
             "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "HieraBackbone",
             "HieraForImageClassification",
-            "HieraForMaskedImageModeling",
+            "HieraForPreTraining",
             "HieraModel",
             "HieraPreTrainedModel",
         ]
@@ -7123,8 +7124,9 @@
         )
         from .models.hiera import (
             HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraBackbone,
             HieraForImageClassification,
-            HieraForMaskedImageModeling,
+            HieraForPreTraining,
             HieraModel,
             HieraPreTrainedModel,
         )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2a08f4ea2c81..42cace26b0ae 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -284,6 +284,7 @@
         ("gpt2", "GPT2LMHeadModel"),
         ("gpt_bigcode", "GPTBigCodeForCausalLM"),
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
+        ("hiera", "HieraForPreTraining"),
         ("ibert", "IBertForMaskedLM"),
         ("idefics", "IdeficsForVisionText2Text"),
         ("layoutlm", "LayoutLMForMaskedLM"),
@@ -549,7 +550,6 @@
     [
         ("deit", "DeiTForMaskedImageModeling"),
         ("focalnet", "FocalNetForMaskedImageModeling"),
-        ("hiera", "HieraForMaskedImageModeling"),
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
@@ -1215,6 +1215,7 @@
         ("dinat", "DinatBackbone"),
         ("dinov2", "Dinov2Backbone"),
         ("focalnet", "FocalNetBackbone"),
+        ("hiera", "HieraBackbone"),
         ("maskformer-swin", "MaskFormerSwinBackbone"),
         ("nat", "NatBackbone"),
         ("pvt_v2", "PvtV2Backbone"),
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index fb05b30adcb1..14d58a33b577 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -31,7 +31,8 @@
     _import_structure["modeling_hiera"] = [
         "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "HieraForImageClassification",
-        "HieraForMaskedImageModeling",
+        "HieraForPreTraining",
+        "HieraBackbone",
         "HieraModel",
         "HieraPreTrainedModel",
     ]
@@ -47,8 +48,9 @@
     else:
         from .modeling_hiera import (
             HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HieraBackbone,
             HieraForImageClassification,
-            HieraForMaskedImageModeling,
+            HieraForPreTraining,
             HieraModel,
             HieraPreTrainedModel,
         )
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index a8f1cdeba681..bfe6ffe408db 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -36,7 +37,7 @@ class HieraConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate an Hiera
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the Hiera
-    [google/hiera-base-patch16-224](https://huggingface.co/google/hiera-base-patch16-224) architecture.
+    [EduardoPacheco/hiera-base-224](https://huggingface.co/EduardoPacheco/hiera-base-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -89,6 +90,15 @@ class HieraConfig(PretrainedConfig):
             The initial weight value for layer normalization layers.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
+        decoder_embed_dim (`int`, *optional*):
+            Dimensionality of decoder embeddings for MAE pretraining.
+        decoder_depth (`int`, *optional*):
+            Depth of the decoder for MAE pretraining.
+        decoder_num_heads (`int`, *optional*):
+            Number of attention heads in each layer of the decoder for MAE pretraining.
+        out_features (`<fill_type>`, *optional*): <fill_docstring>
+        out_indices (`<fill_type>`, *optional*): <fill_docstring>
+
 
     Example:
 
@@ -130,6 +140,11 @@ def __init__(
         initializer_range=0.02,
         layer_norm_init=1.0,
         layer_norm_eps=1e-6,
+        decoder_embed_dim=None,
+        decoder_depth=None,
+        decoder_num_heads=None,
+        out_features=None,
+        out_indices=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -155,8 +170,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.layer_norm_init = layer_norm_init
         self.layer_norm_eps = layer_norm_eps
-
-        self.hidden_size = embed_dim
+        # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1))
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class HieraOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 98bdda409bb4..d63bcdfda20c 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -26,10 +26,10 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
+    BackboneOutput,
     BaseModelOutput,
     BaseModelOutputWithPooling,
     ImageClassifierOutput,
-    MaskedImageModelingOutput,
     ModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
@@ -40,6 +40,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_hiera import HieraConfig
 
 
@@ -132,6 +133,84 @@ class HieraModelOutput(ModelOutput):
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
+@dataclass
+class HieraForImageClassificationOutput(ImageClassifierOutput):
+    """
+    Output type of :class:`~transformers.HieraForImageClassification`.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
+            Prediction scores of the classification head (logits of the output layer).
+        hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, `optional`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class HieraForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.HieraForPreTraining`.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
+            Total loss as the sum of the contrastive loss and the classification loss.
+        contrastive_loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
+            Contrastive loss.
+        classification_loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
+            Prediction scores of the classification head (logits of the output layer).
+        hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, `optional`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    classification_loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 # Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L73
 def conv_nd(n: int) -> nn.Module:
     """
@@ -168,7 +247,7 @@ def __init__(self, config):
 
         self.projection = conv_nd(self.spatial_dims)(
             self.num_channels,
-            config.hidden_size,
+            config.embed_dim,
             kernel_size=config.patch_kernel,
             stride=config.patch_stride,
             padding=config.patch_padding,
@@ -241,14 +320,14 @@ def __init__(self, config: HieraConfig) -> None:
                 torch.zeros(
                     1,
                     self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    config.hidden_size,
+                    config.embed_dim,
                 )
             )
             self.position_embeddings_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], config.hidden_size)
+                torch.zeros(1, self.tokens_spatial_shape[0], config.embed_dim)
             )
         else:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.hidden_size))
+            self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.embed_dim))
 
     def interpolate_pos_encoding(
         self, embeddings: torch.Tensor, pos_embeds: torch.Tensor, height: int, width: int
@@ -1130,7 +1209,7 @@ def forward(self, feature_maps: List[torch.Tensor]) -> torch.Tensor:
 
 
 @add_start_docstrings(
-    """Hiera Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
+    """The Hiera Model transformer with the decoder on top for self-supervised pre-training.
 
     <Tip>
 
@@ -1141,7 +1220,7 @@ def forward(self, feature_maps: List[torch.Tensor]) -> torch.Tensor:
     """,
     HIERA_START_DOCSTRING,
 )
-class HieraForMaskedImageModeling(HieraPreTrainedModel):
+class HieraForPreTraining(HieraPreTrainedModel):
     def __init__(self, config: HieraConfig) -> None:
         super().__init__(config)
         # Encoder
@@ -1156,7 +1235,7 @@ def __init__(self, config: HieraConfig) -> None:
         self.post_init()
 
     @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=HieraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
@@ -1166,7 +1245,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, MaskedImageModelingOutput]:
+    ) -> Union[tuple, HieraForPreTrainingOutput]:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -1175,7 +1254,7 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import AutoImageProcessor, HieraForMaskedImageModeling
+        >>> from transformers import AutoImageProcessor, HieraForPreTraining
         >>> import torch
         >>> from PIL import Image
         >>> import requests
@@ -1183,8 +1262,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("google/hiera-base-patch16-224-in21k")
-        >>> model = HieraForMaskedImageModeling.from_pretrained("google/hiera-base-patch16-224-in21k")
+        >>> image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-mae-base")
+        >>> model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-mae-base")
 
         >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
         >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
@@ -1239,11 +1318,12 @@ def forward(
             output = (reconstructed_pixel_values,) + outputs[1:]
             return ((masked_im_loss,) + output) if masked_im_loss is not None else output
 
-        return MaskedImageModelingOutput(
+        return HieraForPreTrainingOutput(
             loss=masked_im_loss,
             reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
         )
 
 
@@ -1280,7 +1360,7 @@ def __init__(self, config: HieraConfig) -> None:
     @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutput,
+        output_type=HieraForImageClassificationOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
     )
@@ -1344,9 +1424,112 @@ def forward(
             output = (logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
-        return ImageClassifierOutput(
+        return HieraForImageClassificationOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    Hiera backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    HIERA_START_DOCSTRING,
+)
+class HieraBackbone(HieraPreTrainedModel, BackboneMixin):
+    def __init__(self, config: HieraConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+        self.embeddings = HieraEmbeddings(config)
+        self.encoder = HieraEncoder(config)
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "EduardoPacheco/hiera-tiny-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output, input_dimensions = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                batch_size, num_channels, height, width = hidden_state.shape
+                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 4b20b73414d0..a5a027bf6c2a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4270,6 +4270,13 @@ def __init__(self, *args, **kwargs):
 HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class HieraBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class HieraForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4277,7 +4284,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class HieraForMaskedImageModeling(metaclass=DummyObject):
+class HieraForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index f09023a674bb..b6a1d04e3105 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -30,6 +30,7 @@
 )
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -39,7 +40,7 @@
     import torch
     from torch import nn
 
-    from transformers import HieraForImageClassification, HieraForMaskedImageModeling, HieraModel
+    from transformers import HieraBackbone, HieraForImageClassification, HieraForPreTraining, HieraModel
     from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -124,8 +125,35 @@ def create_and_check_model(self, config, pixel_values, labels):
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
 
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = HieraForMaskedImageModeling(config=config)
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = HieraBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = HieraBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+
+    def create_and_check_for_pretraining(self, config, pixel_values, labels):
+        model = HieraForPreTraining(config=config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
@@ -135,7 +163,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
 
         # test greyscale images
         config.num_channels = 1
-        model = HieraForMaskedImageModeling(config)
+        model = HieraForPreTraining(config)
         model.to(torch_device)
         model.eval()
 
@@ -182,8 +210,9 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             HieraModel,
+            HieraBackbone,
             HieraForImageClassification,
-            HieraForMaskedImageModeling,
+            HieraForPreTraining,
         )
         if is_torch_available()
         else ()
@@ -223,9 +252,13 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_masked_image_modeling(self):
+    def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
 
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -330,3 +363,12 @@ def test_inference_fp16(self):
         # forward pass to make sure inference works in fp16
         with torch.no_grad():
             _ = model(pixel_values)
+
+
+@require_torch
+class HieraBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (HieraBackbone,) if is_torch_available() else ()
+    config_class = HieraConfig
+
+    def setUp(self):
+        self.model_tester = HieraModelTester(self)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index f577bf150742..1abdf2cab51e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -992,6 +992,7 @@ def find_all_documented_objects() -> List[str]:
     "DinatBackbone",
     "Dinov2Backbone",
     "FocalNetBackbone",
+    "HieraBackbone",
     "MaskFormerSwinBackbone",
     "MaskFormerSwinConfig",
     "MaskFormerSwinModel",

From ff2ea885d50c8b3892d5bb7901571f4ad655c41e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 15:51:24 +0200
Subject: [PATCH 068/118] Fixed docstrings of outputs

---
 src/transformers/models/hiera/modeling_hiera.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index d63bcdfda20c..e92992f9b327 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -67,7 +67,7 @@
 @dataclass
 class HieraEncoderOutput(ModelOutput):
     """
-    Swin encoder's outputs, with potential hidden states and attentions.
+    Hiera encoder's outputs, with potential hidden states and attentions.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -100,7 +100,7 @@ class HieraEncoderOutput(ModelOutput):
 @dataclass
 class HieraModelOutput(ModelOutput):
     """
-    Swin model's outputs that also contains a pooling of the last hidden states.
+    Hiera model's outputs that also contains a pooling of the last hidden states.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -136,7 +136,7 @@ class HieraModelOutput(ModelOutput):
 @dataclass
 class HieraForImageClassificationOutput(ImageClassifierOutput):
     """
-    Output type of :class:`~transformers.HieraForImageClassification`.
+    Hiera image classification outputs.
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
@@ -172,7 +172,7 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
 @dataclass
 class HieraForPreTrainingOutput(ModelOutput):
     """
-    Output type of :class:`~transformers.HieraForPreTraining`.
+    Hiera model's pretraining outputs.
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, `optional`):

From 4463fb0dc5463bd48e56bd78c1a9336941e51d74 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 1 Apr 2024 16:01:42 +0200
Subject: [PATCH 069/118] More fixes

---
 src/transformers/models/hiera/configuration_hiera.py | 12 ++++++++++--
 src/transformers/models/hiera/modeling_hiera.py      |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index bfe6ffe408db..3d7840ea51aa 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -96,8 +96,16 @@ class HieraConfig(PretrainedConfig):
             Depth of the decoder for MAE pretraining.
         decoder_num_heads (`int`, *optional*):
             Number of attention heads in each layer of the decoder for MAE pretraining.
-        out_features (`<fill_type>`, *optional*): <fill_docstring>
-        out_indices (`<fill_type>`, *optional*): <fill_docstring>
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
 
 
     Example:
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index e92992f9b327..cd4cdc54cc64 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -997,8 +997,8 @@ class PreTrainedModel
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
         bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,

From b92081d4a71fe25efbc7ac413c9935bd7f8fbb99 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 2 Apr 2024 09:53:18 +0200
Subject: [PATCH 070/118] More imrpovments

---
 .../models/hiera/configuration_hiera.py       | 10 ++-
 .../models/hiera/convert_hiera_to_hf.py       | 40 ++++++---
 .../models/hiera/modeling_hiera.py            | 89 +++++++++++++------
 3 files changed, 97 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 3d7840ea51aa..79249131055b 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -22,7 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
-from ...utils.backbone_utils import get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -32,7 +32,7 @@
 }
 
 
-class HieraConfig(PretrainedConfig):
+class HieraConfig(PretrainedConfig, BackboneConfigMixin):
     r"""
     This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate an Hiera
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -96,6 +96,8 @@ class HieraConfig(PretrainedConfig):
             Depth of the decoder for MAE pretraining.
         decoder_num_heads (`int`, *optional*):
             Number of attention heads in each layer of the decoder for MAE pretraining.
+        norm_pix_loss (`bool`, *optional*, defaults to `True`):
+
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
             (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
@@ -151,6 +153,7 @@ def __init__(
         decoder_embed_dim=None,
         decoder_depth=None,
         decoder_num_heads=None,
+        norm_pix_loss=True,
         out_features=None,
         out_indices=None,
         **kwargs,
@@ -178,6 +181,9 @@ def __init__(
         self.initializer_range = initializer_range
         self.layer_norm_init = layer_norm_init
         self.layer_norm_eps = layer_norm_eps
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_depth = decoder_depth
+        self.decoder_num_heads = decoder_num_heads
         # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1))
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 8d4b7faf049b..3aaba9893188 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -16,9 +16,11 @@
 
 
 import argparse
+import json
 
 import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torchvision import transforms
 
@@ -120,20 +122,18 @@ def prepare_img():
 
 
 def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
-    kwargs = {} if base_model else {"num_labels": 400 if model_name.endswith("16x224") else 1000}
-
     if model_name == "hiera-tiny-224":
-        config = HieraConfig(depths=[1, 2, 7, 2], **kwargs)
+        config = HieraConfig(depths=[1, 2, 7, 2])
     elif model_name == "hiera-small-224":
-        HieraConfig(depths=[1, 2, 11, 2], **kwargs)
+        HieraConfig(depths=[1, 2, 11, 2])
     elif model_name == "hiera-base-224":
-        config = HieraConfig(**kwargs)
+        config = HieraConfig()
     elif model_name == "hiera-base-plus-224":
-        config = HieraConfig(embed_dim=112, initial_num_heads=2, **kwargs)
+        config = HieraConfig(embed_dim=112, initial_num_heads=2)
     elif model_name == "hiera-large-224":
-        config = HieraConfig(embed_dim=144, initial_num_heads=2, depths=[2, 6, 36, 4], **kwargs)
+        config = HieraConfig(embed_dim=144, initial_num_heads=2, depths=[2, 6, 36, 4])
     elif model_name == "hiera-huge-224":
-        config = HieraConfig(embed_dim=256, initial_num_heads=4, depths=[2, 6, 36, 4], **kwargs)
+        config = HieraConfig(embed_dim=256, initial_num_heads=4, depths=[2, 6, 36, 4])
     elif model_name == "hiera-base-16x224":
         config = HieraConfig(
             input_size=(16, 224, 224),
@@ -143,7 +143,6 @@ def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
             patch_stride=(2, 4, 4),
             patch_padding=(1, 3, 3),
             sep_pos_embed=True,
-            **kwargs,
         )
     elif model_name == "hiera-base-plus-16x224":
         config = HieraConfig(
@@ -156,7 +155,6 @@ def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
             sep_pos_embed=True,
             embed_dim=112,
             initial_num_heads=2,
-            **kwargs,
         )
     elif model_name == "hiera-large-16x224":
         config = HieraConfig(
@@ -170,7 +168,6 @@ def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
             embed_dim=144,
             initial_num_heads=2,
             depths=[2, 6, 36, 4],
-            **kwargs,
         )
     elif model_name == "hiera-huge-16x224":
         config = HieraConfig(
@@ -184,11 +181,30 @@ def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
             embed_dim=256,
             initial_num_heads=4,
             depths=[2, 6, 36, 4],
-            **kwargs,
         )
     else:
         raise ValueError(f"Unrecognized model name: {model_name}")
 
+    repo_id = "huggingface/label-files"
+
+    if not model_name.endswith("16x224") and not base_model:
+        filename = "imagenet-1k-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+        config.num_labels = len(id2label)
+
+    if model_name.endswith("16x224") and not base_model:
+        filename = "kinetics400-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+        config.num_labels = len(id2label)
+
+        config.num_labels = 400
+
     return config
 
 
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index cd4cdc54cc64..74e7188fcf32 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -54,8 +54,8 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
 
 # Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "EduardoPacheco/hiera-tiny-224-ink1"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+_IMAGE_CLASS_CHECKPOINT = "EduardoPacheco/hiera-tiny-224-in1k"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
 
 
 HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -85,7 +85,7 @@ class HieraEncoderOutput(ModelOutput):
             heads.
         reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
             include the spatial dimensions.
@@ -120,7 +120,7 @@ class HieraModelOutput(ModelOutput):
             heads.
         reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
             include the spatial dimensions.
@@ -156,7 +156,7 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
             heads.
         reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
             include the spatial dimensions.
@@ -196,7 +196,7 @@ class HieraForPreTrainingOutput(ModelOutput):
             heads.
         reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, hidden_size, height, width)`. These are the reshaped and re-rolled hidden states of the model.
+            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
             include the spatial dimensions.
@@ -1063,7 +1063,10 @@ class HieraDecoder(nn.Module):
     def __init__(self, config: HieraConfig):
         super().__init__()
         num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
-        self.tokens_spatial_shape = [[i // s for i, s in zip(config.input_size, config.patch_stride)]]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.tokens_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(self.tokens_spatial_shape, config.query_stride)
+        ]
         self.mask_unit_spatial_shape_final = [
             i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
         ]
@@ -1080,9 +1083,8 @@ def __init__(self, config: HieraConfig):
             config=config,
             dim=config.decoder_embed_dim,
             dim_out=config.decoder_embed_dim,
-            heads=config.decoder_num_heads,
+            num_heads=config.decoder_num_heads,
             depth=config.decoder_depth,
-            mlp_ratio=config.mlp_ratio,
             use_mask_unit_attn=False,
             drop_path=[0.0] * config.decoder_depth,
             query_stride=[1] * config.decoder_depth,
@@ -1093,7 +1095,7 @@ def __init__(self, config: HieraConfig):
         self.decoder_norm = nn.LayerNorm(config.decoder_embed_dim, eps=config.layer_norm_eps)
 
         # patch stride of prediction
-        self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** self.num_query_pool)
+        self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
         pred_dim = (self.pred_stride ** len(config.query_stride)) * config.num_channels
 
         self.decoder_pred = nn.Linear(config.decoder_embed_dim, pred_dim)
@@ -1165,7 +1167,7 @@ def __init__(self, config: HieraConfig):
         self.stage_dimensions = [
             int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
         ]
-        current_masked_unit_size = self.mask_unit_size
+        current_masked_unit_size = config.masked_unit_attention
         self.multi_scale_fusion_heads = nn.ModuleList()
 
         for idx in range(config.num_query_pool):
@@ -1234,6 +1236,38 @@ def __init__(self, config: HieraConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_pixel_label_2d(self, pixel_values: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # mask (boolean tensor): True means *masked*
+        pixel_values = pixel_values.permute(0, 2, 3, 1)
+
+        size = self.pred_stride
+        label = pixel_values.unfold(1, size, size).unfold(2, size, size)
+        label = label.flatten(1, 2).flatten(2)
+        label = label[mask]
+        if self.config.norm_pix_loss:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+    def get_pixel_label_3d(self, pixel_values: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # mask (boolean tensor): True means *masked*
+        pixel_values = pixel_values[:, :, :: self.patch_stride[0], :, :]
+
+        size = self.pred_stride
+        label = pixel_values.unfold(3, size, size).unfold(4, size, size)
+        # Different from 2D
+        label = label.permute(0, 2, 3, 4, 5, 6, 1)
+        label = label.flatten(1, 3).flatten(2)
+        label = label[mask]
+        if self.config.norm_pix_loss:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
     @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=HieraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1302,17 +1336,16 @@ def forward(
         )
 
         masked_im_loss = None
-        if bool_masked_pos is not None:
-            size = self.config.image_size // self.config.patch_size
-            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
-            mask = (
-                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
-                .repeat_interleave(self.config.patch_size, 2)
-                .unsqueeze(1)
-                .contiguous()
-            )
-            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
-            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+        if len(self.config.query_stride) == 2:
+            label = self.get_pixel_label_2d(pixel_values, mask)
+        elif len(self.config.query_stride) == 3:
+            label = self.get_pixel_label_3d(pixel_values, mask)
+        else:
+            raise NotImplementedError("Only images and videos are supported")
+
+        reconstructed_pixel_values = reconstructed_pixel_values[mask]
+        loss = (reconstructed_pixel_values - label) ** 2
+        masked_im_loss = loss.mean()
 
         if not return_dict:
             output = (reconstructed_pixel_values,) + outputs[1:]
@@ -1373,7 +1406,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, ImageClassifierOutput]:
+    ) -> Union[tuple, HieraForImageClassificationOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
@@ -1444,7 +1477,9 @@ def __init__(self, config: HieraConfig):
         super().__init__(config)
         super()._init_backbone(config)
 
-        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+        self.num_features = [config.embed_dim] + [
+            int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
+        ]
         self.embeddings = HieraEmbeddings(config)
         self.encoder = HieraEncoder(config)
 
@@ -1498,11 +1533,10 @@ def forward(
         )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
-        embedding_output, input_dimensions = self.embeddings(pixel_values)
+        embedding_output = self.embeddings(pixel_values)
 
         outputs = self.encoder(
             embedding_output,
-            input_dimensions,
             head_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=True,
@@ -1514,8 +1548,7 @@ def forward(
         feature_maps = ()
         for stage, hidden_state in zip(self.stage_names, hidden_states):
             if stage in self.out_features:
-                batch_size, num_channels, height, width = hidden_state.shape
-                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+                batch_size, height, width, num_channels = hidden_state.shape
                 hidden_state = hidden_state.view(batch_size, height * width, num_channels)
                 hidden_state = self.hidden_states_norms[stage](hidden_state)
                 hidden_state = hidden_state.view(batch_size, height, width, num_channels)

From e38b25dfadbe78cbcf1f0896e819837f8e97e90b Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 3 Apr 2024 03:28:49 +0200
Subject: [PATCH 071/118] Updated conversion script

---
 .../models/hiera/configuration_hiera.py       |   7 +-
 .../models/hiera/convert_hiera_to_hf.py       | 167 ++++++---
 .../models/hiera/modeling_hiera.py            | 320 +++++++++++-------
 3 files changed, 327 insertions(+), 167 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 79249131055b..89fc03f76b4d 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -97,7 +97,9 @@ class HieraConfig(PretrainedConfig, BackboneConfigMixin):
         decoder_num_heads (`int`, *optional*):
             Number of attention heads in each layer of the decoder for MAE pretraining.
         norm_pix_loss (`bool`, *optional*, defaults to `True`):
-
+            Whether to normalize the pixel loss by the number of pixels.
+        mask_ratio (`float`, *optional*, defaults to 0.6):
+            The ratio of masked tokens in the input.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
             (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
@@ -154,6 +156,7 @@ def __init__(
         decoder_depth=None,
         decoder_num_heads=None,
         norm_pix_loss=True,
+        mask_ratio=0.6,
         out_features=None,
         out_indices=None,
         **kwargs,
@@ -184,6 +187,8 @@ def __init__(
         self.decoder_embed_dim = decoder_embed_dim
         self.decoder_depth = decoder_depth
         self.decoder_num_heads = decoder_num_heads
+        self.norm_pix_loss = norm_pix_loss
+        self.mask_ratio = mask_ratio
         # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1))
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 3aaba9893188..c986fa7ea41c 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -17,6 +17,8 @@
 
 import argparse
 import json
+import math
+from typing import Dict, Tuple
 
 import requests
 import torch
@@ -24,7 +26,8 @@
 from PIL import Image
 from torchvision import transforms
 
-from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraModel
+from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraForPreTraining, HieraModel
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from transformers.utils import logging
 
 
@@ -33,7 +36,7 @@
 
 
 # here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
+def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool):
     rename_keys = []
     # fmt: off
     num_stages = len(config.depths)
@@ -86,9 +89,47 @@ def create_rename_keys(config, base_model=False):
     if base_model:
         # layernorm + pooler
         rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")])
-
         # if just the base model, we should remove "hiera" from all keys that start with "hiera"
         rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys]
+    elif mae_model:
+        rename_keys.extend(
+            [
+                ("encoder_norm.weight", "encoder_norm.weight"),
+                ("encoder_norm.bias", "encoder_norm.bias"),
+                ("mask_token", "decoder.mask_token"),
+                ("decoder_pos_embed", "decoder.decoder_position_embeddings"),
+                ("decoder_norm.weight", "decoder.decoder_norm.weight"),
+                ("decoder_norm.bias", "decoder.decoder_norm.bias"),
+                ("decoder_pred.weight", "decoder.decoder_pred.weight"),
+                ("decoder_pred.bias", "decoder.decoder_pred.bias"),
+                ("decoder_embed.weight", "decoder.decoder_embeddings.weight"),
+                ("decoder_embed.bias", "decoder.decoder_embeddings.bias")
+            ]
+        )
+        for i in range(config.decoder_depth):
+            rename_keys.extend(
+                [
+                    (f"decoder_blocks.{i}.norm1.weight", f"decoder.decoder_block.layers.{i}.layernorm_before.weight"),
+                    (f"decoder_blocks.{i}.norm1.bias", f"decoder.decoder_block.layers.{i}.layernorm_before.bias"),
+                    (f"decoder_blocks.{i}.attn.qkv.weight", f"decoder.decoder_block.layers.{i}.attn.qkv.weight"),
+                    (f"decoder_blocks.{i}.attn.qkv.bias", f"decoder.decoder_block.layers.{i}.attn.qkv.bias"),
+                    (f"decoder_blocks.{i}.attn.proj.weight", f"decoder.decoder_block.layers.{i}.attn.proj.weight"),
+                    (f"decoder_blocks.{i}.attn.proj.bias", f"decoder.decoder_block.layers.{i}.attn.proj.bias"),
+                    (f"decoder_blocks.{i}.norm2.weight", f"decoder.decoder_block.layers.{i}.layernorm_after.weight"),
+                    (f"decoder_blocks.{i}.norm2.bias", f"decoder.decoder_block.layers.{i}.layernorm_after.bias"),
+                    (f"decoder_blocks.{i}.mlp.fc1.weight", f"decoder.decoder_block.layers.{i}.mlp.fc1.weight"),
+                    (f"decoder_blocks.{i}.mlp.fc1.bias", f"decoder.decoder_block.layers.{i}.mlp.fc1.bias"),
+                    (f"decoder_blocks.{i}.mlp.fc2.weight", f"decoder.decoder_block.layers.{i}.mlp.fc2.weight"),
+                    (f"decoder_blocks.{i}.mlp.fc2.bias", f"decoder.decoder_block.layers.{i}.mlp.fc2.bias"),
+                ]
+            )
+        for i in range(config.num_query_pool):
+            rename_keys.extend(
+                [
+                    (f"multi_scale_fusion_heads.{i}.weight", f"multiscale_fusion.multi_scale_fusion_heads.{i}.weight"),
+                    (f"multi_scale_fusion_heads.{i}.bias", f"multiscale_fusion.multi_scale_fusion_heads.{i}.bias")
+                ]
+            )
     else:
         # layernorm + classification head
         rename_keys.extend(
@@ -121,7 +162,22 @@ def prepare_img():
     return im
 
 
-def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
+def get_labels_for_classifier(model_name: str) -> Tuple[Dict[int, str], Dict[str, int], int]:
+    repo_id = "huggingface/label-files"
+    if model_name.endswith("16x224"):
+        filename = "kinetics400-id2label.json"
+    else:
+        filename = "imagenet-1k-id2label.json"
+
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    label2id = {v: k for k, v in id2label.items()}
+    num_labels = len(id2label)
+
+    return id2label, label2id, num_labels
+
+
+def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> HieraConfig:
     if model_name == "hiera-tiny-224":
         config = HieraConfig(depths=[1, 2, 7, 2])
     elif model_name == "hiera-small-224":
@@ -185,25 +241,20 @@ def get_hiera_config(model_name: str, base_model: bool) -> HieraConfig:
     else:
         raise ValueError(f"Unrecognized model name: {model_name}")
 
-    repo_id = "huggingface/label-files"
-
-    if not model_name.endswith("16x224") and not base_model:
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.num_labels = len(id2label)
-
-    if model_name.endswith("16x224") and not base_model:
-        filename = "kinetics400-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
+    if base_model:
+        pass
+    elif mae_model:
+        config.num_query_pool = 2
+        config.decoder_embed_dim = 512
+        config.decoder_depth = 8
+        config.decoder_num_heads = 16
+        # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
+        config.mask_ratio = 0.6 if not model_name.endswith("16x224") else 0.9
+    else:
+        id2label, label2id, num_labels = get_labels_for_classifier(model_name)
         config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.num_labels = len(id2label)
-
-        config.num_labels = 400
+        config.label2id = label2id
+        config.num_labels = num_labels
 
     return config
 
@@ -216,33 +267,46 @@ def convert_hiera_checkpoint(args):
     verify_logits = args.verify_logits
     verify_pixel_values = args.verify_pixel_values
     push_to_hub = args.push_to_hub
-    IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
-    IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+    mae_model = args.mae_model
+
+    mae_model = True
 
-    config = get_hiera_config(model_name, base_model)
+    config = get_hiera_config(model_name, base_model, mae_model)
 
     # Load original hiera model
+    original_model_name = model_name.replace("-", "_")
+    original_model_name = f"mae_{original_model_name}" if mae_model else original_model_name
+
+    original_checkpoint_name = "mae_in1k_ft_in1k" if not (base_model or mae_model) else "mae_in1k"
+
     original_model = torch.hub.load(
         "facebookresearch/hiera",
-        model=model_name.replace("-", "_"),
+        model=original_model_name,
         pretrained=True,
-        checkpoint="mae_in1k_ft_in1k" if not base_model else "mae_in1k",
+        checkpoint=original_checkpoint_name,
     )
 
     original_model.eval()
     original_state_dict = original_model.state_dict()
+    # Don't need to remove head for MAE because original implementation doesn't have it on MAE
     if base_model:
         remove_classification_head_(original_state_dict)
 
     # # Rename keys
     new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config, base_model)
+    rename_keys = create_rename_keys(config, base_model, mae_model)
 
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
 
     # Load HF hiera model
-    model = HieraModel(config) if base_model else HieraForImageClassification(config)
+    if base_model:
+        model = HieraModel(config)
+    elif mae_model:
+        model = HieraForPreTraining(config)
+    else:
+        model = HieraForImageClassification(config)
+
     model.eval()
 
     missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
@@ -285,22 +349,35 @@ def convert_hiera_checkpoint(args):
         inputs = {"pixel_values": torch.rand((1, 3, 224, 224))}
         expected_pixel_values = inputs["pixel_values"]
 
-    outputs = model(**inputs)
+    # If is MAE we pass a noise to generate a random mask
+    mask_spatial_shape = [
+        i // s // ms for i, s, ms in zip(config.input_size, config.patch_stride, config.masked_unit_size)
+    ]
+    num_windows = math.prod(mask_spatial_shape)
+    noise = torch.rand(1, num_windows)
+    outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs)
     # original implementation returns logits.softmax(dim=-1)
-    expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
 
     if verify_logits:
-        if not base_model:
-            assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-4)
-            print("Logits look good!")
-            print(f"{outputs.logits[:, :5]=}")
-        else:
+        if base_model:
+            expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
             expected_last_hidden = expected_intermediates[-1]
             batch_size, _, _, hidden_dim = expected_last_hidden.shape
             expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
             assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-4)
-            print("Logits look good!")
+            print("Base Model looks good as hidden states match original implementation!")
             print(f"{outputs.last_hidden_state[0, :3, :3]=}")
+        elif mae_model:
+            # get mask from noise to be able to compare outputs
+            mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
+            expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask)
+            assert torch.allclose(outputs.loss, expected_loss, atol=1e-4)
+            print("MAE Model looks good as loss matches original implementation!")
+        else:
+            expected_prob = original_model(expected_pixel_values)
+            assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-4)
+            print("Classifier looks good as probs match original implementation")
+            print(f"{outputs.logits[:, :5]=}")
     else:
         print("Converted without verifying logits")
 
@@ -310,12 +387,17 @@ def convert_hiera_checkpoint(args):
         image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
         hub_name = model_name
-        if not base_model:
+        if base_model:
+            hub_name = model_name
+        elif mae_model:
+            hub_name = f"{model_name}-mae"
+        else:
             hub_name = f"{model_name}-k400" if model_name.endswith("16x224") else f"{model_name}-in1k"
-        model.push_to_hub(f"EduardoPacheco/{hub_name}")
-        image_processor.push_to_hub(f"EduardoPacheco/{hub_name}")
+        repo_id = f"EduardoPacheco/{hub_name}"
+        print(f"Pushing model and processor for {model_name} to hub at {repo_id}")
+        model.push_to_hub(repo_id)
+        image_processor.push_to_hub(repo_id)
 
 
 if __name__ == "__main__":
@@ -355,6 +437,9 @@ def convert_hiera_checkpoint(args):
         action="store_true",
         help="Whether to only convert the base model (no projection head weights).",
     )
+    parser.add_argument(
+        "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining."
+    )
     parser.add_argument(
         "--verify-pixel-values",
         action="store_true",
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 74e7188fcf32..cb81a432cf20 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -107,6 +107,10 @@ class HieraModelOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the model.
         pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
             Average pooling of the last layer hidden-state.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (0) and which are not (1).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
             shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
@@ -128,6 +132,8 @@ class HieraModelOutput(ModelOutput):
 
     last_hidden_state: torch.FloatTensor = None
     pooler_output: Optional[torch.FloatTensor] = None
+    mask: torch.LongTensor = None
+    ids_restore: torch.LongTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -172,43 +178,38 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
 @dataclass
 class HieraForPreTrainingOutput(ModelOutput):
     """
-    Hiera model's pretraining outputs.
+    Class for ViTMAEForPreTraining's outputs, with potential hidden states and attentions.
 
     Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
-            Total loss as the sum of the contrastive loss and the classification loss.
-        contrastive_loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
-            Contrastive loss.
-        classification_loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
-            Prediction scores of the classification head (logits of the output layer).
-        hidden_states (`tuple(torch.FloatTensor)`, `optional`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, `optional`):
-            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
-            include the spatial dimensions.
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (0) and which are not (1).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs reshaped to include the spatial dimensions.
     """
 
     loss: Optional[torch.FloatTensor] = None
-    contrastive_loss: Optional[torch.FloatTensor] = None
-    classification_loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    mask: torch.LongTensor = None
+    ids_restore: torch.LongTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
 # Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L73
@@ -233,7 +234,7 @@ class HieraPatchEmbeddings(nn.Module):
     Transformer.
     """
 
-    def __init__(self, config):
+    def __init__(self, config, is_mae: bool = False):
         super().__init__()
 
         # Support any number of spatial dimensions
@@ -244,6 +245,10 @@ def __init__(self, config):
             )
         self.num_channels = config.num_channels
         self.image_size = config.input_size[-2:]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
+        self.mask_ratio = config.mask_ratio
+        self.is_mae = is_mae
 
         self.projection = conv_nd(self.spatial_dims)(
             self.num_channels,
@@ -261,6 +266,8 @@ def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] =
             return self.projection(pixel_values)
 
         target_size = pixel_values.shape[2:]
+        # Reshape mask to (batch_size, 1, mask_unit_height, mask_unit_width)
+        mask = mask.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)
 
         if len(mask.shape[2:]) != len(target_size):
             raise ValueError(
@@ -272,10 +279,42 @@ def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] =
 
         return self.projection(pixel_values * mask.bool())
 
+    def random_masking(self, pixel_values, noise=None):
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+
+        Args:
+            pixel_values (`torch.LongTensor` of shape `(batch_size, num_channels, height, width)`)
+            noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size = pixel_values.shape[0]
+        # Tokens selected for masking at mask unit level
+        num_windows = math.prod(self.mask_spatial_shape)
+        len_keep = int(num_windows * (1 - self.mask_ratio))
+
+        if noise is None:
+            noise = torch.rand(batch_size, num_windows, device=pixel_values.device)
+
+        # Sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)
+        # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # Generate the binary mask: 1 is *keep*, 0 is *remove*
+        # Note this is opposite to original MAE
+        mask = torch.zeros([batch_size, num_windows], device=pixel_values.device)
+        mask[:, :len_keep] = 1
+        # Unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return mask.bool(), ids_restore
+
     def forward(
         self,
         pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.Tensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
     ) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
@@ -294,10 +333,12 @@ def forward(
                     f" ({self.image_size[0]}*{self.image_size[1]})."
                 )
 
-        embeddings = self.masked_conv(pixel_values, bool_masked_pos)
-        embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1], -1).transpose(2, 1)
+        (mask, ids_restore) = self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)
 
-        return embeddings
+        embeddings = self.masked_conv(pixel_values, mask)
+        embeddings = embeddings.flatten(2).transpose(2, 1)
+
+        return embeddings, mask, ids_restore
 
 
 class HieraEmbeddings(nn.Module):
@@ -305,15 +346,16 @@ class HieraEmbeddings(nn.Module):
     Construct position and patch embeddings.
     """
 
-    def __init__(self, config: HieraConfig) -> None:
+    def __init__(self, config: HieraConfig, is_mae: bool = False) -> None:
         super().__init__()
         self.patch_stride = config.patch_stride
         self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
         self.num_tokens = math.prod(self.tokens_spatial_shape)
         self.sep_pos_embed = config.sep_pos_embed
-        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
+        self.is_mae = is_mae
 
-        self.patch_embeddings = HieraPatchEmbeddings(config)
+        self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae)
 
         if self.sep_pos_embed:
             self.position_embeddings_spatial = nn.Parameter(
@@ -394,24 +436,21 @@ def get_position_embedding(
     def forward(
         self,
         pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
     ) -> torch.Tensor:
-        if len(self.mask_spatial_shape) == 2:
+        if len(self.tokens_spatial_shape) == 2:
             batch_size, num_channels, height, width = pixel_values.shape
         else:
             batch_size, num_channels, depth, height, width = pixel_values.shape
 
-        if bool_masked_pos is not None:
-            bool_masked_pos = bool_masked_pos.view(batch_size, 1, *self.mask_spatial_shape)
-
-        embeddings = self.patch_embeddings(
-            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        embeddings, mask, ids_restore = self.patch_embeddings(
+            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
         )
 
         embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
 
-        return embeddings
+        return embeddings, mask, ids_restore
 
 
 class HieraMaskUnitAttention(nn.Module):
@@ -603,14 +642,16 @@ def __init__(
         query_stride: List[int],
         window_size: int,
         use_mask_unit_attn: bool,
-        stage_num: int,
+        stage_num: Optional[int] = None,
     ) -> None:
         super().__init__()
         # we need to know if the previous stage used masked attention
         # mask unit or global attention.
         # lag by 1 layer, so that global attention,
         # applied post pooling on lower resolution
-        previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0]
+        previous_stage_used_masked_attention = False
+        if stage_num is not None:
+            previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0]
         self.layers = nn.ModuleList(
             [
                 HieraLayer(
@@ -705,19 +746,20 @@ def __init__(self, config: HieraConfig) -> None:
         # Setting reroll schedule
         # The first stage has to reverse everything
         # The next stage has to reverse all but the first unroll, etc.
-        size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        stage_size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
         unroll_schedule = [config.query_stride] * len(config.depths[:-1])
 
         self.schedule = {}
         for idx_stage in range(len(config.depths)):
-            stage_size = [i // (s**idx_stage) for i, s in zip(size, config.query_stride)]
-            schedule = unroll_schedule[idx_stage:]
-            self.schedule[idx_stage] = schedule, stage_size
+            self.schedule[idx_stage] = unroll_schedule, stage_size
+            if idx_stage < config.num_query_pool:
+                stage_size = [i // s for i, s in zip(stage_size, config.query_stride)]
+                unroll_schedule = unroll_schedule[1:]
 
         self.gradient_checkpointing = False
 
     def reroll(
-        self, hidden_states: torch.Tensor, stage_idx: int, bool_masked_pos: Optional[torch.BoolTensor] = None
+        self, hidden_states: torch.Tensor, stage_idx: int, mask: Optional[torch.BoolTensor] = None
     ) -> torch.Tensor:
         """
         Roll the given tensor back up to spatial order assuming it's from the given block.
@@ -761,14 +803,14 @@ def reroll(
             hidden_states = hidden_states.reshape(batch_size, -1, *mask_unit_shape, hidden_size)
             seq_len = hidden_states.shape[1]
 
-        # Current shape (e.g., 2d: [B, #MUy*#MUx, MUy, MUx, C])
+        # Current shape (e.g., 2d: [batch_size, #num_mask_units_height*#num_mask_units_width, mask_unit_height, mask_unit_width, hidden_size])
         hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size)
 
-        # If masked, return [B, #MUs, MUy, MUx, C]
-        if bool_masked_pos is not None:
+        # If masked, return [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        if mask is not None:
             return hidden_states
 
-        # If not masked, we can return [B, H, W, C]
+        # If not masked, we can return [batch_size, height, width, hidden_size]
         hidden_states = undo_windowing(hidden_states, size, mask_unit_shape)
 
         return hidden_states
@@ -776,7 +818,7 @@ def reroll(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        mask: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
@@ -788,7 +830,7 @@ def forward(
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
-            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, bool_masked_pos=bool_masked_pos)
+            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, mask=mask)
             all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         for i, stage_module in enumerate(self.stages):
@@ -808,7 +850,7 @@ def forward(
 
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
-                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, bool_masked_pos=bool_masked_pos)
+                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, mask=mask)
                 all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         if not return_dict:
@@ -821,7 +863,7 @@ def forward(
         )
 
 
-def unroll(hidden_states: torch.Tensor, size, schedule) -> torch.Tensor:
+def unroll(hidden_states: torch.Tensor, size: List[int], schedule: List[List[int]]) -> torch.Tensor:
     """
     Reorders the tokens such that patches are contiguous in memory.
     E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
@@ -960,11 +1002,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     HIERA_START_DOCSTRING,
 )
 class HieraModel(HieraPreTrainedModel):
-    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True):
+    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae: bool = False):
         super().__init__(config)
         self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
 
-        self.embeddings = HieraEmbeddings(config)
+        self.embeddings = HieraEmbeddings(config, is_mae=is_mae)
         self.encoder = HieraEncoder(config)
 
         self.unroll_size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
@@ -997,7 +1039,7 @@ class PreTrainedModel
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1005,8 +1047,9 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
-        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
-            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+                when is_mae is set to True.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1029,13 +1072,23 @@ def forward(
         if pixel_values.dtype != expected_dtype:
             pixel_values = pixel_values.to(expected_dtype)
 
-        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        embedding_output, mask, ids_restore = self.embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise
+        )
 
         hidden_states = unroll(embedding_output, self.unroll_size, self.unroll_schedule)
 
+        # Discard masked tokens if mask is provided
+        if mask is not None:
+            mask_unit_area = math.prod(self.config.masked_unit_size)
+            batch_size, _, hidden_size = hidden_states.shape
+            positions = mask.unsqueeze(-1).tile(1, mask_unit_area, hidden_size)
+            hidden_states = hidden_states[positions]
+            hidden_states = hidden_states.view(batch_size, -1, hidden_size)
+
         encoder_outputs = self.encoder(
             hidden_states,
-            bool_masked_pos=bool_masked_pos,
+            mask=mask,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1048,11 +1101,14 @@ def forward(
 
         if not return_dict:
             head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            head_outputs = head_outputs + (mask, ids_restore) if mask is not None else head_outputs
             return head_outputs + encoder_outputs[1:]
 
         return HieraModelOutput(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
+            mask=mask,
+            ids_restore=ids_restore,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
             reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
@@ -1088,8 +1144,7 @@ def __init__(self, config: HieraConfig):
             use_mask_unit_attn=False,
             drop_path=[0.0] * config.decoder_depth,
             query_stride=[1] * config.decoder_depth,
-            window_size=1,
-            stage_num=0,
+            window_size=0,
         )
 
         self.decoder_norm = nn.LayerNorm(config.decoder_embed_dim, eps=config.layer_norm_eps)
@@ -1103,7 +1158,7 @@ def __init__(self, config: HieraConfig):
     def forward(
         self,
         encoder_hidden_states: torch.Tensor,
-        bool_masked_pos: torch.BoolTensor,
+        mask: torch.BoolTensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ) -> torch.Tensor:
@@ -1113,15 +1168,13 @@ def forward(
         # Combine visible and mask tokens
 
         # hidden_states : [batch_size, num_mask_units, *mask_unit_spatial_shape_final, encoder_dim_out]
-        # mask: [B, #MUs_all]
+        # mask: [batch_size, num_mask_units]
         decoder_hidden_states = torch.zeros(
-            *bool_masked_pos.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
+            *mask.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
         )
-        mask_tokens = self.mask_token.view(
-            (1,) * (len(bool_masked_pos.shape) + len(hidden_states.shape[2:-1])) + (-1,)
-        )
-        new_mask_shape = bool_masked_pos.shape + (1,) * len(hidden_states.shape[2:])
-        mask = bool_masked_pos.reshape(new_mask_shape)
+        mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(hidden_states.shape[2:-1])) + (-1,))
+        new_mask_shape = mask.shape + (1,) * len(hidden_states.shape[2:])
+        mask = mask.reshape(new_mask_shape)
         expand_shape = (-1,) * 2 + hidden_states.shape[2:]
         mask = mask.expand(expand_shape).bool()
         decoder_hidden_states[mask] = hidden_states.flatten()
@@ -1144,7 +1197,7 @@ def forward(
         mask = mask.view(hidden_states.shape[0], -1)
 
         # Add pos embed
-        hidden_states = hidden_states + self.decoder_pos_embed
+        hidden_states = hidden_states + self.decoder_position_embeddings
 
         # Apply decoder blocks
         hidden_states, attn_weights = self.decoder_block(
@@ -1167,7 +1220,7 @@ def __init__(self, config: HieraConfig):
         self.stage_dimensions = [
             int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
         ]
-        current_masked_unit_size = config.masked_unit_attention
+        current_masked_unit_size = config.masked_unit_size
         self.multi_scale_fusion_heads = nn.ModuleList()
 
         for idx in range(config.num_query_pool):
@@ -1183,7 +1236,7 @@ def __init__(self, config: HieraConfig):
             )
         self.multi_scale_fusion_heads.append(nn.Identity())
 
-    def apply_fusion_head(head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
+    def apply_fusion_head(self, head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
         if isinstance(head, nn.Identity):
             return hidden_states
 
@@ -1191,15 +1244,15 @@ def apply_fusion_head(head: nn.Module, hidden_states: torch.Tensor) -> torch.Ten
         # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
         # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width])
         permute = [0] + [len(hidden_states.shape) - 2] + list(range(1, len(hidden_states.shape) - 2))
-        hidden_states.reshape(batch_size * num_mask_units, *hidden_states.shape[2:])
-        hidden_states = head(hidden_states)
+        hidden_states = hidden_states.reshape(batch_size * num_mask_units, *hidden_states.shape[2:])
         hidden_states = hidden_states.permute(permute)
+        hidden_states = head(hidden_states)
 
         # Restore original layout
         permute = [0] + list(range(2, len(hidden_states.shape))) + [1]
         hidden_states = hidden_states.permute(permute)
         hidden_states = hidden_states.reshape(
-            batch_size, num_mask_units, *hidden_states.shape[2:], hidden_states.shape[1]
+            batch_size, num_mask_units, *hidden_states.shape[1:-1], hidden_states.shape[-1]
         )
         return hidden_states
 
@@ -1209,6 +1262,8 @@ def forward(self, feature_maps: List[torch.Tensor]) -> torch.Tensor:
         for head, feature_map in zip(self.multi_scale_fusion_heads, feature_maps):
             hidden_states = hidden_states + self.apply_fusion_head(head, feature_map)
 
+        return hidden_states
+
 
 @add_start_docstrings(
     """The Hiera Model transformer with the decoder on top for self-supervised pre-training.
@@ -1226,12 +1281,13 @@ class HieraForPreTraining(HieraPreTrainedModel):
     def __init__(self, config: HieraConfig) -> None:
         super().__init__(config)
         # Encoder
-        self.hiera = HieraModel(config, add_pooling_layer=False)
+        self.hiera = HieraModel(config, add_pooling_layer=False, is_mae=True)
         self.encoder_norm = nn.LayerNorm(self.hiera.num_features, eps=config.layer_norm_eps)
         # Multi-scale fusion heads
         self.multiscale_fusion = HieraMultiScaleHead(config)
         # Decoder
         self.decoder = HieraDecoder(config)
+        self.pred_stride = self.decoder.pred_stride
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1268,12 +1324,27 @@ def get_pixel_label_3d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
 
         return label
 
+    def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, mask: torch.BoolTensor):
+        # mask (boolean tensor): True means *masked*
+        if len(self.config.query_stride) == 2:
+            label = self.get_pixel_label_2d(pixel_values, mask)
+        elif len(self.config.query_stride) == 3:
+            label = self.get_pixel_label_3d(pixel_values, mask)
+        else:
+            raise NotImplementedError("Only images and videos are supported")
+
+        logits = logits[mask]
+        loss = (logits - label) ** 2
+        loss = loss.mean()
+
+        return loss
+
     @add_start_docstrings_to_model_forward(HIERA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=HieraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1281,8 +1352,9 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, HieraForPreTrainingOutput]:
         r"""
-        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
-            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+                when is_mae is set to True.
 
         Returns:
 
@@ -1296,67 +1368,65 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-mae-base")
-        >>> model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-mae-base")
+        >>> image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
+        >>> model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
 
-        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
-        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-        >>> # create random boolean mask of shape (batch_size, num_patches)
-        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+        >>> inputs = image_processor(images=image, return_tensors="pt")
 
-        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
-        >>> list(reconstructed_pixel_values.shape)
-        [1, 3, 224, 224]
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 196, 768]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.hiera(
             pixel_values,
-            bool_masked_pos=bool_masked_pos,
+            noise=noise,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=True,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
-        feature_maps = outputs[-1]
+        feature_maps = outputs.reshaped_hidden_states
+        mask = outputs.mask
+        ids_to_restore = outputs.ids_restore
         # Take only the query pooled and last hidden states
-        feature_maps = feature_maps[1 : self.hiera.config.num_query_pool] + [feature_maps[-1]]
+        feature_maps = feature_maps[1 : self.hiera.config.num_query_pool + 1] + (feature_maps[-1],)
         fused_hidden_states = self.multiscale_fusion(feature_maps)
         fused_hidden_states = self.encoder_norm(fused_hidden_states)
 
         # Reconstruct pixel values
-        reconstructed_pixel_values, mask = self.decoder(
+        logits, mask = self.decoder(
             fused_hidden_states,
-            bool_masked_pos=bool_masked_pos,
+            mask=mask,
             head_mask=head_mask,
             output_attentions=output_attentions,
         )
 
-        masked_im_loss = None
-        if len(self.config.query_stride) == 2:
-            label = self.get_pixel_label_2d(pixel_values, mask)
-        elif len(self.config.query_stride) == 3:
-            label = self.get_pixel_label_3d(pixel_values, mask)
-        else:
-            raise NotImplementedError("Only images and videos are supported")
-
-        reconstructed_pixel_values = reconstructed_pixel_values[mask]
-        loss = (reconstructed_pixel_values - label) ** 2
-        masked_im_loss = loss.mean()
+        # We invert the pred_mask such that True means *masked*
+        loss = self.forward_loss(pixel_values, logits, ~mask)
 
         if not return_dict:
-            output = (reconstructed_pixel_values,) + outputs[1:]
-            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+            output = (logits, mask, ids_to_restore)
+            if output_hidden_states:
+                output = output + (outputs.hidden_states,)
+            if output_attentions:
+                output = output + (outputs.attentions,)
+            if output_hidden_states:
+                output = output + (outputs.reshaped_hidden_states,)
+            return ((loss,) + output) if loss is not None else output
 
         return HieraForPreTrainingOutput(
-            loss=masked_im_loss,
-            reconstruction=reconstructed_pixel_values,
-            hidden_states=outputs.hidden_states,
+            loss=loss,
+            logits=logits,
+            mask=mask,
+            ids_restore=ids_to_restore,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
-            reshaped_hidden_states=outputs.reshaped_hidden_states,
+            reshaped_hidden_states=outputs.reshaped_hidden_states if output_hidden_states else None,
         )
 
 
@@ -1380,7 +1450,7 @@ def __init__(self, config: HieraConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.hiera = HieraModel(config)
+        self.hiera = HieraModel(config, add_pooling_layer=True, is_mae=False)
 
         # Classifier head
         self.classifier = (
@@ -1480,7 +1550,7 @@ def __init__(self, config: HieraConfig):
         self.num_features = [config.embed_dim] + [
             int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
         ]
-        self.embeddings = HieraEmbeddings(config)
+        self.embeddings = HieraEmbeddings(config, is_mae=False)
         self.encoder = HieraEncoder(config)
 
         # Add layer norms to hidden states of out_features
@@ -1533,7 +1603,7 @@ def forward(
         )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
-        embedding_output = self.embeddings(pixel_values)
+        embedding_output, _, _ = self.embeddings(pixel_values)
 
         outputs = self.encoder(
             embedding_output,

From 00283b1ca2cdbf4e16ee5c09ef605bc258f40132 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 3 Apr 2024 03:37:18 +0200
Subject: [PATCH 072/118] Fixed docstrings

---
 src/transformers/models/hiera/modeling_hiera.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index cb81a432cf20..d351d852b9e3 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1000,6 +1000,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 @add_start_docstrings(
     "The bare Hiera Model transformer outputting raw hidden-states without any specific head on top.",
     HIERA_START_DOCSTRING,
+    """
+        add_pooling_layer (`bool`, *optional*, defaults to `True`):
+                Whether or not to apply pooling layer.
+        is_mae (`bool`, *optional*, defaults to `False`):
+                Whether or not to run the model on MAE mode.
+    """,
 )
 class HieraModel(HieraPreTrainedModel):
     def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae: bool = False):

From b72cc8efe767db320c94b790588980892f19a378 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 3 Apr 2024 06:17:58 +0200
Subject: [PATCH 073/118] Improved tests

---
 .../models/hiera/configuration_hiera.py       |  41 ++++--
 .../models/hiera/convert_hiera_to_hf.py       |   7 +-
 .../models/hiera/modeling_hiera.py            |  16 ++-
 tests/models/hiera/test_modeling_hiera.py     | 122 ++++++++++++------
 4 files changed, 131 insertions(+), 55 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 89fc03f76b4d..27550ad5b7b6 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -32,7 +32,7 @@
 }
 
 
-class HieraConfig(PretrainedConfig, BackboneConfigMixin):
+class HieraConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate an Hiera
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -46,18 +46,18 @@ class HieraConfig(PretrainedConfig, BackboneConfigMixin):
     Args:
         embed_dim (`int`, *optional*, defaults to 96):
             Dimensionality of patch embedding.
-        input_size (`tuple(int)`, *optional*, defaults to `(224, 224)`):
+        input_size (`list(int)`, *optional*, defaults to `[224, 224]`):
             The size (resolution) of input in the format (height, width) for images
             and (frames, height, width) for videos.
-        patch_kernel (`tuple(int)`, *optional*, defaults to `(7, 7)`):
+        patch_kernel (`list(int)`, *optional*, defaults to `[7, 7]`):
             The size (resolution) of each patch.
-        patch_stride (`tuple(int)`, *optional*, defaults to `(4, 4)`):
+        patch_stride (`list(int)`, *optional*, defaults to `[4, 4]`):
             The stride of the patch.
-        patch_padding (`tuple(int)`, *optional*, defaults to `(3, 3)`):
+        patch_padding (`list(int)`, *optional*, defaults to `[3, 3]`):
             The padding of the patch.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             The ratio of mlp hidden dim to embedding dim.
-        depths (`tuple(int)`, *optional*, defaults to `[2, 3, 16, 3]`):
+        depths (`list(int)`, *optional*, defaults to `[2, 3, 16, 3]`):
             Depth of each layer in the Transformer encoder.
         initial_num_heads (`int`, *optional*, defaults to 1):
             Initial number of attention heads in the first layer of the Transformer encoder.
@@ -67,9 +67,9 @@ class HieraConfig(PretrainedConfig, BackboneConfigMixin):
             The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder.
         num_query_pool (`int`, *optional*, defaults to 3):
             The number of query pool stages.
-        query_stride (`tuple(int)`, *optional*, defaults to `(2, 2)`):
+        query_stride (`list(int)`, *optional*, defaults to `[2, 2]`):
             The stride of the query pool.
-        masked_unit_size (`tuple(int)`, *optional*, defaults to `(8, 8)`):
+        masked_unit_size (`list(int)`, *optional*, defaults to `[8, 8]`):
             The size of the masked unit.
         masked_unit_attention (`list(bool)`, *optional*, defaults to `[True, True, False, False]`):
             Whether to use masked unit attention in each layer of the Transformer encoder.
@@ -129,21 +129,23 @@ class HieraConfig(PretrainedConfig, BackboneConfigMixin):
 
     model_type = "hiera"
 
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
     def __init__(
         self,
         embed_dim=96,
-        input_size=(224, 224),
-        patch_kernel=(7, 7),
-        patch_stride=(4, 4),
-        patch_padding=(3, 3),
+        input_size=[224, 224],
+        patch_kernel=[7, 7],
+        patch_stride=[4, 4],
+        patch_padding=[3, 3],
         mlp_ratio=4.0,
         depths=[2, 3, 16, 3],
         initial_num_heads=1,
         num_head_multiplier=2.0,
         embed_dim_multiplier=2.0,
         num_query_pool=3,
-        query_stride=(2, 2),
-        masked_unit_size=(8, 8),
+        query_stride=[2, 2],
+        masked_unit_size=[8, 8],
         masked_unit_attention=[True, True, False, False],
         drop_path_rate=0.0,
         sep_pos_embed=False,
@@ -162,6 +164,16 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if masked_unit_size[0] % query_stride[0] ** (len(depths) - 1) != 0:
+            raise ValueError(
+                f"masked_unit_size[0] ({masked_unit_size[0]}) must be divisible by query_stride[0] ({query_stride[0]}) "
+                f"raised to the power of the number of layers ({len(depths) - 1})"
+            )
+
+        if num_query_pool >= len(depths):
+            raise ValueError(
+                f"num_query_pool ({num_query_pool}) must be less than the number of layers ({len(depths)})"
+            )
 
         self.embed_dim = embed_dim
         self.input_size = input_size
@@ -170,6 +182,7 @@ def __init__(
         self.patch_padding = patch_padding
         self.mlp_ratio = mlp_ratio
         self.depths = depths
+        self.num_layers = len(depths)
         self.initial_num_heads = initial_num_heads
         self.num_head_multiplier = num_head_multiplier
         self.embed_dim_multiplier = embed_dim_multiplier
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index c986fa7ea41c..f21d7b221da2 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -20,6 +20,7 @@
 import math
 from typing import Dict, Tuple
 
+import numpy as np
 import requests
 import torch
 from huggingface_hub import hf_hub_download
@@ -269,8 +270,6 @@ def convert_hiera_checkpoint(args):
     push_to_hub = args.push_to_hub
     mae_model = args.mae_model
 
-    mae_model = True
-
     config = get_hiera_config(model_name, base_model, mae_model)
 
     # Load original hiera model
@@ -354,7 +353,9 @@ def convert_hiera_checkpoint(args):
         i // s // ms for i, s, ms in zip(config.input_size, config.patch_stride, config.masked_unit_size)
     ]
     num_windows = math.prod(mask_spatial_shape)
-    noise = torch.rand(1, num_windows)
+    np.random.seed(2)
+    noise = np.random.uniform(size=(1, num_windows))
+    noise = torch.from_numpy(noise)
     outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs)
     # original implementation returns logits.softmax(dim=-1)
 
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index d351d852b9e3..9c301084002b 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -611,7 +611,6 @@ def forward(
         batch_size, seq_len, _ = hidden_states.shape
         # Attention + Q Pooling
         hidden_states_norm = self.layernorm_before(hidden_states)
-
         if self.dim != self.dim_out:
             hidden_states = self.proj(hidden_states_norm)
             # Refer to `HieraUnroll` to see how this performs a maxpool-Nd
@@ -925,7 +924,7 @@ class HieraPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["HieraEmbeddings", "HieraLayer"]
 
-    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+    def _init_weights(self, module) -> None:
         """Initialize the weights"""
         std = self.config.initializer_range
 
@@ -936,6 +935,10 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             else:
                 nn.init.trunc_normal_(module.position_embeddings, std=std)
 
+        elif isinstance(module, HieraDecoder):
+            nn.init.trunc_normal_(module.mask_token, std=std)
+            nn.init.trunc_normal_(module.decoder_position_embeddings, std=std)
+
         elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
             nn.init.trunc_normal_(module.weight, std=std)
             if isinstance(module, nn.Linear) and module.bias is not None:
@@ -1476,6 +1479,7 @@ def __init__(self, config: HieraConfig) -> None:
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -1484,6 +1488,9 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, HieraForImageClassificationOutput]:
         r"""
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+                when is_mae is set to True. Not used in classification and backbone.
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@@ -1574,11 +1581,16 @@ def get_input_embeddings(self):
     def forward(
         self,
         pixel_values: torch.Tensor,
+        noise: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> BackboneOutput:
         """
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+                when is_mae is set to True. Not used in classification and backbone.
+
         Returns:
 
         Examples:
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index b6a1d04e3105..ca1bab745a0c 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -18,6 +18,8 @@
 import math
 import unittest
 
+import numpy as np
+
 from transformers import HieraConfig
 from transformers.testing_utils import (
     require_accelerate,
@@ -55,17 +57,22 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        input_size=(32, 32),
+        input_size=[224, 224],
         mlp_ratio=1.0,
         num_channels=3,
         depths=[1, 1, 1, 1],
+        patch_stride=[4, 4],
+        masked_unit_size=[8, 8],
         initial_num_heads=1,
-        num_head_multiplier=1.0,
-        embed_dim_multiplier=1.0,
+        num_head_multiplier=2.0,
+        embed_dim_multiplier=2.0,
         is_training=True,
         use_labels=True,
         embed_dim=32,
         hidden_act="gelu",
+        decoder_embed_dim=2,
+        decoder_depth=1,
+        decoder_num_heads=1,
         initializer_range=0.02,
         scope=None,
         type_sequence_label_size=10,
@@ -76,6 +83,8 @@ def __init__(
         self.mlp_ratio = mlp_ratio
         self.num_channels = num_channels
         self.depths = depths
+        self.patch_stride = patch_stride
+        self.masked_unit_size = masked_unit_size
         self.initial_num_heads = initial_num_heads
         self.num_head_multiplier = num_head_multiplier
         self.embed_dim_multiplier = embed_dim_multiplier
@@ -83,6 +92,9 @@ def __init__(
         self.use_labels = use_labels
         self.embed_dim = embed_dim
         self.hidden_act = hidden_act
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_depth = decoder_depth
+        self.decoder_num_heads = decoder_num_heads
         self.initializer_range = initializer_range
         self.scope = scope
         self.type_sequence_label_size = type_sequence_label_size
@@ -90,18 +102,27 @@ def __init__(
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.input_size[0], self.input_size[1]])
 
+        # Getting mask for MAE
+        mask_spatial_shape = [
+            i // s // ms for i, s, ms in zip(self.input_size, self.patch_stride, self.masked_unit_size)
+        ]
+        num_windows = math.prod(mask_spatial_shape)
+        noise = floats_tensor([self.batch_size, num_windows])
+
         labels = None
         if self.use_labels:
             labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
 
         config = self.get_config()
 
-        return config, pixel_values, labels
+        return config, pixel_values, noise, labels
 
     def get_config(self):
         return HieraConfig(
             embed_dim=self.embed_dim,
             input_size=self.input_size,
+            patch_stride=self.patch_stride,
+            masked_unit_size=self.masked_unit_size,
             mlp_ratio=self.mlp_ratio,
             num_channels=self.num_channels,
             depths=self.depths,
@@ -109,23 +130,25 @@ def get_config(self):
             num_head_multiplier=self.num_head_multiplier,
             embed_dim_multiplier=self.embed_dim_multiplier,
             hidden_act=self.hidden_act,
+            decoder_embed_dim=self.decoder_embed_dim,
+            decoder_depth=self.decoder_depth,
+            decoder_num_heads=self.decoder_num_heads,
             initializer_range=self.initializer_range,
         )
 
-    def create_and_check_model(self, config, pixel_values, labels):
+    def create_and_check_model(self, config, pixel_values, noise, labels):
         model = HieraModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
 
-        expected_seq_len = math.prod([i // s for i, s in zip(config.input_size, config.patch_stride)]) * math.prod(
-            config.query_stride
-        ) ** (-len(config.depths))
+        tokens_spatial_shape = [i // s for i, s in zip(self.input_size, config.patch_stride)]
+        expected_seq_len = math.prod(tokens_spatial_shape) // math.prod(config.query_stride) ** (config.num_query_pool)
         expected_dim = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
 
-    def create_and_check_backbone(self, config, pixel_values, labels):
+    def create_and_check_backbone(self, config, pixel_values, noise, labels):
         model = HieraBackbone(config=config)
         model.to(torch_device)
         model.eval()
@@ -133,7 +156,10 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify hidden states
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
+        num_patches = config.input_size[0] // config.patch_stride[0] // config.masked_unit_size[0]
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], num_patches, num_patches]
+        )
 
         # verify channels
         self.parent.assertEqual(len(model.channels), len(config.out_features))
@@ -147,18 +173,22 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], num_patches, num_patches]
+        )
 
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)
 
-    def create_and_check_for_pretraining(self, config, pixel_values, labels):
+    def create_and_check_for_pretraining(self, config, pixel_values, noise, labels):
         model = HieraForPreTraining(config=config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
+        pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
+        num_patches = self.input_size[0] // pred_stride
         self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+            result.logits.shape, (self.batch_size, num_patches**2, self.num_channels * pred_stride**2)
         )
 
         # test greyscale images
@@ -168,10 +198,10 @@ def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model.eval()
 
         pixel_values = floats_tensor([self.batch_size, 1, self.input_size[0], self.input_size[0]])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
+        result = model(pixel_values, noise=noise)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches**2, pred_stride**2))
 
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+    def create_and_check_for_image_classification(self, config, pixel_values, noise, labels):
         config.num_labels = self.type_sequence_label_size
         model = HieraForImageClassification(config)
         model.to(torch_device)
@@ -185,7 +215,7 @@ def create_and_check_for_image_classification(self, config, pixel_values, labels
         model.to(torch_device)
         model.eval()
 
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        pixel_values = floats_tensor([self.batch_size, 1, self.input_size[0], self.input_size[0]])
         result = model(pixel_values)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
@@ -194,9 +224,10 @@ def prepare_config_and_inputs_for_common(self):
         (
             config,
             pixel_values,
+            noise,
             labels,
         ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
+        inputs_dict = {"pixel_values": pixel_values, "noise": noise}
         return config, inputs_dict
 
 
@@ -230,10 +261,18 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = HieraModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=HieraConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=HieraConfig, has_text_modality=False)
 
     def test_config(self):
-        self.config_tester.run_common_tests()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_save_load(self):
+        ...
 
     @unittest.skip(reason="Hiera does not use inputs_embeds")
     def test_inputs_embeds(self):
@@ -317,31 +356,42 @@ def test_inference_image_classification_head(self):
         self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
 
     @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # Hiera models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model = HieraModel.from_pretrained("facebook/dino-hieras8").to(torch_device)
-
-        image_processor = AutoImageProcessor.from_pretrained("facebook/dino-hieras8", size=480)
+    def test_inference_for_pretraining(self):
+        # make random mask reproducible across the PT and TF model
+        np.random.seed(2)
+
+        model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae").to(torch_device)
+        image_processor = self.default_image_processor
+
         image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        config = model.config
+        mask_spatial_shape = [
+            i // s // ms for i, s, ms in zip(config.input_size, config.patch_stride, config.masked_unit_size)
+        ]
+        num_windows = math.prod(mask_spatial_shape)
+        noise = np.random.uniform(size=(1, num_windows))
 
         # forward pass
         with torch.no_grad():
-            outputs = model(pixel_values, interpolate_pos_encoding=True)
+            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
 
         # verify the logits
-        expected_shape = torch.Size((1, 3601, 384))
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+        expected_shape = torch.Size((1, 196, 768))
+        self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
-        ).to(torch_device)
+            [
+                [1.5719, 1.5743, 1.5732, 1.5791, 1.5958],
+                [1.9311, 1.9409, 1.9440, 1.9545, 1.9605],
+                [1.6149, 1.8555, 1.2720, 1.5385, 1.5067],
+                [1.2804, 1.8411, 0.8342, 1.5867, 1.5384],
+                [2.1131, 2.0876, 2.0349, 1.9921, 1.9496],
+            ]
+        )
 
-        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.logits[0, :5, :5], expected_slice.to(torch_device), atol=1e-4))
 
     @slow
     @require_accelerate

From 707484f487f7cbbdbdff6ddda9b9c053f2756626 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 3 Apr 2024 19:30:03 +0200
Subject: [PATCH 074/118] Fixed attentou outputs test

---
 tests/models/hiera/test_modeling_hiera.py | 63 ++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index ca1bab745a0c..5b6f6181dd6a 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -271,8 +271,67 @@ def test_config(self):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
-    def test_save_load(self):
-        ...
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            expected_num_attentions = len(self.model_tester.depths)
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            seq_len = math.prod([i // s for i, s in zip(config.input_size, config.patch_stride)])
+            mask_unit_area = math.prod(config.masked_unit_size)
+            num_windows = seq_len // mask_unit_area
+            if model_class.__name__ == "HieraForPreTraining":
+                num_windows = int(num_windows * (1 - config.mask_ratio))
+                seq_len = int(num_windows * mask_unit_area)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-4:]),
+                [self.model_tester.initial_num_heads, num_windows, mask_unit_area, seq_len // num_windows],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # also another +1 for reshaped_hidden_states
+            added_hidden_states = 1 if model_class.__name__ == "HieraBackbone" else 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-4:]),
+                [self.model_tester.initial_num_heads, num_windows, mask_unit_area, seq_len // num_windows],
+            )
 
     @unittest.skip(reason="Hiera does not use inputs_embeds")
     def test_inputs_embeds(self):

From cc04cc3977b247b9be00e28f0258528e3a114844 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 4 Apr 2024 17:35:10 +0200
Subject: [PATCH 075/118] All tests green

---
 .../models/hiera/modeling_hiera.py            |  47 ++-
 tests/models/hiera/test_modeling_hiera.py     | 355 +++++++++++++++++-
 2 files changed, 359 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 9c301084002b..de9db03de48b 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -275,7 +275,7 @@ def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] =
             )
 
         if mask.shape[2:] != target_size:
-            mask = nn.functional.interpolate(mask.float(), size=target_size)
+            mask = nn.functional.interpolate(mask, size=target_size)
 
         return self.projection(pixel_values * mask.bool())
 
@@ -309,7 +309,7 @@ def random_masking(self, pixel_values, noise=None):
         # Unshuffle to get the binary mask
         mask = torch.gather(mask, dim=1, index=ids_restore)
 
-        return mask.bool(), ids_restore
+        return mask, ids_restore
 
     def forward(
         self,
@@ -922,7 +922,6 @@ class HieraPreTrainedModel(PreTrainedModel):
     base_model_prefix = "hiera"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["HieraEmbeddings", "HieraLayer"]
 
     def _init_weights(self, module) -> None:
         """Initialize the weights"""
@@ -941,7 +940,7 @@ def _init_weights(self, module) -> None:
 
         elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
             nn.init.trunc_normal_(module.weight, std=std)
-            if isinstance(module, nn.Linear) and module.bias is not None:
+            if module.bias is not None:
                 nn.init.constant_(module.bias, std)
 
         elif isinstance(module, nn.LayerNorm):
@@ -1092,6 +1091,7 @@ def forward(
             mask_unit_area = math.prod(self.config.masked_unit_size)
             batch_size, _, hidden_size = hidden_states.shape
             positions = mask.unsqueeze(-1).tile(1, mask_unit_area, hidden_size)
+            positions = positions.bool()
             hidden_states = hidden_states[positions]
             hidden_states = hidden_states.view(batch_size, -1, hidden_size)
 
@@ -1176,7 +1176,7 @@ def forward(
 
         # Combine visible and mask tokens
 
-        # hidden_states : [batch_size, num_mask_units, *mask_unit_spatial_shape_final, encoder_dim_out]
+        # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_embed_dim]
         # mask: [batch_size, num_mask_units]
         decoder_hidden_states = torch.zeros(
             *mask.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
@@ -1185,9 +1185,9 @@ def forward(
         new_mask_shape = mask.shape + (1,) * len(hidden_states.shape[2:])
         mask = mask.reshape(new_mask_shape)
         expand_shape = (-1,) * 2 + hidden_states.shape[2:]
-        mask = mask.expand(expand_shape).bool()
-        decoder_hidden_states[mask] = hidden_states.flatten()
-        decoder_hidden_states = ~mask * mask_tokens + mask * decoder_hidden_states
+        mask = mask.expand(expand_shape)
+        decoder_hidden_states[mask.bool()] = hidden_states.flatten()
+        decoder_hidden_states = (1 - mask) * mask_tokens + mask * decoder_hidden_states
 
         # Get back spatial order
         hidden_states = undo_windowing(
@@ -1308,7 +1308,7 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
         size = self.pred_stride
         label = pixel_values.unfold(1, size, size).unfold(2, size, size)
         label = label.flatten(1, 2).flatten(2)
-        label = label[mask]
+        label = label[mask.bool()]
         if self.config.norm_pix_loss:
             mean = label.mean(dim=-1, keepdim=True)
             var = label.var(dim=-1, keepdim=True)
@@ -1325,7 +1325,7 @@ def get_pixel_label_3d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
         # Different from 2D
         label = label.permute(0, 2, 3, 4, 5, 6, 1)
         label = label.flatten(1, 3).flatten(2)
-        label = label[mask]
+        label = label[mask.bool()]
         if self.config.norm_pix_loss:
             mean = label.mean(dim=-1, keepdim=True)
             var = label.var(dim=-1, keepdim=True)
@@ -1334,7 +1334,8 @@ def get_pixel_label_3d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
         return label
 
     def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, mask: torch.BoolTensor):
-        # mask (boolean tensor): True means *masked*
+        # We invert the mask such that 1.0 is *masked*
+        mask = 1 - mask
         if len(self.config.query_stride) == 2:
             label = self.get_pixel_label_2d(pixel_values, mask)
         elif len(self.config.query_stride) == 3:
@@ -1342,7 +1343,7 @@ def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, mask: t
         else:
             raise NotImplementedError("Only images and videos are supported")
 
-        logits = logits[mask]
+        logits = logits[mask.bool()]
         loss = (logits - label) ** 2
         loss = loss.mean()
 
@@ -1388,6 +1389,10 @@ def forward(
         [1, 196, 768]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         outputs = self.hiera(
             pixel_values,
@@ -1415,8 +1420,7 @@ def forward(
             output_attentions=output_attentions,
         )
 
-        # We invert the pred_mask such that True means *masked*
-        loss = self.forward_loss(pixel_values, logits, ~mask)
+        loss = self.forward_loss(pixel_values, logits, mask)
 
         if not return_dict:
             output = (logits, mask, ids_to_restore)
@@ -1479,7 +1483,6 @@ def __init__(self, config: HieraConfig) -> None:
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
-        noise: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -1488,15 +1491,16 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, HieraForImageClassificationOutput]:
         r"""
-        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
-                mainly used for testing purposes to control randomness and maintain the reproducibility
-                when is_mae is set to True. Not used in classification and backbone.
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         outputs = self.hiera(
             pixel_values,
@@ -1537,7 +1541,7 @@ def forward(
                 loss = loss_fct(logits, labels)
 
         if not return_dict:
-            output = (logits,) + outputs[1:]
+            output = (logits,) + outputs[4:]
             return ((loss,) + output) if loss is not None else output
 
         return HieraForImageClassificationOutput(
@@ -1581,16 +1585,11 @@ def get_input_embeddings(self):
     def forward(
         self,
         pixel_values: torch.Tensor,
-        noise: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> BackboneOutput:
         """
-        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
-                mainly used for testing purposes to control randomness and maintain the reproducibility
-                when is_mae is set to True. Not used in classification and backbone.
-
         Returns:
 
         Examples:
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 5b6f6181dd6a..a11adb55bd7a 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -14,9 +14,11 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Hiera model. """
 
-
 import math
+import os
+import tempfile
 import unittest
+from typing import Dict, List, Tuple
 
 import numpy as np
 
@@ -30,7 +32,13 @@
     slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_torch_available, is_vision_available
+from transformers.utils import (
+    CONFIG_NAME,
+    GENERATION_CONFIG_NAME,
+    cached_property,
+    is_torch_available,
+    is_vision_available,
+)
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -41,6 +49,7 @@
 if is_torch_available():
     import torch
     from torch import nn
+    from torch.nn import functional as F
 
     from transformers import HieraBackbone, HieraForImageClassification, HieraForPreTraining, HieraModel
     from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -68,7 +77,7 @@ def __init__(
         embed_dim_multiplier=2.0,
         is_training=True,
         use_labels=True,
-        embed_dim=32,
+        embed_dim=8,
         hidden_act="gelu",
         decoder_embed_dim=2,
         decoder_depth=1,
@@ -102,20 +111,20 @@ def __init__(
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.input_size[0], self.input_size[1]])
 
-        # Getting mask for MAE
-        mask_spatial_shape = [
-            i // s // ms for i, s, ms in zip(self.input_size, self.patch_stride, self.masked_unit_size)
-        ]
-        num_windows = math.prod(mask_spatial_shape)
-        noise = floats_tensor([self.batch_size, num_windows])
-
         labels = None
         if self.use_labels:
             labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
 
         config = self.get_config()
 
-        return config, pixel_values, noise, labels
+        return config, pixel_values, labels
+
+    def get_noise_for_mae(self):
+        mask_spatial_shape = [
+            i // s // ms for i, s, ms in zip(self.input_size, self.patch_stride, self.masked_unit_size)
+        ]
+        num_windows = math.prod(mask_spatial_shape)
+        return floats_tensor([self.batch_size, num_windows])
 
     def get_config(self):
         return HieraConfig(
@@ -136,7 +145,7 @@ def get_config(self):
             initializer_range=self.initializer_range,
         )
 
-    def create_and_check_model(self, config, pixel_values, noise, labels):
+    def create_and_check_model(self, config, pixel_values, labels):
         model = HieraModel(config=config)
         model.to(torch_device)
         model.eval()
@@ -148,7 +157,7 @@ def create_and_check_model(self, config, pixel_values, noise, labels):
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
 
-    def create_and_check_backbone(self, config, pixel_values, noise, labels):
+    def create_and_check_backbone(self, config, pixel_values, labels):
         model = HieraBackbone(config=config)
         model.to(torch_device)
         model.eval()
@@ -180,11 +189,12 @@ def create_and_check_backbone(self, config, pixel_values, noise, labels):
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)
 
-    def create_and_check_for_pretraining(self, config, pixel_values, noise, labels):
+    def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model = HieraForPreTraining(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values)
+        noise = self.get_noise_for_mae()
+        result = model(pixel_values, noise=noise)
         pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
         num_patches = self.input_size[0] // pred_stride
         self.parent.assertEqual(
@@ -201,7 +211,7 @@ def create_and_check_for_pretraining(self, config, pixel_values, noise, labels):
         result = model(pixel_values, noise=noise)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches**2, pred_stride**2))
 
-    def create_and_check_for_image_classification(self, config, pixel_values, noise, labels):
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
         model = HieraForImageClassification(config)
         model.to(torch_device)
@@ -224,10 +234,9 @@ def prepare_config_and_inputs_for_common(self):
         (
             config,
             pixel_values,
-            noise,
             labels,
         ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values, "noise": noise}
+        inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
 
@@ -333,6 +342,314 @@ def test_attention_outputs(self):
                 [self.model_tester.initial_num_heads, num_windows, mask_unit_area, seq_len // num_windows],
             )
 
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class, image_size):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # Swin has a different seq_length
+            patch_size = config.patch_stride
+
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            if model_class.__name__ == "HieraForPreTraining":
+                mask_unit_area = math.prod(config.masked_unit_size)
+                num_windows = num_patches // mask_unit_area
+                num_windows = int(num_windows * (1 - config.mask_ratio))
+                num_patches = int(num_windows * mask_unit_area)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [num_patches, self.model_tester.embed_dim],
+            )
+
+            if not model_class.__name__ == "HieraBackbone":
+                reshaped_hidden_states = outputs.reshaped_hidden_states
+                self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
+
+                batch_size = reshaped_hidden_states[0].shape[0]
+                num_channels = reshaped_hidden_states[0].shape[-1]
+
+                reshaped_hidden_states = reshaped_hidden_states[0].view(batch_size, -1, num_channels)
+                self.assertListEqual(
+                    list(reshaped_hidden_states.shape[-2:]),
+                    [num_patches, self.model_tester.embed_dim],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        image_size = self.model_tester.input_size
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+    def test_batching_equivalence(self):
+        """
+        Tests that the model supports batching and that the output is the nearly the same for the same input in
+        different batch sizes.
+        (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
+        different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
+        """
+
+        def get_tensor_equivalence_function(batched_input):
+            # models operating on continuous spaces have higher abs difference than LMs
+            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
+            if "input_ids" not in batched_input:
+                return lambda tensor1, tensor2: (
+                    1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
+                )
+            return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
+
+        def recursive_check(batched_object, single_row_object, model_name, key):
+            if isinstance(batched_object, (list, tuple)):
+                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
+                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
+            elif isinstance(batched_object, dict):
+                for batched_object_value, single_row_object_value in zip(
+                    batched_object.values(), single_row_object.values()
+                ):
+                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
+            # do not compare returned loss (0-dim tensor) / codebook ids (int) / caching objects
+            elif batched_object is None or not isinstance(batched_object, torch.Tensor):
+                return
+            elif batched_object.dim() == 0:
+                return
+            else:
+                # indexing the first element does not always work
+                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
+                slice_ids = [slice(0, index) for index in single_row_object.shape]
+                batched_row = batched_object[slice_ids]
+                self.assertFalse(
+                    torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
+                )
+                self.assertFalse(
+                    torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
+                )
+                self.assertFalse(
+                    torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
+                )
+                self.assertFalse(
+                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
+                )
+                self.assertTrue(
+                    (equivalence(batched_row, single_row_object)) <= 1e-03,
+                    msg=(
+                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
+                        f"Difference={equivalence(batched_row, single_row_object)}."
+                    ),
+                )
+
+        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
+        equivalence = get_tensor_equivalence_function(batched_input)
+
+        for model_class in self.all_model_classes:
+            config.output_hidden_states = True
+
+            model_name = model_class.__name__
+            if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
+                config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
+            if model_name == "HieraForPreTraining":
+                batched_input["noise"] = self.model_tester.get_noise_for_mae()
+            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
+            model = model_class(config).to(torch_device).eval()
+
+            batch_size = self.model_tester.batch_size
+            single_row_input = {}
+            for key, value in batched_input_prepared.items():
+                if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
+                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
+                    single_batch_shape = value.shape[0] // batch_size
+                    single_row_input[key] = value[:single_batch_shape]
+                else:
+                    single_row_input[key] = value
+
+            with torch.no_grad():
+                model_batched_output = model(**batched_input_prepared)
+                model_row_output = model(**single_row_input)
+
+            if isinstance(model_batched_output, torch.Tensor):
+                model_batched_output = {"model_output": model_batched_output}
+                model_row_output = {"model_output": model_row_output}
+
+            for key in model_batched_output:
+                # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
+                if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
+                    model_batched_output[key] = model_batched_output[key][1:]
+                    model_row_output[key] = model_row_output[key][1:]
+                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            additional_kwargs = {}
+            if model_class.__name__ == "HieraForPreTraining":
+                additional_kwargs["noise"] = self.model_tester.get_noise_for_mae()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            additional_kwargs["output_hidden_states"] = True
+            check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+            if self.has_attentions:
+                # Removing "output_hidden_states"
+                del additional_kwargs["output_hidden_states"]
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+                additional_kwargs["output_attentions"] = True
+                check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                additional_kwargs["output_hidden_states"] = True
+                check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
+
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_determinism(first, second):
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                if model_class.__name__ == "HieraForPreTraining":
+                    inputs_dict["noise"] = self.model_tester.get_noise_for_mae()
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            if isinstance(first, tuple) and isinstance(second, tuple):
+                for tensor1, tensor2 in zip(first, second):
+                    check_determinism(tensor1, tensor2)
+            else:
+                check_determinism(first, second)
+
+    def test_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_save_load(out1, out2):
+            # make sure we don't have nans
+            out_2 = out2.cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            out_1 = out1.cpu().numpy()
+            out_1[np.isnan(out_1)] = 0
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                if model_class.__name__ == "HieraForPreTraining":
+                    inputs_dict["noise"] = self.model_tester.get_noise_for_mae()
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # the config file (and the generation config file, if it can generate) should be saved
+                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
+                self.assertEqual(
+                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
+                )
+
+                model = model_class.from_pretrained(tmpdirname)
+                model.to(torch_device)
+                with torch.no_grad():
+                    second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            if isinstance(first, tuple) and isinstance(second, tuple):
+                for tensor1, tensor2 in zip(first, second):
+                    check_save_load(tensor1, tensor2)
+            else:
+                check_save_load(first, second)
+
+    @unittest.skip(reason="Hiera Transformer does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
     @unittest.skip(reason="Hiera does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
@@ -416,7 +733,7 @@ def test_inference_image_classification_head(self):
 
     @slow
     def test_inference_for_pretraining(self):
-        # make random mask reproducible across the PT and TF model
+        # make random mask reproducible
         np.random.seed(2)
 
         model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae").to(torch_device)

From 219024fffb7740dd035c30a93ce5c95573673649 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 19 Apr 2024 21:01:06 +0200
Subject: [PATCH 076/118] Removed unnecessary file

---
 .../models/hiera/convert_hiera_to_pytorch.py  | 233 ------------------
 1 file changed, 233 deletions(-)
 delete mode 100644 src/transformers/models/hiera/convert_hiera_to_pytorch.py

diff --git a/src/transformers/models/hiera/convert_hiera_to_pytorch.py b/src/transformers/models/hiera/convert_hiera_to_pytorch.py
deleted file mode 100644
index 3da9749f4eb5..000000000000
--- a/src/transformers/models/hiera/convert_hiera_to_pytorch.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-# from transformers import HieraConfig, HieraModel
-from transformers import BeitImageProcessor, HieraConfig, HieraModel
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-
-
-def rename_key(name):
-    if ".proj." in name:
-        name = name.replace(".proj.", ".projection.")
-    if "attn" in name:
-        name = name.replace("attn", "attention")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "position_embeddings")
-    if "patch_embed" in name:
-        name = name.replace("patch_embed", "patch_embedding")
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    updated_model_state = {rename_key(k): v for k, v in orig_state_dict.items()}
-    return updated_model_state
-
-
-def convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path, **kwargs):
-    strict = True
-    pretrained_models_links = {
-        "hiera_tiny_224": {
-            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_tiny_224.pth",
-            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_tiny_224.pth",
-        },
-        "hiera_small_224": {
-            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_small_224.pth",
-            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_small_224.pth",
-        },
-        "hiera_base_224": {
-            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth",
-            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_224.pth",
-        },
-        "hiera_base_plus_224": {
-            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_224.pth",
-            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_224.pth",
-        },
-        "hiera_large_224": {
-            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_large_224.pth",
-            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_224.pth",
-        },
-        "hiera_huge_224": {
-            "mae_in1k_ft_in1k": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_224.pth",
-            "mae_in1k": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_224.pth",
-        },
-        "hiera_base_16x224": {
-            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_16x224.pth",
-            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_16x224.pth",
-        },
-        "hiera_base_plus_16x224": {
-            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_base_plus_16x224.pth",
-            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_base_plus_16x224.pth",
-        },
-        "hiera_large_16x224": {
-            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_large_16x224.pth",
-            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_large_16x224.pth",
-        },
-        "hiera_huge_16x224": {
-            "mae_k400_ft_k400": "https://dl.fbaipublicfiles.com/hiera/hiera_huge_16x224.pth",
-            "mae_k400": "https://dl.fbaipublicfiles.com/hiera/mae_hiera_huge_16x224.pth",
-        },
-    }
-
-    if "hiera_tiny_224" in checkpoint_url:
-        config = HieraConfig(
-            embedding_dimension=96,
-            number_of_heads=1,
-            stages=(1, 2, 7, 2),
-        )
-        checkpoints = pretrained_models_links["hiera_tiny_224"]
-        checkpoint = pretrained_models_links["hiera_tiny_224"]["mae_in1k_ft_in1k"]
-
-    elif "hiera_small_224" in checkpoint_url:
-        config = HieraConfig(
-            embedding_dimension=96,
-            number_of_heads=1,
-            stages=(1, 2, 11, 2),
-        )
-        checkpoints = pretrained_models_links["hiera_small_224"]
-        checkpoint = pretrained_models_links["hiera_small_224"]["mae_in1k_ft_in1k"]
-
-    elif "hiera_base_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=96, number_of_heads=1, stages=(2, 3, 16, 3))
-
-        checkpoints = pretrained_models_links["hiera_base_224"]
-        checkpoint = pretrained_models_links["hiera_base_224"]["mae_in1k_ft_in1k"]
-
-    elif "hiera_base_plus_224" in checkpoint_url:
-        config = HieraConfig(
-            embedding_dimension=112,
-            number_of_heads=2,
-            stages=(2, 3, 16, 3),
-        )
-        checkpoints = pretrained_models_links["hiera_base_plus_224"]
-        checkpoint = pretrained_models_links["hiera_base_plus_224"]["mae_in1k_ft_in1k"]
-
-    elif "hiera_large_224" in checkpoint_url:
-        config = HieraConfig(
-            embedding_dimension=144,
-            number_of_heads=2,
-            stages=(2, 6, 36, 4),
-        )
-        checkpoints = pretrained_models_links["hiera_large_224"]
-        checkpoint = pretrained_models_links["hiera_large_224"]["mae_in1k_ft_in1k"]
-
-    elif "hiera_huge_224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
-        checkpoints = pretrained_models_links["hiera_huge_224"]
-        checkpoint = pretrained_models_links["hiera_huge_224"]["mae_in1k_ft_in1k"]
-
-    elif "hiera_base_16x224" in checkpoint_url:
-        config = HieraConfig(
-            input_size=(16, 224, 224),
-            q_stride=(1, 2, 2),
-            mask_unit_size=(1, 8, 8),
-            patch_kernel=(3, 7, 7),
-            patch_stride=(2, 4, 4),
-            patch_padding=(1, 3, 3),
-            sep_position_embeddings=True,
-        )
-        checkpoints = pretrained_models_links["hiera_base_16x224"]
-        checkpoint = pretrained_models_links["hiera_base_16x224"]["mae_k400_ft_k400"]
-
-    elif "hiera_base_plus_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=112, number_of_heads=2, stages=(2, 3, 16, 3))
-        checkpoints = pretrained_models_links["hiera_base_plus_16x224"]
-        checkpoint = pretrained_models_links["hiera_base_plus_16x224"]["mae_k400_ft_k400"]
-
-    elif "hiera_large_16x224" in checkpoint_url:
-        config = HieraConfig(
-            embedding_dimension=144,
-            number_of_heads=2,
-            stages=(2, 6, 36, 4),
-        )
-        checkpoints = pretrained_models_links["hiera_large_16x224"]
-        checkpoint = pretrained_models_links["hiera_large_16x224"]["mae_k400_ft_k400"]
-
-    elif "hiera_huge_16x224" in checkpoint_url:
-        config = HieraConfig(embedding_dimension=256, number_of_heads=4, stages=(2, 6, 36, 4))
-        checkpoints = pretrained_models_links["hiera_huge_16x224"]
-        checkpoint = pretrained_models_links["hiera_huge_16x224"]["mae_k400_ft_k400"]
-    elif checkpoint not in checkpoints:
-        raise RuntimeError(f"Invalid checkpoint specified ({checkpoint}). Options are: {list(checkpoints.keys())}.")
-
-    pretrained = True
-    if pretrained:
-        if checkpoints is None:
-            raise RuntimeError("This model currently doesn't have pretrained weights available.")
-        elif checkpoint is None:
-            raise RuntimeError("No checkpoint specified.")
-
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint, map_location="cpu")
-        state_dict["model_state"] = convert_state_dict(state_dict["model_state"], {})
-        if "head.projection.weight" in state_dict["model_state"]:
-            # Set the number of classes equal to the state_dict only if the user doesn't want to overwrite it
-            if config.num_classes is None:
-                config.num_classes = state_dict["model_state"]["head.projection.weight"].shape[0]
-            # If the user specified a different number of classes, remove the projection weights or else we'll error out
-            elif config.num_classes != state_dict["model_state"]["head.projection.weight"].shape[0]:
-                del state_dict["model_state"]["head.projection.weight"]
-                del state_dict["model_state"]["head.projection.bias"]
-
-    model = HieraModel(config=config)
-    if pretrained:
-        # Disable being strict when trying to load a encoder-decoder model into an encoder-only model
-        if "decoder_position_embeddings" in state_dict["model_state"] and not hasattr(
-            model, "decoder_position_embeddings"
-        ):
-            strict = False
-
-        model.load_state_dict(state_dict["model_state"], strict=strict)
-
-    image_processor = BeitImageProcessor(
-        size={"height": 256, "width": 256},
-        do_rescale=True,
-        do_center_crop=True,
-        crop_size={"height": 224, "width": 224},
-        do_normalize=True,
-        do_reduce_labels=False,
-        do_resize=True,
-        image_std=IMAGENET_DEFAULT_STD,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        resample=PILImageResampling.BICUBIC,
-    )
-
-    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    processed_image = image_processor(images=image, return_tensors="pt")
-    model.load_state_dict(state_dict["model_state"], strict=strict)
-    expected_slice = torch.tensor([0.1825, 0.8655, 0.5779, 1.1550, 1.1025, 0.6381, 1.0288, -0.0624, 0.1455])
-    # If you also want intermediate feature maps
-    out = model(processed_image.pixel_values)
-    out.last_hidden_state.argmax(dim=-1).item()
-    assert torch.allclose(out.last_hidden_state[0, :9], expected_slice, atol=1e-4)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained("/home/ubuntu/home/hiera/hiera_base_224_image_processor/")
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    checkpoint_url = "https://dl.fbaipublicfiles.com/hiera/hiera_base_224.pth"
-    convert_Hiera_checkpoint(checkpoint_url, pytorch_dump_folder_path="/home/ubuntu/home/hiera/hiera_base_224")

From aedaab9c5bdd561391cc99dfcc49f9675e756be7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 23 Apr 2024 18:57:51 +0200
Subject: [PATCH 077/118] contribution attribution

---
 docs/source/en/model_doc/hiera.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 964036b5ef12..a721d3de8e6f 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
 
-This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco). The original code can be found
+This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) [namangarg110](https://huggingface.co/namangarg110). The original code can be found
 [here](https://github.com/facebookresearch/hiera).
 
 
From 3dc48d1f4901f1bdbb3021dc906ba66a2baca1b1 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 8 May 2024 23:31:45 +0000
Subject: [PATCH 078/118] Resolved a few issues

---
 docs/source/en/model_doc/hiera.md             |  7 +---
 .../models/hiera/configuration_hiera.py       | 39 ++++++-------------
 .../models/hiera/convert_hiera_to_hf.py       |  7 +++-
 .../models/hiera/modeling_hiera.py            | 15 ++++---
 4 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index a721d3de8e6f..24bf1639fe14 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -26,9 +26,7 @@ The abstract from the paper is the following:
 
 *Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
 
-This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) [namangarg110](https://huggingface.co/namangarg110). The original code can be found
-[here](https://github.com/facebookresearch/hiera).
-
+This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).
 
 ## HieraConfig
 
@@ -48,6 +46,3 @@ This model was a joint contibution by [EduardoPacheco](https://huggingface.co/Ed
 
 [[autodoc]] HieraForImageClassification
     - forward
-
-</pt>
-<tf>
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 27550ad5b7b6..aca1f4db3cf1 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@
 
 class HieraConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate an Hiera
+    This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the Hiera
     [EduardoPacheco/hiera-base-224](https://huggingface.co/EduardoPacheco/hiera-base-224) architecture.
@@ -42,16 +42,15 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         embed_dim (`int`, *optional*, defaults to 96):
             Dimensionality of patch embedding.
-        input_size (`list(int)`, *optional*, defaults to `[224, 224]`):
+        image_size (`list(int)`, *optional*, defaults to `[224, 224]`):
             The size (resolution) of input in the format (height, width) for images
             and (frames, height, width) for videos.
         patch_kernel (`list(int)`, *optional*, defaults to `[7, 7]`):
             The size (resolution) of each patch.
-        patch_stride (`list(int)`, *optional*, defaults to `[4, 4]`):
+        patch_size (`list(int)`, *optional*, defaults to `[4, 4]`):
             The stride of the patch.
         patch_padding (`list(int)`, *optional*, defaults to `[3, 3]`):
             The padding of the patch.
@@ -75,7 +74,7 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
             Whether to use masked unit attention in each layer of the Transformer encoder.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate.
-        sep_pos_embed (`bool`, *optional*, defaults to `False`):
+        use_separate_position_embedding (`bool`, *optional*, defaults to `False`):
             Whether to use separate position embedding for temporal and spatial dimensions. Must be `True` for videos.
             and `False` for images.
         num_channels (`int`, *optional*, defaults to 3):
@@ -134,9 +133,9 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
     def __init__(
         self,
         embed_dim=96,
-        input_size=[224, 224],
+        image_size=[224, 224],
         patch_kernel=[7, 7],
-        patch_stride=[4, 4],
+        patch_size=[4, 4],
         patch_padding=[3, 3],
         mlp_ratio=4.0,
         depths=[2, 3, 16, 3],
@@ -148,7 +147,7 @@ def __init__(
         masked_unit_size=[8, 8],
         masked_unit_attention=[True, True, False, False],
         drop_path_rate=0.0,
-        sep_pos_embed=False,
+        use_separate_position_embedding=False,
         num_channels=3,
         hidden_act="gelu",
         initializer_range=0.02,
@@ -176,9 +175,9 @@ def __init__(
             )
 
         self.embed_dim = embed_dim
-        self.input_size = input_size
+        self.input_size = image_size
         self.patch_kernel = patch_kernel
-        self.patch_stride = patch_stride
+        self.patch_stride = patch_size
         self.patch_padding = patch_padding
         self.mlp_ratio = mlp_ratio
         self.depths = depths
@@ -191,7 +190,7 @@ def __init__(
         self.masked_unit_size = masked_unit_size
         self.masked_unit_attention = masked_unit_attention
         self.drop_path_rate = drop_path_rate
-        self.sep_pos_embed = sep_pos_embed
+        self.sep_pos_embed = use_separate_position_embedding
         self.num_channels = num_channels
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
@@ -209,19 +208,3 @@ def __init__(
         self._out_features, self._out_indices = get_aligned_output_features_output_indices(
             out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
         )
-
-
-class HieraOnnxConfig(OnnxConfig):
-    torch_onnx_minimum_version = version.parse("1.11")
-
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
-            ]
-        )
-
-    @property
-    def atol_for_validation(self) -> float:
-        return 1e-4
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index f21d7b221da2..20b0109f3699 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert Hiera checkpoints trained with the DINO method."""
+"""Convert Hiera checkpoints from the original repository.
+
+URL: https://github.com/facebookresearch/hiera
+"""
 
 
 import argparse
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index de9db03de48b..1332f73791ce 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -178,7 +178,7 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
 @dataclass
 class HieraForPreTrainingOutput(ModelOutput):
     """
-    Class for ViTMAEForPreTraining's outputs, with potential hidden states and attentions.
+    Class for HieraForPreTraining's outputs, with potential hidden states and attentions.
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`):
@@ -455,7 +455,7 @@ def forward(
 
 class HieraMaskUnitAttention(nn.Module):
     """
-    Computes either Mask Unit or Global Attention. Also is able to perform q pooling.
+    Computes either Mask Unit or Global Attention. Also is able to perform query pooling.
 
     Note: this assumes the tokens have already been flattened and unrolled into mask units.
     """
@@ -1385,8 +1385,11 @@ def forward(
 
         >>> outputs = model(**inputs)
         >>> logits = outputs.logits
-        >>> list(logits.shape)
+        >>> loss = outputs.loss
+        >>> print(list(logits.shape))
         [1, 196, 768]
+        >>> print(loss.item())
+        0.5571276545524597
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1401,7 +1404,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=True,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=True,
+            return_dict=return_dict,
         )
 
         feature_maps = outputs.reshaped_hidden_states
@@ -1482,7 +1485,7 @@ def __init__(self, config: HieraConfig) -> None:
     )
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values,
         head_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -1627,7 +1630,7 @@ def forward(
             head_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=True,
-            return_dict=True,
+            return_dict=return_dict,
         )
 
         hidden_states = outputs.reshaped_hidden_states

From 22ad148c8a640db49c3d3d995aff7ff21de204fe Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 9 May 2024 03:10:15 +0000
Subject: [PATCH 079/118] Resolved Comments

---
 .../models/hiera/configuration_hiera.py       | 27 ++++++--------
 .../models/hiera/convert_hiera_to_hf.py       | 22 ++++++------
 .../models/hiera/modeling_hiera.py            | 36 +++++++++----------
 tests/models/hiera/test_modeling_hiera.py     | 12 +++----
 4 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index aca1f4db3cf1..3625c40a92c3 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -14,13 +14,8 @@
 # limitations under the License.
 """ Hiera model configuration"""
 
-from collections import OrderedDict
-from typing import Mapping
-
-from packaging import version
 
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
 from ...utils import logging
 from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
@@ -48,9 +43,9 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
         image_size (`list(int)`, *optional*, defaults to `[224, 224]`):
             The size (resolution) of input in the format (height, width) for images
             and (frames, height, width) for videos.
-        patch_kernel (`list(int)`, *optional*, defaults to `[7, 7]`):
+        patch_size (`list(int)`, *optional*, defaults to `[7, 7]`):
             The size (resolution) of each patch.
-        patch_size (`list(int)`, *optional*, defaults to `[4, 4]`):
+        patch_stride (`list(int)`, *optional*, defaults to `[4, 4]`):
             The stride of the patch.
         patch_padding (`list(int)`, *optional*, defaults to `[3, 3]`):
             The padding of the patch.
@@ -89,7 +84,7 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
             The initial weight value for layer normalization layers.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        decoder_embed_dim (`int`, *optional*):
+        decoder_hidden_size (`int`, *optional*):
             Dimensionality of decoder embeddings for MAE pretraining.
         decoder_depth (`int`, *optional*):
             Depth of the decoder for MAE pretraining.
@@ -134,8 +129,8 @@ def __init__(
         self,
         embed_dim=96,
         image_size=[224, 224],
-        patch_kernel=[7, 7],
-        patch_size=[4, 4],
+        patch_size=[7, 7],
+        patch_stride=[4, 4],
         patch_padding=[3, 3],
         mlp_ratio=4.0,
         depths=[2, 3, 16, 3],
@@ -153,7 +148,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_init=1.0,
         layer_norm_eps=1e-6,
-        decoder_embed_dim=None,
+        decoder_hidden_size=None,
         decoder_depth=None,
         decoder_num_heads=None,
         norm_pix_loss=True,
@@ -175,9 +170,9 @@ def __init__(
             )
 
         self.embed_dim = embed_dim
-        self.input_size = image_size
-        self.patch_kernel = patch_kernel
-        self.patch_stride = patch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
         self.patch_padding = patch_padding
         self.mlp_ratio = mlp_ratio
         self.depths = depths
@@ -190,13 +185,13 @@ def __init__(
         self.masked_unit_size = masked_unit_size
         self.masked_unit_attention = masked_unit_attention
         self.drop_path_rate = drop_path_rate
-        self.sep_pos_embed = use_separate_position_embedding
+        self.use_separate_position_embedding = use_separate_position_embedding
         self.num_channels = num_channels
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.layer_norm_init = layer_norm_init
         self.layer_norm_eps = layer_norm_eps
-        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_hidden_size = decoder_hidden_size
         self.decoder_depth = decoder_depth
         self.decoder_num_heads = decoder_num_heads
         self.norm_pix_loss = norm_pix_loss
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 20b0109f3699..b244e24ca5fe 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -80,7 +80,7 @@ def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool):
         ]
     )
 
-    if config.sep_pos_embed:
+    if config.use_separate_position_embedding:
         rename_keys.extend(
             [
                 ("pos_embed_spatial", "hiera.embeddings.position_embeddings_spatial"),
@@ -199,20 +199,20 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
             input_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
-            patch_kernel=(3, 7, 7),
+            patch_size=(3, 7, 7),
             patch_stride=(2, 4, 4),
             patch_padding=(1, 3, 3),
-            sep_pos_embed=True,
+            use_separate_position_embedding=True,
         )
     elif model_name == "hiera-base-plus-16x224":
         config = HieraConfig(
             input_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
-            patch_kernel=(3, 7, 7),
+            patch_size=(3, 7, 7),
             patch_stride=(2, 4, 4),
             patch_padding=(1, 3, 3),
-            sep_pos_embed=True,
+            use_separate_position_embedding=True,
             embed_dim=112,
             initial_num_heads=2,
         )
@@ -221,10 +221,10 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
             input_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
-            patch_kernel=(3, 7, 7),
+            patch_size=(3, 7, 7),
             patch_stride=(2, 4, 4),
             patch_padding=(1, 3, 3),
-            sep_pos_embed=True,
+            use_separate_position_embedding=True,
             embed_dim=144,
             initial_num_heads=2,
             depths=[2, 6, 36, 4],
@@ -234,10 +234,10 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
             input_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
-            patch_kernel=(3, 7, 7),
+            patch_size=(3, 7, 7),
             patch_stride=(2, 4, 4),
             patch_padding=(1, 3, 3),
-            sep_pos_embed=True,
+            use_separate_position_embedding=True,
             embed_dim=256,
             initial_num_heads=4,
             depths=[2, 6, 36, 4],
@@ -249,7 +249,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         pass
     elif mae_model:
         config.num_query_pool = 2
-        config.decoder_embed_dim = 512
+        config.decoder_hidden_size = 512
         config.decoder_depth = 8
         config.decoder_num_heads = 16
         # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
@@ -353,7 +353,7 @@ def convert_hiera_checkpoint(args):
 
     # If is MAE we pass a noise to generate a random mask
     mask_spatial_shape = [
-        i // s // ms for i, s, ms in zip(config.input_size, config.patch_stride, config.masked_unit_size)
+        i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
     ]
     num_windows = math.prod(mask_spatial_shape)
     np.random.seed(2)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 1332f73791ce..84c4c2d1a2f6 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -238,14 +238,14 @@ def __init__(self, config, is_mae: bool = False):
         super().__init__()
 
         # Support any number of spatial dimensions
-        self.spatial_dims = len(config.patch_kernel)
+        self.spatial_dims = len(config.patch_size)
         if self.spatial_dims not in (2, 3):
             raise ValueError(
                 f"The number of dimensions of the input image should be 2 or 3, but got {self.spatial_dims}."
             )
         self.num_channels = config.num_channels
-        self.image_size = config.input_size[-2:]
-        self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.image_size = config.image_size[-2:]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
         self.mask_ratio = config.mask_ratio
         self.is_mae = is_mae
@@ -253,7 +253,7 @@ def __init__(self, config, is_mae: bool = False):
         self.projection = conv_nd(self.spatial_dims)(
             self.num_channels,
             config.embed_dim,
-            kernel_size=config.patch_kernel,
+            kernel_size=config.patch_size,
             stride=config.patch_stride,
             padding=config.patch_padding,
         )
@@ -349,10 +349,10 @@ class HieraEmbeddings(nn.Module):
     def __init__(self, config: HieraConfig, is_mae: bool = False) -> None:
         super().__init__()
         self.patch_stride = config.patch_stride
-        self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
         self.num_tokens = math.prod(self.tokens_spatial_shape)
-        self.sep_pos_embed = config.sep_pos_embed
+        self.sep_pos_embed = config.use_separate_position_embedding
         self.is_mae = is_mae
 
         self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae)
@@ -745,7 +745,7 @@ def __init__(self, config: HieraConfig) -> None:
         # Setting reroll schedule
         # The first stage has to reverse everything
         # The next stage has to reverse all but the first unroll, etc.
-        stage_size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        stage_size = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         unroll_schedule = [config.query_stride] * len(config.depths[:-1])
 
         self.schedule = {}
@@ -928,7 +928,7 @@ def _init_weights(self, module) -> None:
         std = self.config.initializer_range
 
         if isinstance(module, HieraEmbeddings):
-            if self.config.sep_pos_embed:
+            if self.config.use_separate_position_embedding:
                 nn.init.trunc_normal_(module.position_embeddings_spatial, std=std)
                 nn.init.trunc_normal_(module.position_embeddings_temporal, std=std)
             else:
@@ -1017,7 +1017,7 @@ def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae:
         self.embeddings = HieraEmbeddings(config, is_mae=is_mae)
         self.encoder = HieraEncoder(config)
 
-        self.unroll_size = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.unroll_size = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.unroll_schedule = [config.query_stride] * len(config.depths[:-1])
 
         self.pooler = HieraPooler(config) if add_pooling_layer else None
@@ -1128,7 +1128,7 @@ class HieraDecoder(nn.Module):
     def __init__(self, config: HieraConfig):
         super().__init__()
         num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
-        self.tokens_spatial_shape = [i // s for i, s in zip(config.input_size, config.patch_stride)]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.tokens_spatial_shape_final = [
             i // s ** (config.num_query_pool) for i, s in zip(self.tokens_spatial_shape, config.query_stride)
         ]
@@ -1136,18 +1136,18 @@ def __init__(self, config: HieraConfig):
             i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
         ]
 
-        self.decoder_embeddings = nn.Linear(num_features, config.decoder_embed_dim)
+        self.decoder_embeddings = nn.Linear(num_features, config.decoder_hidden_size)
 
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_embed_dim))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
 
         self.decoder_position_embeddings = nn.Parameter(
-            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), config.decoder_embed_dim)
+            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), config.decoder_hidden_size)
         )
 
         self.decoder_block = HieraStage(
             config=config,
-            dim=config.decoder_embed_dim,
-            dim_out=config.decoder_embed_dim,
+            dim=config.decoder_hidden_size,
+            dim_out=config.decoder_hidden_size,
             num_heads=config.decoder_num_heads,
             depth=config.decoder_depth,
             use_mask_unit_attn=False,
@@ -1156,13 +1156,13 @@ def __init__(self, config: HieraConfig):
             window_size=0,
         )
 
-        self.decoder_norm = nn.LayerNorm(config.decoder_embed_dim, eps=config.layer_norm_eps)
+        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
 
         # patch stride of prediction
         self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
         pred_dim = (self.pred_stride ** len(config.query_stride)) * config.num_channels
 
-        self.decoder_pred = nn.Linear(config.decoder_embed_dim, pred_dim)
+        self.decoder_pred = nn.Linear(config.decoder_hidden_size, pred_dim)
 
     def forward(
         self,
@@ -1176,7 +1176,7 @@ def forward(
 
         # Combine visible and mask tokens
 
-        # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_embed_dim]
+        # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_hidden_size]
         # mask: [batch_size, num_mask_units]
         decoder_hidden_states = torch.zeros(
             *mask.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index a11adb55bd7a..0499cf4aada1 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -79,7 +79,7 @@ def __init__(
         use_labels=True,
         embed_dim=8,
         hidden_act="gelu",
-        decoder_embed_dim=2,
+        decoder_hidden_size=2,
         decoder_depth=1,
         decoder_num_heads=1,
         initializer_range=0.02,
@@ -101,7 +101,7 @@ def __init__(
         self.use_labels = use_labels
         self.embed_dim = embed_dim
         self.hidden_act = hidden_act
-        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_hidden_size = decoder_hidden_size
         self.decoder_depth = decoder_depth
         self.decoder_num_heads = decoder_num_heads
         self.initializer_range = initializer_range
@@ -139,7 +139,7 @@ def get_config(self):
             num_head_multiplier=self.num_head_multiplier,
             embed_dim_multiplier=self.embed_dim_multiplier,
             hidden_act=self.hidden_act,
-            decoder_embed_dim=self.decoder_embed_dim,
+            decoder_hidden_size=self.decoder_hidden_size,
             decoder_depth=self.decoder_depth,
             decoder_num_heads=self.decoder_num_heads,
             initializer_range=self.initializer_range,
@@ -165,7 +165,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify hidden states
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        num_patches = config.input_size[0] // config.patch_stride[0] // config.masked_unit_size[0]
+        num_patches = config.image_size[0] // config.patch_stride[0] // config.masked_unit_size[0]
         self.parent.assertListEqual(
             list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], num_patches, num_patches]
         )
@@ -300,7 +300,7 @@ def test_attention_outputs(self):
             # check that output_attentions also work using config
             del inputs_dict["output_attentions"]
             config.output_attentions = True
-            seq_len = math.prod([i // s for i, s in zip(config.input_size, config.patch_stride)])
+            seq_len = math.prod([i // s for i, s in zip(config.image_size, config.patch_stride)])
             mask_unit_area = math.prod(config.masked_unit_size)
             num_windows = seq_len // mask_unit_area
             if model_class.__name__ == "HieraForPreTraining":
@@ -744,7 +744,7 @@ def test_inference_for_pretraining(self):
 
         config = model.config
         mask_spatial_shape = [
-            i // s // ms for i, s, ms in zip(config.input_size, config.patch_stride, config.masked_unit_size)
+            i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
         ]
         num_windows = math.prod(mask_spatial_shape)
         noise = np.random.uniform(size=(1, num_windows))

From aab41dc0eb5d316cec22c55c0a598f08bd1b8657 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 9 May 2024 04:13:46 +0000
Subject: [PATCH 080/118] Updated model repo id and fixed bugs

---
 src/transformers/models/hiera/convert_hiera_to_hf.py | 8 ++++----
 src/transformers/models/hiera/modeling_hiera.py      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index b244e24ca5fe..8ca5d579cbed 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -196,7 +196,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         config = HieraConfig(embed_dim=256, initial_num_heads=4, depths=[2, 6, 36, 4])
     elif model_name == "hiera-base-16x224":
         config = HieraConfig(
-            input_size=(16, 224, 224),
+            image_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
             patch_size=(3, 7, 7),
@@ -206,7 +206,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         )
     elif model_name == "hiera-base-plus-16x224":
         config = HieraConfig(
-            input_size=(16, 224, 224),
+            image_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
             patch_size=(3, 7, 7),
@@ -218,7 +218,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         )
     elif model_name == "hiera-large-16x224":
         config = HieraConfig(
-            input_size=(16, 224, 224),
+            image_size=(16, 224, 224),
             query_stride=(1, 2, 2),
             masked_unit_size=(1, 8, 8),
             patch_size=(3, 7, 7),
@@ -398,7 +398,7 @@ def convert_hiera_checkpoint(args):
             hub_name = f"{model_name}-mae"
         else:
             hub_name = f"{model_name}-k400" if model_name.endswith("16x224") else f"{model_name}-in1k"
-        repo_id = f"EduardoPacheco/{hub_name}"
+        repo_id = f"namangarg110/{hub_name}"
         print(f"Pushing model and processor for {model_name} to hub at {repo_id}")
         model.push_to_hub(repo_id)
         image_processor.push_to_hub(repo_id)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 84c4c2d1a2f6..4cb50330a26a 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1378,8 +1378,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
-        >>> model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
+        >>> image_processor = AutoImageProcessor.from_pretrained("namangarg110/hiera-tiny-224-mae")
+        >>> model = HieraForPreTraining.from_pretrained("namangarg110/hiera-tiny-224-mae")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
 
@@ -1404,7 +1404,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=True,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         feature_maps = outputs.reshaped_hidden_states

From be45286c157b553cf98bcfeff29ebf98efcb144a Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 9 May 2024 04:24:27 +0000
Subject: [PATCH 081/118] Removed loss print

---
 src/transformers/models/hiera/modeling_hiera.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 4cb50330a26a..c79b51de3a28 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1388,8 +1388,6 @@ def forward(
         >>> loss = outputs.loss
         >>> print(list(logits.shape))
         [1, 196, 768]
-        >>> print(loss.item())
-        0.5571276545524597
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

From de9715754b3b0a85a7c93e3e196079147c053051 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 9 May 2024 16:02:19 +0200
Subject: [PATCH 082/118] Make tests green

---
 src/transformers/models/hiera/modeling_hiera.py | 6 +++---
 tests/models/hiera/test_modeling_hiera.py       | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index c79b51de3a28..6a7bfc2c25df 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -853,7 +853,7 @@ def forward(
                 all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions, all_reshaped_hidden_states] if v is not None)
         return HieraEncoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
@@ -1542,7 +1542,7 @@ def forward(
                 loss = loss_fct(logits, labels)
 
         if not return_dict:
-            output = (logits,) + outputs[4:]
+            output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
         return HieraForImageClassificationOutput(
@@ -1628,7 +1628,7 @@ def forward(
             head_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=True,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         hidden_states = outputs.reshaped_hidden_states
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 0499cf4aada1..9be7d61e6e4f 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -698,12 +698,12 @@ class HieraModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
         return (
-            AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k") if is_vision_available() else None
+            AutoImageProcessor.from_pretrained("namangarg110/hiera-tiny-224-in1k") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = HieraForImageClassification.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k").to(torch_device)
+        model = HieraForImageClassification.from_pretrained("namangarg110/hiera-tiny-224-in1k").to(torch_device)
 
         image_processor = self.default_image_processor
         image = prepare_img()
@@ -736,7 +736,7 @@ def test_inference_for_pretraining(self):
         # make random mask reproducible
         np.random.seed(2)
 
-        model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae").to(torch_device)
+        model = HieraForPreTraining.from_pretrained("namangarg110/hiera-tiny-224-mae").to(torch_device)
         image_processor = self.default_image_processor
 
         image = prepare_img()
@@ -778,7 +778,7 @@ def test_inference_fp16(self):
         A small test to make sure that inference work in half precision without any problem.
         """
         model = HieraModel.from_pretrained(
-            "EduardoPacheco/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto"
+            "namangarg110/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto"
         )
         image_processor = self.default_image_processor
 

From ae9fa03e7ea7a493775a95b1bdded38ce907414a Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 9 May 2024 16:10:23 +0200
Subject: [PATCH 083/118] Updated docstrings

---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 6a7bfc2c25df..fb3b7bd1aea6 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -146,7 +146,7 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
-            Classification loss.
+            Loss value for the training task.
         logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
             Prediction scores of the classification head (logits of the output layer).
         hidden_states (`tuple(torch.FloatTensor)`, `optional`):

From 761f9e9a1478a67963fee3add328e5ec31c7698a Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 9 May 2024 16:11:11 +0200
Subject: [PATCH 084/118] Fix style

---
 .../speech-recognition/run_speech_recognition_seq2seq.py    | 3 ++-
 src/transformers/models/hiera/modeling_hiera.py             | 6 +++++-
 tests/models/hiera/test_modeling_hiera.py                   | 4 +---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index f352954d80ae..943dff1894ed 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -122,7 +122,8 @@ class ModelArguments:
         metadata={"help": "Deprecated. Please use the `language` and `task` arguments instead."},
     )
     suppress_tokens: List[int] = field(
-        default=None, metadata={
+        default=None,
+        metadata={
             "help": (
                 "Deprecated. The use of `suppress_tokens` should not be required for the majority of fine-tuning examples."
                 "Should you need to use `suppress_tokens`, please manually update them in the fine-tuning script directly."
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index fb3b7bd1aea6..aecdaca2074d 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -853,7 +853,11 @@ def forward(
                 all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions, all_reshaped_hidden_states] if v is not None)
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions, all_reshaped_hidden_states]
+                if v is not None
+            )
         return HieraEncoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 9be7d61e6e4f..60734ad9004d 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -777,9 +777,7 @@ def test_inference_fp16(self):
         r"""
         A small test to make sure that inference work in half precision without any problem.
         """
-        model = HieraModel.from_pretrained(
-            "namangarg110/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto"
-        )
+        model = HieraModel.from_pretrained("namangarg110/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto")
         image_processor = self.default_image_processor
 
         image = prepare_img()

From b30718a67b822bed4a47f4cace69710c591521e5 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 12 May 2024 12:15:03 +0200
Subject: [PATCH 085/118] Fixed num_heads in config

---
 src/transformers/models/hiera/configuration_hiera.py | 12 ++++--------
 src/transformers/models/hiera/convert_hiera_to_hf.py |  6 +++---
 src/transformers/models/hiera/modeling_hiera.py      |  2 +-
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 3625c40a92c3..ad1895df24ea 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -53,10 +53,8 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
             The ratio of mlp hidden dim to embedding dim.
         depths (`list(int)`, *optional*, defaults to `[2, 3, 16, 3]`):
             Depth of each layer in the Transformer encoder.
-        initial_num_heads (`int`, *optional*, defaults to 1):
-            Initial number of attention heads in the first layer of the Transformer encoder.
-        num_head_multiplier (`float`, *optional*, defaults to 2.0):
-            The multiplier to the number of attention heads in each layer of the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[1, 2, 4, 8]`):
+            Number of attention heads in each layer of the Transformer encoder.
         embed_dim_multiplier (`float`, *optional*, defaults to 2.0):
             The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder.
         num_query_pool (`int`, *optional*, defaults to 3):
@@ -134,8 +132,7 @@ def __init__(
         patch_padding=[3, 3],
         mlp_ratio=4.0,
         depths=[2, 3, 16, 3],
-        initial_num_heads=1,
-        num_head_multiplier=2.0,
+        num_heads=[1, 2, 4, 8],
         embed_dim_multiplier=2.0,
         num_query_pool=3,
         query_stride=[2, 2],
@@ -176,9 +173,8 @@ def __init__(
         self.patch_padding = patch_padding
         self.mlp_ratio = mlp_ratio
         self.depths = depths
+        self.num_heads = num_heads
         self.num_layers = len(depths)
-        self.initial_num_heads = initial_num_heads
-        self.num_head_multiplier = num_head_multiplier
         self.embed_dim_multiplier = embed_dim_multiplier
         self.num_query_pool = num_query_pool
         self.query_stride = query_stride
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 8ca5d579cbed..fb63bbb1f92a 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -189,11 +189,11 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
     elif model_name == "hiera-base-224":
         config = HieraConfig()
     elif model_name == "hiera-base-plus-224":
-        config = HieraConfig(embed_dim=112, initial_num_heads=2)
+        config = HieraConfig(embed_dim=112, num_heads=[2, 4, 8, 16])
     elif model_name == "hiera-large-224":
-        config = HieraConfig(embed_dim=144, initial_num_heads=2, depths=[2, 6, 36, 4])
+        config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4])
     elif model_name == "hiera-huge-224":
-        config = HieraConfig(embed_dim=256, initial_num_heads=4, depths=[2, 6, 36, 4])
+        config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4])
     elif model_name == "hiera-base-16x224":
         config = HieraConfig(
             image_size=(16, 224, 224),
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index aecdaca2074d..db4afdb5a6bc 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -731,7 +731,7 @@ def __init__(self, config: HieraConfig) -> None:
                 depth=depth,
                 dim=embed_dim,
                 dim_out=dim_out,
-                num_heads=int(config.initial_num_heads * config.num_head_multiplier**idx_stage),
+                num_heads=config.num_heads[idx_stage],
                 drop_path=dpr[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
                 query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
                 window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage),

From 1d8fed984f7426e7b04e81d4fc94da1afebd88ef Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 12 May 2024 13:47:22 +0200
Subject: [PATCH 086/118] Removed unnecessary video checkpoint related code in
 the conversion script

---
 docs/source/en/model_doc/hiera.md             |  2 +-
 .../models/hiera/convert_hiera_to_hf.py       | 83 ++++---------------
 .../models/hiera/modeling_hiera.py            |  4 +-
 tests/models/hiera/test_modeling_hiera.py     |  8 +-
 4 files changed, 21 insertions(+), 76 deletions(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 24bf1639fe14..f3cdcc55116d 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
 
-This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).
+This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [EduardoPacheco](https://huggingface.co/EduardoPacheco). The original code can be found [here] (https://github.com/facebookresearch/hiera).
 
 ## HieraConfig
 
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index fb63bbb1f92a..45d03feade3d 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -185,7 +185,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
     if model_name == "hiera-tiny-224":
         config = HieraConfig(depths=[1, 2, 7, 2])
     elif model_name == "hiera-small-224":
-        HieraConfig(depths=[1, 2, 11, 2])
+        config = HieraConfig(depths=[1, 2, 11, 2])
     elif model_name == "hiera-base-224":
         config = HieraConfig()
     elif model_name == "hiera-base-plus-224":
@@ -194,54 +194,6 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4])
     elif model_name == "hiera-huge-224":
         config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4])
-    elif model_name == "hiera-base-16x224":
-        config = HieraConfig(
-            image_size=(16, 224, 224),
-            query_stride=(1, 2, 2),
-            masked_unit_size=(1, 8, 8),
-            patch_size=(3, 7, 7),
-            patch_stride=(2, 4, 4),
-            patch_padding=(1, 3, 3),
-            use_separate_position_embedding=True,
-        )
-    elif model_name == "hiera-base-plus-16x224":
-        config = HieraConfig(
-            image_size=(16, 224, 224),
-            query_stride=(1, 2, 2),
-            masked_unit_size=(1, 8, 8),
-            patch_size=(3, 7, 7),
-            patch_stride=(2, 4, 4),
-            patch_padding=(1, 3, 3),
-            use_separate_position_embedding=True,
-            embed_dim=112,
-            initial_num_heads=2,
-        )
-    elif model_name == "hiera-large-16x224":
-        config = HieraConfig(
-            image_size=(16, 224, 224),
-            query_stride=(1, 2, 2),
-            masked_unit_size=(1, 8, 8),
-            patch_size=(3, 7, 7),
-            patch_stride=(2, 4, 4),
-            patch_padding=(1, 3, 3),
-            use_separate_position_embedding=True,
-            embed_dim=144,
-            initial_num_heads=2,
-            depths=[2, 6, 36, 4],
-        )
-    elif model_name == "hiera-huge-16x224":
-        config = HieraConfig(
-            input_size=(16, 224, 224),
-            query_stride=(1, 2, 2),
-            masked_unit_size=(1, 8, 8),
-            patch_size=(3, 7, 7),
-            patch_stride=(2, 4, 4),
-            patch_padding=(1, 3, 3),
-            use_separate_position_embedding=True,
-            embed_dim=256,
-            initial_num_heads=4,
-            depths=[2, 6, 36, 4],
-        )
     else:
         raise ValueError(f"Unrecognized model name: {model_name}")
 
@@ -253,7 +205,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         config.decoder_depth = 8
         config.decoder_num_heads = 16
         # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
-        config.mask_ratio = 0.6 if not model_name.endswith("16x224") else 0.9
+        config.mask_ratio = 0.6 
     else:
         id2label, label2id, num_labels = get_labels_for_classifier(model_name)
         config.id2label = id2label
@@ -317,19 +269,16 @@ def convert_hiera_checkpoint(args):
 
     input_image = prepare_img()
 
-    if model_name.endswith("16x224"):
-        original_image_preprocessor = None
-    else:
-        original_image_preprocessor = transforms.Compose(
-            [
-                transforms.Resize(
-                    int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC
-                ),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-            ]
-        )
+    original_image_preprocessor = transforms.Compose(
+        [
+            transforms.Resize(
+                int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC
+            ),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+        ]
+    )
 
     image_processor = BitImageProcessor(
         image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256}
@@ -374,7 +323,7 @@ def convert_hiera_checkpoint(args):
         elif mae_model:
             # get mask from noise to be able to compare outputs
             mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
-            expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask)
+            expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool())
             assert torch.allclose(outputs.loss, expected_loss, atol=1e-4)
             print("MAE Model looks good as loss matches original implementation!")
         else:
@@ -398,7 +347,7 @@ def convert_hiera_checkpoint(args):
             hub_name = f"{model_name}-mae"
         else:
             hub_name = f"{model_name}-k400" if model_name.endswith("16x224") else f"{model_name}-in1k"
-        repo_id = f"namangarg110/{hub_name}"
+        repo_id = f"EduardoPacheco/{hub_name}"
         print(f"Pushing model and processor for {model_name} to hub at {repo_id}")
         model.push_to_hub(repo_id)
         image_processor.push_to_hub(repo_id)
@@ -418,10 +367,6 @@ def convert_hiera_checkpoint(args):
             "hiera-base-plus-224",
             "hiera-large-224",
             "hiera-huge-224",
-            "hiera-base-16x224",
-            "hiera-base-plus-16x224",
-            "hiera-large-16x224",
-            "hiera-huge-16x224",
         ],
         help="Name of the Hiera model you'd like to convert.",
     )
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index db4afdb5a6bc..6cb5c2571173 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1382,8 +1382,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("namangarg110/hiera-tiny-224-mae")
-        >>> model = HieraForPreTraining.from_pretrained("namangarg110/hiera-tiny-224-mae")
+        >>> image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
+        >>> model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 60734ad9004d..794467404337 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -698,12 +698,12 @@ class HieraModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
         return (
-            AutoImageProcessor.from_pretrained("namangarg110/hiera-tiny-224-in1k") if is_vision_available() else None
+            AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = HieraForImageClassification.from_pretrained("namangarg110/hiera-tiny-224-in1k").to(torch_device)
+        model = HieraForImageClassification.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k").to(torch_device)
 
         image_processor = self.default_image_processor
         image = prepare_img()
@@ -736,7 +736,7 @@ def test_inference_for_pretraining(self):
         # make random mask reproducible
         np.random.seed(2)
 
-        model = HieraForPreTraining.from_pretrained("namangarg110/hiera-tiny-224-mae").to(torch_device)
+        model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae").to(torch_device)
         image_processor = self.default_image_processor
 
         image = prepare_img()
@@ -777,7 +777,7 @@ def test_inference_fp16(self):
         r"""
         A small test to make sure that inference work in half precision without any problem.
         """
-        model = HieraModel.from_pretrained("namangarg110/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto")
+        model = HieraModel.from_pretrained("EduardoPacheco/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto")
         image_processor = self.default_image_processor
 
         image = prepare_img()

From d736e561aca2192ec067a9d72c41203c81507403 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 12 May 2024 17:34:16 +0200
Subject: [PATCH 087/118] Fix style

---
 src/transformers/models/hiera/convert_hiera_to_hf.py | 6 ++----
 tests/models/hiera/test_modeling_hiera.py            | 4 +++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 45d03feade3d..3f3892dd0870 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -205,7 +205,7 @@ def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> Hier
         config.decoder_depth = 8
         config.decoder_num_heads = 16
         # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
-        config.mask_ratio = 0.6 
+        config.mask_ratio = 0.6
     else:
         id2label, label2id, num_labels = get_labels_for_classifier(model_name)
         config.id2label = id2label
@@ -271,9 +271,7 @@ def convert_hiera_checkpoint(args):
 
     original_image_preprocessor = transforms.Compose(
         [
-            transforms.Resize(
-                int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC
-            ),
+            transforms.Resize(int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC),
             transforms.CenterCrop(224),
             transforms.ToTensor(),
             transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 794467404337..0499cf4aada1 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -777,7 +777,9 @@ def test_inference_fp16(self):
         r"""
         A small test to make sure that inference work in half precision without any problem.
         """
-        model = HieraModel.from_pretrained("EduardoPacheco/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto")
+        model = HieraModel.from_pretrained(
+            "EduardoPacheco/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto"
+        )
         image_processor = self.default_image_processor
 
         image = prepare_img()

From fc34d3efcae0e0ebf60d3d46945bf9ecbc4e48aa Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 12 May 2024 18:27:24 +0200
Subject: [PATCH 088/118] Changed atol in conversion script

---
 src/transformers/models/hiera/convert_hiera_to_hf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 3f3892dd0870..856dbde773bc 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -315,18 +315,18 @@ def convert_hiera_checkpoint(args):
             expected_last_hidden = expected_intermediates[-1]
             batch_size, _, _, hidden_dim = expected_last_hidden.shape
             expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
-            assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-4)
+            assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3)
             print("Base Model looks good as hidden states match original implementation!")
             print(f"{outputs.last_hidden_state[0, :3, :3]=}")
         elif mae_model:
             # get mask from noise to be able to compare outputs
             mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
             expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool())
-            assert torch.allclose(outputs.loss, expected_loss, atol=1e-4)
+            assert torch.allclose(outputs.loss, expected_loss, atol=1e-3)
             print("MAE Model looks good as loss matches original implementation!")
         else:
             expected_prob = original_model(expected_pixel_values)
-            assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-4)
+            assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3)
             print("Classifier looks good as probs match original implementation")
             print(f"{outputs.logits[:, :5]=}")
     else:

From a9a29b36c95878bf26889a155cd102efeea039bf Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 12 May 2024 18:33:19 +0200
Subject: [PATCH 089/118] HieraConfig

---
 src/transformers/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a12f9a2c2765..8fb3e4cff7dc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -430,6 +430,7 @@
         "GroupViTVisionConfig",
     ],
     "models.herbert": ["HerbertTokenizer"],
+    "models.hiera": ["HieraConfig"],
     "models.hubert": ["HubertConfig"],
     "models.ibert": ["IBertConfig"],
     "models.idefics": ["IdeficsConfig"],
@@ -4952,6 +4953,7 @@
         GroupViTVisionConfig,
     )
     from .models.herbert import HerbertTokenizer
+    from .models.hiera import HieraConfig
     from .models.hubert import HubertConfig
     from .models.ibert import IBertConfig
     from .models.idefics import (

From 3525f1c92e5635522892af01ff023966045e5d73 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 12 May 2024 18:36:51 +0200
Subject: [PATCH 090/118] Fix copies

---
 src/transformers/utils/dummy_pt_objects.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 10502eeb6b93..279011266ac9 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4116,9 +4116,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
 class HubertForCTC(metaclass=DummyObject):
     _backends = ["torch"]
 

From c83de0a97dbdf3d6935cc4143fb2eb95a64bec46 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 13 May 2024 10:22:46 +0200
Subject: [PATCH 091/118] Fixed typo

---
 docs/source/en/model_doc/hiera.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index f3cdcc55116d..24bf1639fe14 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
 
-This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [EduardoPacheco](https://huggingface.co/EduardoPacheco). The original code can be found [here] (https://github.com/facebookresearch/hiera).
+This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).
 
 ## HieraConfig
 

From ed73189d2377d508bc6fda4d218fd5d7a4f6a227 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Tue, 21 May 2024 00:13:42 +0000
Subject: [PATCH 092/118] Resolved few issues

---
 src/transformers/__init__.py                  |  2 -
 src/transformers/models/hiera/__init__.py     |  2 -
 .../models/hiera/configuration_hiera.py       | 10 +--
 .../models/hiera/convert_hiera_to_hf.py       | 65 ++++++++-----------
 .../models/hiera/modeling_hiera.py            | 58 +++++++----------
 tests/models/hiera/test_modeling_hiera.py     |  4 +-
 6 files changed, 53 insertions(+), 88 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8fb3e4cff7dc..0b102246565d 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2131,7 +2131,6 @@
     )
     _import_structure["models.hiera"].extend(
         [
-            "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
             "HieraBackbone",
             "HieraForImageClassification",
             "HieraForPreTraining",
@@ -6535,7 +6534,6 @@
             GroupViTVisionModel,
         )
         from .models.hiera import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraBackbone,
             HieraForImageClassification,
             HieraForPreTraining,
diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 14d58a33b577..595bf76e4c93 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -29,7 +29,6 @@
     pass
 else:
     _import_structure["modeling_hiera"] = [
-        "HIERA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "HieraForImageClassification",
         "HieraForPreTraining",
         "HieraBackbone",
@@ -47,7 +46,6 @@
         pass
     else:
         from .modeling_hiera import (
-            HIERA_PRETRAINED_MODEL_ARCHIVE_LIST,
             HieraBackbone,
             HieraForImageClassification,
             HieraForPreTraining,
diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index ad1895df24ea..2bfa13acb75b 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -22,10 +22,6 @@
 
 logger = logging.get_logger(__name__)
 
-HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "EduardoPacheco/hiera-tiny-224": "https://huggingface.co/EduardoPacheco/hiera-tiny-224/resolve/main/config.json",
-}
-
 
 class HieraConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
@@ -88,7 +84,7 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
             Depth of the decoder for MAE pretraining.
         decoder_num_heads (`int`, *optional*):
             Number of attention heads in each layer of the decoder for MAE pretraining.
-        norm_pix_loss (`bool`, *optional*, defaults to `True`):
+        normalize_pixel_loss (`bool`, *optional*, defaults to `True`):
             Whether to normalize the pixel loss by the number of pixels.
         mask_ratio (`float`, *optional*, defaults to 0.6):
             The ratio of masked tokens in the input.
@@ -148,7 +144,7 @@ def __init__(
         decoder_hidden_size=None,
         decoder_depth=None,
         decoder_num_heads=None,
-        norm_pix_loss=True,
+        normalize_pixel_loss=True,
         mask_ratio=0.6,
         out_features=None,
         out_indices=None,
@@ -190,7 +186,7 @@ def __init__(
         self.decoder_hidden_size = decoder_hidden_size
         self.decoder_depth = decoder_depth
         self.decoder_num_heads = decoder_num_heads
-        self.norm_pix_loss = norm_pix_loss
+        self.normalize_pixel_loss = normalize_pixel_loss
         self.mask_ratio = mask_ratio
         # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 856dbde773bc..a03be2500f7d 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -285,18 +285,14 @@ def convert_hiera_checkpoint(args):
 
     expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
 
-    if verify_pixel_values:
-        input_image = prepare_img()
-
-        inputs = image_processor(images=input_image, return_tensors="pt")
-        expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
-        assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
-        print("Pixel values look good!")
-        print(f"{inputs.pixel_values[0, :3, :3, :3]=}")
-    else:
-        print("Converted without verifying pixel values")
-        inputs = {"pixel_values": torch.rand((1, 3, 224, 224))}
-        expected_pixel_values = inputs["pixel_values"]
+    input_image = prepare_img()
+
+    inputs = image_processor(images=input_image, return_tensors="pt")
+    expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
+    assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
+    print("Pixel values look good!")
+    print(f"{inputs.pixel_values[0, :3, :3, :3]=}")
+
 
     # If is MAE we pass a noise to generate a random mask
     mask_spatial_shape = [
@@ -309,28 +305,26 @@ def convert_hiera_checkpoint(args):
     outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs)
     # original implementation returns logits.softmax(dim=-1)
 
-    if verify_logits:
-        if base_model:
-            expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
-            expected_last_hidden = expected_intermediates[-1]
-            batch_size, _, _, hidden_dim = expected_last_hidden.shape
-            expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
-            assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3)
-            print("Base Model looks good as hidden states match original implementation!")
-            print(f"{outputs.last_hidden_state[0, :3, :3]=}")
-        elif mae_model:
-            # get mask from noise to be able to compare outputs
-            mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
-            expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool())
-            assert torch.allclose(outputs.loss, expected_loss, atol=1e-3)
-            print("MAE Model looks good as loss matches original implementation!")
-        else:
-            expected_prob = original_model(expected_pixel_values)
-            assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3)
-            print("Classifier looks good as probs match original implementation")
-            print(f"{outputs.logits[:, :5]=}")
+    if base_model:
+        expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
+        expected_last_hidden = expected_intermediates[-1]
+        batch_size, _, _, hidden_dim = expected_last_hidden.shape
+        expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
+        assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3)
+        print("Base Model looks good as hidden states match original implementation!")
+        print(f"{outputs.last_hidden_state[0, :3, :3]=}")
+    elif mae_model:
+        # get mask from noise to be able to compare outputs
+        mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
+        expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool())
+        assert torch.allclose(outputs.loss, expected_loss, atol=1e-3)
+        print("MAE Model looks good as loss matches original implementation!")
     else:
-        print("Converted without verifying logits")
+        expected_prob = original_model(expected_pixel_values)
+        assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3)
+        print("Classifier looks good as probs match original implementation")
+        print(f"{outputs.logits[:, :5]=}")
+
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
@@ -387,11 +381,6 @@ def convert_hiera_checkpoint(args):
     parser.add_argument(
         "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining."
     )
-    parser.add_argument(
-        "--verify-pixel-values",
-        action="store_true",
-        help="Whether to verify the pixel values of the input image.",
-    )
 
     args = parser.parse_args()
     convert_hiera_checkpoint(args)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 6cb5c2571173..221aff6854bb 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -58,12 +58,6 @@
 _IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
 
 
-HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "EduardoPacheco/hiera-tiny-224",
-    # See all Hiera models at https://huggingface.co/models?filter=hiera
-]
-
-
 @dataclass
 class HieraEncoderOutput(ModelOutput):
     """
@@ -221,11 +215,6 @@ def conv_nd(n: int) -> nn.Module:
     return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
 
 
-# Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L81
-def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor:
-    # Refer to `Unroll` to see how this performs a maxpool-Nd
-    return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values
-
 
 class HieraPatchEmbeddings(nn.Module):
     """
@@ -317,8 +306,7 @@ def forward(
         noise: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
     ) -> torch.Tensor:
-        num_channels = pixel_values.shape[1]
-        height, width = pixel_values.shape[-2:]
+        _, num_channels, height, width = pixel_values.shape
 
         if num_channels != self.num_channels:
             raise ValueError(
@@ -439,11 +427,7 @@ def forward(
         noise: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
     ) -> torch.Tensor:
-        if len(self.tokens_spatial_shape) == 2:
-            batch_size, num_channels, height, width = pixel_values.shape
-        else:
-            batch_size, num_channels, depth, height, width = pixel_values.shape
-
+        height, width = pixel_values.shape[-2:]
         embeddings, mask, ids_restore = self.patch_embeddings(
             pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
         )
@@ -505,7 +489,7 @@ def forward(
         query, key, value = qkv.unbind(0)
 
         if self.query_stride > 1:
-            # Refer to Unroll to see how this performs a maxpool-Nd
+            # Refer to unroll to see how this performs a maxpool-Nd
             query = query.view(batch_size, self.num_heads, num_windows, self.query_stride, -1, self.head_dim)
             query = query.max(dim=3).values
 
@@ -613,7 +597,7 @@ def forward(
         hidden_states_norm = self.layernorm_before(hidden_states)
         if self.dim != self.dim_out:
             hidden_states = self.proj(hidden_states_norm)
-            # Refer to `HieraUnroll` to see how this performs a maxpool-Nd
+            # Refer to unroll to see how this performs a maxpool-Nd
             hidden_states = hidden_states.view(batch_size, self.query_stride, -1, self.dim_out).max(dim=1).values
 
         (hidden_states_norm, attn_weights) = self.attn(
@@ -682,10 +666,18 @@ def forward(
 def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shape: List[int]) -> torch.Tensor:
     """
     Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        hidden_states (torch.Tensor): The hidden states tensor of shape [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size].
+        shape (List[int]): The original shape of the hidden states tensor before windowing.
+        mask_unit_shape (List[int]): The shape of the mask units used for windowing.
+
+    Returns:
+        torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].
     """
     num_dims = len(shape)
     batch_size, hidden_size = hidden_states.shape[0], hidden_states.shape[-1]
-    # From: [batch_size, num_mask_unit_height*num_#mask_unit_wdith, mask_unit_height, mask_unit_width, hidden_size]
+    # From: [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]
     # To: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
     num_mask_units = [s // mu for s, mu in zip(shape, mask_unit_shape)]
     hidden_states = hidden_states.view(batch_size, *num_mask_units, *mask_unit_shape, hidden_size)
@@ -693,12 +685,9 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
     # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
     # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]
     permute = (
-        [0]
-        + sum(
-            [list(p) for p in zip(range(1, 1 + num_dims), range(1 + num_dims, 1 + 2 * num_dims))],
-            [],
-        )
-        + [len(hidden_states.shape) - 1]
+        [0] +
+        [item for pair in zip(range(1, 1 + num_dims), range(1 + num_dims, 1 + 2 * num_dims)) for item in pair] +
+        [len(hidden_states.shape) - 1]
     )
     hidden_states = hidden_states.permute(permute).reshape(batch_size, *shape, hidden_size)
 
@@ -785,14 +774,11 @@ def reroll(
             # Example in 2d:
             # Input: [batch_size, stride, stride, seq_len//(stride*stride), mask_unit_height, mask_unit_width, hidden_size]
             # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size]
-            L = len(hidden_states.shape)
+            hidden_state_dims = len(hidden_states.shape)
             permute = (
-                [0, 1 + num_dim]
-                + sum(
-                    [list(p) for p in zip(range(1, 1 + num_dim), range(1 + num_dim + 1, L - 1))],
-                    [],
-                )
-                + [L - 1]
+                [0, 1 + num_dim] +
+                [item for pair in zip(range(1, 1 + num_dim), range(1 + num_dim + 1, hidden_state_dims - 1)) for item in pair] +
+                [hidden_state_dims - 1]
             )
             hidden_states = hidden_states.permute(permute)
 
@@ -1313,7 +1299,7 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
         label = pixel_values.unfold(1, size, size).unfold(2, size, size)
         label = label.flatten(1, 2).flatten(2)
         label = label[mask.bool()]
-        if self.config.norm_pix_loss:
+        if self.config.normalize_pixel_loss:
             mean = label.mean(dim=-1, keepdim=True)
             var = label.var(dim=-1, keepdim=True)
             label = (label - mean) / (var + 1.0e-6) ** 0.5
@@ -1330,7 +1316,7 @@ def get_pixel_label_3d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
         label = label.permute(0, 2, 3, 4, 5, 6, 1)
         label = label.flatten(1, 3).flatten(2)
         label = label[mask.bool()]
-        if self.config.norm_pix_loss:
+        if self.config.normalize_pixel_loss:
             mean = label.mean(dim=-1, keepdim=True)
             var = label.var(dim=-1, keepdim=True)
             label = (label - mean) / (var + 1.0e-6) ** 0.5
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 0499cf4aada1..0e3626ea038d 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -52,8 +52,6 @@
     from torch.nn import functional as F
 
     from transformers import HieraBackbone, HieraForImageClassification, HieraForPreTraining, HieraModel
-    from transformers.models.hiera.modeling_hiera import HIERA_PRETRAINED_MODEL_ARCHIVE_LIST
-
 
 if is_vision_available():
     from PIL import Image
@@ -681,7 +679,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in HIERA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["EduardoPacheco/hiera-tiny-224"]: 
             model = HieraModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 

From 7f5d27c6647e16218be1933c6a5e96a987b8ead3 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Tue, 21 May 2024 02:03:22 +0000
Subject: [PATCH 093/118] make

---
 .../models/hiera/convert_hiera_to_hf.py         |  4 ----
 src/transformers/models/hiera/modeling_hiera.py | 17 ++++++++++-------
 src/transformers/utils/dummy_pt_objects.py      |  3 ---
 tests/models/hiera/test_modeling_hiera.py       |  2 +-
 4 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index a03be2500f7d..1e48a8608a40 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -220,8 +220,6 @@ def convert_hiera_checkpoint(args):
     model_name = args.model_name
     base_model = args.base_model
     pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    verify_logits = args.verify_logits
-    verify_pixel_values = args.verify_pixel_values
     push_to_hub = args.push_to_hub
     mae_model = args.mae_model
 
@@ -293,7 +291,6 @@ def convert_hiera_checkpoint(args):
     print("Pixel values look good!")
     print(f"{inputs.pixel_values[0, :3, :3, :3]=}")
 
-
     # If is MAE we pass a noise to generate a random mask
     mask_spatial_shape = [
         i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
@@ -325,7 +322,6 @@ def convert_hiera_checkpoint(args):
         print("Classifier looks good as probs match original implementation")
         print(f"{outputs.logits[:, :5]=}")
 
-
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 221aff6854bb..6ae2d0924ec8 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -215,7 +215,6 @@ def conv_nd(n: int) -> nn.Module:
     return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
 
 
-
 class HieraPatchEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -685,9 +684,9 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
     # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
     # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]
     permute = (
-        [0] +
-        [item for pair in zip(range(1, 1 + num_dims), range(1 + num_dims, 1 + 2 * num_dims)) for item in pair] +
-        [len(hidden_states.shape) - 1]
+        [0]
+        + [item for pair in zip(range(1, 1 + num_dims), range(1 + num_dims, 1 + 2 * num_dims)) for item in pair]
+        + [len(hidden_states.shape) - 1]
     )
     hidden_states = hidden_states.permute(permute).reshape(batch_size, *shape, hidden_size)
 
@@ -776,9 +775,13 @@ def reroll(
             # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size]
             hidden_state_dims = len(hidden_states.shape)
             permute = (
-                [0, 1 + num_dim] +
-                [item for pair in zip(range(1, 1 + num_dim), range(1 + num_dim + 1, hidden_state_dims - 1)) for item in pair] +
-                [hidden_state_dims - 1]
+                [0, 1 + num_dim]
+                + [
+                    item
+                    for pair in zip(range(1, 1 + num_dim), range(1 + num_dim + 1, hidden_state_dims - 1))
+                    for item in pair
+                ]
+                + [hidden_state_dims - 1]
             )
             hidden_states = hidden_states.permute(permute)
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 279011266ac9..99919ff8a38c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4078,9 +4078,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-HIERA_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
 class HieraBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 0e3626ea038d..1b478f0701bb 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -679,7 +679,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in ["EduardoPacheco/hiera-tiny-224"]: 
+        for model_name in ["EduardoPacheco/hiera-tiny-224"]:
             model = HieraModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 

From cedc6e8879c54fb86da10dec3a7f595412dab26a Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Tue, 21 May 2024 18:51:18 +0000
Subject: [PATCH 094/118] converted conv_nd -> nn.Module

---
 .../models/hiera/modeling_hiera.py            | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 6ae2d0924ec8..7b3f25b707a2 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -206,13 +206,22 @@ class HieraForPreTrainingOutput(ModelOutput):
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Taken from https://github.com/facebookresearch/hiera/blob/main/hiera/hiera_utils.py#L73
-def conv_nd(n: int) -> nn.Module:
-    """
-    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
-    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
-    """
-    return [nn.Identity, nn.Conv1d, nn.Conv2d, nn.Conv3d][n]
+class HieraConvND(nn.Module):
+    def __init__(self, n: int, in_channels: int, out_channels: int, kernel_size, stride=1, padding=0):
+        super().__init__()
+        if n == 0:
+            self.conv = nn.Identity()
+        elif n == 1:
+            self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
+        elif n == 2:
+            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
+        elif n == 3:
+            self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
+        else:
+            raise ValueError(f"Unsupported number of dimensions: {n}. Only 1, 2, and 3 are supported.")
+
+    def forward(self, x):
+        return self.conv(x)
 
 
 class HieraPatchEmbeddings(nn.Module):
@@ -238,7 +247,8 @@ def __init__(self, config, is_mae: bool = False):
         self.mask_ratio = config.mask_ratio
         self.is_mae = is_mae
 
-        self.projection = conv_nd(self.spatial_dims)(
+        self.projection = HieraConvND(
+            self.spatial_dims,
             self.num_channels,
             config.embed_dim,
             kernel_size=config.patch_size,
@@ -1068,11 +1078,6 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, len(self.config.depths))
 
-        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
-        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
-        if pixel_values.dtype != expected_dtype:
-            pixel_values = pixel_values.to(expected_dtype)
-
         embedding_output, mask, ids_restore = self.embeddings(
             pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise
         )
@@ -1229,7 +1234,8 @@ def __init__(self, config: HieraConfig):
             kernel = [i // s for i, s in zip(current_masked_unit_size, self.mask_unit_spatial_shape_final)]
             current_masked_unit_size = [i // s for i, s in zip(current_masked_unit_size, config.query_stride)]
             self.multi_scale_fusion_heads.append(
-                conv_nd(len(config.query_stride))(
+                HieraConvND(
+                    len(config.query_stride),
                     self.stage_dimensions[idx],
                     self.stage_dimensions[-1],
                     kernel_size=kernel,

From f4a343c41df7709486ba4238f11c9bec1f33528d Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 23 May 2024 18:28:11 +0000
Subject: [PATCH 095/118] Removed video complexities

---
 .../models/hiera/configuration_hiera.py       |  5 -
 .../models/hiera/modeling_hiera.py            | 95 ++++---------------
 2 files changed, 20 insertions(+), 80 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 2bfa13acb75b..0c9c9c76559b 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -63,9 +63,6 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
             Whether to use masked unit attention in each layer of the Transformer encoder.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate.
-        use_separate_position_embedding (`bool`, *optional*, defaults to `False`):
-            Whether to use separate position embedding for temporal and spatial dimensions. Must be `True` for videos.
-            and `False` for images.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         hidden_act (`str`, *optional*, defaults to `"gelu"`):
@@ -135,7 +132,6 @@ def __init__(
         masked_unit_size=[8, 8],
         masked_unit_attention=[True, True, False, False],
         drop_path_rate=0.0,
-        use_separate_position_embedding=False,
         num_channels=3,
         hidden_act="gelu",
         initializer_range=0.02,
@@ -177,7 +173,6 @@ def __init__(
         self.masked_unit_size = masked_unit_size
         self.masked_unit_attention = masked_unit_attention
         self.drop_path_rate = drop_path_rate
-        self.use_separate_position_embedding = use_separate_position_embedding
         self.num_channels = num_channels
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 7b3f25b707a2..d7377a8671d7 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -215,10 +215,8 @@ def __init__(self, n: int, in_channels: int, out_channels: int, kernel_size, str
             self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
         elif n == 2:
             self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
-        elif n == 3:
-            self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
         else:
-            raise ValueError(f"Unsupported number of dimensions: {n}. Only 1, 2, and 3 are supported.")
+            raise ValueError(f"Unsupported number of dimensions: {n}. Only 1 and 2 are supported.")
 
     def forward(self, x):
         return self.conv(x)
@@ -236,10 +234,8 @@ def __init__(self, config, is_mae: bool = False):
 
         # Support any number of spatial dimensions
         self.spatial_dims = len(config.patch_size)
-        if self.spatial_dims not in (2, 3):
-            raise ValueError(
-                f"The number of dimensions of the input image should be 2 or 3, but got {self.spatial_dims}."
-            )
+        if self.spatial_dims != 2:
+            raise ValueError(f"The number of dimensions of the input image should be 2, but got {self.spatial_dims}.")
         self.num_channels = config.num_channels
         self.image_size = config.image_size[-2:]
         self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
@@ -349,24 +345,11 @@ def __init__(self, config: HieraConfig, is_mae: bool = False) -> None:
         self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
         self.num_tokens = math.prod(self.tokens_spatial_shape)
-        self.sep_pos_embed = config.use_separate_position_embedding
         self.is_mae = is_mae
 
         self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae)
 
-        if self.sep_pos_embed:
-            self.position_embeddings_spatial = nn.Parameter(
-                torch.zeros(
-                    1,
-                    self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                    config.embed_dim,
-                )
-            )
-            self.position_embeddings_temporal = nn.Parameter(
-                torch.zeros(1, self.tokens_spatial_shape[0], config.embed_dim)
-            )
-        else:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.embed_dim))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.embed_dim))
 
     def interpolate_pos_encoding(
         self, embeddings: torch.Tensor, pos_embeds: torch.Tensor, height: int, width: int
@@ -384,8 +367,8 @@ def interpolate_pos_encoding(
         if num_patches == num_positions and height == width:
             return pos_embeds
         dim = embeddings.shape[-1]
-        h0 = height // self.patch_stride[0] if not self.sep_pos_embed else height // self.patch_stride[1]
-        w0 = width // self.patch_stride[1] if not self.sep_pos_embed else width // self.patch_stride[2]
+        h0 = height // self.patch_stride[0]
+        w0 = width // self.patch_stride[1]
         # we add a small number to avoid floating point error in the interpolation
         # see discussion at https://github.com/facebookresearch/dino/issues/8
         h0, w0 = h0 + 0.1, w0 + 0.1
@@ -405,30 +388,13 @@ def interpolate_pos_encoding(
     def get_position_embedding(
         self, embeddings: torch.Tensor, height: int, width: int, interpolate_pos_encoding: bool
     ) -> torch.Tensor:
-        if self.sep_pos_embed:
-            spatial = self.position_embeddings_spatial
-            spatial = (
-                self.interpolate_pos_encoding(embeddings, spatial, height, width)
-                if interpolate_pos_encoding
-                else spatial
-            )
-            spatial = spatial.repeat(1, self.tokens_spatial_shape[0], 1)
-
-            temporal = torch.repeat_interleave(
-                self.position_embeddings_temporal,
-                self.tokens_spatial_shape[1] * self.tokens_spatial_shape[2],
-                dim=1,
-            )
-
-            return spatial + temporal
-        else:
-            position_embeddings = self.position_embeddings
-            position_embeddings = (
-                self.interpolate_pos_encoding(embeddings, position_embeddings, height, width)
-                if interpolate_pos_encoding
-                else position_embeddings
-            )
-            return position_embeddings
+        position_embeddings = self.position_embeddings
+        position_embeddings = (
+            self.interpolate_pos_encoding(embeddings, position_embeddings, height, width)
+            if interpolate_pos_encoding
+            else position_embeddings
+        )
+        return position_embeddings
 
     def forward(
         self,
@@ -441,8 +407,11 @@ def forward(
             pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
         )
 
+        height, width = pixel_values.shape[-2:]
+        embeddings, mask, ids_restore = self.patch_embeddings(
+            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
+        )
         embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
-
         return embeddings, mask, ids_restore
 
 
@@ -763,7 +732,6 @@ def reroll(
 
         If no mask is provided returns:
             - [batch_size, height, width, hidden_size] for 2d
-            - [batch_size, frames, height, width, hidden_size] for 3d
         If a mask is provided returns:
             - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] for 2d
         """
@@ -931,17 +899,13 @@ def _init_weights(self, module) -> None:
         std = self.config.initializer_range
 
         if isinstance(module, HieraEmbeddings):
-            if self.config.use_separate_position_embedding:
-                nn.init.trunc_normal_(module.position_embeddings_spatial, std=std)
-                nn.init.trunc_normal_(module.position_embeddings_temporal, std=std)
-            else:
-                nn.init.trunc_normal_(module.position_embeddings, std=std)
+            nn.init.trunc_normal_(module.position_embeddings, std=std)
 
         elif isinstance(module, HieraDecoder):
             nn.init.trunc_normal_(module.mask_token, std=std)
             nn.init.trunc_normal_(module.decoder_position_embeddings, std=std)
 
-        elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
             nn.init.trunc_normal_(module.weight, std=std)
             if module.bias is not None:
                 nn.init.constant_(module.bias, std)
@@ -1315,32 +1279,13 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
 
         return label
 
-    def get_pixel_label_3d(self, pixel_values: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # mask (boolean tensor): True means *masked*
-        pixel_values = pixel_values[:, :, :: self.patch_stride[0], :, :]
-
-        size = self.pred_stride
-        label = pixel_values.unfold(3, size, size).unfold(4, size, size)
-        # Different from 2D
-        label = label.permute(0, 2, 3, 4, 5, 6, 1)
-        label = label.flatten(1, 3).flatten(2)
-        label = label[mask.bool()]
-        if self.config.normalize_pixel_loss:
-            mean = label.mean(dim=-1, keepdim=True)
-            var = label.var(dim=-1, keepdim=True)
-            label = (label - mean) / (var + 1.0e-6) ** 0.5
-
-        return label
-
     def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, mask: torch.BoolTensor):
         # We invert the mask such that 1.0 is *masked*
         mask = 1 - mask
         if len(self.config.query_stride) == 2:
             label = self.get_pixel_label_2d(pixel_values, mask)
-        elif len(self.config.query_stride) == 3:
-            label = self.get_pixel_label_3d(pixel_values, mask)
         else:
-            raise NotImplementedError("Only images and videos are supported")
+            raise NotImplementedError("Only images are supported")
 
         logits = logits[mask.bool()]
         loss = (logits - label) ** 2

From 6c65b3085e162d0da332fa252d6fba6e73ddd0fe Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Thu, 23 May 2024 23:34:09 +0000
Subject: [PATCH 096/118] Removed video complexities

---
 .../models/hiera/convert_hiera_to_hf.py       | 18 ++++-----------
 .../models/hiera/modeling_hiera.py            | 23 ++-----------------
 2 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 1e48a8608a40..8ab30493a54e 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -80,15 +80,7 @@ def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool):
         ]
     )
 
-    if config.use_separate_position_embedding:
-        rename_keys.extend(
-            [
-                ("pos_embed_spatial", "hiera.embeddings.position_embeddings_spatial"),
-                ("pos_embed_temporal", "hiera.embeddings.position_embeddings_temporal")
-            ]
-        )
-    else:
-        rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings"))
+    rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings"))
 
     if base_model:
         # layernorm + pooler
@@ -168,10 +160,8 @@ def prepare_img():
 
 def get_labels_for_classifier(model_name: str) -> Tuple[Dict[int, str], Dict[str, int], int]:
     repo_id = "huggingface/label-files"
-    if model_name.endswith("16x224"):
-        filename = "kinetics400-id2label.json"
-    else:
-        filename = "imagenet-1k-id2label.json"
+
+    filename = "imagenet-1k-id2label.json"
 
     id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
     id2label = {int(k): v for k, v in id2label.items()}
@@ -334,7 +324,7 @@ def convert_hiera_checkpoint(args):
         elif mae_model:
             hub_name = f"{model_name}-mae"
         else:
-            hub_name = f"{model_name}-k400" if model_name.endswith("16x224") else f"{model_name}-in1k"
+            hub_name = f"{model_name}-in1k"
         repo_id = f"EduardoPacheco/{hub_name}"
         print(f"Pushing model and processor for {model_name} to hub at {repo_id}")
         model.push_to_hub(repo_id)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index d7377a8671d7..a748119b9e4d 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -206,22 +206,6 @@ class HieraForPreTrainingOutput(ModelOutput):
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
-class HieraConvND(nn.Module):
-    def __init__(self, n: int, in_channels: int, out_channels: int, kernel_size, stride=1, padding=0):
-        super().__init__()
-        if n == 0:
-            self.conv = nn.Identity()
-        elif n == 1:
-            self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
-        elif n == 2:
-            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
-        else:
-            raise ValueError(f"Unsupported number of dimensions: {n}. Only 1 and 2 are supported.")
-
-    def forward(self, x):
-        return self.conv(x)
-
-
 class HieraPatchEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -242,9 +226,7 @@ def __init__(self, config, is_mae: bool = False):
         self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
         self.mask_ratio = config.mask_ratio
         self.is_mae = is_mae
-
-        self.projection = HieraConvND(
-            self.spatial_dims,
+        self.projection = nn.Conv2d(
             self.num_channels,
             config.embed_dim,
             kernel_size=config.patch_size,
@@ -1198,8 +1180,7 @@ def __init__(self, config: HieraConfig):
             kernel = [i // s for i, s in zip(current_masked_unit_size, self.mask_unit_spatial_shape_final)]
             current_masked_unit_size = [i // s for i, s in zip(current_masked_unit_size, config.query_stride)]
             self.multi_scale_fusion_heads.append(
-                HieraConvND(
-                    len(config.query_stride),
+                nn.Conv2d(
                     self.stage_dimensions[idx],
                     self.stage_dimensions[-1],
                     kernel_size=kernel,

From 73980d0812ef6309582bb15bcbcc860649ef7129 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 27 May 2024 12:24:04 +0200
Subject: [PATCH 097/118] fix style

---
 src/transformers/models/hiera/configuration_hiera.py | 3 +--
 src/transformers/models/hiera/convert_hiera_to_hf.py | 1 -
 src/transformers/models/hiera/modeling_hiera.py      | 3 +--
 tests/models/hiera/test_modeling_hiera.py            | 2 +-
 4 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 0c9c9c76559b..24a11bec97ba 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Hiera model configuration"""
-
+"""Hiera model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 8ab30493a54e..168a69f3f466 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -17,7 +17,6 @@
 URL: https://github.com/facebookresearch/hiera
 """
 
-
 import argparse
 import json
 import math
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index a748119b9e4d..a700a6978e69 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Hiera model."""
-
+"""PyTorch Hiera model."""
 
 import math
 from dataclasses import dataclass
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 1b478f0701bb..c32c680574a8 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Hiera model. """
+"""Testing suite for the PyTorch Hiera model."""
 
 import math
 import os

From 267bf7d6781cc4516bcb2f18f37ec1cb610fb1f9 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 28 May 2024 01:44:44 +0200
Subject: [PATCH 098/118] Addressing comments

---
 .../models/hiera/modeling_hiera.py            | 190 +++++++-------
 tests/models/hiera/test_modeling_hiera.py     | 245 ++++--------------
 tests/test_modeling_common.py                 |  10 +
 3 files changed, 158 insertions(+), 287 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index a700a6978e69..f7eac88d234c 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -100,7 +100,7 @@ class HieraModelOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the model.
         pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
             Average pooling of the last layer hidden-state.
-        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
             Tensor indicating which patches are masked (0) and which are not (1).
         ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Tensor containing the original index of the (shuffled) masked patches.
@@ -125,7 +125,7 @@ class HieraModelOutput(ModelOutput):
 
     last_hidden_state: torch.FloatTensor = None
     pooler_output: Optional[torch.FloatTensor] = None
-    mask: torch.LongTensor = None
+    bool_masked_pos: torch.BoolTensor = None
     ids_restore: torch.LongTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -178,7 +178,7 @@ class HieraForPreTrainingOutput(ModelOutput):
             Pixel reconstruction loss.
         logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
             Pixel reconstruction logits.
-        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
             Tensor indicating which patches are masked (0) and which are not (1).
         ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Tensor containing the original index of the (shuffled) masked patches.
@@ -198,7 +198,7 @@ class HieraForPreTrainingOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
-    mask: torch.LongTensor = None
+    bool_masked_pos: torch.BoolTensor = None
     ids_restore: torch.LongTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -233,34 +233,38 @@ def __init__(self, config, is_mae: bool = False):
             padding=config.patch_padding,
         )
 
-    def masked_conv(self, pixel_values: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def masked_conv(
+        self, pixel_values: torch.FloatTensor, bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
         """Zero-out the masked regions of the input before conv.
         Prevents leakage of masked regions when using overlapping kernels.
         """
-        if mask is None:
+        if bool_masked_pos is None:
             return self.projection(pixel_values)
 
         target_size = pixel_values.shape[2:]
-        # Reshape mask to (batch_size, 1, mask_unit_height, mask_unit_width)
-        mask = mask.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)
+        # Reshape bool_masked_pos to (batch_size, 1, mask_unit_height, mask_unit_width)
+        bool_masked_pos = bool_masked_pos.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)
 
-        if len(mask.shape[2:]) != len(target_size):
+        if len(bool_masked_pos.shape[2:]) != len(target_size):
             raise ValueError(
-                f"The length of the spatial dimensions of the mask should match the one from input image, but got {len(mask.shape[2:])} and {len(target_size)}."
+                f"The length of the spatial dimensions of the bool_masked_pos should match the one from input image, but got {len(bool_masked_pos.shape[2:])} and {len(target_size)}."
             )
 
-        if mask.shape[2:] != target_size:
-            mask = nn.functional.interpolate(mask, size=target_size)
+        if bool_masked_pos.shape[2:] != target_size:
+            bool_masked_pos = nn.functional.interpolate(bool_masked_pos.float(), size=target_size)
 
-        return self.projection(pixel_values * mask.bool())
+        return self.projection(pixel_values * bool_masked_pos)
 
-    def random_masking(self, pixel_values, noise=None):
+    def random_masking(
+        self, pixel_values: torch.FloatTensor, noise: Optional[torch.FloatTensor] = None
+    ) -> Tuple[torch.BoolTensor, torch.LongTensor]:
         """
         Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
         noise.
 
         Args:
-            pixel_values (`torch.LongTensor` of shape `(batch_size, num_channels, height, width)`)
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`)
             noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
                 mainly used for testing purposes to control randomness and maintain the reproducibility
         """
@@ -277,21 +281,21 @@ def random_masking(self, pixel_values, noise=None):
         # ascend: small is keep, large is remove
         ids_restore = torch.argsort(ids_shuffle, dim=1)
 
-        # Generate the binary mask: 1 is *keep*, 0 is *remove*
+        # Generate the binary bool_masked_pos: 1 is *keep*, 0 is *remove*
         # Note this is opposite to original MAE
-        mask = torch.zeros([batch_size, num_windows], device=pixel_values.device)
-        mask[:, :len_keep] = 1
-        # Unshuffle to get the binary mask
-        mask = torch.gather(mask, dim=1, index=ids_restore)
+        bool_masked_pos = torch.zeros([batch_size, num_windows], device=pixel_values.device)
+        bool_masked_pos[:, :len_keep] = 1
+        # Unshuffle to get the binary bool_masked_pos
+        bool_masked_pos = torch.gather(bool_masked_pos, dim=1, index=ids_restore).bool()
 
-        return mask, ids_restore
+        return bool_masked_pos, ids_restore
 
     def forward(
         self,
-        pixel_values: torch.Tensor,
+        pixel_values: torch.FloatTensor,
         noise: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
         _, num_channels, height, width = pixel_values.shape
 
         if num_channels != self.num_channels:
@@ -307,12 +311,14 @@ def forward(
                     f" ({self.image_size[0]}*{self.image_size[1]})."
                 )
 
-        (mask, ids_restore) = self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)
+        (bool_masked_pos, ids_restore) = (
+            self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)
+        )
 
-        embeddings = self.masked_conv(pixel_values, mask)
+        embeddings = self.masked_conv(pixel_values, bool_masked_pos)
         embeddings = embeddings.flatten(2).transpose(2, 1)
 
-        return embeddings, mask, ids_restore
+        return embeddings, bool_masked_pos, ids_restore
 
 
 class HieraEmbeddings(nn.Module):
@@ -368,7 +374,7 @@ def interpolate_pos_encoding(
 
     def get_position_embedding(
         self, embeddings: torch.Tensor, height: int, width: int, interpolate_pos_encoding: bool
-    ) -> torch.Tensor:
+    ) -> torch.FloatTensor:
         position_embeddings = self.position_embeddings
         position_embeddings = (
             self.interpolate_pos_encoding(embeddings, position_embeddings, height, width)
@@ -379,21 +385,17 @@ def get_position_embedding(
 
     def forward(
         self,
-        pixel_values: torch.Tensor,
+        pixel_values: torch.FloatTensor,
         noise: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
         height, width = pixel_values.shape[-2:]
-        embeddings, mask, ids_restore = self.patch_embeddings(
+        embeddings, bool_masked_pos, ids_restore = self.patch_embeddings(
             pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
         )
 
-        height, width = pixel_values.shape[-2:]
-        embeddings, mask, ids_restore = self.patch_embeddings(
-            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
-        )
         embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
-        return embeddings, mask, ids_restore
+        return embeddings, bool_masked_pos, ids_restore
 
 
 class HieraMaskUnitAttention(nn.Module):
@@ -411,7 +413,7 @@ def __init__(
         query_stride: int = 1,
         window_size: int = 0,
         use_mask_unit_attn: bool = False,
-    ):
+    ) -> None:
         super().__init__()
 
         self.dim = dim
@@ -433,7 +435,7 @@ def forward(
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: bool = False,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input should be of shape [batch, tokens, channels]."""
         batch_size, seq_len, _ = hidden_states.shape
 
@@ -503,7 +505,7 @@ def extra_repr(self) -> str:
 
 
 class HieraMlp(nn.Module):
-    def __init__(self, config, dim: int):
+    def __init__(self, config, dim: int) -> None:
         super().__init__()
         self.config = config
         self.activation_fn = ACT2FN[config.hidden_act]
@@ -528,7 +530,7 @@ def __init__(
         query_stride: int = 1,
         window_size: int = 0,
         use_mask_unit_attn: bool = False,
-    ):
+    ) -> None:
         super().__init__()
 
         self.dim = dim
@@ -550,7 +552,7 @@ def forward(
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: bool = False,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         batch_size, seq_len, _ = hidden_states.shape
         # Attention + Q Pooling
         hidden_states_norm = self.layernorm_before(hidden_states)
@@ -612,7 +614,7 @@ def __init__(
 
     def forward(
         self, hidden_states: torch.Tensor, head_mask: Optional[torch.FloatTensor], output_attentions: bool = False
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         for i, layer_module in enumerate(self.layers):
             layer_head_mask = head_mask[i] if head_mask is not None else None
             (hidden_states, attn_weights) = layer_module(
@@ -706,14 +708,14 @@ def __init__(self, config: HieraConfig) -> None:
         self.gradient_checkpointing = False
 
     def reroll(
-        self, hidden_states: torch.Tensor, stage_idx: int, mask: Optional[torch.BoolTensor] = None
+        self, hidden_states: torch.Tensor, stage_idx: int, bool_masked_pos: Optional[torch.BoolTensor] = None
     ) -> torch.Tensor:
         """
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
-        If no mask is provided returns:
+        If no bool_masked_pos is provided returns:
             - [batch_size, height, width, hidden_size] for 2d
-        If a mask is provided returns:
+        If a bool_masked_pos is provided returns:
             - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] for 2d
         """
         schedule, size = self.schedule[stage_idx]
@@ -754,7 +756,7 @@ def reroll(
         hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size)
 
         # If masked, return [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
-        if mask is not None:
+        if bool_masked_pos is not None:
             return hidden_states
 
         # If not masked, we can return [batch_size, height, width, hidden_size]
@@ -765,7 +767,7 @@ def reroll(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        mask: Optional[torch.BoolTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
@@ -777,7 +779,7 @@ def forward(
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
-            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, mask=mask)
+            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, bool_masked_pos=bool_masked_pos)
             all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         for i, stage_module in enumerate(self.stages):
@@ -797,7 +799,7 @@ def forward(
 
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
-                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, mask=mask)
+                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, bool_masked_pos=bool_masked_pos)
                 all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
 
         if not return_dict:
@@ -814,7 +816,9 @@ def forward(
         )
 
 
-def unroll(hidden_states: torch.Tensor, size: List[int], schedule: List[List[int]]) -> torch.Tensor:
+def unroll(
+    hidden_states: torch.Tensor, image_shape: Tuple[int, int], patch_stride: Tuple[int, int], schedule: List[List[int]]
+) -> torch.Tensor:
     """
     Reorders the tokens such that patches are contiguous in memory.
     E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
@@ -834,6 +838,7 @@ def unroll(hidden_states: torch.Tensor, size: List[int], schedule: List[List[int
     The last block of the network is fine though, since by then the strides are all consumed.
     """
     batch_size, _, hidden_size = hidden_states.shape
+    size = [i // s for i, s in zip(image_shape, patch_stride)]
 
     current_size = size
     hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size]))
@@ -965,7 +970,6 @@ def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae:
         self.embeddings = HieraEmbeddings(config, is_mae=is_mae)
         self.encoder = HieraEncoder(config)
 
-        self.unroll_size = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.unroll_schedule = [config.query_stride] * len(config.depths[:-1])
 
         self.pooler = HieraPooler(config) if add_pooling_layer else None
@@ -1023,24 +1027,28 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, len(self.config.depths))
 
-        embedding_output, mask, ids_restore = self.embeddings(
+        embedding_output, bool_masked_pos, ids_restore = self.embeddings(
             pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise
         )
 
-        hidden_states = unroll(embedding_output, self.unroll_size, self.unroll_schedule)
+        hidden_states = unroll(
+            embedding_output,
+            image_shape=pixel_values.shape[-2:],
+            patch_stride=self.config.patch_stride,
+            schedule=self.unroll_schedule,
+        )
 
-        # Discard masked tokens if mask is provided
-        if mask is not None:
+        # Discard masked tokens if bool_masked_pos is provided
+        if bool_masked_pos is not None:
             mask_unit_area = math.prod(self.config.masked_unit_size)
             batch_size, _, hidden_size = hidden_states.shape
-            positions = mask.unsqueeze(-1).tile(1, mask_unit_area, hidden_size)
-            positions = positions.bool()
+            positions = bool_masked_pos.unsqueeze(-1).tile(1, mask_unit_area, hidden_size)
             hidden_states = hidden_states[positions]
             hidden_states = hidden_states.view(batch_size, -1, hidden_size)
 
         encoder_outputs = self.encoder(
             hidden_states,
-            mask=mask,
+            bool_masked_pos=bool_masked_pos,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1053,13 +1061,15 @@ def forward(
 
         if not return_dict:
             head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
-            head_outputs = head_outputs + (mask, ids_restore) if mask is not None else head_outputs
+            head_outputs = (
+                head_outputs + (bool_masked_pos, ids_restore) if bool_masked_pos is not None else head_outputs
+            )
             return head_outputs + encoder_outputs[1:]
 
         return HieraModelOutput(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
-            mask=mask,
+            bool_masked_pos=bool_masked_pos,
             ids_restore=ids_restore,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
@@ -1110,27 +1120,31 @@ def __init__(self, config: HieraConfig):
     def forward(
         self,
         encoder_hidden_states: torch.Tensor,
-        mask: torch.BoolTensor,
+        bool_masked_pos: torch.BoolTensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.BoolTensor]:
         # Embed tokens
         hidden_states = self.decoder_embeddings(encoder_hidden_states)
 
-        # Combine visible and mask tokens
+        # Combine visible and bool_masked_pos tokens
 
         # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_hidden_size]
-        # mask: [batch_size, num_mask_units]
+        # bool_masked_pos: [batch_size, num_mask_units]
         decoder_hidden_states = torch.zeros(
-            *mask.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
+            *bool_masked_pos.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
+        )
+        mask_tokens = self.mask_token.view(
+            (1,) * (len(bool_masked_pos.shape) + len(hidden_states.shape[2:-1])) + (-1,)
         )
-        mask_tokens = self.mask_token.view((1,) * (len(mask.shape) + len(hidden_states.shape[2:-1])) + (-1,))
-        new_mask_shape = mask.shape + (1,) * len(hidden_states.shape[2:])
-        mask = mask.reshape(new_mask_shape)
+        new_mask_shape = bool_masked_pos.shape + (1,) * len(hidden_states.shape[2:])
+        bool_masked_pos = bool_masked_pos.reshape(new_mask_shape)
         expand_shape = (-1,) * 2 + hidden_states.shape[2:]
-        mask = mask.expand(expand_shape)
-        decoder_hidden_states[mask.bool()] = hidden_states.flatten()
-        decoder_hidden_states = (1 - mask) * mask_tokens + mask * decoder_hidden_states
+        bool_masked_pos = bool_masked_pos.expand(expand_shape)
+        decoder_hidden_states[bool_masked_pos] = hidden_states.flatten()
+        decoder_hidden_states = (
+            1 - bool_masked_pos.float()
+        ) * mask_tokens + bool_masked_pos.float() * decoder_hidden_states
 
         # Get back spatial order
         hidden_states = undo_windowing(
@@ -1138,15 +1152,15 @@ def forward(
             self.tokens_spatial_shape_final,
             self.mask_unit_spatial_shape_final,
         )
-        mask = undo_windowing(
-            mask[..., 0:1],
+        bool_masked_pos = undo_windowing(
+            bool_masked_pos[..., 0:1],
             self.tokens_spatial_shape_final,
             self.mask_unit_spatial_shape_final,
         )
 
         # Flatten
         hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1])
-        mask = mask.view(hidden_states.shape[0], -1)
+        bool_masked_pos = bool_masked_pos.view(hidden_states.shape[0], -1)
 
         # Add pos embed
         hidden_states = hidden_states + self.decoder_position_embeddings
@@ -1160,7 +1174,7 @@ def forward(
         # Predictor projection
         hidden_states = self.decoder_pred(hidden_states)
 
-        return hidden_states, mask
+        return hidden_states, bool_masked_pos
 
 
 class HieraMultiScaleHead(nn.Module):
@@ -1244,14 +1258,14 @@ def __init__(self, config: HieraConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_pixel_label_2d(self, pixel_values: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # mask (boolean tensor): True means *masked*
+    def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor) -> torch.Tensor:
+        # bool_masked_pos (boolean tensor): True means *masked*
         pixel_values = pixel_values.permute(0, 2, 3, 1)
 
         size = self.pred_stride
         label = pixel_values.unfold(1, size, size).unfold(2, size, size)
         label = label.flatten(1, 2).flatten(2)
-        label = label[mask.bool()]
+        label = label[bool_masked_pos]
         if self.config.normalize_pixel_loss:
             mean = label.mean(dim=-1, keepdim=True)
             var = label.var(dim=-1, keepdim=True)
@@ -1259,15 +1273,15 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, mask: torch.Tensor) ->
 
         return label
 
-    def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, mask: torch.BoolTensor):
-        # We invert the mask such that 1.0 is *masked*
-        mask = 1 - mask
+    def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor):
+        # We invert the bool_masked_pos such that 1.0 is *masked*
+        bool_masked_pos = ~bool_masked_pos
         if len(self.config.query_stride) == 2:
-            label = self.get_pixel_label_2d(pixel_values, mask)
+            label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)
         else:
             raise NotImplementedError("Only images are supported")
 
-        logits = logits[mask.bool()]
+        logits = logits[bool_masked_pos]
         loss = (logits - label) ** 2
         loss = loss.mean()
 
@@ -1330,7 +1344,7 @@ def forward(
         )
 
         feature_maps = outputs.reshaped_hidden_states
-        mask = outputs.mask
+        bool_masked_pos = outputs.bool_masked_pos
         ids_to_restore = outputs.ids_restore
         # Take only the query pooled and last hidden states
         feature_maps = feature_maps[1 : self.hiera.config.num_query_pool + 1] + (feature_maps[-1],)
@@ -1338,17 +1352,17 @@ def forward(
         fused_hidden_states = self.encoder_norm(fused_hidden_states)
 
         # Reconstruct pixel values
-        logits, mask = self.decoder(
+        logits, bool_masked_pos = self.decoder(
             fused_hidden_states,
-            mask=mask,
+            bool_masked_pos=bool_masked_pos,
             head_mask=head_mask,
             output_attentions=output_attentions,
         )
 
-        loss = self.forward_loss(pixel_values, logits, mask)
+        loss = self.forward_loss(pixel_values, logits, bool_masked_pos)
 
         if not return_dict:
-            output = (logits, mask, ids_to_restore)
+            output = (logits, bool_masked_pos, ids_to_restore)
             if output_hidden_states:
                 output = output + (outputs.hidden_states,)
             if output_attentions:
@@ -1360,7 +1374,7 @@ def forward(
         return HieraForPreTrainingOutput(
             loss=loss,
             logits=logits,
-            mask=mask,
+            bool_masked_pos=bool_masked_pos,
             ids_restore=ids_to_restore,
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index c32c680574a8..193a7ac64cc3 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -15,8 +15,6 @@
 """Testing suite for the PyTorch Hiera model."""
 
 import math
-import os
-import tempfile
 import unittest
 from typing import Dict, List, Tuple
 
@@ -33,8 +31,6 @@
     torch_device,
 )
 from transformers.utils import (
-    CONFIG_NAME,
-    GENERATION_CONFIG_NAME,
     cached_property,
     is_torch_available,
     is_vision_available,
@@ -49,7 +45,6 @@
 if is_torch_available():
     import torch
     from torch import nn
-    from torch.nn import functional as F
 
     from transformers import HieraBackbone, HieraForImageClassification, HieraForPreTraining, HieraModel
 
@@ -64,14 +59,15 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        input_size=[224, 224],
+        image_size=[64, 64],
         mlp_ratio=1.0,
         num_channels=3,
         depths=[1, 1, 1, 1],
         patch_stride=[4, 4],
+        patch_size=[7, 7],
+        patch_padding=[3, 3],
         masked_unit_size=[8, 8],
-        initial_num_heads=1,
-        num_head_multiplier=2.0,
+        num_heads=[1, 1, 1, 1],
         embed_dim_multiplier=2.0,
         is_training=True,
         use_labels=True,
@@ -86,14 +82,15 @@ def __init__(
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.input_size = input_size
+        self.image_size = image_size
         self.mlp_ratio = mlp_ratio
         self.num_channels = num_channels
         self.depths = depths
         self.patch_stride = patch_stride
+        self.patch_size = patch_size
+        self.patch_padding = patch_padding
         self.masked_unit_size = masked_unit_size
-        self.initial_num_heads = initial_num_heads
-        self.num_head_multiplier = num_head_multiplier
+        self.num_heads = num_heads
         self.embed_dim_multiplier = embed_dim_multiplier
         self.is_training = is_training
         self.use_labels = use_labels
@@ -107,7 +104,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.input_size[0], self.input_size[1]])
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
 
         labels = None
         if self.use_labels:
@@ -117,24 +114,18 @@ def prepare_config_and_inputs(self):
 
         return config, pixel_values, labels
 
-    def get_noise_for_mae(self):
-        mask_spatial_shape = [
-            i // s // ms for i, s, ms in zip(self.input_size, self.patch_stride, self.masked_unit_size)
-        ]
-        num_windows = math.prod(mask_spatial_shape)
-        return floats_tensor([self.batch_size, num_windows])
-
     def get_config(self):
         return HieraConfig(
             embed_dim=self.embed_dim,
-            input_size=self.input_size,
+            image_size=self.image_size,
             patch_stride=self.patch_stride,
+            patch_size=self.patch_size,
+            patch_padding=self.patch_padding,
             masked_unit_size=self.masked_unit_size,
             mlp_ratio=self.mlp_ratio,
             num_channels=self.num_channels,
             depths=self.depths,
-            initial_num_heads=self.initial_num_heads,
-            num_head_multiplier=self.num_head_multiplier,
+            num_heads=self.num_heads,
             embed_dim_multiplier=self.embed_dim_multiplier,
             hidden_act=self.hidden_act,
             decoder_hidden_size=self.decoder_hidden_size,
@@ -149,7 +140,7 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.eval()
         result = model(pixel_values)
 
-        tokens_spatial_shape = [i // s for i, s in zip(self.input_size, config.patch_stride)]
+        tokens_spatial_shape = [i // s for i, s in zip(self.image_size, config.patch_stride)]
         expected_seq_len = math.prod(tokens_spatial_shape) // math.prod(config.query_stride) ** (config.num_query_pool)
         expected_dim = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
 
@@ -191,10 +182,9 @@ def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model = HieraForPreTraining(config=config)
         model.to(torch_device)
         model.eval()
-        noise = self.get_noise_for_mae()
-        result = model(pixel_values, noise=noise)
+        result = model(pixel_values)
         pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
-        num_patches = self.input_size[0] // pred_stride
+        num_patches = self.image_size[0] // pred_stride
         self.parent.assertEqual(
             result.logits.shape, (self.batch_size, num_patches**2, self.num_channels * pred_stride**2)
         )
@@ -205,8 +195,8 @@ def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
 
-        pixel_values = floats_tensor([self.batch_size, 1, self.input_size[0], self.input_size[0]])
-        result = model(pixel_values, noise=noise)
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size[0], self.image_size[0]])
+        result = model(pixel_values)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches**2, pred_stride**2))
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
@@ -223,7 +213,7 @@ def create_and_check_for_image_classification(self, config, pixel_values, labels
         model.to(torch_device)
         model.eval()
 
-        pixel_values = floats_tensor([self.batch_size, 1, self.input_size[0], self.input_size[0]])
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size[0], self.image_size[0]])
         result = model(pixel_values)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
@@ -314,7 +304,7 @@ def test_attention_outputs(self):
 
             self.assertListEqual(
                 list(attentions[0].shape[-4:]),
-                [self.model_tester.initial_num_heads, num_windows, mask_unit_area, seq_len // num_windows],
+                [self.model_tester.num_heads[0], num_windows, mask_unit_area, seq_len // num_windows],
             )
             out_len = len(outputs)
 
@@ -337,7 +327,7 @@ def test_attention_outputs(self):
 
             self.assertListEqual(
                 list(self_attentions[0].shape[-4:]),
-                [self.model_tester.initial_num_heads, num_windows, mask_unit_area, seq_len // num_windows],
+                [self.model_tester.num_heads[0], num_windows, mask_unit_area, seq_len // num_windows],
             )
 
     def test_hidden_states_output(self):
@@ -386,7 +376,7 @@ def check_hidden_states_output(inputs_dict, config, model_class, image_size):
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        image_size = self.model_tester.input_size
+        image_size = self.model_tester.image_size
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
@@ -398,101 +388,6 @@ def check_hidden_states_output(inputs_dict, config, model_class, image_size):
 
             check_hidden_states_output(inputs_dict, config, model_class, image_size)
 
-    def test_batching_equivalence(self):
-        """
-        Tests that the model supports batching and that the output is the nearly the same for the same input in
-        different batch sizes.
-        (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
-        different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
-        """
-
-        def get_tensor_equivalence_function(batched_input):
-            # models operating on continuous spaces have higher abs difference than LMs
-            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
-            if "input_ids" not in batched_input:
-                return lambda tensor1, tensor2: (
-                    1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
-                )
-            return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
-
-        def recursive_check(batched_object, single_row_object, model_name, key):
-            if isinstance(batched_object, (list, tuple)):
-                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            elif isinstance(batched_object, dict):
-                for batched_object_value, single_row_object_value in zip(
-                    batched_object.values(), single_row_object.values()
-                ):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            # do not compare returned loss (0-dim tensor) / codebook ids (int) / caching objects
-            elif batched_object is None or not isinstance(batched_object, torch.Tensor):
-                return
-            elif batched_object.dim() == 0:
-                return
-            else:
-                # indexing the first element does not always work
-                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
-                slice_ids = [slice(0, index) for index in single_row_object.shape]
-                batched_row = batched_object[slice_ids]
-                self.assertFalse(
-                    torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
-                )
-                self.assertTrue(
-                    (equivalence(batched_row, single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={equivalence(batched_row, single_row_object)}."
-                    ),
-                )
-
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
-        equivalence = get_tensor_equivalence_function(batched_input)
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-
-            model_name = model_class.__name__
-            if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
-                config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-            if model_name == "HieraForPreTraining":
-                batched_input["noise"] = self.model_tester.get_noise_for_mae()
-            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
-            model = model_class(config).to(torch_device).eval()
-
-            batch_size = self.model_tester.batch_size
-            single_row_input = {}
-            for key, value in batched_input_prepared.items():
-                if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
-                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
-                    single_batch_shape = value.shape[0] // batch_size
-                    single_row_input[key] = value[:single_batch_shape]
-                else:
-                    single_row_input[key] = value
-
-            with torch.no_grad():
-                model_batched_output = model(**batched_input_prepared)
-                model_row_output = model(**single_row_input)
-
-            if isinstance(model_batched_output, torch.Tensor):
-                model_batched_output = {"model_output": model_batched_output}
-                model_row_output = {"model_output": model_row_output}
-
-            for key in model_batched_output:
-                # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
-                if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
-                    model_batched_output[key] = model_batched_output[key][1:]
-                    model_row_output[key] = model_row_output[key][1:]
-                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
-
     def test_model_outputs_equivalence(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -523,7 +418,7 @@ def recursive_check(tuple_object, dict_object):
                             ),
                             msg=(
                                 "Tuple and dict output are not equal. Difference:"
-                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.max(torch.abs(tuple_object.float() - dict_object.float()))}. Tuple has `nan`:"
                                 f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
                                 f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
                             ),
@@ -537,8 +432,6 @@ def recursive_check(tuple_object, dict_object):
             model.eval()
 
             additional_kwargs = {}
-            if model_class.__name__ == "HieraForPreTraining":
-                additional_kwargs["noise"] = self.model_tester.get_noise_for_mae()
 
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
@@ -575,75 +468,6 @@ def recursive_check(tuple_object, dict_object):
                 additional_kwargs["output_hidden_states"] = True
                 check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs)
 
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_determinism(first, second):
-            out_1 = first.cpu().numpy()
-            out_2 = second.cpu().numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                if model_class.__name__ == "HieraForPreTraining":
-                    inputs_dict["noise"] = self.model_tester.get_noise_for_mae()
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_determinism(tensor1, tensor2)
-            else:
-                check_determinism(first, second)
-
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_save_load(out1, out2):
-            # make sure we don't have nans
-            out_2 = out2.cpu().numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            out_1 = out1.cpu().numpy()
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                if model_class.__name__ == "HieraForPreTraining":
-                    inputs_dict["noise"] = self.model_tester.get_noise_for_mae()
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # the config file (and the generation config file, if it can generate) should be saved
-                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
-                self.assertEqual(
-                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
-                )
-
-                model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                with torch.no_grad():
-                    second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_save_load(tensor1, tensor2)
-            else:
-                check_save_load(first, second)
-
     @unittest.skip(reason="Hiera Transformer does not use feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
@@ -729,6 +553,29 @@ def test_inference_image_classification_head(self):
 
         self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
 
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        model = HieraModel.from_pretrained("EduardoPacheco/hiera-tiny-224").to(torch_device)
+
+        image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224", size=448, crop_size=448)
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 196, 768))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[1.8522, 0.1532, 0.3849], [2.7352, -0.1941, 0.1848], [1.5859, -0.0773, 0.0168]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
     @slow
     def test_inference_for_pretraining(self):
         # make random mask reproducible
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 30010cde9116..159d76ffb1b0 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -16,6 +16,7 @@
 import copy
 import gc
 import inspect
+import math
 import os
 import os.path
 import random
@@ -55,6 +56,7 @@
     MODEL_FOR_MASKED_LM_MAPPING_NAMES,
     MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
     MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
+    MODEL_FOR_PRETRAINING_MAPPING_NAMES,
     MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
     MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
@@ -194,6 +196,14 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
             }
         elif model_class.__name__ in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES):
             inputs_dict.pop("attention_mask")
+        elif model_class.__name__ == MODEL_FOR_PRETRAINING_MAPPING_NAMES["hiera"]:
+            config = self.model_tester.get_config()
+            mask_spatial_shape = [
+                i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
+            ]
+            num_windows = math.prod(mask_spatial_shape)
+            torch.manual_seed(0)
+            inputs_dict["noise"] = torch.rand(self.model_tester.batch_size, num_windows)
 
         if return_labels:
             if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):

From 3a7ab678af2126750086967a6cca9a1650b3aab4 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Tue, 11 Jun 2024 16:03:16 +0200
Subject: [PATCH 099/118] Update
 src/transformers/models/hiera/modeling_hiera.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/hiera/modeling_hiera.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index f7eac88d234c..a03db052b519 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -629,9 +629,9 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
     Restore spatial organization by undoing windowed organization of mask units.
 
     Args:
-        hidden_states (torch.Tensor): The hidden states tensor of shape [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size].
-        shape (List[int]): The original shape of the hidden states tensor before windowing.
-        mask_unit_shape (List[int]): The shape of the mask units used for windowing.
+        hidden_states (`torch.Tensor`): The hidden states tensor of shape `[batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]`.
+        shape (`List[int]`): The original shape of the hidden states tensor before windowing.
+        mask_unit_shape (`List[int]`): The shape of the mask units used for windowing.
 
     Returns:
         torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].

From c35fb3f2e7d5977a8998ae5745f923e1ecf804ce Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Tue, 11 Jun 2024 16:03:54 +0200
Subject: [PATCH 100/118] Update
 src/transformers/models/hiera/modeling_hiera.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/hiera/modeling_hiera.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index a03db052b519..02ac636b5b07 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -304,12 +304,11 @@ def forward(
                 f" Expected {self.num_channels} but got {num_channels}."
             )
 
-        if not interpolate_pos_encoding:
-            if height != self.image_size[0] or width != self.image_size[1]:
-                raise ValueError(
-                    f"Input image size ({height}*{width}) doesn't match model"
-                    f" ({self.image_size[0]}*{self.image_size[1]})."
-                )
+        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model"
+                f" ({self.image_size[0]}*{self.image_size[1]})."
+            )
 
         (bool_masked_pos, ids_restore) = (
             self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)

From 660d0b174375ecb519f6d16fe302f1a60969c778 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Tue, 11 Jun 2024 16:04:12 +0200
Subject: [PATCH 101/118] Update
 src/transformers/models/hiera/modeling_hiera.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/hiera/modeling_hiera.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 02ac636b5b07..79cad71a4f5a 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1273,12 +1273,11 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.
         return label
 
     def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor):
+        if len(self.config.query_stride) != 2:
+            raise NotImplementedError("Only images are supported")    
         # We invert the bool_masked_pos such that 1.0 is *masked*
         bool_masked_pos = ~bool_masked_pos
-        if len(self.config.query_stride) == 2:
-            label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)
-        else:
-            raise NotImplementedError("Only images are supported")
+        label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)
 
         logits = logits[bool_masked_pos]
         loss = (logits - label) ** 2

From cc259de8458992805535f8114e854d1abaedcab4 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 11 Jun 2024 16:41:59 +0200
Subject: [PATCH 102/118] Fix style

---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 79cad71a4f5a..d7c3fab0c582 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1274,7 +1274,7 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.
 
     def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor):
         if len(self.config.query_stride) != 2:
-            raise NotImplementedError("Only images are supported")    
+            raise NotImplementedError("Only images are supported")
         # We invert the bool_masked_pos such that 1.0 is *masked*
         bool_masked_pos = ~bool_masked_pos
         label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)

From 20951c1381244a9a6888768a31471ab299ea3ada Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 11 Jun 2024 17:08:32 +0200
Subject: [PATCH 103/118] Fixed tests

---
 tests/models/hiera/test_modeling_hiera.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 193a7ac64cc3..635c3e5b31d8 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -268,6 +268,17 @@ def test_config(self):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
+    # Overriding as Hiera `get_input_embeddings` returns HieraPatchEmbeddings
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    # Overriding as attention shape depends on patch_stride and mask_unit_size
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -330,6 +341,7 @@ def test_attention_outputs(self):
                 [self.model_tester.num_heads[0], num_windows, mask_unit_area, seq_len // num_windows],
             )
 
+    # Overriding as attention shape depends on patch_stride and mask_unit_size
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class, image_size):
             model = model_class(config)
@@ -388,6 +400,7 @@ def check_hidden_states_output(inputs_dict, config, model_class, image_size):
 
             check_hidden_states_output(inputs_dict, config, model_class, image_size)
 
+    # Overriding since HieraForPreTraining outputs bool_masked_pos which has to be converted to float in the msg
     def test_model_outputs_equivalence(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From 48a317e511b9ba344f3efda6f5ecc8d834672ff8 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 11 Jun 2024 17:10:00 +0200
Subject: [PATCH 104/118] Fixed typo

---
 tests/models/hiera/test_modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 635c3e5b31d8..0a12418531e6 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -358,7 +358,7 @@ def check_hidden_states_output(inputs_dict, config, model_class, image_size):
             )
             self.assertEqual(len(hidden_states), expected_num_layers)
 
-            # Swin has a different seq_length
+            # Hiera has a different seq_length
             patch_size = config.patch_stride
 
             num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])

From 192d7dbf99185b3b5add846c9545074ce8ead923 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 11 Jun 2024 17:16:27 +0200
Subject: [PATCH 105/118] Fixed interpolate test

---
 tests/models/hiera/test_modeling_hiera.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 0a12418531e6..45ac0c5a3f98 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -566,11 +566,12 @@ def test_inference_image_classification_head(self):
 
         self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
 
-    @slow
     def test_inference_interpolate_pos_encoding(self):
         model = HieraModel.from_pretrained("EduardoPacheco/hiera-tiny-224").to(torch_device)
 
-        image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224", size=448, crop_size=448)
+        image_processor = AutoImageProcessor.from_pretrained(
+            "EduardoPacheco/hiera-tiny-224", size={"shortest_edge": 448}, crop_size={"height": 448, "width": 448}
+        )
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)

From cafae1b59e3bdb4d6bb9ee6961bb4ada794b06e3 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 12 Jun 2024 13:16:02 +0200
Subject: [PATCH 106/118] Made torch fx compatible

---
 .../models/hiera/modeling_hiera.py            | 76 ++++++++-----------
 src/transformers/utils/fx.py                  |  1 +
 tests/models/hiera/test_modeling_hiera.py     |  2 +-
 tests/test_modeling_common.py                 |  1 +
 4 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index d7c3fab0c582..433dac7a948e 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -246,13 +246,7 @@ def masked_conv(
         # Reshape bool_masked_pos to (batch_size, 1, mask_unit_height, mask_unit_width)
         bool_masked_pos = bool_masked_pos.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)
 
-        if len(bool_masked_pos.shape[2:]) != len(target_size):
-            raise ValueError(
-                f"The length of the spatial dimensions of the bool_masked_pos should match the one from input image, but got {len(bool_masked_pos.shape[2:])} and {len(target_size)}."
-            )
-
-        if bool_masked_pos.shape[2:] != target_size:
-            bool_masked_pos = nn.functional.interpolate(bool_masked_pos.float(), size=target_size)
+        bool_masked_pos = nn.functional.interpolate(bool_masked_pos.float(), size=target_size)
 
         return self.projection(pixel_values * bool_masked_pos)
 
@@ -635,7 +629,6 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
     Returns:
         torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].
     """
-    num_dims = len(shape)
     batch_size, hidden_size = hidden_states.shape[0], hidden_states.shape[-1]
     # From: [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]
     # To: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
@@ -644,12 +637,8 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
 
     # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
     # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]
-    permute = (
-        [0]
-        + [item for pair in zip(range(1, 1 + num_dims), range(1 + num_dims, 1 + 2 * num_dims)) for item in pair]
-        + [len(hidden_states.shape) - 1]
-    )
-    hidden_states = hidden_states.permute(permute).reshape(batch_size, *shape, hidden_size)
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5)
+    hidden_states = hidden_states.reshape(batch_size, *shape, hidden_size)
 
     return hidden_states
 
@@ -713,9 +702,9 @@ def reroll(
         Roll the given tensor back up to spatial order assuming it's from the given block.
 
         If no bool_masked_pos is provided returns:
-            - [batch_size, height, width, hidden_size] for 2d
+            - [batch_size, height, width, hidden_size]
         If a bool_masked_pos is provided returns:
-            - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] for 2d
+            - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
         """
         schedule, size = self.schedule[stage_idx]
         batch_size, seq_len, hidden_size = hidden_states.shape
@@ -730,20 +719,9 @@ def reroll(
             )
 
             # Move that patch into the current MU
-            # Example in 2d:
             # Input: [batch_size, stride, stride, seq_len//(stride*stride), mask_unit_height, mask_unit_width, hidden_size]
             # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size]
-            hidden_state_dims = len(hidden_states.shape)
-            permute = (
-                [0, 1 + num_dim]
-                + [
-                    item
-                    for pair in zip(range(1, 1 + num_dim), range(1 + num_dim + 1, hidden_state_dims - 1))
-                    for item in pair
-                ]
-                + [hidden_state_dims - 1]
-            )
-            hidden_states = hidden_states.permute(permute)
+            hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5, 6)
 
             # Reshape to [batch_size, seq_len//(stride*stride), *mask_units, hidden_size]
             for i in range(num_dim):
@@ -837,6 +815,7 @@ def unroll(
     The last block of the network is fine though, since by then the strides are all consumed.
     """
     batch_size, _, hidden_size = hidden_states.shape
+
     size = [i // s for i, s in zip(image_shape, patch_stride)]
 
     current_size = size
@@ -1030,9 +1009,10 @@ def forward(
             pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise
         )
 
+        image_shape = (pixel_values.shape[-2], pixel_values.shape[-1])
         hidden_states = unroll(
             embedding_output,
-            image_shape=pixel_values.shape[-2:],
+            image_shape=image_shape,
             patch_stride=self.config.patch_stride,
             schedule=self.unroll_schedule,
         )
@@ -1130,16 +1110,21 @@ def forward(
 
         # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_hidden_size]
         # bool_masked_pos: [batch_size, num_mask_units]
+        mask_unit_height, mask_unit_width, decoder_hidden_size = hidden_states.shape[2:]
+        batch_size, num_mask_units = bool_masked_pos.shape
+
         decoder_hidden_states = torch.zeros(
-            *bool_masked_pos.shape, *hidden_states.shape[2:], device=hidden_states.device, dtype=hidden_states.dtype
+            batch_size,
+            num_mask_units,
+            mask_unit_height,
+            mask_unit_width,
+            decoder_hidden_size,
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
         )
-        mask_tokens = self.mask_token.view(
-            (1,) * (len(bool_masked_pos.shape) + len(hidden_states.shape[2:-1])) + (-1,)
-        )
-        new_mask_shape = bool_masked_pos.shape + (1,) * len(hidden_states.shape[2:])
-        bool_masked_pos = bool_masked_pos.reshape(new_mask_shape)
-        expand_shape = (-1,) * 2 + hidden_states.shape[2:]
-        bool_masked_pos = bool_masked_pos.expand(expand_shape)
+        mask_tokens = self.mask_token.view(1, 1, 1, 1, -1)
+        bool_masked_pos = bool_masked_pos.reshape(batch_size, num_mask_units, 1, 1, 1)
+        bool_masked_pos = bool_masked_pos.expand(-1, -1, mask_unit_height, mask_unit_width, decoder_hidden_size)
         decoder_hidden_states[bool_masked_pos] = hidden_states.flatten()
         decoder_hidden_states = (
             1 - bool_masked_pos.float()
@@ -1205,20 +1190,23 @@ def apply_fusion_head(self, head: nn.Module, hidden_states: torch.Tensor) -> tor
         if isinstance(head, nn.Identity):
             return hidden_states
 
-        batch_size, num_mask_units = hidden_states.shape[0:2]
+        # Doing explicit to avoid problems with torch.fx
+        batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size = hidden_states.shape
         # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
         # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width])
-        permute = [0] + [len(hidden_states.shape) - 2] + list(range(1, len(hidden_states.shape) - 2))
-        hidden_states = hidden_states.reshape(batch_size * num_mask_units, *hidden_states.shape[2:])
-        hidden_states = hidden_states.permute(permute)
+        hidden_states = hidden_states.reshape(
+            batch_size * num_mask_units, mask_unit_height, mask_unit_width, hidden_size
+        )
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
         hidden_states = head(hidden_states)
 
         # Restore original layout
-        permute = [0] + list(range(2, len(hidden_states.shape))) + [1]
-        hidden_states = hidden_states.permute(permute)
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+        mask_unit_height_final, mask_unit_width_final, hidden_size = hidden_states.shape[1:]
         hidden_states = hidden_states.reshape(
-            batch_size, num_mask_units, *hidden_states.shape[1:-1], hidden_states.shape[-1]
+            batch_size, num_mask_units, mask_unit_height_final, mask_unit_width_final, hidden_size
         )
+
         return hidden_states
 
     def forward(self, feature_maps: List[torch.Tensor]) -> torch.Tensor:
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index c3687c035c58..78e7fcc292f0 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -138,6 +138,7 @@ def _generate_supported_model_class_names(
     "gpt2",
     "gpt_neo",
     "gptj",
+    "hiera",
     "hubert",
     "layoutlm",
     "llama",
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 45ac0c5a3f98..4a2beadb716f 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -250,7 +250,7 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
+    fx_compatible = True
 
     test_pruning = False
     test_resize_embeddings = False
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index b82595f329ec..2765100ba92e 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1285,6 +1285,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa
                     "token_type_ids",
                     "visual_feats",
                     "visual_pos",
+                    "noise",
                 ]
 
                 labels = inputs.get("labels", None)

From 75fd1c22152f89c16563309bc0967c070251dbec Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 12 Jun 2024 13:20:26 +0200
Subject: [PATCH 107/118] Made sure imageprocesor is correct

---
 src/transformers/models/auto/image_processing_auto.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index b316a1a55dde..b9c31853fcc7 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -85,6 +85,7 @@
             ("glpn", ("GLPNImageProcessor",)),
             ("grounding-dino", ("GroundingDinoImageProcessor",)),
             ("groupvit", ("CLIPImageProcessor",)),
+            ("hiera", ("BitImageProcessor",)),
             ("idefics", ("IdeficsImageProcessor",)),
             ("idefics2", ("Idefics2ImageProcessor",)),
             ("imagegpt", ("ImageGPTImageProcessor",)),

From 7e42e01c4326b095911d5c1e05a1a1c6e279be7a Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 12 Jun 2024 14:35:42 +0200
Subject: [PATCH 108/118] Addressed comments

---
 .../models/hiera/modeling_hiera.py            | 123 ++++++++----------
 1 file changed, 54 insertions(+), 69 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 433dac7a948e..376c91c9418f 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -288,22 +288,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         noise: Optional[torch.FloatTensor] = None,
-        interpolate_pos_encoding: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
-        _, num_channels, height, width = pixel_values.shape
-
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-                f" Expected {self.num_channels} but got {num_channels}."
-            )
-
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
-
         (bool_masked_pos, ids_restore) = (
             self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)
         )
@@ -322,9 +307,9 @@ class HieraEmbeddings(nn.Module):
     def __init__(self, config: HieraConfig, is_mae: bool = False) -> None:
         super().__init__()
         self.patch_stride = config.patch_stride
-        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
-        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
-        self.num_tokens = math.prod(self.tokens_spatial_shape)
+        tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(tokens_spatial_shape, config.masked_unit_size)]
+        self.num_tokens = math.prod(tokens_spatial_shape)
         self.is_mae = is_mae
 
         self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae)
@@ -383,10 +368,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
         height, width = pixel_values.shape[-2:]
-        embeddings, bool_masked_pos, ids_restore = self.patch_embeddings(
-            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
-        )
-
+        embeddings, bool_masked_pos, ids_restore = self.patch_embeddings(pixel_values, noise=noise)
         embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
         return embeddings, bool_masked_pos, ids_restore
 
@@ -400,25 +382,23 @@ class HieraMaskUnitAttention(nn.Module):
 
     def __init__(
         self,
-        dim: int,
-        dim_out: int,
+        hidden_size: int,
+        hidden_size_output: int,
         num_heads: int,
         query_stride: int = 1,
         window_size: int = 0,
         use_mask_unit_attn: bool = False,
     ) -> None:
         super().__init__()
-
-        self.dim = dim
-        self.dim_out = dim_out
         self.num_heads = num_heads
         self.query_stride = query_stride
+        self.hidden_size_output = hidden_size_output
 
-        self.head_dim = dim_out // num_heads
+        self.head_dim = hidden_size_output // num_heads
         self.scale = (self.head_dim) ** -0.5
 
-        self.qkv = nn.Linear(dim, 3 * dim_out)
-        self.proj = nn.Linear(dim_out, dim_out)
+        self.qkv = nn.Linear(hidden_size, 3 * hidden_size_output)
+        self.proj = nn.Linear(hidden_size_output, hidden_size_output)
 
         self.window_size = window_size
         self.use_mask_unit_attn = use_mask_unit_attn
@@ -455,7 +435,7 @@ def forward(
             attn_weights = attn_weights * head_mask
 
         attn_output = attn_weights @ value
-        attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.dim_out)
+        attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.hidden_size_output)
         attn_output = self.proj(attn_output)
 
         return (attn_output, attn_weights) if output_attentions else (attn_output, None)
@@ -500,7 +480,6 @@ def extra_repr(self) -> str:
 class HieraMlp(nn.Module):
     def __init__(self, config, dim: int) -> None:
         super().__init__()
-        self.config = config
         self.activation_fn = ACT2FN[config.hidden_act]
         self.fc1 = nn.Linear(dim, int(dim * config.mlp_ratio))
         self.fc2 = nn.Linear(int(dim * config.mlp_ratio), dim)
@@ -516,8 +495,8 @@ class HieraLayer(nn.Module):
     def __init__(
         self,
         config,
-        dim: int,
-        dim_out: int,
+        hidden_size: int,
+        hidden_size_output: int,
         num_heads: int,
         drop_path: float = 0.0,
         query_stride: int = 1,
@@ -526,19 +505,26 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self.dim = dim
-        self.dim_out = dim_out
+        self.hidden_size = hidden_size
+        self.hidden_size_output = hidden_size_output
         self.query_stride = query_stride
 
-        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
-        self.attn = HieraMaskUnitAttention(dim, dim_out, num_heads, query_stride, window_size, use_mask_unit_attn)
+        self.layernorm_before = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.attn = HieraMaskUnitAttention(
+            hidden_size=hidden_size,
+            hidden_size_output=hidden_size_output,
+            num_heads=num_heads,
+            query_stride=query_stride,
+            window_size=window_size,
+            use_mask_unit_attn=use_mask_unit_attn,
+        )
 
-        self.layernorm_after = nn.LayerNorm(dim_out, eps=config.layer_norm_eps)
-        self.mlp = HieraMlp(config, dim_out)
+        self.layernorm_after = nn.LayerNorm(hidden_size_output, eps=config.layer_norm_eps)
+        self.mlp = HieraMlp(config, hidden_size_output)
 
         self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity()
-        if dim != dim_out:
-            self.proj = nn.Linear(dim, dim_out)
+        if hidden_size != hidden_size_output:
+            self.proj = nn.Linear(hidden_size, hidden_size_output)
 
     def forward(
         self,
@@ -549,10 +535,12 @@ def forward(
         batch_size, seq_len, _ = hidden_states.shape
         # Attention + Q Pooling
         hidden_states_norm = self.layernorm_before(hidden_states)
-        if self.dim != self.dim_out:
+        if self.hidden_size != self.hidden_size_output:
             hidden_states = self.proj(hidden_states_norm)
             # Refer to unroll to see how this performs a maxpool-Nd
-            hidden_states = hidden_states.view(batch_size, self.query_stride, -1, self.dim_out).max(dim=1).values
+            hidden_states = (
+                hidden_states.view(batch_size, self.query_stride, -1, self.hidden_size_output).max(dim=1).values
+            )
 
         (hidden_states_norm, attn_weights) = self.attn(
             hidden_states_norm, head_mask, output_attentions=output_attentions
@@ -572,8 +560,8 @@ def __init__(
         self,
         config,
         depth: int,
-        dim: int,
-        dim_out: int,
+        hidden_size: int,
+        hidden_size_output: int,
         num_heads: int,
         drop_path: List[float],
         query_stride: List[int],
@@ -593,8 +581,8 @@ def __init__(
             [
                 HieraLayer(
                     config=config,
-                    dim=dim if i == 0 else dim_out,
-                    dim_out=dim_out,
+                    hidden_size=hidden_size if i == 0 else hidden_size_output,
+                    hidden_size_output=hidden_size_output,
                     num_heads=num_heads,
                     drop_path=drop_path[i],
                     query_stride=query_stride[i],
@@ -646,38 +634,37 @@ def undo_windowing(hidden_states: torch.Tensor, shape: List[int], mask_unit_shap
 class HieraEncoder(nn.Module):
     def __init__(self, config: HieraConfig) -> None:
         super().__init__()
-        self.config = config
-
+        total_depth = sum(config.depths)
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, total_depth)]
         # query strides rule
-        stage_ends = [sum(config.depths[:i]) - 1 for i in range(1, len(config.depths) + 1)]
-        query_pool_layer = [stage_end + 1 for stage_end in stage_ends[: config.num_query_pool]]
-        query_strides = [
-            math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(sum(config.depths))
-        ]
+        cumulative_depths = torch.tensor(config.depths).cumsum(0).tolist()
+        query_pool_layer = cumulative_depths[: config.num_query_pool]
+        query_strides = [math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(total_depth)]
 
         # Transformer blocks
         self.stages = nn.ModuleList()
-        embed_dim = config.embed_dim
-
+        hidden_size = config.embed_dim
+        stage_ends = [0] + cumulative_depths
+        masked_unit_area = math.prod(config.masked_unit_size)
+        query_stride_area = math.prod(config.query_stride)
         for idx_stage, depth in enumerate(config.depths):
-            dim_out = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
+            hidden_size_output = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
 
             stage = HieraStage(
                 config=config,
                 depth=depth,
-                dim=embed_dim,
-                dim_out=dim_out,
+                hidden_size=hidden_size,
+                hidden_size_output=hidden_size_output,
                 num_heads=config.num_heads[idx_stage],
-                drop_path=dpr[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
-                query_stride=query_strides[sum(config.depths[:idx_stage]) : sum(config.depths[: idx_stage + 1])],
-                window_size=int(math.prod(config.masked_unit_size) * math.prod(config.query_stride) ** -idx_stage),
+                drop_path=dpr[stage_ends[idx_stage] : stage_ends[idx_stage + 1]],
+                query_stride=query_strides[stage_ends[idx_stage] : stage_ends[idx_stage + 1]],
+                window_size=int(masked_unit_area * query_stride_area**-idx_stage),
                 use_mask_unit_attn=config.masked_unit_attention[idx_stage],
                 stage_num=idx_stage,
             )
 
-            embed_dim = dim_out
+            hidden_size = hidden_size_output
             self.stages.append(stage)
 
         # Setting reroll schedule
@@ -1078,8 +1065,8 @@ def __init__(self, config: HieraConfig):
 
         self.decoder_block = HieraStage(
             config=config,
-            dim=config.decoder_hidden_size,
-            dim_out=config.decoder_hidden_size,
+            hidden_size=config.decoder_hidden_size,
+            hidden_size_output=config.decoder_hidden_size,
             num_heads=config.decoder_num_heads,
             depth=config.decoder_depth,
             use_mask_unit_attn=False,
@@ -1261,8 +1248,6 @@ def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.
         return label
 
     def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor):
-        if len(self.config.query_stride) != 2:
-            raise NotImplementedError("Only images are supported")
         # We invert the bool_masked_pos such that 1.0 is *masked*
         bool_masked_pos = ~bool_masked_pos
         label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)

From 776d2e2fc9c7b192ce64f98ebdea9f100ea2011b Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 12 Jun 2024 14:50:00 +0200
Subject: [PATCH 109/118] Noise directly as torch

---
 .../models/hiera/convert_hiera_to_hf.py        |  6 ++----
 tests/models/hiera/test_modeling_hiera.py      | 18 ++++++++----------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
index 168a69f3f466..eed27645b344 100644
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ b/src/transformers/models/hiera/convert_hiera_to_hf.py
@@ -22,7 +22,6 @@
 import math
 from typing import Dict, Tuple
 
-import numpy as np
 import requests
 import torch
 from huggingface_hub import hf_hub_download
@@ -285,9 +284,8 @@ def convert_hiera_checkpoint(args):
         i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
     ]
     num_windows = math.prod(mask_spatial_shape)
-    np.random.seed(2)
-    noise = np.random.uniform(size=(1, num_windows))
-    noise = torch.from_numpy(noise)
+    torch.manual_seed(2)
+    noise = torch.rand(1, num_windows)
     outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs)
     # original implementation returns logits.softmax(dim=-1)
 
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 4a2beadb716f..7398d313d7b4 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -18,8 +18,6 @@
 import unittest
 from typing import Dict, List, Tuple
 
-import numpy as np
-
 from transformers import HieraConfig
 from transformers.testing_utils import (
     require_accelerate,
@@ -593,7 +591,7 @@ def test_inference_interpolate_pos_encoding(self):
     @slow
     def test_inference_for_pretraining(self):
         # make random mask reproducible
-        np.random.seed(2)
+        torch.manual_seed(2)
 
         model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae").to(torch_device)
         image_processor = self.default_image_processor
@@ -606,11 +604,11 @@ def test_inference_for_pretraining(self):
             i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
         ]
         num_windows = math.prod(mask_spatial_shape)
-        noise = np.random.uniform(size=(1, num_windows))
+        noise = torch.rand(1, num_windows, device=torch_device)
 
         # forward pass
         with torch.no_grad():
-            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
+            outputs = model(**inputs, noise=noise)
 
         # verify the logits
         expected_shape = torch.Size((1, 196, 768))
@@ -618,11 +616,11 @@ def test_inference_for_pretraining(self):
 
         expected_slice = torch.tensor(
             [
-                [1.5719, 1.5743, 1.5732, 1.5791, 1.5958],
-                [1.9311, 1.9409, 1.9440, 1.9545, 1.9605],
-                [1.6149, 1.8555, 1.2720, 1.5385, 1.5067],
-                [1.2804, 1.8411, 0.8342, 1.5867, 1.5384],
-                [2.1131, 2.0876, 2.0349, 1.9921, 1.9496],
+                [1.6407, 1.6506, 1.6541, 1.6617, 1.6703],
+                [1.9730, 1.9842, 1.9848, 1.9896, 1.9947],
+                [1.5949, 1.8262, 1.2602, 1.4801, 1.4448],
+                [1.2341, 1.7907, 0.8618, 1.5202, 1.4523],
+                [2.0140, 1.9846, 1.9434, 1.9019, 1.8648],
             ]
         )
 

From ad09194d21f9d2a8891f506b9a97c38deb4853cd Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 12 Jun 2024 14:53:19 +0200
Subject: [PATCH 110/118] Remove unnecesary attr

---
 src/transformers/models/hiera/modeling_hiera.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 376c91c9418f..c4a319d66917 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1047,9 +1047,9 @@ class HieraDecoder(nn.Module):
     def __init__(self, config: HieraConfig):
         super().__init__()
         num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
-        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
         self.tokens_spatial_shape_final = [
-            i // s ** (config.num_query_pool) for i, s in zip(self.tokens_spatial_shape, config.query_stride)
+            i // s ** (config.num_query_pool) for i, s in zip(tokens_spatial_shape, config.query_stride)
         ]
         self.mask_unit_spatial_shape_final = [
             i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)

From 14a686b50da9c3ffe689b8f0658f845fb0e0b844 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 12 Jun 2024 15:08:02 +0200
Subject: [PATCH 111/118] Added return_dit

---
 .../models/hiera/modeling_hiera.py            | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index c4a319d66917..998900463c18 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1311,12 +1311,12 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=True,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=True,
+            return_dict=return_dict,
         )
 
-        feature_maps = outputs.reshaped_hidden_states
-        bool_masked_pos = outputs.bool_masked_pos
-        ids_to_restore = outputs.ids_restore
+        feature_maps = outputs[-1]
+        bool_masked_pos = outputs[1]
+        ids_to_restore = outputs[2]
         # Take only the query pooled and last hidden states
         feature_maps = feature_maps[1 : self.hiera.config.num_query_pool + 1] + (feature_maps[-1],)
         fused_hidden_states = self.multiscale_fusion(feature_maps)
@@ -1335,11 +1335,11 @@ def forward(
         if not return_dict:
             output = (logits, bool_masked_pos, ids_to_restore)
             if output_hidden_states:
-                output = output + (outputs.hidden_states,)
+                output = output + (outputs[3],)
             if output_attentions:
-                output = output + (outputs.attentions,)
+                output = output + (outputs[4],)
             if output_hidden_states:
-                output = output + (outputs.reshaped_hidden_states,)
+                output = output + (outputs[-1],)
             return ((loss,) + output) if loss is not None else output
 
         return HieraForPreTrainingOutput(
@@ -1537,10 +1537,10 @@ def forward(
             head_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=True,
-            return_dict=True,
+            return_dict=return_dict,
         )
 
-        hidden_states = outputs.reshaped_hidden_states
+        hidden_states = outputs[-1]
 
         feature_maps = ()
         for stage, hidden_state in zip(self.stage_names, hidden_states):
@@ -1555,11 +1555,13 @@ def forward(
         if not return_dict:
             output = (feature_maps,)
             if output_hidden_states:
-                output += (outputs.hidden_states,)
+                output += (outputs[1],)
+            if output_attentions:
+                output += (outputs[2],)
             return output
 
         return BackboneOutput(
             feature_maps=feature_maps,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
-            attentions=outputs.attentions,
+            hidden_states=outputs[1] if output_hidden_states else None,
+            attentions=outputs[2] if output_attentions else None,
         )

From ef2ea02b1facede4346f036aeac1a0c6d0368db8 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sun, 7 Jul 2024 09:09:25 -0300
Subject: [PATCH 112/118] Update src/transformers/models/hiera/__init__.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/hiera/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/hiera/__init__.py b/src/transformers/models/hiera/__init__.py
index 595bf76e4c93..aeda2baf5653 100644
--- a/src/transformers/models/hiera/__init__.py
+++ b/src/transformers/models/hiera/__init__.py
@@ -20,7 +20,7 @@
 )
 
 
-_import_structure = {"configuration_hiera": ["HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP", "HieraConfig", "HieraOnnxConfig"]}
+_import_structure = {"configuration_hiera": ["HieraConfig"]}
 
 try:
     if not is_torch_available():
@@ -37,7 +37,7 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_hiera import HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP, HieraConfig, HieraOnnxConfig
+    from .configuration_hiera import HieraConfig
 
     try:
         if not is_torch_available():

From 190c662d2df86af316d3adb4f72202de7e0f1757 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 7 Jul 2024 09:54:24 -0300
Subject: [PATCH 113/118] Updated checkpoints

---
 .../models/hiera/configuration_hiera.py        |  2 +-
 .../models/hiera/modeling_hiera.py             | 12 ++++++------
 tests/models/hiera/test_modeling_hiera.py      | 18 +++++++-----------
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py
index 24a11bec97ba..0412e02be7a3 100644
--- a/src/transformers/models/hiera/configuration_hiera.py
+++ b/src/transformers/models/hiera/configuration_hiera.py
@@ -27,7 +27,7 @@ class HieraConfig(BackboneConfigMixin, PretrainedConfig):
     This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the Hiera
-    [EduardoPacheco/hiera-base-224](https://huggingface.co/EduardoPacheco/hiera-base-224) architecture.
+    [facebook/hiera-base-224](https://huggingface.co/facebook/hiera-base-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 998900463c18..4da1d7be719e 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -49,11 +49,11 @@
 _CONFIG_FOR_DOC = "HieraConfig"
 
 # Base docstring
-_CHECKPOINT_FOR_DOC = "EduardoPacheco/hiera-tiny-224"
+_CHECKPOINT_FOR_DOC = "facebook/hiera-tiny-224-hf"
 _EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
 
 # Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "EduardoPacheco/hiera-tiny-224-in1k"
+_IMAGE_CLASS_CHECKPOINT = "facebook/hiera-tiny-224-in1k-hf"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
 
 
@@ -1287,8 +1287,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
-        >>> model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-mae-hf")
+        >>> model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
 
@@ -1513,9 +1513,9 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224")
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-hf")
         >>> model = AutoBackbone.from_pretrained(
-        ...     "EduardoPacheco/hiera-tiny-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ...     "facebook/hiera-tiny-224-hf", out_features=["stage1", "stage2", "stage3", "stage4"]
         ... )
 
         >>> inputs = processor(image, return_tensors="pt")
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 7398d313d7b4..6dcb98a1a026 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -514,7 +514,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in ["EduardoPacheco/hiera-tiny-224"]:
+        for model_name in ["facebook/hiera-tiny-224-hf"]:
             model = HieraModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
@@ -530,13 +530,11 @@ def prepare_img():
 class HieraModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
-        return (
-            AutoImageProcessor.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k") if is_vision_available() else None
-        )
+        return AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-in1k-hf") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
-        model = HieraForImageClassification.from_pretrained("EduardoPacheco/hiera-tiny-224-in1k").to(torch_device)
+        model = HieraForImageClassification.from_pretrained("facebook/hiera-tiny-224-in1k-hf").to(torch_device)
 
         image_processor = self.default_image_processor
         image = prepare_img()
@@ -565,10 +563,10 @@ def test_inference_image_classification_head(self):
         self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
 
     def test_inference_interpolate_pos_encoding(self):
-        model = HieraModel.from_pretrained("EduardoPacheco/hiera-tiny-224").to(torch_device)
+        model = HieraModel.from_pretrained("facebook/hiera-tiny-224-hf").to(torch_device)
 
         image_processor = AutoImageProcessor.from_pretrained(
-            "EduardoPacheco/hiera-tiny-224", size={"shortest_edge": 448}, crop_size={"height": 448, "width": 448}
+            "facebook/hiera-tiny-224-hf", size={"shortest_edge": 448}, crop_size={"height": 448, "width": 448}
         )
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt")
@@ -593,7 +591,7 @@ def test_inference_for_pretraining(self):
         # make random mask reproducible
         torch.manual_seed(2)
 
-        model = HieraForPreTraining.from_pretrained("EduardoPacheco/hiera-tiny-224-mae").to(torch_device)
+        model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf").to(torch_device)
         image_processor = self.default_image_processor
 
         image = prepare_img()
@@ -634,9 +632,7 @@ def test_inference_fp16(self):
         r"""
         A small test to make sure that inference work in half precision without any problem.
         """
-        model = HieraModel.from_pretrained(
-            "EduardoPacheco/hiera-tiny-224", torch_dtype=torch.float16, device_map="auto"
-        )
+        model = HieraModel.from_pretrained("facebook/hiera-tiny-224-hf", torch_dtype=torch.float16, device_map="auto")
         image_processor = self.default_image_processor
 
         image = prepare_img()

From d76183e4093e33ce40d302981d500fdb9a8b1f2f Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Tue, 9 Jul 2024 16:21:34 +0000
Subject: [PATCH 114/118] [run_slow] hiera


From c67bacf2c0ec9128b836c29c61b2ec7f5b17037e Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 10 Jul 2024 03:42:42 +0000
Subject: [PATCH 115/118] Fixed device mismatch

---
 src/transformers/models/hiera/modeling_hiera.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 4da1d7be719e..2b07efe347b9 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -273,7 +273,7 @@ def random_masking(
         # Sort noise for each sample
         ids_shuffle = torch.argsort(noise, dim=1)
         # ascend: small is keep, large is remove
-        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_restore = torch.argsort(ids_shuffle, dim=1).to(pixel_values.device)
 
         # Generate the binary bool_masked_pos: 1 is *keep*, 0 is *remove*
         # Note this is opposite to original MAE

From ec2c480e00329f2696871031a274f08f43ddd933 Mon Sep 17 00:00:00 2001
From: Naman Garg <namangarg110@gmail.com>
Date: Wed, 10 Jul 2024 06:09:26 +0000
Subject: [PATCH 116/118] [run_slow] hiera


From 834bc8eb21435aaff3eff061b399bf209c957201 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <Eduardo.pach@hotmail.com>
Date: Thu, 11 Jul 2024 18:27:33 +0000
Subject: [PATCH 117/118] Fixed GPU tests

---
 tests/models/hiera/test_modeling_hiera.py | 24 +----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index 6dcb98a1a026..4319e1eb0f4f 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -20,10 +20,7 @@
 
 from transformers import HieraConfig
 from transformers.testing_utils import (
-    require_accelerate,
     require_torch,
-    require_torch_accelerator,
-    require_torch_fp16,
     require_vision,
     slow,
     torch_device,
@@ -602,7 +599,7 @@ def test_inference_for_pretraining(self):
             i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
         ]
         num_windows = math.prod(mask_spatial_shape)
-        noise = torch.rand(1, num_windows, device=torch_device)
+        noise = torch.rand(1, num_windows).to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -624,25 +621,6 @@ def test_inference_for_pretraining(self):
 
         self.assertTrue(torch.allclose(outputs.logits[0, :5, :5], expected_slice.to(torch_device), atol=1e-4))
 
-    @slow
-    @require_accelerate
-    @require_torch_accelerator
-    @require_torch_fp16
-    def test_inference_fp16(self):
-        r"""
-        A small test to make sure that inference work in half precision without any problem.
-        """
-        model = HieraModel.from_pretrained("facebook/hiera-tiny-224-hf", torch_dtype=torch.float16, device_map="auto")
-        image_processor = self.default_image_processor
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
-
-        # forward pass to make sure inference works in fp16
-        with torch.no_grad():
-            _ = model(pixel_values)
-
 
 @require_torch
 class HieraBackboneTest(unittest.TestCase, BackboneTesterMixin):

From 0a06b4d22477c2eb8ab3996f2bb32a150ea99c8c Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 11 Jul 2024 20:38:37 +0200
Subject: [PATCH 118/118] [run_slow] hiera