From 03e4321ecbf5ab1a513063488bf977ab7a59fe96 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 25 May 2022 12:23:09 +0000
Subject: [PATCH 001/181] First draft

---
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/vitpose.mdx          |  39 ++
 docs/source/en/serialization.mdx              |   1 +
 src/transformers/__init__.py                  |  10 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |   3 +
 src/transformers/models/vitpose/__init__.py   |  59 ++
 .../models/vitpose/configuration_vitpose.py   | 144 ++++
 .../convert_vitpose_timm_to_pytorch.py        | 250 +++++++
 .../models/vitpose/modeling_vitpose.py        | 624 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  17 +
 tests/models/vitpose/__init__.py              |   0
 tests/models/vitpose/test_modeling_vitpose.py | 242 +++++++
 15 files changed, 1395 insertions(+)
 create mode 100644 docs/source/en/model_doc/vitpose.mdx
 create mode 100644 src/transformers/models/vitpose/__init__.py
 create mode 100644 src/transformers/models/vitpose/configuration_vitpose.py
 create mode 100644 src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py
 create mode 100644 src/transformers/models/vitpose/modeling_vitpose.py
 create mode 100644 tests/models/vitpose/__init__.py
 create mode 100644 tests/models/vitpose/test_modeling_vitpose.py
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index b4e8d5154a12..f9b57c087bcb 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -278,6 +278,7 @@ Flax), PyTorch, and/or TensorFlow.
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         ViTPosePose         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/vitpose.mdx b/docs/source/en/model_doc/vitpose.mdx
new file mode 100644
index 000000000000..97a80c85ae5d
--- /dev/null
+++ b/docs/source/en/model_doc/vitpose.mdx
@@ -0,0 +1,39 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ViTPose
+
+## Overview
+
+The ViTPose model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## ViTPoseConfig
+
+[[autodoc]] ViTPoseConfig
+
+## ViTPoseModel
+
+[[autodoc]] ViTPoseModel
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 2bb449240bb0..a80839150119 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -75,6 +75,7 @@ Ready-made configurations include the following architectures:
 - RoFormer
 - T5
 - ViT
+- ViTPosePose
 - XLM-RoBERTa
 - XLM-RoBERTa-XL
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0afe8588d658..ba21da84ce06 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -323,6 +323,7 @@
     "models.visual_bert": ["VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VisualBertConfig"],
     "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
+    "models.vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig"],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
@@ -1699,6 +1700,13 @@
             "ViTPreTrainedModel",
         ]
     )
+    _import_structure["models.vitpose"].extend(
+        [
+            "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ViTPoseModel",
+            "ViTPosePreTrainedModel",
+        ]
+    )
     _import_structure["models.vit_mae"].extend(
         [
             "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2865,6 +2873,7 @@
     from .models.visual_bert import VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, VisualBertConfig
     from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
+    from .models.vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
@@ -4023,6 +4032,7 @@
             ViTMAEModel,
             ViTMAEPreTrainedModel,
         )
+        from .models.vitpose import VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST, ViTPoseModel, ViTPosePreTrainedModel
         from .models.wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             Wav2Vec2ForAudioFrameClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index e435265a8378..14548673a5b3 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -131,6 +131,7 @@
     visual_bert,
     vit,
     vit_mae,
+    vitpose,
     wav2vec2,
     wav2vec2_conformer,
     wav2vec2_phoneme,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index dbb19c55aa97..6cc78f4be15d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -128,6 +128,7 @@
         ("visual_bert", "VisualBertConfig"),
         ("vit", "ViTConfig"),
         ("vit_mae", "ViTMAEConfig"),
+        ("vitpose", "ViTPoseConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
@@ -230,6 +231,7 @@
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vitpose", "VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xglm", "XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -361,6 +363,7 @@
         ("visual_bert", "VisualBert"),
         ("vit", "ViT"),
         ("vit_mae", "ViTMAE"),
+        ("vitpose", "ViTPosePose"),
         ("wav2vec2", "Wav2Vec2"),
         ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
         ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index f398efe360ba..2f282d255091 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -64,6 +64,7 @@
         ("van", "ConvNextFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),
+        ("vitpose", "ViTPoseFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
         ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
         ("yolos", "YolosFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index be7dc5bc9e88..759ab394e634 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -121,6 +121,7 @@
         ("visual_bert", "VisualBertModel"),
         ("vit", "ViTModel"),
         ("vit_mae", "ViTMAEModel"),
+        ("vitpose", "ViTPoseModel"),
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
@@ -288,6 +289,7 @@
         ("deit", "DeiTForMaskedImageModeling"),
         ("swin", "SwinForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
+        ("vitpose", "ViTPoseForMaskedImageModeling"),
     ]
 )
 
@@ -323,6 +325,7 @@
         ("swin", "SwinForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
+        ("vitpose", "ViTPoseForImageClassification"),
     ]
 )
 
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
new file mode 100644
index 000000000000..f55f3da09e3b
--- /dev/null
+++ b/src/transformers/models/vitpose/__init__.py
@@ -0,0 +1,59 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {"configuration_vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig", "ViTPoseOnnxConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vitpose"] = [
+        "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ViTPoseModel",
+        "ViTPosePreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig, ViTPoseOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vitpose import (
+            VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTPoseModel,
+            ViTPosePreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
new file mode 100644
index 000000000000..f3bd6a045916
--- /dev/null
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViTPose model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "unisydney": "https://huggingface.co/unisydney/resolve/main/config.json",
+}
+
+
+
+class ViTPoseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTPoseModel`]. It is used to instantiate an
+    ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ViTPose
+    [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `16`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        encoder_stride (`int`, `optional`, defaults to 16):
+           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTPoseModel, ViTPoseConfig
+
+    >>> # Initializing a ViTPose vitpose-base-patch16-224 style configuration
+    >>> configuration = ViTPoseConfig()
+
+    >>> # Initializing a model from the vitpose-base-patch16-224 style configuration
+    >>> model = ViTPoseModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vitpose"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        is_encoder_decoder=False,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        encoder_stride=16,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.encoder_stride = encoder_stride
+
+
+class ViTPoseOnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py
new file mode 100644
index 000000000000..a535ba782629
--- /dev/null
+++ b/src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViTPose and non-distilled DeiT checkpoints from the timm library."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+import timm
+from huggingface_hub import hf_hub_download
+from transformers import DeiTFeatureExtractor, ViTPoseConfig, ViTFeatureExtractor, ViTPoseForImageClassification, ViTPoseModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"vitpose.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"vitpose.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vitpose.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vitpose.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"vitpose.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"vitpose.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vitpose.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vitpose.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vitpose.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vitpose.encoder.layer.{i}.output.dense.bias"))
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("cls_token", "vitpose.embeddings.cls_token"),
+            ("patch_embed.proj.weight", "vitpose.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "vitpose.embeddings.patch_embeddings.projection.bias"),
+            ("pos_embed", "vitpose.embeddings.position_embeddings"),
+        ]
+    )
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend(
+            [
+                ("norm.weight", "layernorm.weight"),
+                ("norm.bias", "layernorm.bias"),
+                ("pre_logits.fc.weight", "pooler.dense.weight"),
+                ("pre_logits.fc.bias", "pooler.dense.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "vitpose" from all keys that start with "vitpose"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vitpose") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "vitpose.layernorm.weight"),
+                ("norm.bias", "vitpose.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        if base_model:
+            prefix = ""
+        else:
+            prefix = "vitpose."
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.weight", "head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_vitpose_checkpoint(vitpose_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our ViTPose structure.
+    """
+
+    # define default ViTPose configuration
+    config = ViTPoseConfig()
+    base_model = False
+    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
+    if vitpose_name[-5:] == "in21k":
+        base_model = True
+        config.patch_size = int(vitpose_name[-12:-10])
+        config.image_size = int(vitpose_name[-9:-6])
+    else:
+        config.num_labels = 1000
+        repo_id = "datasets/huggingface/label-files"
+        filename = "imagenet-1k-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+        config.patch_size = int(vitpose_name[-6:-4])
+        config.image_size = int(vitpose_name[-3:])
+    # size of the architecture
+    if "deit" in vitpose_name:
+        if vitpose_name[9:].startswith("tiny"):
+            config.hidden_size = 192
+            config.intermediate_size = 768
+            config.num_hidden_layers = 12
+            config.num_attention_heads = 3
+        elif vitpose_name[9:].startswith("small"):
+            config.hidden_size = 384
+            config.intermediate_size = 1536
+            config.num_hidden_layers = 12
+            config.num_attention_heads = 6
+        else:
+            pass
+    else:
+        if vitpose_name[4:].startswith("small"):
+            config.hidden_size = 768
+            config.intermediate_size = 2304
+            config.num_hidden_layers = 8
+            config.num_attention_heads = 8
+        elif vitpose_name[4:].startswith("base"):
+            pass
+        elif vitpose_name[4:].startswith("large"):
+            config.hidden_size = 1024
+            config.intermediate_size = 4096
+            config.num_hidden_layers = 24
+            config.num_attention_heads = 16
+        elif vitpose_name[4:].startswith("huge"):
+            config.hidden_size = 1280
+            config.intermediate_size = 5120
+            config.num_hidden_layers = 32
+            config.num_attention_heads = 16
+
+    # load original model from timm
+    timm_model = timm.create_model(vitpose_name, pretrained=True)
+    timm_model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = timm_model.state_dict()
+    if base_model:
+        remove_classification_head_(state_dict)
+    rename_keys = create_rename_keys(config, base_model)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model)
+
+    # load HuggingFace model
+    if vitpose_name[-5:] == "in21k":
+        model = ViTPoseModel(config).eval()
+    else:
+        model = ViTPoseForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
+    if "deit" in vitpose_name:
+        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
+    else:
+        feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+
+    if base_model:
+        timm_pooled_output = timm_model.forward_features(pixel_values)
+        assert timm_pooled_output.shape == outputs.pooler_output.shape
+        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
+    else:
+        timm_logits = timm_model(pixel_values)
+        assert timm_logits.shape == outputs.logits.shape
+        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {vitpose_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--vitpose_name",
+        default="vitpose_base_patch16_224",
+        type=str,
+        help="Name of the ViTPose timm model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_vitpose_checkpoint(args.vitpose_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
new file mode 100644
index 000000000000..c66e1b3c757c
--- /dev/null
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -0,0 +1,624 @@
+# coding=utf-8
+# Copyright 2022 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViTPose model."""
+
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_vitpose import ViTPoseConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTPoseConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "unisydney"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vitpose-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "unisydney",
+    # See all ViTPose models at https://huggingface.co/models?filter=vitpose
+]
+
+
+
+# Inspired by
+# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
+# From PyTorch internals
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings with ViT->ViTPose
+class ViTPoseEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+    """
+
+    def __init__(self, config: ViTPoseConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = PatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        npatch = embeddings.shape[1] - 1
+        N = self.position_embeddings.shape[1] - 1
+        if npatch == N and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(h0 / math.sqrt(N), w0 / math.sqrt(N)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        batch_size, seq_len, _ = embeddings.size()
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class PatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+
+    """
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        num_channels: int = 3,
+        embed_dim: int = 768,
+    ):
+        super().__init__()
+        image_size = to_2tuple(image_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTPose
+class ViTPoseSelfAttention(nn.Module):
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTPose
+class ViTPoseSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTPoseLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTPose
+class ViTPoseAttention(nn.Module):
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        self.attention = ViTPoseSelfAttention(config)
+        self.output = ViTPoseSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTPose
+class ViTPoseIntermediate(nn.Module):
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTPose
+class ViTPoseOutput(nn.Module):
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTPose
+class ViTPoseLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTPoseAttention(config)
+        self.intermediate = ViTPoseIntermediate(config)
+        self.output = ViTPoseOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTPose, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViTPose, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTPose
+class ViTPoseEncoder(nn.Module):
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTPoseLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->ViTPose,vit->vitpose
+class ViTPosePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTPoseConfig
+    base_model_prefix = "vitpose"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: ViTPoseEncoder, value: bool = False) -> None:
+        if isinstance(module, ViTPoseEncoder):
+            module.gradient_checkpointing = value
+
+
+VITPOSE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTPoseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITPOSE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
+            [`ViTFeatureExtractor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViTPose Model transformer outputting raw hidden-states without any specific head on top.",
+    VITPOSE_START_DOCSTRING,
+)
+# Copied from transformers.models.vit.modeling_vit.ViTModel with VIT->VITPOSE,ViT->ViTPose
+class ViTPoseModel(ViTPosePreTrainedModel):
+    def __init__(self, config: ViTPoseConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTPoseEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = ViTPoseEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTPosePooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->ViTPose
+class ViTPosePooler(nn.Module):
+    def __init__(self, config: ViTPoseConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e25185b1a11c..13db6dffa81e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4498,6 +4498,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ViTPoseModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ViTPosePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/vitpose/__init__.py b/tests/models/vitpose/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
new file mode 100644
index 000000000000..c3fa1fc1c108
--- /dev/null
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViTPose model. """
+
+
+import inspect
+import unittest
+
+from transformers import ViTPoseConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import ViTPoseForImageClassification, ViTPoseForMaskedImageModeling, ViTPoseModel
+    from transformers.models.vitpose.modeling_vitpose import VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTFeatureExtractor
+
+
+class ViTPoseModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+        encoder_stride=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.encoder_stride = encoder_stride
+
+        # in ViTPose, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ViTPoseConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = ViTPoseModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = ViTPoseForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViTPose does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            ViTPoseModel,
+            ViTPoseForImageClassification,
+            ViTPoseForMaskedImageModeling,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ViTPoseModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTPoseConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="ViTPose does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ViTPoseModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class ViTPoseModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return ViTFeatureExtractor.from_pretrained("google/vitpose-base-patch16-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = ViTPoseForImageClassification.from_pretrained("google/vitpose-base-patch16-224").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 84ac7fec5a361db7debcb3919cf5c205312979c0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@nielss-mbp.home>
Date: Thu, 26 May 2022 15:26:23 +0200
Subject: [PATCH 002/181] Make fixup

---
 docs/source/en/model_doc/vitpose.mdx          |   5 +
 src/transformers/__init__.py                  |   8 +-
 src/transformers/models/vitpose/__init__.py   |  12 +-
 .../models/vitpose/configuration_vitpose.py   |   1 -
 .../convert_vitpose_timm_to_pytorch.py        | 250 ------------------
 .../vitpose/convert_vitpose_to_pytorch.py     | 180 +++++++++++++
 .../models/vitpose/modeling_vitpose.py        | 156 +++++------
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 tests/models/vitpose/test_modeling_vitpose.py |  38 +--
 utils/check_repo.py                           |   1 +
 10 files changed, 282 insertions(+), 376 deletions(-)
 delete mode 100644 src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py
 create mode 100644 src/transformers/models/vitpose/convert_vitpose_to_pytorch.py

diff --git a/docs/source/en/model_doc/vitpose.mdx b/docs/source/en/model_doc/vitpose.mdx
index 97a80c85ae5d..c1be84b35023 100644
--- a/docs/source/en/model_doc/vitpose.mdx
+++ b/docs/source/en/model_doc/vitpose.mdx
@@ -36,4 +36,9 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 ## ViTPoseModel
 
 [[autodoc]] ViTPoseModel
+    - forward
+
+## ViTPoseForPoseEstimation
+
+[[autodoc]] ViTPoseForPoseEstimation
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ba21da84ce06..1211c9689d1c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1705,6 +1705,7 @@
             "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ViTPoseModel",
             "ViTPosePreTrainedModel",
+            "ViTPoseForPoseEstimation",
         ]
     )
     _import_structure["models.vit_mae"].extend(
@@ -4032,7 +4033,12 @@
             ViTMAEModel,
             ViTMAEPreTrainedModel,
         )
-        from .models.vitpose import VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST, ViTPoseModel, ViTPosePreTrainedModel
+        from .models.vitpose import (
+            VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTPoseForPoseEstimation,
+            ViTPoseModel,
+            ViTPosePreTrainedModel,
+        )
         from .models.wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             Wav2Vec2ForAudioFrameClassification,
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index f55f3da09e3b..d30adaf44cf8 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -17,14 +17,12 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
-_import_structure = {"configuration_vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig", "ViTPoseOnnxConfig"]}
+_import_structure = {
+    "configuration_vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig", "ViTPoseOnnxConfig"]
+}
 
 try:
     if not is_torch_available():
@@ -36,6 +34,7 @@
         "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ViTPoseModel",
         "ViTPosePreTrainedModel",
+        "ViTPoseForPoseEstimation",
     ]
 
 if TYPE_CHECKING:
@@ -49,6 +48,7 @@
     else:
         from .modeling_vitpose import (
             VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTPoseForPoseEstimation,
             ViTPoseModel,
             ViTPosePreTrainedModel,
         )
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index f3bd6a045916..757a0efa196d 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -31,7 +31,6 @@
 }
 
 
-
 class ViTPoseConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ViTPoseModel`]. It is used to instantiate an
diff --git a/src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py
deleted file mode 100644
index a535ba782629..000000000000
--- a/src/transformers/models/vitpose/convert_vitpose_timm_to_pytorch.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViTPose and non-distilled DeiT checkpoints from the timm library."""
-
-
-import argparse
-import json
-from pathlib import Path
-
-import torch
-from PIL import Image
-
-import requests
-import timm
-from huggingface_hub import hf_hub_download
-from transformers import DeiTFeatureExtractor, ViTPoseConfig, ViTFeatureExtractor, ViTPoseForImageClassification, ViTPoseModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vitpose.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vitpose.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vitpose.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vitpose.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vitpose.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vitpose.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vitpose.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vitpose.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vitpose.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vitpose.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "vitpose.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "vitpose.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "vitpose.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "vitpose.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vitpose" from all keys that start with "vitpose"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vitpose") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vitpose.layernorm.weight"),
-                ("norm.bias", "vitpose.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vitpose."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vitpose_checkpoint(vitpose_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ViTPose structure.
-    """
-
-    # define default ViTPose configuration
-    config = ViTPoseConfig()
-    base_model = False
-    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
-    if vitpose_name[-5:] == "in21k":
-        base_model = True
-        config.patch_size = int(vitpose_name[-12:-10])
-        config.image_size = int(vitpose_name[-9:-6])
-    else:
-        config.num_labels = 1000
-        repo_id = "datasets/huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.patch_size = int(vitpose_name[-6:-4])
-        config.image_size = int(vitpose_name[-3:])
-    # size of the architecture
-    if "deit" in vitpose_name:
-        if vitpose_name[9:].startswith("tiny"):
-            config.hidden_size = 192
-            config.intermediate_size = 768
-            config.num_hidden_layers = 12
-            config.num_attention_heads = 3
-        elif vitpose_name[9:].startswith("small"):
-            config.hidden_size = 384
-            config.intermediate_size = 1536
-            config.num_hidden_layers = 12
-            config.num_attention_heads = 6
-        else:
-            pass
-    else:
-        if vitpose_name[4:].startswith("small"):
-            config.hidden_size = 768
-            config.intermediate_size = 2304
-            config.num_hidden_layers = 8
-            config.num_attention_heads = 8
-        elif vitpose_name[4:].startswith("base"):
-            pass
-        elif vitpose_name[4:].startswith("large"):
-            config.hidden_size = 1024
-            config.intermediate_size = 4096
-            config.num_hidden_layers = 24
-            config.num_attention_heads = 16
-        elif vitpose_name[4:].startswith("huge"):
-            config.hidden_size = 1280
-            config.intermediate_size = 5120
-            config.num_hidden_layers = 32
-            config.num_attention_heads = 16
-
-    # load original model from timm
-    timm_model = timm.create_model(vitpose_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = timm_model.state_dict()
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    if vitpose_name[-5:] == "in21k":
-        model = ViTPoseModel(config).eval()
-    else:
-        model = ViTPoseForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
-    if "deit" in vitpose_name:
-        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
-    else:
-        feature_extractor = ViTFeatureExtractor(size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    if base_model:
-        timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.pooler_output.shape
-        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
-    else:
-        timm_logits = timm_model(pixel_values)
-        assert timm_logits.shape == outputs.logits.shape
-        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {vitpose_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--vitpose_name",
-        default="vitpose_base_patch16_224",
-        type=str,
-        help="Name of the ViTPose timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vitpose_checkpoint(args.vitpose_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
new file mode 100644
index 000000000000..bc9eb25c5476
--- /dev/null
+++ b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViTPose checkpoints from the original repository.
+
+URL: https://github.com/vitae-transformer/vitpose
+"""
+
+
+import argparse
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from transformers import ViTFeatureExtractor, ViTPoseConfig, ViTPoseForPoseEstimation
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def rename_key(name):
+    if "backbone" in name:
+        name = name.replace("backbone", "vitpose")
+    if "patch_embed" in name:
+        name = name.replace("patch_embed", "embeddings.patch_embeddings")
+    if "layers" in name:
+        name = "encoder." + name
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn" in name:
+        name = name.replace("attn", "attention.self")
+    if "norm1" in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+
+    if name == "norm.weight":
+        name = "layernorm.weight"
+    if name == "norm.bias":
+        name = "layernorm.bias"
+
+    if "head" in name:
+        name = name.replace("head", "classifier")
+    else:
+        name = "swin." + name
+
+    return name
+
+
+def convert_state_dict(orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            # layer_num = int(key_split[1])
+
+            # # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+            # in_proj_weight = orig_state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+            # in_proj_bias = orig_state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+            # # next, add query, keys and values (in that order) to the state dict
+            # orig_state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            #     : config.hidden_size, :
+            # ]
+            # orig_state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+            # orig_state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            #     config.hidden_size : config.hidden_size * 2, :
+            # ]
+            # orig_state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            #     config.hidden_size : config.hidden_size * 2
+            # ]
+            # orig_state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            #     -config.hidden_size :, :
+            # ]
+            # orig_state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+            pass
+
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our ViTPose structure.
+    """
+
+    # define default ViTPose configuration
+    config = ViTPoseConfig()
+
+    # size of the architecture
+    if "small" in model_name:
+        config.hidden_size = 768
+        config.intermediate_size = 2304
+        config.num_hidden_layers = 8
+        config.num_attention_heads = 8
+    elif "large" in model_name:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif "huge" in model_name:
+        config.hidden_size = 1280
+        config.intermediate_size = 5120
+        config.num_hidden_layers = 32
+        config.num_attention_heads = 16
+
+    # load HuggingFace model
+    model = ViTPoseForPoseEstimation(config)
+    model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
+    new_state_dict = convert_state_dict(state_dict, model)
+    model.load_state_dict(new_state_dict)
+
+    # Check outputs on an image, prepared by ViTFeatureExtractor
+    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+
+    # TODO assert logits
+    print(outputs.keys())
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="vitpose_base",
+        type=str,
+        help="Name of the ViTPose model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/Users/nielsrogge/Documents/ViTPose/Original checkpoints/vitpose-b-simple.pth",
+        type=str,
+        help="Path to the original PyTorch checkpoint (.pt file).",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_vitpose_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index c66e1b3c757c..3a471695289f 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -22,19 +22,12 @@
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedLMOutput
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_vitpose import ViTPoseConfig
 
 
@@ -59,7 +52,6 @@
 ]
 
 
-
 # Inspired by
 # https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
 # From PyTorch internals
@@ -73,18 +65,15 @@ def to_2tuple(x):
 # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings with ViT->ViTPose
 class ViTPoseEmbeddings(nn.Module):
     """
-    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    Construct the position and patch embeddings.
 
     """
 
-    def __init__(self, config: ViTPoseConfig, use_mask_token: bool = False) -> None:
+    def __init__(self, config: ViTPoseConfig) -> None:
         super().__init__()
 
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
         self.patch_embeddings = PatchEmbeddings(
             image_size=config.image_size,
             patch_size=config.patch_size,
@@ -96,70 +85,18 @@ def __init__(self, config: ViTPoseConfig, use_mask_token: bool = False) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
-    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
-        """
-        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
-        resolution images.
-
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
-        """
-
-        npatch = embeddings.shape[1] - 1
-        N = self.position_embeddings.shape[1] - 1
-        if npatch == N and height == width:
-            return self.position_embeddings
-        class_pos_embed = self.position_embeddings[:, 0]
-        patch_pos_embed = self.position_embeddings[:, 1:]
-        dim = embeddings.shape[-1]
-        h0 = height // self.config.patch_size
-        w0 = width // self.config.patch_size
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        h0, w0 = h0 + 0.1, w0 + 0.1
-        patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
-            scale_factor=(h0 / math.sqrt(N), w0 / math.sqrt(N)),
-            mode="bicubic",
-            align_corners=False,
-        )
-        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
-        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
-        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
-        interpolate_pos_encoding: bool = False,
-    ) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
-        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-
-        batch_size, seq_len, _ = embeddings.size()
-        if bool_masked_pos is not None:
-            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
-            # replace the masked visual tokens by mask_tokens
-            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
-            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
-
-        # add the [CLS] token to the embedded patch tokens
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        embeddings = self.patch_embeddings(pixel_values)
 
         # add positional encoding to each token
-        if interpolate_pos_encoding:
-            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-        else:
-            embeddings = embeddings + self.position_embeddings
+        embeddings = embeddings + self.position_embeddings
 
         embeddings = self.dropout(embeddings)
 
         return embeddings
 
 
-# Based on timm implementation, which can be found here:
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 class PatchEmbeddings(nn.Module):
     """
     Image to Patch Embedding.
@@ -183,14 +120,12 @@ def __init__(
 
         self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
 
-    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
-        if not interpolate_pos_encoding:
-            if height != self.image_size[0] or width != self.image_size[1]:
-                raise ValueError(
-                    f"Input image size ({height}*{width}) doesn't match model"
-                    f" ({self.image_size[0]}*{self.image_size[1]})."
-                )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
         x = self.projection(pixel_values).flatten(2).transpose(1, 2)
         return x
 
@@ -509,8 +444,6 @@ def _set_gradient_checkpointing(self, module: ViTPoseEncoder, value: bool = Fals
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        interpolate_pos_encoding (`bool`, *optional*):
-            Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
@@ -520,13 +453,12 @@ def _set_gradient_checkpointing(self, module: ViTPoseEncoder, value: bool = Fals
     "The bare ViTPose Model transformer outputting raw hidden-states without any specific head on top.",
     VITPOSE_START_DOCSTRING,
 )
-# Copied from transformers.models.vit.modeling_vit.ViTModel with VIT->VITPOSE,ViT->ViTPose
 class ViTPoseModel(ViTPosePreTrainedModel):
-    def __init__(self, config: ViTPoseConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+    def __init__(self, config: ViTPoseConfig, add_pooling_layer: bool = True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = ViTPoseEmbeddings(config, use_mask_token=use_mask_token)
+        self.embeddings = ViTPoseEmbeddings(config)
         self.encoder = ViTPoseEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -558,11 +490,9 @@ class PreTrainedModel
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -581,9 +511,7 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output = self.embeddings(
-            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embedding_output = self.embeddings(pixel_values)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -622,3 +550,57 @@ def forward(self, hidden_states):
         pooled_output = self.dense(first_token_tensor)
         pooled_output = self.activation(pooled_output)
         return pooled_output
+
+
+class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
+    def __init__(self, config: ViTPoseConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vit = ViTPoseModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("To do")
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 13db6dffa81e..916dc8131259 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4501,6 +4501,13 @@ def __init__(self, *args, **kwargs):
 VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class ViTPoseForPoseEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ViTPoseModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index c3fa1fc1c108..4b27199a43de 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -27,10 +27,9 @@
 
 
 if is_torch_available():
-    import torch
     from torch import nn
 
-    from transformers import ViTPoseForImageClassification, ViTPoseForMaskedImageModeling, ViTPoseModel
+    from transformers import ViTPoseForPoseEstimation, ViTPoseModel
     from transformers.models.vitpose.modeling_vitpose import VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -120,14 +119,6 @@ def create_and_check_model(self, config, pixel_values, labels):
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTPoseForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -149,8 +140,7 @@ class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ViTPoseModel,
-            ViTPoseForImageClassification,
-            ViTPoseForMaskedImageModeling,
+            ViTPoseForPoseEstimation,
         )
         if is_torch_available()
         else ()
@@ -219,24 +209,10 @@ def prepare_img():
 class ViTPoseModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("google/vitpose-base-patch16-224") if is_vision_available() else None
+        return (
+            ViTFeatureExtractor.from_pretrained("google/vitpose-base-patch16-224") if is_vision_available() else None
+        )
 
     @slow
-    def test_inference_image_classification_head(self):
-        model = ViTPoseForImageClassification.from_pretrained("google/vitpose-base-patch16-224").to(torch_device)
-
-        feature_extractor = self.default_feature_extractor
-        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+    def test_inference_pose_estimation_head(self):
+        raise NotImplementedError("To do")
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 6eaa2d752acd..48ef4aacb8c7 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -120,6 +120,7 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "ViTPoseForPoseEstimation",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",

From 90018b01ff27fbea75231a3802d623874ec77188 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 27 May 2022 12:06:53 +0200
Subject: [PATCH 003/181] =?UTF-8?q?Make=20forward=20pass=20work=C3=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/vitpose/configuration_vitpose.py   | 27 ++++---
 .../vitpose/convert_vitpose_to_pytorch.py     | 78 ++++++++-----------
 .../models/vitpose/modeling_vitpose.py        | 41 +++++-----
 3 files changed, 65 insertions(+), 81 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 757a0efa196d..9f1d937e9f7d 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -43,6 +43,12 @@ class ViTPoseConfig(PretrainedConfig):
 
 
     Args:
+        image_size (`int`, *optional*, defaults to `[256, 192]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of input channels.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -62,16 +68,10 @@ class ViTPoseConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
-            The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
-            The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
-           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        num_keypoints (`int`, *optional*, defaults to `17`):
+            The number of keypoints.
 
     Example:
 
@@ -91,6 +91,9 @@ class ViTPoseConfig(PretrainedConfig):
 
     def __init__(
         self,
+        image_size=[256, 192],
+        patch_size=[16, 16],
+        num_channels=3,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -100,12 +103,8 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        is_encoder_decoder=False,
-        image_size=224,
-        patch_size=16,
-        num_channels=3,
         qkv_bias=True,
-        encoder_stride=16,
+        num_keypoints=17,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -123,7 +122,7 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-        self.encoder_stride = encoder_stride
+        self.num_keypoints = num_keypoints
 
 
 class ViTPoseOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
index bc9eb25c5476..c8572d32fa0e 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
@@ -35,11 +35,13 @@
 
 def rename_key(name):
     if "backbone" in name:
-        name = name.replace("backbone", "vitpose")
-    if "patch_embed" in name:
-        name = name.replace("patch_embed", "embeddings.patch_embeddings")
-    if "layers" in name:
-        name = "encoder." + name
+        name = name.replace("backbone", "vit")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    if "pos_embed" in name:
+        name = name.replace("pos_embed", "embeddings.position_embeddings")
+    if "blocks" in name:
+        name = name.replace("blocks", "encoder.layer")
     if "attn.proj" in name:
         name = name.replace("attn.proj", "attention.output.dense")
     if "attn" in name:
@@ -52,47 +54,31 @@ def rename_key(name):
         name = name.replace("mlp.fc1", "intermediate.dense")
     if "mlp.fc2" in name:
         name = name.replace("mlp.fc2", "output.dense")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "swin." + name
+    if "last_norm" in name:
+        name = name.replace("last_norm", "layernorm")
+    if "final_layer." in name:
+        name = name.replace("final_layer.", "")
 
     return name
 
 
-def convert_state_dict(orig_state_dict):
+def convert_state_dict(orig_state_dict, dim):
     for key in orig_state_dict.copy().keys():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
-            # layer_num = int(key_split[1])
-
-            # # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-            # in_proj_weight = orig_state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-            # in_proj_bias = orig_state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-            # # next, add query, keys and values (in that order) to the state dict
-            # orig_state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            #     : config.hidden_size, :
-            # ]
-            # orig_state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-            # orig_state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            #     config.hidden_size : config.hidden_size * 2, :
-            # ]
-            # orig_state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            #     config.hidden_size : config.hidden_size * 2
-            # ]
-            # orig_state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            #     -config.hidden_size :, :
-            # ]
-            # orig_state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-            pass
-
+            key_split = key.split(".")
+            layer_num = int(key_split[2])
+            if "weight" in key:
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
+                    dim : dim * 2, :
+                ]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+            else:
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
         else:
             orig_state_dict[rename_key(key)] = val
 
@@ -138,23 +124,25 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
 
     # load state_dict of original model, remove and rename some keys
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
-    new_state_dict = convert_state_dict(state_dict, model)
+    new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size)
     model.load_state_dict(new_state_dict)
 
     # Check outputs on an image, prepared by ViTFeatureExtractor
-    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    feature_extractor = ViTFeatureExtractor(size=config.image_size[::-1])
     encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
+
+    print("Shape of pixel values:", pixel_values.shape)
     outputs = model(pixel_values)
 
     # TODO assert logits
     print(outputs.keys())
 
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    # Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    # print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    # model.save_pretrained(pytorch_dump_folder_path)
+    # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 3a471695289f..b4360c840b23 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -52,19 +52,6 @@
 ]
 
 
-# Inspired by
-# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
-# From PyTorch internals
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-
-
-# Based on timm implementation, which can be found here:
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-
-
 class ViTPoseEmbeddings(nn.Module):
     """
     Construct the position and patch embeddings.
@@ -90,7 +77,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
 
         # add positional encoding to each token
-        embeddings = embeddings + self.position_embeddings
+        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
 
         embeddings = self.dropout(embeddings)
 
@@ -111,14 +98,14 @@ def __init__(
         embed_dim: int = 768,
     ):
         super().__init__()
-        image_size = to_2tuple(image_size)
-        patch_size = to_2tuple(patch_size)
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
         num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_patches = num_patches
 
-        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
@@ -559,8 +546,12 @@ def __init__(self, config: ViTPoseConfig) -> None:
         self.num_labels = config.num_labels
         self.vit = ViTPoseModel(config, add_pooling_layer=False)
 
-        # Classifier head
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        # Keypoint head
+        final_conv_kernel = 3
+        padding = 1
+        self.keypoint_head = nn.Conv2d(
+            config.hidden_size, config.num_keypoints, kernel_size=final_conv_kernel, stride=1, padding=padding
+        )
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -572,7 +563,6 @@ def forward(
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, ImageClassifierOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -582,13 +572,20 @@ def forward(
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=return_dict,
         )
 
         sequence_output = outputs[0]
+        batch_size = sequence_output.shape[0]
+        patch_height = self.config.image_size[0] // self.config.patch_size[0]
+        patch_width = self.config.image_size[1] // self.config.patch_size[1]
+        sequence_output = (
+            sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
+        )
+
+        print("Shape of sequence output:", sequence_output.shape)
 
-        logits = self.classifier(sequence_output[:, 0, :])
+        logits = self.keypoint_head(sequence_output)
 
         loss = None
         if labels is not None:

From 5ce0b8b9ddc56b2094a9fdd72e56dc46945de396 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 2 Jun 2022 12:36:33 +0200
Subject: [PATCH 004/181] Improve code

---
 .../vitpose/convert_vitpose_to_pytorch.py     |   5 +-
 .../vitpose/feature_extraction_vitpose.py     | 186 ++++++++++++++++++
 .../models/vitpose/modeling_vitpose.py        |   8 +-
 3 files changed, 196 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/models/vitpose/feature_extraction_vitpose.py

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
index c8572d32fa0e..8fb7d1e12099 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
@@ -19,7 +19,6 @@
 
 
 import argparse
-from pathlib import Path
 
 import torch
 from PIL import Image
@@ -29,6 +28,8 @@
 from transformers.utils import logging
 
 
+# from pathlib import Path
+
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 
@@ -136,7 +137,7 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     outputs = model(pixel_values)
 
     # TODO assert logits
-    print(outputs.keys())
+    print("Shape of logits:", outputs.logits.shape)
 
     # Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     # print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/vitpose/feature_extraction_vitpose.py b/src/transformers/models/vitpose/feature_extraction_vitpose.py
new file mode 100644
index 000000000000..89e2cc5f6b2a
--- /dev/null
+++ b/src/transformers/models/vitpose/feature_extraction_vitpose.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ViTPose."""
+
+from typing import Optional, Union
+import math
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """
+    Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+    
+    Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
+    Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+    
+    Returns:
+        np.ndarray: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+class ViTPoseFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a ViTPose feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_affine_transform (`bool`, *optional*, defaults to `True`):
+            Whether to apply an affine transformation to the input images.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_affine_transform=True,
+        do_rescale=True,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.do_affine_transform = do_affine_transform
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    def affine_transform(self, image):
+        raise NotImplementedError("To do")
+        
+        #transformation = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+
+        #image = image.transform(transformation, Image.AFFINE, resample=Image.BILINEAR)
+
+        #return image
+    
+    def __call__(
+        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s).
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        # Input type checking for clearer error
+        valid_images = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        if not is_batched:
+            images = [images]
+
+        # transformations (affine transformation + rescaling + normalization)
+        if self.do_affine_transform:
+            images = [self.affine_transform(image) for image in images]
+        if self.do_rescale:
+            images = [self.to_numpy_array(image=image) for image in images]
+        if self.do_normalize:
+            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+
+        # return as BatchFeature
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
\ No newline at end of file
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index b4360c840b23..3fd891d85c6e 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -575,6 +575,7 @@ def forward(
             return_dict=return_dict,
         )
 
+        # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
         sequence_output = outputs[0]
         batch_size = sequence_output.shape[0]
         patch_height = self.config.image_size[0] // self.config.patch_size[0]
@@ -583,8 +584,13 @@ def forward(
             sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
         )
 
-        print("Shape of sequence output:", sequence_output.shape)
+        # ReLu + upsample
+        sequence_output = nn.functional.relu(sequence_output)
+        sequence_output = nn.functional.interpolate(
+            sequence_output, scale_factor=4, mode="bilinear", align_corners=False
+        )
 
+        # Conv2d
         logits = self.keypoint_head(sequence_output)
 
         loss = None

From 8f397734be5dd23982add20a893c4cb1d7936ef2 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 15 Apr 2024 10:17:12 +0200
Subject: [PATCH 005/181] More improvements

---
 README.md                                     |  1 +
 README_de.md                                  |  1 +
 README_es.md                                  |  1 +
 README_fr.md                                  |  1 +
 README_hd.md                                  |  1 +
 README_ja.md                                  |  1 +
 README_ko.md                                  |  1 +
 README_pt-br.md                               |  1 +
 README_ru.md                                  |  1 +
 README_te.md                                  |  1 +
 README_vi.md                                  |  1 +
 README_zh-hans.md                             |  1 +
 README_zh-hant.md                             |  1 +
 docs/source/en/index.md                       |  1 +
 docs/source/en/model_doc/vitpose.md           | 40 +++++++++++++
 docs/source/en/model_doc/vitpose.mdx          | 44 --------------
 src/transformers/__init__.py                  | 32 +++++------
 src/transformers/models/__init__.py           |  2 +-
 .../models/auto/configuration_auto.py         |  4 +-
 src/transformers/models/auto/modeling_auto.py |  4 +-
 src/transformers/models/vitpose/__init__.py   |  2 +-
 .../models/vitpose/configuration_vitpose.py   | 14 ++---
 .../vitpose/convert_vitpose_to_pytorch.py     | 34 +++++++----
 ...vitpose.py => image_processing_vitpose.py} | 46 +++++++--------
 .../models/vitpose/modeling_vitpose.py        | 57 ++++++-------------
 src/transformers/utils/dummy_pt_objects.py    | 52 ++++++++---------
 26 files changed, 166 insertions(+), 179 deletions(-)
 create mode 100644 docs/source/en/model_doc/vitpose.md
 delete mode 100644 docs/source/en/model_doc/vitpose.mdx
 rename src/transformers/models/vitpose/{feature_extraction_vitpose.py => image_processing_vitpose.py} (87%)

diff --git a/README.md b/README.md
index de844848a402..e95654117f82 100644
--- a/README.md
+++ b/README.md
@@ -538,6 +538,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_de.md b/README_de.md
index e5bd3522ca3e..7aa386213272 100644
--- a/README_de.md
+++ b/README_de.md
@@ -534,6 +534,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_es.md b/README_es.md
index 5f6a2afb7c51..864b5443bed7 100644
--- a/README_es.md
+++ b/README_es.md
@@ -511,6 +511,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_fr.md b/README_fr.md
index 9c6f71d324cf..3d362055051a 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -532,6 +532,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (de Meta AI) publié dans l'article [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) par Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (de HUST-VL) publié dans l'article [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) par Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (de Meta AI) publié dans l'article [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) par Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (de The University of Sydney) publié dans l'article [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) parYufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (de Kakao Enterprise) publié dans l'article [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) par Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (de Google Research) publié dans l'article [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) par Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (de Facebook AI) publié dans l'article [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) par Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_hd.md b/README_hd.md
index 19dfa18b2644..f9064846e50a 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -485,6 +485,7 @@ conda install conda-forge::transformers
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा।
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL से) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. द्वाराअनुसंधान पत्र [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) के साथ जारी किया गया
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (The University of Sydney से) Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. द्वाराअनुसंधान पत्र [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) के साथ जारी किया गया
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise से) Jaehyeon Kim, Jungil Kong, Juhee Son. द्वाराअनुसंधान पत्र [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) के साथ जारी किया गया
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 443f650cae45..15413a0e7fcf 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -545,6 +545,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL から) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. から公開された研究論文 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (The University of Sydney から) Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. から公開された研究論文 [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484)
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise から) Jaehyeon Kim, Jungil Kong, Juhee Son. から公開された研究論文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
diff --git a/README_ko.md b/README_ko.md
index dfb271b73748..1d6e9bd1d71e 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -460,6 +460,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL 에서 제공)은 Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.의 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)논문과 함께 발표했습니다.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (The University of Sydney 에서 제공)은 Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.의 [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484)논문과 함께 발표했습니다.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise 에서 제공)은 Jaehyeon Kim, Jungil Kong, Juhee Son.의 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)논문과 함께 발표했습니다.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다.
diff --git a/README_pt-br.md b/README_pt-br.md
index 8dcdddac0093..60d8272503a1 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -543,6 +543,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ru.md b/README_ru.md
index 7ee48cbcaf71..5eba2fb757a6 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -533,6 +533,7 @@ conda install conda-forge::transformers
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_te.md b/README_te.md
index 687a97d69780..e661bc28e8bc 100644
--- a/README_te.md
+++ b/README_te.md
@@ -535,6 +535,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_vi.md b/README_vi.md
index 1a872b9ce5c0..e417de1e6d97 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -534,6 +534,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (từ Meta AI) được phát hành với bài báo [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (từ HUST-VL) được phát hành với bài báo [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (từ Meta AI) được phát hành với bài báo [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (từ The University of Sydney) được phát hành với bài báo [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (từ Kakao Enterprise) được phát hành với bài báo [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (từ Google Research) được phát hành với bài báo [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (từ Facebook AI) được phát hành với bài báo [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 7e307c781a28..34bb181dc28a 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -484,6 +484,7 @@ conda install conda-forge::transformers
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (来自 HUST-VL) 伴随论文 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) 由 Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (来自 The University of Sydney) 伴随论文 [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) 由 Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao 发布。
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (来自 Kakao Enterprise) 伴随论文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) 由 Jaehyeon Kim, Jungil Kong, Juhee Son 发布。
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 1431293df5cc..0e94e60f93de 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -496,6 +496,7 @@ conda install conda-forge::transformers
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViTPose](https://huggingface.co/docs/transformers/main/model_doc/vitpose)** (from The University of Sydney) released with the paper [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 701f0a7e6b3a..7e21ecaf285c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -308,6 +308,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
+|                       [ViTPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
new file mode 100644
index 000000000000..e709cd974646
--- /dev/null
+++ b/docs/source/en/model_doc/vitpose.md
@@ -0,0 +1,40 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ViTPose
+
+## Overview
+
+The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard [Vision Transformer](vit) as backbone for the task of keypoint estimation.
+
+The abstract from the paper is the following:
+
+*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
+
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
+
+
+## ViTPoseConfig
+
+[[autodoc]] ViTPoseConfig
+
+## ViTPoseModel
+
+[[autodoc]] ViTPoseModel
+    - forward
+
+## ViTPoseForPoseEstimation
+
+[[autodoc]] ViTPoseForPoseEstimation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/vitpose.mdx b/docs/source/en/model_doc/vitpose.mdx
deleted file mode 100644
index c1be84b35023..000000000000
--- a/docs/source/en/model_doc/vitpose.mdx
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ViTPose
-
-## Overview
-
-The ViTPose model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
-
-
-## ViTPoseConfig
-
-[[autodoc]] ViTPoseConfig
-
-## ViTPoseModel
-
-[[autodoc]] ViTPoseModel
-    - forward
-
-## ViTPoseForPoseEstimation
-
-[[autodoc]] ViTPoseForPoseEstimation
-    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 54a2b7aeebd2..409760a5081e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -926,10 +926,10 @@
         "ViTHybridConfig",
     ],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
-    "models.vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
     "models.vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"],
     "models.vitmatte": ["VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitMatteConfig"],
+    "models.vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig"],
     "models.vits": [
         "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "VitsConfig",
@@ -3619,14 +3619,6 @@
             "ViTPreTrainedModel",
         ]
     )
-    _import_structure["models.vitpose"].extend(
-        [
-            "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ViTPoseModel",
-            "ViTPosePreTrainedModel",
-            "ViTPoseForPoseEstimation",
-        ]
-    )
     _import_structure["models.vit_hybrid"].extend(
         [
             "VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3667,6 +3659,14 @@
             "VitMattePreTrainedModel",
         ]
     )
+    _import_structure["models.vitpose"].extend(
+        [
+            "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ViTPoseForPoseEstimation",
+            "ViTPoseModel",
+            "ViTPosePreTrainedModel",
+        ]
+    )
     _import_structure["models.vits"].extend(
         [
             "VITS_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -5832,10 +5832,10 @@
         ViTHybridConfig,
     )
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
-    from .models.vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
     from .models.vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig
     from .models.vitmatte import VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP, VitMatteConfig
+    from .models.vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig
     from .models.vits import (
         VITS_PRETRAINED_CONFIG_ARCHIVE_MAP,
         VitsConfig,
@@ -8134,12 +8134,6 @@
             ViTMAEModel,
             ViTMAEPreTrainedModel,
         )
-        from .models.vitpose import (
-            VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ViTPoseForPoseEstimation,
-            ViTPoseModel,
-            ViTPosePreTrainedModel,
-        )
         from .models.vit_msn import (
             VIT_MSN_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTMSNForImageClassification,
@@ -8157,6 +8151,12 @@
             VitMatteForImageMatting,
             VitMattePreTrainedModel,
         )
+        from .models.vitpose import (
+            VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTPoseForPoseEstimation,
+            ViTPoseModel,
+            ViTPosePreTrainedModel,
+        )
         from .models.vits import (
             VITS_PRETRAINED_MODEL_ARCHIVE_LIST,
             VitsModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 52921b903744..ee6889bf3636 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -244,10 +244,10 @@
     vit,
     vit_hybrid,
     vit_mae,
-    vitpose,
     vit_msn,
     vitdet,
     vitmatte,
+    vitpose,
     vits,
     vivit,
     wav2vec2,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6e67a833c6dd..738e136159fb 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -260,10 +260,10 @@
         ("vit", "ViTConfig"),
         ("vit_hybrid", "ViTHybridConfig"),
         ("vit_mae", "ViTMAEConfig"),
-        ("vitpose", "ViTPoseConfig"),
         ("vit_msn", "ViTMSNConfig"),
         ("vitdet", "VitDetConfig"),
         ("vitmatte", "VitMatteConfig"),
+        ("vitpose", "ViTPoseConfig"),
         ("vits", "VitsConfig"),
         ("vivit", "VivitConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -539,10 +539,10 @@
         ("vit", "ViT"),
         ("vit_hybrid", "ViT Hybrid"),
         ("vit_mae", "ViTMAE"),
-        ("vitpose", "ViTPose"),
         ("vit_msn", "ViTMSN"),
         ("vitdet", "VitDet"),
         ("vitmatte", "ViTMatte"),
+        ("vitpose", "ViTPose"),
         ("vits", "VITS"),
         ("vivit", "ViViT"),
         ("wav2vec2", "Wav2Vec2"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1d4d29a49a66..09e42fd7670e 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -238,9 +238,9 @@
         ("vit", "ViTModel"),
         ("vit_hybrid", "ViTHybridModel"),
         ("vit_mae", "ViTMAEModel"),
-        ("vitpose", "ViTPoseModel"),
         ("vit_msn", "ViTMSNModel"),
         ("vitdet", "VitDetModel"),
+        ("vitpose", "ViTPoseModel"),
         ("vits", "VitsModel"),
         ("vivit", "VivitModel"),
         ("wav2vec2", "Wav2Vec2Model"),
@@ -555,7 +555,6 @@
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
-        ("vitpose", "ViTPoseForMaskedImageModeling"),
     ]
 )
 
@@ -622,7 +621,6 @@
         ("swinv2", "Swinv2ForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
-        ("vitpose", "ViTPoseForImageClassification"),
         ("vit_hybrid", "ViTHybridForImageClassification"),
         ("vit_msn", "ViTMSNForImageClassification"),
     ]
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index d30adaf44cf8..0329e2d4d007 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 9f1d937e9f7d..72ab7f60fe43 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ class ViTPoseConfig(PretrainedConfig):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to `[16, 16]`):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
@@ -60,9 +60,9 @@ class ViTPoseConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -70,7 +70,7 @@ class ViTPoseConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        num_keypoints (`int`, *optional*, defaults to `17`):
+        num_keypoints (`int`, *optional*, defaults to 17):
             The number of keypoints.
 
     Example:
@@ -87,6 +87,7 @@ class ViTPoseConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "vitpose"
 
     def __init__(
@@ -105,7 +106,7 @@ def __init__(
         layer_norm_eps=1e-12,
         qkv_bias=True,
         num_keypoints=17,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -126,7 +127,6 @@ def __init__(
 
 
 class ViTPoseOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
index 8fb7d1e12099..a3521eda48a2 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,11 +19,12 @@
 
 
 import argparse
+from pathlib import Path
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import ViTFeatureExtractor, ViTPoseConfig, ViTPoseForPoseEstimation
 from transformers.utils import logging
 
@@ -94,7 +95,7 @@ def prepare_img():
 
 
 @torch.no_grad()
-def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
+def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
     """
     Copy/paste/tweak model's weights to our ViTPose structure.
     """
@@ -129,8 +130,8 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     model.load_state_dict(new_state_dict)
 
     # Check outputs on an image, prepared by ViTFeatureExtractor
-    feature_extractor = ViTFeatureExtractor(size=config.image_size[::-1])
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = ViTFeatureExtractor(size=config.image_size[::-1])
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     print("Shape of pixel values:", pixel_values.shape)
@@ -139,11 +140,17 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     # TODO assert logits
     print("Shape of logits:", outputs.logits.shape)
 
-    # Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    # print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    # model.save_pretrained(pytorch_dump_folder_path)
-    # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and image processor for {model_name} to hub")
+        model.push_to_hub(f"nielsr/{model_name}")
+        image_processor.push_to_hub(f"nielsr/{model_name}")
 
 
 if __name__ == "__main__":
@@ -157,13 +164,16 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     )
     parser.add_argument(
         "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/ViTPose/Original checkpoints/vitpose-b-simple.pth",
+        default="/Users/nielsrogge/Documents/ViTPose/vitpose-b-simple.pth",
         type=str,
         help="Path to the original PyTorch checkpoint (.pt file).",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
 
     args = parser.parse_args()
-    convert_vitpose_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
+    convert_vitpose_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/vitpose/feature_extraction_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
similarity index 87%
rename from src/transformers/models/vitpose/feature_extraction_vitpose.py
rename to src/transformers/models/vitpose/image_processing_vitpose.py
index 89e2cc5f6b2a..7687dec7a91f 100644
--- a/src/transformers/models/vitpose/feature_extraction_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 # limitations under the License.
 """Feature extractor class for ViTPose."""
 
-from typing import Optional, Union
 import math
+from typing import Optional, Union
 
 import numpy as np
 from PIL import Image
@@ -37,16 +37,16 @@
 def get_warp_matrix(theta, size_input, size_dst, size_target):
     """
     Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
-    
+
     Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
     Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
-    
+
     Args:
         theta (float): Rotation angle in degrees.
         size_input (np.ndarray): Size of input image [w, h].
         size_dst (np.ndarray): Size of output image [w, h].
         size_target (np.ndarray): Size of ROI in input plane [w, h].
-    
+
     Returns:
         np.ndarray: A matrix for transformation.
     """
@@ -56,20 +56,20 @@ def get_warp_matrix(theta, size_input, size_dst, size_target):
     scale_y = size_dst[1] / size_target[1]
     matrix[0, 0] = math.cos(theta) * scale_x
     matrix[0, 1] = -math.sin(theta) * scale_x
-    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
-                              0.5 * size_input[1] * math.sin(theta) +
-                              0.5 * size_target[0])
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * math.cos(theta) + 0.5 * size_input[1] * math.sin(theta) + 0.5 * size_target[0]
+    )
     matrix[1, 0] = math.sin(theta) * scale_y
     matrix[1, 1] = math.cos(theta) * scale_y
-    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
-                              0.5 * size_input[1] * math.cos(theta) +
-                              0.5 * size_target[1])
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * math.sin(theta) - 0.5 * size_input[1] * math.cos(theta) + 0.5 * size_target[1]
+    )
     return matrix
 
 
-class ViTPoseFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class ViTPoseImageProcessor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a ViTPose feature extractor.
+    Constructs a ViTPose image processor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
@@ -90,13 +90,7 @@ class ViTPoseFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixi
     model_input_names = ["pixel_values"]
 
     def __init__(
-        self,
-        do_affine_transform=True,
-        do_rescale=True,
-        do_normalize=True,
-        image_mean=None,
-        image_std=None,
-        **kwargs
+        self, do_affine_transform=True, do_rescale=True, do_normalize=True, image_mean=None, image_std=None, **kwargs
     ):
         super().__init__(**kwargs)
         self.do_affine_transform = do_affine_transform
@@ -107,13 +101,13 @@ def __init__(
 
     def affine_transform(self, image):
         raise NotImplementedError("To do")
-        
-        #transformation = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
 
-        #image = image.transform(transformation, Image.AFFINE, resample=Image.BILINEAR)
+        # transformation = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+
+        # image = image.transform(transformation, Image.AFFINE, resample=Image.BILINEAR)
+
+        # return image
 
-        #return image
-    
     def __call__(
         self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
     ) -> BatchFeature:
@@ -183,4 +177,4 @@ def __call__(
         data = {"pixel_values": images}
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
 
-        return encoded_inputs
\ No newline at end of file
+        return encoded_inputs
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 3fd891d85c6e..bfde2cb1af0a 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 University of Sydney and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -191,7 +191,6 @@ def __init__(self, config: ViTPoseConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -249,7 +248,6 @@ def __init__(self, config: ViTPoseConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -341,17 +339,11 @@ def forward(
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
             if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
                     hidden_states,
                     layer_head_mask,
+                    output_attentions,
                 )
             else:
                 layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
@@ -373,7 +365,6 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->ViTPose,vit->vitpose
 class ViTPosePreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -384,22 +375,27 @@ class ViTPosePreTrainedModel(PreTrainedModel):
     base_model_prefix = "vitpose"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTPoseEmbeddings", "ViTPoseLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module: ViTPoseEncoder, value: bool = False) -> None:
-        if isinstance(module, ViTPoseEncoder):
-            module.gradient_checkpointing = value
+        elif isinstance(module, ViTPoseEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
 
 
 VITPOSE_START_DOCSTRING = r"""
@@ -441,7 +437,7 @@ def _set_gradient_checkpointing(self, module: ViTPoseEncoder, value: bool = Fals
     VITPOSE_START_DOCSTRING,
 )
 class ViTPoseModel(ViTPosePreTrainedModel):
-    def __init__(self, config: ViTPoseConfig, add_pooling_layer: bool = True):
+    def __init__(self, config: ViTPoseConfig):
         super().__init__(config)
         self.config = config
 
@@ -449,7 +445,6 @@ def __init__(self, config: ViTPoseConfig, add_pooling_layer: bool = True):
         self.encoder = ViTPoseEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.pooler = ViTPosePooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -523,28 +518,12 @@ def forward(
         )
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->ViTPose
-class ViTPosePooler(nn.Module):
-    def __init__(self, config: ViTPoseConfig):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
 class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
     def __init__(self, config: ViTPoseConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.vit = ViTPoseModel(config, add_pooling_layer=False)
+        self.vit = ViTPoseModel(config)
 
         # Keypoint head
         final_conv_kernel = 3
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index af1c41bffeb5..7c9af99c4a24 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -9076,6 +9076,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ViTPoseForPoseEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ViTPoseModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ViTPosePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 VITS_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -9231,34 +9255,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class ViTPoseForPoseEstimation(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class ViTPoseModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class ViTPosePreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class Wav2Vec2ForAudioFrameClassification(metaclass=DummyObject):
 class Wav2Vec2ConformerForCTC(metaclass=DummyObject):
     _backends = ["torch"]
 

From 067f593de5d15e4b4afddd186e24055276a2d3ed Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 15 Apr 2024 11:54:10 +0200
Subject: [PATCH 006/181] More improvements

---
 docs/source/en/model_doc/vitpose.md           |   6 +
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/vitpose/__init__.py   |  26 +-
 ...to_pytorch.py => convert_vitpose_to_hf.py} | 112 ++++++---
 .../vitpose/image_processing_vitpose.py       | 238 ++++++++++++++++--
 .../models/vitpose/modeling_vitpose.py        |   2 +-
 .../utils/dummy_vision_objects.py             |   7 +
 7 files changed, 336 insertions(+), 57 deletions(-)
 rename src/transformers/models/vitpose/{convert_vitpose_to_pytorch.py => convert_vitpose_to_hf.py} (71%)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index e709cd974646..3915abf754a9 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -25,6 +25,12 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
 
 
+## ViTPoseImageProcessor
+
+[[autodoc]] ViTPoseImageProcessor
+    - preprocess
+
+
 ## ViTPoseConfig
 
 [[autodoc]] ViTPoseConfig
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 409760a5081e..18c43bf0db6b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1367,6 +1367,7 @@
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
     _import_structure["models.vit_hybrid"].extend(["ViTHybridImageProcessor"])
     _import_structure["models.vitmatte"].append("VitMatteImageProcessor")
+    _import_structure["models.vitpose"].append("ViTPoseImageProcessor")
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
@@ -6269,6 +6270,7 @@
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
         from .models.vit_hybrid import ViTHybridImageProcessor
         from .models.vitmatte import VitMatteImageProcessor
+        from .models.vitpose import ViTPoseImageProcessor
         from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
 
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index 0329e2d4d007..94d322338f0e 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -17,12 +17,20 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
-_import_structure = {
-    "configuration_vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig", "ViTPoseOnnxConfig"]
-}
+_import_structure = {"configuration_vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig"]}
+
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_vitpose"] = ["ViTPoseImageProcessor"]
+
 
 try:
     if not is_torch_available():
@@ -38,7 +46,15 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig, ViTPoseOnnxConfig
+    from .configuration_vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_vitpose import ViTPoseImageProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
similarity index 71%
rename from src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
rename to src/transformers/models/vitpose/convert_vitpose_to_hf.py
index a3521eda48a2..03cece3a4568 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_pytorch.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -21,18 +21,35 @@
 import argparse
 from pathlib import Path
 
+import numpy as np
 import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import ViTFeatureExtractor, ViTPoseConfig, ViTPoseForPoseEstimation
-from transformers.utils import logging
+from transformers import ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
 
 
-# from pathlib import Path
+def get_config(model_name):
+    config = ViTPoseConfig()
+    # size of the architecture
+    if "small" in model_name:
+        config.hidden_size = 768
+        config.intermediate_size = 2304
+        config.num_hidden_layers = 8
+        config.num_attention_heads = 8
+    elif "large" in model_name:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif "huge" in model_name:
+        config.hidden_size = 1280
+        config.intermediate_size = 5120
+        config.num_hidden_layers = 32
+        config.num_attention_heads = 16
 
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
+    return config
 
 
 def rename_key(name):
@@ -101,24 +118,7 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     """
 
     # define default ViTPose configuration
-    config = ViTPoseConfig()
-
-    # size of the architecture
-    if "small" in model_name:
-        config.hidden_size = 768
-        config.intermediate_size = 2304
-        config.num_hidden_layers = 8
-        config.num_attention_heads = 8
-    elif "large" in model_name:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "huge" in model_name:
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
+    config = get_config(model_name)
 
     # load HuggingFace model
     model = ViTPoseForPoseEstimation(config)
@@ -126,25 +126,77 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
 
     # load state_dict of original model, remove and rename some keys
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
+
+    for name, param in state_dict.items():
+        print(name, param.shape)
+
     new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size)
     model.load_state_dict(new_state_dict)
 
-    # Check outputs on an image, prepared by ViTFeatureExtractor
-    image_processor = ViTFeatureExtractor(size=config.image_size[::-1])
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
+    # TODO create image processor
+    image_processor = ViTPoseImageProcessor()
+    # image_processor = ViTImageProcessor(size=config.image_size[::-1])
+    # encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    # pixel_values = encoding["pixel_values"]
+
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
+    pixel_values = torch.load(filepath, map_location="cpu")["img"]
+    img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
 
     print("Shape of pixel values:", pixel_values.shape)
     outputs = model(pixel_values)
 
-    # TODO assert logits
+    # TODO assert logits (output heatmap)
     print("Shape of logits:", outputs.logits.shape)
+    print("First values of output heatmp:", outputs.logits[0, 0, :3, :3])
+
+    # TODO verify postprocessing
+    batch_size = pixel_values.shape[0]
+    heatmaps = outputs.logits.cpu().numpy()
+
+    if "bbox_id" in img_metas[0]:
+        bbox_ids = []
+    else:
+        bbox_ids = None
+
+    c = np.zeros((batch_size, 2), dtype=np.float32)
+    s = np.zeros((batch_size, 2), dtype=np.float32)
+    image_paths = []
+    score = np.ones(batch_size)
+    for i in range(batch_size):
+        c[i, :] = img_metas[i]["center"]
+        s[i, :] = img_metas[i]["scale"]
+        image_paths.append(img_metas[i]["image_file"])
+
+        if "bbox_score" in img_metas[i]:
+            score[i] = np.array(img_metas[i]["bbox_score"]).reshape(-1)
+        if bbox_ids is not None:
+            bbox_ids.append(img_metas[i]["bbox_id"])
+
+    preds, maxvals = image_processor.keypoints_from_heatmaps(heatmaps, center=c, scale=s)
+
+    all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+    all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+    all_preds[:, :, 0:2] = preds[:, :, 0:2]
+    all_preds[:, :, 2:3] = maxvals
+    all_boxes[:, 0:2] = c[:, 0:2]
+    all_boxes[:, 2:4] = s[:, 0:2]
+    all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+    all_boxes[:, 5] = score
+
+    result = {}
+
+    result["preds"] = all_preds
+    result["boxes"] = all_boxes
+    result["image_paths"] = image_paths
+    result["bbox_ids"] = bbox_ids
+
+    print(result["boxes"])
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
         image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 7687dec7a91f..d797383d89cb 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -17,14 +17,14 @@
 import math
 from typing import Optional, Union
 
+import cv2
 import numpy as np
 from PIL import Image
 
-from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
-    ImageFeatureExtractionMixin,
     ImageInput,
     is_torch_tensor,
 )
@@ -34,6 +34,149 @@
 logger = logging.get_logger(__name__)
 
 
+def _get_max_preds(heatmaps):
+    """Get keypoint predictions from score maps.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps, np.ndarray), "heatmaps should be numpy.ndarray"
+    assert heatmaps.ndim == 4, "batch_images should be 4-ndim"
+
+    N, K, _, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % W
+    preds[:, :, 1] = preds[:, :, 1] // W
+
+    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def post_dark_udp(coords, batch_heatmaps, kernel=3):
+    """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
+    Devil is in the Details: Delving into Unbiased Data Processing for Human
+    Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
+    Representation for Human Pose Estimation (CVPR 2020).
+
+    Note:
+        - batch size: B
+        - num keypoints: K
+        - num persons: N
+        - height of heatmaps: H
+        - width of heatmaps: W
+
+        B=1 for bottom_up paradigm where all persons share the same heatmap.
+        B=N for top_down paradigm where each person has its own heatmaps.
+
+    Args:
+        coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
+        batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
+        kernel (int): Gaussian kernel size (K) for modulation.
+
+    Returns:
+        np.ndarray([N, K, 2]): Refined coordinates.
+    """
+    if not isinstance(batch_heatmaps, np.ndarray):
+        batch_heatmaps = batch_heatmaps.cpu().numpy()
+    B, K, H, W = batch_heatmaps.shape
+    N = coords.shape[0]
+    assert B == 1 or B == N
+    for heatmaps in batch_heatmaps:
+        for heatmap in heatmaps:
+            cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+    np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
+    np.log(batch_heatmaps, batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode="edge").flatten()
+
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
+    index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + W + 2]
+    ix1y1 = batch_heatmaps_pad[index + W + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - W]
+
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(N, K, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(N, K, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum("ijmn,ijnk->ijmk", hessian, derivative).squeeze()
+    return coords
+
+
+def transform_preds(coords, center, scale, output_size, use_udp=False):
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (np.ndarray[K, ndims]):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        use_udp (bool): Use unbiased data processing
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    assert coords.shape[1] in (2, 4, 5)
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    if use_udp:
+        scale_x = scale[0] / (output_size[0] - 1.0)
+        scale_y = scale[1] / (output_size[1] - 1.0)
+    else:
+        scale_x = scale[0] / output_size[0]
+        scale_y = scale[1] / output_size[1]
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
 def get_warp_matrix(theta, size_input, size_dst, size_target):
     """
     Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
@@ -67,13 +210,10 @@ def get_warp_matrix(theta, size_input, size_dst, size_target):
     return matrix
 
 
-class ViTPoseImageProcessor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class ViTPoseImageProcessor(BaseImageProcessor):
     r"""
     Constructs a ViTPose image processor.
 
-    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
-    should refer to this superclass for more information regarding those methods.
-
     Args:
         do_affine_transform (`bool`, *optional*, defaults to `True`):
             Whether to apply an affine transformation to the input images.
@@ -81,9 +221,9 @@ class ViTPoseImageProcessor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
             Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
-        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`, *optional*):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`, *optional*):
             The sequence of standard deviations for each channel, to be used when normalizing images.
     """
 
@@ -108,24 +248,16 @@ def affine_transform(self, image):
 
         # return image
 
-    def __call__(
+    def preprocess(
         self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several image(s).
-
-        <Tip warning={true}>
-
-        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-        PIL images.
-
-        </Tip>
+        Preprocess an image or batch of images.
 
         Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
@@ -178,3 +310,67 @@ def __call__(
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
 
         return encoded_inputs
+
+    # TODO rename to post_process_keypoint_detection?
+    def keypoints_from_heatmaps(
+        self,
+        heatmaps,
+        center,
+        scale,
+        kernel=11,
+        use_udp=False,
+    ):
+        """Get final keypoint predictions from heatmaps and transform them back to
+        the image.
+
+        Note:
+            - batch size: N
+            - num keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+            center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+            scale (np.ndarray[N, 2]): Scale of the bounding box
+                wrt height/width.
+            post_process (str/None): Choice of methods to post-process
+                heatmaps. Currently supported: None, 'default', 'unbiased',
+                'megvii'.
+            unbiased (bool): Option to use unbiased decoding. Mutually
+                exclusive with megvii.
+                Note: this arg is deprecated and unbiased=True can be replaced
+                by post_process='unbiased'
+                Paper ref: Zhang et al. Distribution-Aware Coordinate
+                Representation for Human Pose Estimation (CVPR 2020).
+            kernel (int): Gaussian kernel size (K) for modulation, which should
+                match the heatmap gaussian sigma when training.
+                K=17 for sigma=3 and k=11 for sigma=2.
+            valid_radius_factor (float): The radius factor of the positive area
+                in classification heatmap for UDP.
+            use_udp (bool): Use unbiased data processing.
+            target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+                GaussianHeatmap: Classification target with gaussian distribution.
+                CombinedTarget: The combination of classification target
+                (response map) and regression target (offset map).
+                Paper ref: Huang et al. The Devil is in the Details: Delving into
+                Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+        Returns:
+            tuple: A tuple containing keypoint predictions and scores.
+
+            - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+            - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+        """
+        # Avoid being affected
+        heatmaps = heatmaps.copy()
+
+        N, K, H, W = heatmaps.shape
+        preds, maxvals = _get_max_preds(heatmaps)
+        preds = post_dark_udp(preds, heatmaps, kernel=kernel)
+
+        # Transform back to the image
+        for i in range(N):
+            preds[i] = transform_preds(preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
+
+        return preds, maxvals
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index bfde2cb1af0a..c6842a12dcf0 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -504,7 +504,7 @@ def forward(
         )
         sequence_output = encoder_outputs[0]
         sequence_output = self.layernorm(sequence_output)
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        pooled_output = None
 
         if not return_dict:
             head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 80b418adc16f..3845385ad138 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -590,6 +590,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ViTPoseImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VivitImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 7360c2267939533b6074fd7d7636ea54936a87ad Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 15 Apr 2024 15:02:38 +0200
Subject: [PATCH 007/181] Make predictions match

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 41 +++++++---
 .../vitpose/image_processing_vitpose.py       | 19 +++--
 .../models/vitpose/modeling_vitpose.py        | 82 +++++++++++++++----
 3 files changed, 109 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 03cece3a4568..6dd68a239039 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -77,6 +77,8 @@ def rename_key(name):
         name = name.replace("last_norm", "layernorm")
     if "final_layer." in name:
         name = name.replace("final_layer.", "")
+    if "keypoint_head" in name:
+        name = name.replace("keypoint_head", "head.conv")
 
     return name
 
@@ -127,15 +129,14 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     # load state_dict of original model, remove and rename some keys
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
 
-    for name, param in state_dict.items():
-        print(name, param.shape)
+    # for name, param in state_dict.items():
+    #     print(name, param.shape)
 
     new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size)
     model.load_state_dict(new_state_dict)
 
-    # TODO create image processor
+    # TODO verify image processor
     image_processor = ViTPoseImageProcessor()
-    # image_processor = ViTImageProcessor(size=config.image_size[::-1])
     # encoding = image_processor(images=prepare_img(), return_tensors="pt")
     # pixel_values = encoding["pixel_values"]
 
@@ -144,15 +145,33 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
 
     print("Shape of pixel values:", pixel_values.shape)
-    outputs = model(pixel_values)
+    with torch.no_grad():
+        # first forward pass
+        output_heatmap = model(pixel_values).logits
 
-    # TODO assert logits (output heatmap)
-    print("Shape of logits:", outputs.logits.shape)
-    print("First values of output heatmp:", outputs.logits[0, 0, :3, :3])
+        # TODO assert logits (output heatmap)
+        print("Shape of heatmap:", output_heatmap.shape)
+        print("Mean value of heatmap:", output_heatmap.numpy().mean())
+
+        print("----------------")
+
+        # second forward pass (flipped)
+        pixel_values_flipped = torch.flip(pixel_values, [3])
+        print("Mean of pixel_values_flipped:", pixel_values_flipped.mean())
+        output_flipped_heatmap = model(
+            pixel_values_flipped, flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+        ).logits
+
+        print("Shape of flipped heatmap:", output_flipped_heatmap.shape)
+        print("Mean value of flipped heatmap:", output_flipped_heatmap.mean())
+
+    output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
+
+    print("Mean of final output_heatmap:", output_heatmap.mean())
 
     # TODO verify postprocessing
     batch_size = pixel_values.shape[0]
-    heatmaps = outputs.logits.cpu().numpy()
+    heatmaps = output_heatmap.cpu().numpy()
 
     if "bbox_id" in img_metas[0]:
         bbox_ids = []
@@ -173,7 +192,7 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
         if bbox_ids is not None:
             bbox_ids.append(img_metas[i]["bbox_id"])
 
-    preds, maxvals = image_processor.keypoints_from_heatmaps(heatmaps, center=c, scale=s)
+    preds, maxvals = image_processor.keypoints_from_heatmaps(heatmaps, center=c, scale=s, use_udp=True)
 
     all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
     all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
@@ -191,7 +210,7 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     result["image_paths"] = image_paths
     result["bbox_ids"] = bbox_ids
 
-    print(result["boxes"])
+    # print(result)
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index d797383d89cb..bd8d20935b7f 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -37,14 +37,13 @@
 def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
-    Note:
-        batch_size: N
-        num_keypoints: K
-        heatmap height: H
-        heatmap width: W
-
     Args:
-        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        heatmaps (np.ndarray of shape [N, K, H, W]):
+            Model predicted heatmaps. Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
 
     Returns:
         tuple: A tuple containing aggregated results.
@@ -366,7 +365,13 @@ def keypoints_from_heatmaps(
         heatmaps = heatmaps.copy()
 
         N, K, H, W = heatmaps.shape
+
+        print("Mean of heatmaps before _get_max_preds:", np.mean(heatmaps))
+
         preds, maxvals = _get_max_preds(heatmaps)
+
+        print("Preds after _get_max_preds:", preds)
+
         preds = post_dark_udp(preds, heatmaps, kernel=kernel)
 
         # Transform back to the image
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index c6842a12dcf0..50376f4843ed 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -518,6 +518,66 @@ def forward(
         )
 
 
+def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
+    """Flip the flipped heatmaps back to the original form.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert output_flipped.ndim == 4, "output_flipped should be [batch_size, num_keypoints, height, width]"
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == "CombinedTarget".lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, shape_ori[2], shape_ori[3])
+    output_flipped_back = output_flipped.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
+
+
+class ViTPoseKeyPointsHead(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+
+        self.conv = nn.Conv2d(config.hidden_size, config.num_keypoints, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
+        # Transform input: ReLu + upsample
+        hidden_state = nn.functional.relu(hidden_state)
+        hidden_state = nn.functional.interpolate(hidden_state, scale_factor=4, mode="bilinear", align_corners=False)
+
+        print("Shape after upsampling:", hidden_state.shape)
+        print("First values after upsampling:", hidden_state[0, 0, :3, :3])
+
+        output = self.conv(hidden_state)
+
+        if flip_pairs is not None:
+            output = flip_back(output.detach().cpu().numpy(), flip_pairs)
+
+        return output
+
+
 class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
     def __init__(self, config: ViTPoseConfig) -> None:
         super().__init__(config)
@@ -525,12 +585,7 @@ def __init__(self, config: ViTPoseConfig) -> None:
         self.num_labels = config.num_labels
         self.vit = ViTPoseModel(config)
 
-        # Keypoint head
-        final_conv_kernel = 3
-        padding = 1
-        self.keypoint_head = nn.Conv2d(
-            config.hidden_size, config.num_keypoints, kernel_size=final_conv_kernel, stride=1, padding=padding
-        )
+        self.head = ViTPoseKeyPointsHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -538,6 +593,7 @@ def __init__(self, config: ViTPoseConfig) -> None:
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
+        flip_pairs: Optional = None,
         head_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -563,26 +619,22 @@ def forward(
             sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
         )
 
-        # ReLu + upsample
-        sequence_output = nn.functional.relu(sequence_output)
-        sequence_output = nn.functional.interpolate(
-            sequence_output, scale_factor=4, mode="bilinear", align_corners=False
-        )
+        print("Sequence output before head:", sequence_output.shape)
+        print("First values of sequence output before head:", sequence_output[0, 0, :3, :3])
 
-        # Conv2d
-        logits = self.keypoint_head(sequence_output)
+        heatmaps = self.head(sequence_output, flip_pairs=flip_pairs)
 
         loss = None
         if labels is not None:
             raise NotImplementedError("To do")
 
         if not return_dict:
-            output = (logits,) + outputs[1:]
+            output = (heatmaps,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
         return ImageClassifierOutput(
             loss=loss,
-            logits=logits,
+            logits=heatmaps,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )

From a1b154ad5ccb89470b0c08dfad1f851eaee90705 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 15 Apr 2024 15:42:27 +0200
Subject: [PATCH 008/181] More improvements

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 53 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 6dd68a239039..f0e116089a3d 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -30,6 +30,23 @@
 from transformers import ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
 
 
+def _xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+
+    return bbox_xyxy
+
+
 def get_config(model_name):
     config = ViTPoseConfig()
     # size of the architecture
@@ -209,8 +226,40 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     result["boxes"] = all_boxes
     result["image_paths"] = image_paths
     result["bbox_ids"] = bbox_ids
-
-    # print(result)
+    result["output_heatmap"] = None  # return_heatmap = False for inference in mmpose
+
+    print(result)
+    poses, heatmap = result["preds"], result["output_heatmap"]
+
+    # create final results by adding person bbox information
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
+    person_results = torch.load(filepath, map_location="cpu")
+    bboxes = np.array([box["bbox"] for box in person_results])
+    bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    pose_results = []
+    for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy):
+        pose_result = person_result.copy()
+        pose_result["keypoints"] = pose
+        pose_result["bbox"] = bbox_xyxy
+        pose_results.append(pose_result)
+
+    print("Pose results:", pose_results)
+
+    # Verify pose_results
+    # This is a list of dictionaries, containing the bounding box and keypoints per detected person
+    assert torch.allclose(
+        torch.from_numpy(pose_results[0]["bbox"]).float(), torch.tensor([412.8, 157.61, 464.85, 294.62])
+    )
+    assert torch.allclose(
+        torch.from_numpy(pose_results[1]["bbox"]).float(), torch.tensor([384.43, 172.21, 398.55, 206.95])
+    )
+    assert pose_results[0]["keypoints"].shape == (17, 3)
+    assert pose_results[1]["keypoints"].shape == (17, 3)
+    assert torch.allclose(
+        torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+        torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
+    )
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)

From 4bd07c3855e330c638beeb411897f6592b76d674 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 15 Apr 2024 21:47:41 +0200
Subject: [PATCH 009/181] Improve image processor

---
 .../vitpose/image_processing_vitpose.py       | 154 ++++++++++++++----
 .../models/vitpose/test_image_processor.py    |  17 ++
 2 files changed, 139 insertions(+), 32 deletions(-)
 create mode 100644 src/transformers/models/vitpose/test_image_processor.py

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index bd8d20935b7f..6d6f124ad10c 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -12,28 +12,65 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for ViTPose."""
+"""Image processor class for ViTPose."""
 
 import math
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
+# TODO get rid of cv2
 import cv2
 import numpy as np
-from PIL import Image
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    ChannelDimension,
     ImageInput,
-    is_torch_tensor,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
 )
-from ...utils import TensorType, logging
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
 
 
 logger = logging.get_logger(__name__)
 
 
+def _box2cs(box, input_size):
+    """This encodes a bounding box (x,y,w,h) into (center, scale)
+
+    Args:
+        x, y, w, h
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = box[:4]
+    aspect_ratio = input_size[0] / input_size[1]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    # pixel std is 200.0
+    scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+    scale = scale * 1.25
+
+    return center, scale
+
+
 def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
@@ -216,8 +253,14 @@ class ViTPoseImageProcessor(BaseImageProcessor):
     Args:
         do_affine_transform (`bool`, *optional*, defaults to `True`):
             Whether to apply an affine transformation to the input images.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 192}`):
+            Resolution of the image after `affine_transform` is applied. Only has an effect if `do_affine_transform` is set to `True`. Can
+            be overriden by `size` in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
+            method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
         image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`, *optional*):
@@ -229,27 +272,57 @@ class ViTPoseImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-        self, do_affine_transform=True, do_rescale=True, do_normalize=True, image_mean=None, image_std=None, **kwargs
+        self,
+        do_affine_transform: bool = True,
+        size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.do_affine_transform = do_affine_transform
+        self.size = size if size is not None else {"height": 256, "width": 192}
         self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
 
-    def affine_transform(self, image):
-        raise NotImplementedError("To do")
+    def affine_transform(
+        self,
+        image: np.array,
+        center: tuple[float],
+        scale: tuple[float],
+        rotation: float,
+        size: Dict[str, int],
+    ) -> np.array:
+        size = (size["width"], size["height"])
 
-        # transformation = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+        transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
 
-        # image = image.transform(transformation, Image.AFFINE, resample=Image.BILINEAR)
+        image = cv2.warpAffine(image, transformation, size, flags=cv2.INTER_LINEAR)
 
-        # return image
+        return image
 
     def preprocess(
-        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
-    ) -> BatchFeature:
+        self,
+        images: ImageInput,
+        boxes,
+        do_affine_transform: bool = None,
+        size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
@@ -272,35 +345,52 @@ def preprocess(
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
         """
-        # Input type checking for clearer error
-        valid_images = False
+        do_affine_transform = do_affine_transform if do_affine_transform is not None else self.do_affine_transform
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
 
-        # Check that images has a valid type
-        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
-            valid_images = True
-        elif isinstance(images, (list, tuple)):
-            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
-                valid_images = True
+        images = make_list_of_images(images)
 
-        if not valid_images:
+        if not valid_images(images):
             raise ValueError(
-                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
-                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        is_batched = bool(
-            isinstance(images, (list, tuple))
-            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
-        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
 
-        if not is_batched:
-            images = [images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
 
         # transformations (affine transformation + rescaling + normalization)
+        new_images = []
         if self.do_affine_transform:
-            images = [self.affine_transform(image) for image in images]
+            for image, image_boxes in zip(images, boxes):
+                for box in image_boxes:
+                    center, scale = _box2cs(box, (size["width"], size["height"]))
+                    transformed_image = self.affine_transform(image, center, scale, rotation=0, size=size)
+                    new_images.append(transformed_image)
+
+        images = new_images
+
+        # TODO each image might have a variable number of boxes => padding?
+        # create pixel_values of shape (batch_size, num_boxes, num_channels, height, width)
+        for image in images:
+            print(image.shape)
+
         if self.do_rescale:
-            images = [self.to_numpy_array(image=image) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
         if self.do_normalize:
             images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
 
diff --git a/src/transformers/models/vitpose/test_image_processor.py b/src/transformers/models/vitpose/test_image_processor.py
new file mode 100644
index 000000000000..23687e9c1b4f
--- /dev/null
+++ b/src/transformers/models/vitpose/test_image_processor.py
@@ -0,0 +1,17 @@
+import requests
+from PIL import Image
+
+from transformers import ViTPoseImageProcessor
+
+
+url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+image_processor = ViTPoseImageProcessor()
+
+boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+
+inputs = image_processor(images=image, boxes=boxes, return_tensors="pt")
+
+print(inputs.pixel_values.shape)
+print(inputs.pixel_values.mean())

From 44f694a657ad7f0457842038b722d01099137b16 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Tue, 16 Apr 2024 20:33:21 +0200
Subject: [PATCH 010/181] Fix model tests

---
 .../models/vitpose/configuration_vitpose.py   | 24 ++------
 .../models/vitpose/convert_vitpose_to_hf.py   |  4 +-
 .../models/vitpose/modeling_vitpose.py        | 26 +++++++--
 tests/models/vitpose/test_modeling_vitpose.py | 57 ++++++++++++++-----
 4 files changed, 71 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 72ab7f60fe43..8d7459d62e19 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -70,8 +70,8 @@ class ViTPoseConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        num_keypoints (`int`, *optional*, defaults to 17):
-            The number of keypoints.
+        scale_factor (`int`, *optional*, defaults to 4):
+            Factor to upscale te feature maps coming from the ViT backbone.
 
     Example:
 
@@ -105,7 +105,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         qkv_bias=True,
-        num_keypoints=17,
+        scale_factor=4,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -123,20 +123,4 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-        self.num_keypoints = num_keypoints
-
-
-class ViTPoseOnnxConfig(OnnxConfig):
-    torch_onnx_minimum_version = version.parse("1.11")
-
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("pixel_values", {0: "batch", 1: "sequence"}),
-            ]
-        )
-
-    @property
-    def atol_for_validation(self) -> float:
-        return 1e-4
+        self.scale_factor = scale_factor
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index f0e116089a3d..321ec6dd56d2 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -48,7 +48,7 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def get_config(model_name):
-    config = ViTPoseConfig()
+    config = ViTPoseConfig(num_labels=17)
     # size of the architecture
     if "small" in model_name:
         config.hidden_size = 768
@@ -229,7 +229,7 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     result["output_heatmap"] = None  # return_heatmap = False for inference in mmpose
 
     print(result)
-    poses, heatmap = result["preds"], result["output_heatmap"]
+    poses, _ = result["preds"], result["output_heatmap"]
 
     # create final results by adding person bbox information
     filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 50376f4843ed..001be21be733 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -76,6 +76,9 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
         embeddings = self.patch_embeddings(pixel_values)
 
+        print("Shape of embeddings:", embeddings.shape)
+        print("Shape of position embeddings:", self.position_embeddings.shape)
+
         # add positional encoding to each token
         embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
 
@@ -108,12 +111,17 @@ def __init__(
         self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, height, width = pixel_values.shape
+        print("Shape of pixel values:", pixel_values.shape)
+        height, width = pixel_values.shape[-2:]
         if height != self.image_size[0] or width != self.image_size[1]:
             raise ValueError(
                 f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
             )
-        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        x = self.projection(pixel_values)
+
+        print(x.shape)
+
+        x = x.flatten(2).transpose(1, 2)
         return x
 
 
@@ -372,7 +380,7 @@ class ViTPosePreTrainedModel(PreTrainedModel):
     """
 
     config_class = ViTPoseConfig
-    base_model_prefix = "vitpose"
+    base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _no_split_modules = ["ViTPoseEmbeddings", "ViTPoseLayer"]
@@ -560,18 +568,23 @@ class ViTPoseKeyPointsHead(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
 
-        self.conv = nn.Conv2d(config.hidden_size, config.num_keypoints, kernel_size=3, stride=1, padding=1)
+        self.scale_factor = config.scale_factor
+        self.conv = nn.Conv2d(config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
         # Transform input: ReLu + upsample
         hidden_state = nn.functional.relu(hidden_state)
-        hidden_state = nn.functional.interpolate(hidden_state, scale_factor=4, mode="bilinear", align_corners=False)
+        hidden_state = nn.functional.interpolate(
+            hidden_state, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+        )
 
         print("Shape after upsampling:", hidden_state.shape)
         print("First values after upsampling:", hidden_state[0, 0, :3, :3])
 
         output = self.conv(hidden_state)
 
+        print("Shape after conv:", output.shape)
+
         if flip_pairs is not None:
             output = flip_back(output.detach().cpu().numpy(), flip_pairs)
 
@@ -602,6 +615,9 @@ def forward(
     ) -> Union[tuple, ImageClassifierOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if labels is not None:
+            raise NotImplementedError("Training is not yet supported")
+
         outputs = self.vit(
             pixel_values,
             head_mask=head_mask,
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 4b27199a43de..b3d98816dca3 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,8 +44,8 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        image_size=30,
-        patch_size=2,
+        image_size=[16 * 8, 12 * 8],
+        patch_size=[8, 8],
         num_channels=3,
         is_training=True,
         use_labels=True,
@@ -58,8 +58,9 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         type_sequence_label_size=10,
         initializer_range=0.02,
+        num_labels=2,
+        scale_factor=4,
         scope=None,
-        encoder_stride=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -77,15 +78,16 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scale_factor = scale_factor
         self.scope = scope
-        self.encoder_stride = encoder_stride
 
-        # in ViTPose, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
+        # in ViTPose, the seq length equals the number of patches
+        num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
+        self.seq_length = num_patches
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
 
         labels = None
         if self.use_labels:
@@ -107,9 +109,9 @@ def get_config(self):
             hidden_act=self.hidden_act,
             hidden_dropout_prob=self.hidden_dropout_prob,
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
             initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
+            num_labels=self.num_labels,
+            scale_factor=self.scale_factor,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -119,6 +121,19 @@ def create_and_check_model(self, config, pixel_values, labels):
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
+    def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
+        model = ViTPoseForPoseEstimation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        expected_height = (self.image_size[0] // self.patch_size[0]) * self.scale_factor
+        expected_width = (self.image_size[1] // self.patch_size[1]) * self.scale_factor
+
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_labels, expected_height, expected_width)
+        )
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -162,6 +177,22 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -187,9 +218,9 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_image_classification(self):
+    def test_for_pose_estimation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+        self.model_tester.create_and_check_for_pose_estimation(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):

From 41c1778f29531d790e62c07d9a4770d063c517ce Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Tue, 16 Apr 2024 21:00:47 +0200
Subject: [PATCH 011/181] Add classic decoder

---
 .../models/vitpose/configuration_vitpose.py   |  9 ++--
 .../models/vitpose/convert_vitpose_to_hf.py   | 28 ++++++----
 .../models/vitpose/modeling_vitpose.py        | 51 +++++++++++++++----
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 8d7459d62e19..08c23457674a 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -14,13 +14,8 @@
 # limitations under the License.
 """ ViTPose model configuration"""
 
-from collections import OrderedDict
-from typing import Mapping
-
-from packaging import version
 
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
 from ...utils import logging
 
 
@@ -72,6 +67,8 @@ class ViTPoseConfig(PretrainedConfig):
             Whether to add a bias to the queries, keys and values.
         scale_factor (`int`, *optional*, defaults to 4):
             Factor to upscale te feature maps coming from the ViT backbone.
+        use_simple_decoder (`bool`, *optional*, defaults to `True`):
+            Whether to use a simple decoder to decode the feature maps from the backbone into heatmaps.
 
     Example:
 
@@ -106,6 +103,7 @@ def __init__(
         layer_norm_eps=1e-12,
         qkv_bias=True,
         scale_factor=4,
+        use_simple_decoder=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -124,3 +122,4 @@ def __init__(
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
         self.scale_factor = scale_factor
+        self.use_simple_decoder = use_simple_decoder
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 321ec6dd56d2..7b1308867c3e 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -48,7 +48,8 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def get_config(model_name):
-    config = ViTPoseConfig(num_labels=17)
+    use_simple_decoder = "simple" in model_name
+    config = ViTPoseConfig(num_labels=17, use_simple_decoder=use_simple_decoder)
     # size of the architecture
     if "small" in model_name:
         config.hidden_size = 768
@@ -97,6 +98,8 @@ def rename_key(name):
     if "keypoint_head" in name:
         name = name.replace("keypoint_head", "head.conv")
 
+    # TODO classic decoder weights
+
     return name
 
 
@@ -130,8 +133,14 @@ def prepare_img():
     return im
 
 
+name_to_path = {
+    "vitpose-base-simple": "/Users/nielsrogge/Documents/ViTPose/vitpose-b-simple.pth",
+    "vitpose-base": "/Users/nielsrogge/Documents/ViTPose/vitpose-b.pth",
+}
+
+
 @torch.no_grad()
-def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
+def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
     """
     Copy/paste/tweak model's weights to our ViTPose structure.
     """
@@ -143,12 +152,14 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     model = ViTPoseForPoseEstimation(config)
     model.eval()
 
-    # load state_dict of original model, remove and rename some keys
+    # load state_dict of original model
+    checkpoint_path = name_to_path[model_name]
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
 
     # for name, param in state_dict.items():
     #     print(name, param.shape)
 
+    # rename some keys
     new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size)
     model.load_state_dict(new_state_dict)
 
@@ -278,16 +289,11 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="vitpose_base",
+        default="vitpose-base-simple",
+        choices=name_to_path.keys(),
         type=str,
         help="Name of the ViTPose model you'd like to convert.",
     )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/ViTPose/vitpose-b-simple.pth",
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pt file).",
-    )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
@@ -296,4 +302,4 @@ def convert_vitpose_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     )
 
     args = parser.parse_args()
-    convert_vitpose_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_vitpose_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 001be21be733..732e2df08f2f 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -564,7 +564,12 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
     return output_flipped_back
 
 
-class ViTPoseKeyPointsHead(nn.Module):
+class ViTPoseSimpleDecoder(nn.Module):
+    """
+    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
+    feature maps into heatmaps.
+    """
+
     def __init__(self, config) -> None:
         super().__init__()
 
@@ -578,17 +583,45 @@ def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
             hidden_state, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
         )
 
-        print("Shape after upsampling:", hidden_state.shape)
-        print("First values after upsampling:", hidden_state[0, 0, :3, :3])
+        heatmaps = self.conv(hidden_state)
+
+        if flip_pairs is not None:
+            heatmaps = flip_back(heatmaps.detach().cpu().numpy(), flip_pairs)
 
-        output = self.conv(hidden_state)
+        return heatmaps
 
-        print("Shape after conv:", output.shape)
 
-        if flip_pairs is not None:
-            output = flip_back(output.detach().cpu().numpy(), flip_pairs)
+class ViTPoseClassicDecoder(nn.Module):
+    """
+    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
+    turning the feature maps into heatmaps.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.deconv1 = nn.ConvTranspose2d(config.hidden_size, 256, kernel_size=4, stride=2, padding=1)
+        self.batchnorm1 = nn.BatchNorm2d(256)
+        self.relu1 = nn.ReLU()
+
+        self.deconv2 = nn.ConvTranspose2d(config.hidden_size, 256, kernel_size=4, stride=2, padding=1)
+        self.batchnorm2 = nn.BatchNorm2d(256)
+        self.relu2 = nn.ReLU()
+
+        self.conv = nn.Conv2d(256, config.num_labels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_state, flip_pairs):
+        hidden_state = self.deconv1(hidden_state)
+        hidden_state = self.batchnorm1(hidden_state)
+        hidden_state = self.relu1(hidden_state)
+
+        hidden_state = self.deconv2(hidden_state)
+        hidden_state = self.batchnorm2(hidden_state)
+        hidden_state = self.relu2(hidden_state)
+
+        heatmaps = self.conv(hidden_state)
 
-        return output
+        return heatmaps
 
 
 class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
@@ -598,7 +631,7 @@ def __init__(self, config: ViTPoseConfig) -> None:
         self.num_labels = config.num_labels
         self.vit = ViTPoseModel(config)
 
-        self.head = ViTPoseKeyPointsHead(config)
+        self.head = ViTPoseSimpleDecoder(config) if config.use_simple_decoder else ViTPoseClassicDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()

From ceb3d3c6b8d6ffe0e46713d080cbb7cda8a71215 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 21 Apr 2024 22:01:03 +0200
Subject: [PATCH 012/181] Convert classic decoder

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 52 ++++++++++++++-----
 .../models/vitpose/modeling_vitpose.py        | 13 +----
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 7b1308867c3e..b7462e83c4e7 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -70,7 +70,7 @@ def get_config(model_name):
     return config
 
 
-def rename_key(name):
+def rename_key(name, config):
     if "backbone" in name:
         name = name.replace("backbone", "vit")
     if "patch_embed.proj" in name:
@@ -93,17 +93,33 @@ def rename_key(name):
         name = name.replace("mlp.fc2", "output.dense")
     if "last_norm" in name:
         name = name.replace("last_norm", "layernorm")
-    if "final_layer." in name:
+
+    # keypoint head
+    if "keypoint_head" in name and config.use_simple_decoder:
         name = name.replace("final_layer.", "")
-    if "keypoint_head" in name:
         name = name.replace("keypoint_head", "head.conv")
-
-    # TODO classic decoder weights
+    elif "keypoint_head" in name and not config.use_simple_decoder:
+        name = name.replace("keypoint_head", "head")
+        name = name.replace("deconv_layers.0.weight", "deconv1.weight")
+        name = name.replace("deconv_layers.1.weight", "batchnorm1.weight")
+        name = name.replace("deconv_layers.1.bias", "batchnorm1.bias")
+        name = name.replace("deconv_layers.1.running_mean", "batchnorm1.running_mean")
+        name = name.replace("deconv_layers.1.running_var", "batchnorm1.running_var")
+        name = name.replace("deconv_layers.1.num_batches_tracked", "batchnorm1.num_batches_tracked")
+        name = name.replace("deconv_layers.3.weight", "deconv2.weight")
+        name = name.replace("deconv_layers.4.weight", "batchnorm2.weight")
+        name = name.replace("deconv_layers.4.bias", "batchnorm2.bias")
+        name = name.replace("deconv_layers.4.running_mean", "batchnorm2.running_mean")
+        name = name.replace("deconv_layers.4.running_var", "batchnorm2.running_var")
+        name = name.replace("deconv_layers.4.num_batches_tracked", "batchnorm2.num_batches_tracked")
+
+        name = name.replace("final_layer.weight", "conv.weight")
+        name = name.replace("final_layer.bias", "conv.bias")
 
     return name
 
 
-def convert_state_dict(orig_state_dict, dim):
+def convert_state_dict(orig_state_dict, dim, config):
     for key in orig_state_dict.copy().keys():
         val = orig_state_dict.pop(key)
 
@@ -121,7 +137,7 @@ def convert_state_dict(orig_state_dict, dim):
                 orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
                 orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
         else:
-            orig_state_dict[rename_key(key)] = val
+            orig_state_dict[rename_key(key, config)] = val
 
     return orig_state_dict
 
@@ -152,7 +168,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     model = ViTPoseForPoseEstimation(config)
     model.eval()
 
-    # load state_dict of original model
+    # load original state_dict
     checkpoint_path = name_to_path[model_name]
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
 
@@ -160,7 +176,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     #     print(name, param.shape)
 
     # rename some keys
-    new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size)
+    new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size, config=config)
     model.load_state_dict(new_state_dict)
 
     # TODO verify image processor
@@ -239,7 +255,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     result["bbox_ids"] = bbox_ids
     result["output_heatmap"] = None  # return_heatmap = False for inference in mmpose
 
-    print(result)
+    # print(result)
     poses, _ = result["preds"], result["output_heatmap"]
 
     # create final results by adding person bbox information
@@ -267,10 +283,18 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     )
     assert pose_results[0]["keypoints"].shape == (17, 3)
     assert pose_results[1]["keypoints"].shape == (17, 3)
-    assert torch.allclose(
-        torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
-        torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
-    )
+
+    if model_name == "vitpose-base-simple":
+        assert torch.allclose(
+            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
+        )
+    elif model_name == "vitpose-base":
+        # TODO not sure this is right
+        assert torch.allclose(
+            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            torch.tensor([3.9811887e02, 1.8188435e02, 4.5788464e-01]),
+        )
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 732e2df08f2f..909be584556b 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -76,9 +76,6 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
         embeddings = self.patch_embeddings(pixel_values)
 
-        print("Shape of embeddings:", embeddings.shape)
-        print("Shape of position embeddings:", self.position_embeddings.shape)
-
         # add positional encoding to each token
         embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
 
@@ -111,7 +108,6 @@ def __init__(
         self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        print("Shape of pixel values:", pixel_values.shape)
         height, width = pixel_values.shape[-2:]
         if height != self.image_size[0] or width != self.image_size[1]:
             raise ValueError(
@@ -119,8 +115,6 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
             )
         x = self.projection(pixel_values)
 
-        print(x.shape)
-
         x = x.flatten(2).transpose(1, 2)
         return x
 
@@ -600,11 +594,11 @@ class ViTPoseClassicDecoder(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.deconv1 = nn.ConvTranspose2d(config.hidden_size, 256, kernel_size=4, stride=2, padding=1)
+        self.deconv1 = nn.ConvTranspose2d(config.hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False)
         self.batchnorm1 = nn.BatchNorm2d(256)
         self.relu1 = nn.ReLU()
 
-        self.deconv2 = nn.ConvTranspose2d(config.hidden_size, 256, kernel_size=4, stride=2, padding=1)
+        self.deconv2 = nn.ConvTranspose2d(256, 256, kernel_size=4, stride=2, padding=1, bias=False)
         self.batchnorm2 = nn.BatchNorm2d(256)
         self.relu2 = nn.ReLU()
 
@@ -668,9 +662,6 @@ def forward(
             sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
         )
 
-        print("Sequence output before head:", sequence_output.shape)
-        print("First values of sequence output before head:", sequence_output[0, 0, :3, :3])
-
         heatmaps = self.head(sequence_output, flip_pairs=flip_pairs)
 
         loss = None

From fedf2ccdb88fd60c827e36bcfc18fb9c9a0f9256 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 21 Apr 2024 22:20:42 +0200
Subject: [PATCH 013/181] Verify image processor

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 21 ++++++-----
 .../vitpose/image_processing_vitpose.py       | 36 +++++++++++++++----
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index b7462e83c4e7..9423bdb3e4e7 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -142,11 +142,11 @@ def convert_state_dict(orig_state_dict, dim, config):
     return orig_state_dict
 
 
-# We will verify our results on an image of cute cats
+# We will verify our results on a COCO image
 def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
+    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
 
 
 name_to_path = {
@@ -179,13 +179,18 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size, config=config)
     model.load_state_dict(new_state_dict)
 
-    # TODO verify image processor
+    # create image processor
     image_processor = ViTPoseImageProcessor()
-    # encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    # pixel_values = encoding["pixel_values"]
+
+    # verify image processor
+    image = prepare_img()
+    boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+    pixel_values = image_processor(images=image, boxes=boxes, return_tensors="pt").pixel_values
 
     filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
-    pixel_values = torch.load(filepath, map_location="cpu")["img"]
+    original_pixel_values = torch.load(filepath, map_location="cpu")["img"]
+    assert torch.allclose(pixel_values, original_pixel_values)
+
     img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
 
     print("Shape of pixel values:", pixel_values.shape)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 6d6f124ad10c..3ee4a16b8d50 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -22,11 +22,13 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
     ImageInput,
+    infer_channel_dimension_format,
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
@@ -298,13 +300,23 @@ def affine_transform(
         scale: tuple[float],
         rotation: float,
         size: Dict[str, int],
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.array:
+        data_format = input_data_format if data_format is None else data_format
+
+        print("Data format:", data_format)
+
         size = (size["width"], size["height"])
 
         transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
 
         image = cv2.warpAffine(image, transformation, size, flags=cv2.INTER_LINEAR)
 
+        # move back to input_data_format
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_data_format)
+
         return image
 
     def preprocess(
@@ -319,7 +331,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: ChannelDimension = ChannelDimension.FIRST,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
@@ -370,21 +382,25 @@ def preprocess(
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
 
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         # transformations (affine transformation + rescaling + normalization)
         new_images = []
         if self.do_affine_transform:
             for image, image_boxes in zip(images, boxes):
                 for box in image_boxes:
                     center, scale = _box2cs(box, (size["width"], size["height"]))
-                    transformed_image = self.affine_transform(image, center, scale, rotation=0, size=size)
+                    transformed_image = self.affine_transform(
+                        image, center, scale, rotation=0, size=size, input_data_format=input_data_format
+                    )
                     new_images.append(transformed_image)
 
         images = new_images
 
         # TODO each image might have a variable number of boxes => padding?
         # create pixel_values of shape (batch_size, num_boxes, num_channels, height, width)
-        for image in images:
-            print(image.shape)
 
         if self.do_rescale:
             images = [
@@ -392,9 +408,17 @@ def preprocess(
                 for image in images
             ]
         if self.do_normalize:
-            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+            images = [
+                self.normalize(
+                    image=image, mean=self.image_mean, std=self.image_std, input_data_format=input_data_format
+                )
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
-        # return as BatchFeature
         data = {"pixel_values": images}
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
 

From 38dedcd3f6299ce18e9f542cd5afd8cfaae61e2a Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 21 Apr 2024 22:33:08 +0200
Subject: [PATCH 014/181] Fix classic decoder logits

---
 src/transformers/models/vitpose/convert_vitpose_to_hf.py    | 2 +-
 src/transformers/models/vitpose/image_processing_vitpose.py | 5 +----
 src/transformers/models/vitpose/modeling_vitpose.py         | 3 +++
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 9423bdb3e4e7..00020e42e62e 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -298,7 +298,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         # TODO not sure this is right
         assert torch.allclose(
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
-            torch.tensor([3.9811887e02, 1.8188435e02, 4.5788464e-01]),
+            torch.tensor([3.9807913e02, 1.8182812e02, 8.8235235e-01]),
         )
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 3ee4a16b8d50..0cc074d81cfa 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -333,7 +333,6 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
@@ -409,9 +408,7 @@ def preprocess(
             ]
         if self.do_normalize:
             images = [
-                self.normalize(
-                    image=image, mean=self.image_mean, std=self.image_std, input_data_format=input_data_format
-                )
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
                 for image in images
             ]
 
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 909be584556b..66fbe2a922e6 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -615,6 +615,9 @@ def forward(self, hidden_state, flip_pairs):
 
         heatmaps = self.conv(hidden_state)
 
+        if flip_pairs is not None:
+            heatmaps = flip_back(heatmaps.detach().cpu().numpy(), flip_pairs)
+
         return heatmaps
 
 

From 4cdbc031b44671b365215142f1dc083eaf77c460 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 10:47:50 +0200
Subject: [PATCH 015/181] Clean up

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 20 ++++++-------
 .../vitpose/image_processing_vitpose.py       | 28 +++++++++++--------
 .../models/vitpose/modeling_vitpose.py        |  5 +---
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 00020e42e62e..faaaf5ed5c86 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -196,7 +196,10 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     print("Shape of pixel values:", pixel_values.shape)
     with torch.no_grad():
         # first forward pass
-        output_heatmap = model(pixel_values).logits
+        outputs = model(pixel_values)
+        output_heatmap = outputs.logits
+
+        print("Type of output_heatmap:", type(output_heatmap))
 
         # TODO assert logits (output heatmap)
         print("Shape of heatmap:", output_heatmap.shape)
@@ -207,9 +210,10 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         # second forward pass (flipped)
         pixel_values_flipped = torch.flip(pixel_values, [3])
         print("Mean of pixel_values_flipped:", pixel_values_flipped.mean())
-        output_flipped_heatmap = model(
+        outputs_flipped = model(
             pixel_values_flipped, flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
-        ).logits
+        )
+        output_flipped_heatmap = outputs_flipped.logits
 
         print("Shape of flipped heatmap:", output_flipped_heatmap.shape)
         print("Mean value of flipped heatmap:", output_flipped_heatmap.mean())
@@ -222,11 +226,6 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     batch_size = pixel_values.shape[0]
     heatmaps = output_heatmap.cpu().numpy()
 
-    if "bbox_id" in img_metas[0]:
-        bbox_ids = []
-    else:
-        bbox_ids = None
-
     c = np.zeros((batch_size, 2), dtype=np.float32)
     s = np.zeros((batch_size, 2), dtype=np.float32)
     image_paths = []
@@ -238,10 +237,8 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
 
         if "bbox_score" in img_metas[i]:
             score[i] = np.array(img_metas[i]["bbox_score"]).reshape(-1)
-        if bbox_ids is not None:
-            bbox_ids.append(img_metas[i]["bbox_id"])
 
-    preds, maxvals = image_processor.keypoints_from_heatmaps(heatmaps, center=c, scale=s, use_udp=True)
+    preds, maxvals = image_processor.post_process_pose_estimation(output_heatmap, center=c, scale=s, use_udp=True)
 
     all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
     all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
@@ -257,7 +254,6 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     result["preds"] = all_preds
     result["boxes"] = all_boxes
     result["image_paths"] = image_paths
-    result["bbox_ids"] = bbox_ids
     result["output_heatmap"] = None  # return_heatmap = False for inference in mmpose
 
     # print(result)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 0cc074d81cfa..70c27f9188ba 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -421,8 +421,8 @@ def preprocess(
 
         return encoded_inputs
 
-    # TODO rename to post_process_keypoint_detection?
-    def keypoints_from_heatmaps(
+    # TODO originally called keypoints_from_heatmaps
+    def post_process_pose_estimation(
         self,
         heatmaps,
         center,
@@ -430,7 +430,8 @@ def keypoints_from_heatmaps(
         kernel=11,
         use_udp=False,
     ):
-        """Get final keypoint predictions from heatmaps and transform them back to
+        """
+        Get final keypoint predictions from heatmaps and transform them back to
         the image.
 
         Note:
@@ -469,24 +470,29 @@ def keypoints_from_heatmaps(
         Returns:
             tuple: A tuple containing keypoint predictions and scores.
 
-            - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
-            - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+            - preds (np.ndarray[batch_size, num_keypoints, 2]):
+                Predicted keypoint location in images.
+            - maxvals (np.ndarray[batch_size, num_keypoints, 1]):
+                Scores (confidence) of the keypoints.
         """
         # Avoid being affected
-        heatmaps = heatmaps.copy()
+        heatmaps = heatmaps.numpy().copy()
 
-        N, K, H, W = heatmaps.shape
+        batch_size, num_keypoints, height, width = heatmaps.shape
 
-        print("Mean of heatmaps before _get_max_preds:", np.mean(heatmaps))
+        # print("Mean of heatmaps before _get_max_preds:", np.mean(heatmaps))
 
         preds, maxvals = _get_max_preds(heatmaps)
 
-        print("Preds after _get_max_preds:", preds)
+        # print("Preds after _get_max_preds:", preds)
 
         preds = post_dark_udp(preds, heatmaps, kernel=kernel)
 
         # Transform back to the image
-        for i in range(N):
-            preds[i] = transform_preds(preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
+        for i in range(batch_size):
+            preds[i] = transform_preds(preds[i], center[i], scale[i], [width, height], use_udp=use_udp)
+
+        print("Shape of preds:", preds.shape)
+        print("Shape of maxvals:", maxvals.shape)
 
         return preds, maxvals
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 66fbe2a922e6..0ed18084fa16 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -645,6 +645,7 @@ def forward(
     ) -> Union[tuple, ImageClassifierOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        loss = None
         if labels is not None:
             raise NotImplementedError("Training is not yet supported")
 
@@ -667,10 +668,6 @@ def forward(
 
         heatmaps = self.head(sequence_output, flip_pairs=flip_pairs)
 
-        loss = None
-        if labels is not None:
-            raise NotImplementedError("To do")
-
         if not return_dict:
             output = (heatmaps,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output

From 95aae6d3fd89590126e8f731f83fdf3b443ea5dc Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 11:13:00 +0200
Subject: [PATCH 016/181] Add post_process_pose_estimation

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 48 +++++-------
 .../vitpose/image_processing_vitpose.py       | 77 ++++++++++++++-----
 .../models/vitpose/modeling_vitpose.py        | 45 +++++++++--
 3 files changed, 114 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index faaaf5ed5c86..83ddf6a18240 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -197,63 +197,48 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     with torch.no_grad():
         # first forward pass
         outputs = model(pixel_values)
-        output_heatmap = outputs.logits
-
-        print("Type of output_heatmap:", type(output_heatmap))
-
-        # TODO assert logits (output heatmap)
-        print("Shape of heatmap:", output_heatmap.shape)
-        print("Mean value of heatmap:", output_heatmap.numpy().mean())
-
-        print("----------------")
+        output_heatmap = outputs.heatmaps
 
         # second forward pass (flipped)
+        # this is done since the model uses `flip_test=True` in its test config
         pixel_values_flipped = torch.flip(pixel_values, [3])
-        print("Mean of pixel_values_flipped:", pixel_values_flipped.mean())
         outputs_flipped = model(
             pixel_values_flipped, flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
         )
-        output_flipped_heatmap = outputs_flipped.logits
-
-        print("Shape of flipped heatmap:", output_flipped_heatmap.shape)
-        print("Mean value of flipped heatmap:", output_flipped_heatmap.mean())
+        output_flipped_heatmap = outputs_flipped.heatmaps
 
     output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
 
-    print("Mean of final output_heatmap:", output_heatmap.mean())
-
     # TODO verify postprocessing
     batch_size = pixel_values.shape[0]
-    heatmaps = output_heatmap.cpu().numpy()
 
-    c = np.zeros((batch_size, 2), dtype=np.float32)
-    s = np.zeros((batch_size, 2), dtype=np.float32)
-    image_paths = []
+    centers = np.zeros((batch_size, 2), dtype=np.float32)
+    scales = np.zeros((batch_size, 2), dtype=np.float32)
     score = np.ones(batch_size)
     for i in range(batch_size):
-        c[i, :] = img_metas[i]["center"]
-        s[i, :] = img_metas[i]["scale"]
-        image_paths.append(img_metas[i]["image_file"])
+        centers[i, :] = img_metas[i]["center"]
+        scales[i, :] = img_metas[i]["scale"]
 
         if "bbox_score" in img_metas[i]:
             score[i] = np.array(img_metas[i]["bbox_score"]).reshape(-1)
 
-    preds, maxvals = image_processor.post_process_pose_estimation(output_heatmap, center=c, scale=s, use_udp=True)
+    preds, maxvals = image_processor.keypoints_from_heatmaps(
+        output_heatmap, center=centers, scale=scales, use_udp=True
+    )
 
     all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
     all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
     all_preds[:, :, 0:2] = preds[:, :, 0:2]
     all_preds[:, :, 2:3] = maxvals
-    all_boxes[:, 0:2] = c[:, 0:2]
-    all_boxes[:, 2:4] = s[:, 0:2]
-    all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+    all_boxes[:, 0:2] = centers[:, 0:2]
+    all_boxes[:, 2:4] = scales[:, 0:2]
+    all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
     all_boxes[:, 5] = score
 
     result = {}
 
     result["preds"] = all_preds
     result["boxes"] = all_boxes
-    result["image_paths"] = image_paths
     result["output_heatmap"] = None  # return_heatmap = False for inference in mmpose
 
     # print(result)
@@ -272,7 +257,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         pose_result["bbox"] = bbox_xyxy
         pose_results.append(pose_result)
 
-    print("Pose results:", pose_results)
+    # print("Pose results:", pose_results)
 
     # Verify pose_results
     # This is a list of dictionaries, containing the bounding box and keypoints per detected person
@@ -291,12 +276,15 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
             torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
         )
     elif model_name == "vitpose-base":
-        # TODO not sure this is right
         assert torch.allclose(
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.9807913e02, 1.8182812e02, 8.8235235e-01]),
         )
 
+    # test post_process_pose_estimation
+    results = image_processor.post_process_pose_estimation(outputs, centers=centers, scales=scales, use_udp=True)
+    print("Shape of results:", results.shape)
+
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 70c27f9188ba..46ab5e4be6f2 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -421,8 +421,43 @@ def preprocess(
 
         return encoded_inputs
 
+    def post_process_pose_estimation(self, outputs, centers, scales, kernel_size=11, use_udp=False):
+        """
+        Transform the heatmaps into keypoint predictions and transform them back to the image.
+
+        Args:
+            outputs (torch.Tensor):
+                Model outputs.
+            centers (torch.Tensor of shape [batch_size, 2]):
+                Center of each bounding box (x, y).
+            scales (torch.Tensor of shape [batch_size, 2]):
+                Scale of each bounding box.
+            kernel_size (`int`, *optional*, defaults to 11):
+                Gaussian kernel size (K) for modulation.
+            use_udp (`bool`, *optional*, defaults to `False`):
+                Whether to use unbiased data processing.
+        """
+
+        # Avoid being affected
+        heatmaps = outputs.heatmaps.numpy().copy()
+
+        batch_size, num_keypoints, height, width = heatmaps.shape
+
+        preds, maxvals = _get_max_preds(heatmaps)
+
+        preds = post_dark_udp(preds, heatmaps, kernel=kernel_size)
+
+        # Transform back to the image
+        for i in range(batch_size):
+            preds[i] = transform_preds(preds[i], centers[i], scales[i], [width, height], use_udp=use_udp)
+
+        # Concatenate along the final dimension
+        preds = np.concatenate([preds, maxvals], axis=-1)
+
+        return preds
+
     # TODO originally called keypoints_from_heatmaps
-    def post_process_pose_estimation(
+    def keypoints_from_heatmaps(
         self,
         heatmaps,
         center,
@@ -441,26 +476,29 @@ def post_process_pose_estimation(
             - heatmap width: W
 
         Args:
-            heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
-            center (np.ndarray[N, 2]): Center of the bounding box (x, y).
-            scale (np.ndarray[N, 2]): Scale of the bounding box
-                wrt height/width.
-            post_process (str/None): Choice of methods to post-process
-                heatmaps. Currently supported: None, 'default', 'unbiased',
-                'megvii'.
-            unbiased (bool): Option to use unbiased decoding. Mutually
-                exclusive with megvii.
+            heatmaps (np.ndarray[N, K, H, W]):
+                Model predicted heatmaps.
+            center (np.ndarray[N, 2]):
+                Center of the bounding box (x, y).
+            scale (np.ndarray[N, 2]):
+                Scale of the bounding box wrt height/width.
+            post_process (str/None):
+                Choice of methods to post-process heatmaps.
+                Currently supported: None, 'default', 'unbiased', 'megvii'.
+            unbiased (bool)
+                Option to use unbiased decoding. Mutually exclusive with megvii.
                 Note: this arg is deprecated and unbiased=True can be replaced
-                by post_process='unbiased'
-                Paper ref: Zhang et al. Distribution-Aware Coordinate
+                by post_process='unbiased'. Paper ref: Zhang et al. Distribution-Aware Coordinate
                 Representation for Human Pose Estimation (CVPR 2020).
-            kernel (int): Gaussian kernel size (K) for modulation, which should
-                match the heatmap gaussian sigma when training.
+            kernel (int):
+                aussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
                 K=17 for sigma=3 and k=11 for sigma=2.
-            valid_radius_factor (float): The radius factor of the positive area
-                in classification heatmap for UDP.
-            use_udp (bool): Use unbiased data processing.
-            target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+            valid_radius_factor (float):
+                The radius factor of the positive area in classification heatmap for UDP.
+            use_udp (bool):
+                Use unbiased data processing.
+            target_type (str):
+                'GaussianHeatmap' or 'CombinedTarget'.
                 GaussianHeatmap: Classification target with gaussian distribution.
                 CombinedTarget: The combination of classification target
                 (response map) and regression target (offset map).
@@ -492,7 +530,4 @@ def post_process_pose_estimation(
         for i in range(batch_size):
             preds[i] = transform_preds(preds[i], center[i], scale[i], [width, height], use_udp=use_udp)
 
-        print("Shape of preds:", preds.shape)
-        print("Shape of maxvals:", maxvals.shape)
-
         return preds, maxvals
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 0ed18084fa16..b52593b3a62f 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -17,6 +17,7 @@
 
 import collections.abc
 import math
+from dataclasses import dataclass
 from typing import Dict, List, Optional, Set, Tuple, Union
 
 import torch
@@ -24,10 +25,16 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_vitpose import ViTPoseConfig
 
 
@@ -52,6 +59,34 @@
 ]
 
 
+@dataclass
+class PoseEstimatorOutput(ModelOutput):
+    """
+    Class for outputs of pose estimation models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
+            Heatmaps.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    heatmaps: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 class ViTPoseEmbeddings(nn.Module):
     """
     Construct the position and patch embeddings.
@@ -642,7 +677,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, ImageClassifierOutput]:
+    ) -> Union[tuple, PoseEstimatorOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         loss = None
@@ -672,9 +707,9 @@ def forward(
             output = (heatmaps,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
-        return ImageClassifierOutput(
+        return PoseEstimatorOutput(
             loss=loss,
-            logits=heatmaps,
+            heatmaps=heatmaps,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )

From 2531c195413e725402d4651ebd97b462de14f5da Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 11:33:46 +0200
Subject: [PATCH 017/181] Improve post_process_pose_estimation

---
 .../models/vitpose/convert_vitpose_to_hf.py   |  5 +++-
 .../vitpose/image_processing_vitpose.py       | 27 +++++++++++++------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 83ddf6a18240..6d3e3655f1b5 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -282,7 +282,10 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         )
 
     # test post_process_pose_estimation
-    results = image_processor.post_process_pose_estimation(outputs, centers=centers, scales=scales, use_udp=True)
+    target_sizes = [(426, 640)]
+    results = image_processor.post_process_pose_estimation(
+        outputs, boxes=boxes[0], target_sizes=target_sizes, use_udp=True
+    )
     print("Shape of results:", results.shape)
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 46ab5e4be6f2..89b4e5f9f9df 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -44,7 +44,7 @@
 logger = logging.get_logger(__name__)
 
 
-def _box2cs(box, input_size):
+def _box2cs(box, width, height):
     """This encodes a bounding box (x,y,w,h) into (center, scale)
 
     Args:
@@ -58,7 +58,7 @@ def _box2cs(box, input_size):
     """
 
     x, y, w, h = box[:4]
-    aspect_ratio = input_size[0] / input_size[1]
+    aspect_ratio = width / height
     center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
 
     if w > aspect_ratio * h:
@@ -390,7 +390,7 @@ def preprocess(
         if self.do_affine_transform:
             for image, image_boxes in zip(images, boxes):
                 for box in image_boxes:
-                    center, scale = _box2cs(box, (size["width"], size["height"]))
+                    center, scale = _box2cs(box, size["width"], size["height"])
                     transformed_image = self.affine_transform(
                         image, center, scale, rotation=0, size=size, input_data_format=input_data_format
                     )
@@ -421,17 +421,17 @@ def preprocess(
 
         return encoded_inputs
 
-    def post_process_pose_estimation(self, outputs, centers, scales, kernel_size=11, use_udp=False):
+    def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size=11, use_udp=False):
         """
         Transform the heatmaps into keypoint predictions and transform them back to the image.
 
         Args:
             outputs (torch.Tensor):
                 Model outputs.
-            centers (torch.Tensor of shape [batch_size, 2]):
-                Center of each bounding box (x, y).
-            scales (torch.Tensor of shape [batch_size, 2]):
-                Scale of each bounding box.
+            boxes (torch.Tensor of shape [batch_size, 4]):
+                Bounding boxes.
+            target_sizes (`List[Tuple[int, int]]`, *optional*):
+                Size of the target heatmaps.
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
             use_udp (`bool`, *optional*, defaults to `False`):
@@ -447,6 +447,17 @@ def post_process_pose_estimation(self, outputs, centers, scales, kernel_size=11,
 
         preds = post_dark_udp(preds, heatmaps, kernel=kernel_size)
 
+        centers = np.zeros((batch_size, 2), dtype=np.float32)
+        scales = np.zeros((batch_size, 2), dtype=np.float32)
+
+        for idx, (box, (height, width)) in enumerate(zip(boxes, target_sizes)):
+            print("Box:", box)
+            print("Height:", height)
+            print("Width:", width)
+            center, scale = _box2cs(box, width, height)
+            centers[idx, :] = center
+            scales[idx, :] = scale
+
         # Transform back to the image
         for i in range(batch_size):
             preds[i] = transform_preds(preds[i], centers[i], scales[i], [width, height], use_udp=use_udp)

From e06d6788c8fb6f6133ab988ce780639df9fafc34 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 13:51:45 +0200
Subject: [PATCH 018/181] Use AutoBackbone

---
 docs/source/en/index.md                       |   1 +
 docs/source/en/model_doc/vitpose.md           |   5 -
 src/transformers/__init__.py                  |  17 +-
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   2 +
 src/transformers/models/auto/modeling_auto.py |   2 +-
 src/transformers/models/vitpose/__init__.py   |   8 +-
 .../models/vitpose/configuration_vitpose.py   | 103 ++--
 .../models/vitpose/convert_vitpose_to_hf.py   |  54 +-
 .../models/vitpose/modeling_vitpose.py        | 460 +---------------
 .../models/vitpose_backbone/__init__.py       |  54 ++
 .../configuration_vitpose_backbone.py         | 133 +++++
 .../modeling_vitpose_backbone.py              | 513 ++++++++++++++++++
 .../models/vitpose_backbone/test.py           |  10 +
 src/transformers/utils/dummy_pt_objects.py    |  12 +-
 tests/models/vitpose/test_modeling_vitpose.py |  29 +-
 .../test_modeling_vitpose_backbone.py         | 203 +++++++
 utils/check_copies.py                         |   1 +
 utils/check_repo.py                           |   2 +
 19 files changed, 1037 insertions(+), 573 deletions(-)
 create mode 100644 src/transformers/models/vitpose_backbone/__init__.py
 create mode 100644 src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
 create mode 100644 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
 create mode 100644 src/transformers/models/vitpose_backbone/test.py
 create mode 100644 tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 26f6e51eb422..25f308f429c2 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -313,6 +313,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
 |                       [ViTPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
+|              [ViTPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 3915abf754a9..11a2b1b7af77 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -35,11 +35,6 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 
 [[autodoc]] ViTPoseConfig
 
-## ViTPoseModel
-
-[[autodoc]] ViTPoseModel
-    - forward
-
 ## ViTPoseForPoseEstimation
 
 [[autodoc]] ViTPoseForPoseEstimation
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7ae2187a5bac..abb7fdfb5ab0 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -933,7 +933,8 @@
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
     "models.vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"],
     "models.vitmatte": ["VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitMatteConfig"],
-    "models.vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig"],
+    "models.vitpose": ["ViTPoseConfig"],
+    "models.vitpose_backbone": ["ViTPoseBackboneConfig"],
     "models.vits": [
         "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "VitsConfig",
@@ -3699,12 +3700,16 @@
     )
     _import_structure["models.vitpose"].extend(
         [
-            "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ViTPoseForPoseEstimation",
-            "ViTPoseModel",
             "ViTPosePreTrainedModel",
         ]
     )
+    _import_structure["models.vitpose_backbone"].extend(
+        [
+            "ViTPoseBackbone",
+            "ViTPoseBackbonePreTrainedModel",
+        ]
+    )
     _import_structure["models.vits"].extend(
         [
             "VITS_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -5885,7 +5890,8 @@
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
     from .models.vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig
     from .models.vitmatte import VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP, VitMatteConfig
-    from .models.vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig
+    from .models.vitpose import ViTPoseConfig
+    from .models.vitpose_backbone import ViTPoseBackboneConfig
     from .models.vits import (
         VITS_PRETRAINED_CONFIG_ARCHIVE_MAP,
         VitsConfig,
@@ -8227,11 +8233,10 @@
             VitMattePreTrainedModel,
         )
         from .models.vitpose import (
-            VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTPoseForPoseEstimation,
-            ViTPoseModel,
             ViTPosePreTrainedModel,
         )
+        from .models.vitpose_backbone import ViTPoseBackbone, ViTPoseBackbonePreTrainedModel
         from .models.vits import (
             VITS_PRETRAINED_MODEL_ARCHIVE_LIST,
             VitsModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index d50a78ed70df..913c9f684247 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -252,6 +252,7 @@
     vitdet,
     vitmatte,
     vitpose,
+    vitpose_backbone,
     vits,
     vivit,
     wav2vec2,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index df7376aef95c..facd6f59419e 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -268,6 +268,7 @@
         ("vitdet", "VitDetConfig"),
         ("vitmatte", "VitMatteConfig"),
         ("vitpose", "ViTPoseConfig"),
+        ("vitpose_backbone", "ViTPoseBackboneConfig"),
         ("vits", "VitsConfig"),
         ("vivit", "VivitConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -552,6 +553,7 @@
         ("vitdet", "VitDet"),
         ("vitmatte", "ViTMatte"),
         ("vitpose", "ViTPose"),
+        ("vitpose_backbone", "ViTPoseBackbone"),
         ("vits", "VITS"),
         ("vivit", "ViViT"),
         ("wav2vec2", "Wav2Vec2"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 12ffa5802578..6a73141b23a2 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -244,7 +244,6 @@
         ("vit_mae", "ViTMAEModel"),
         ("vit_msn", "ViTMSNModel"),
         ("vitdet", "VitDetModel"),
-        ("vitpose", "ViTPoseModel"),
         ("vits", "VitsModel"),
         ("vivit", "VivitModel"),
         ("wav2vec2", "Wav2Vec2Model"),
@@ -1235,6 +1234,7 @@
         ("swinv2", "Swinv2Backbone"),
         ("timm_backbone", "TimmBackbone"),
         ("vitdet", "VitDetBackbone"),
+        ("vitpose_backbone", "ViTPoseBackbone"),
     ]
 )
 
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index 94d322338f0e..9c32191cdcfe 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -20,7 +20,7 @@
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
-_import_structure = {"configuration_vitpose": ["VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTPoseConfig"]}
+_import_structure = {"configuration_vitpose": ["ViTPoseConfig"]}
 
 
 try:
@@ -39,14 +39,12 @@
     pass
 else:
     _import_structure["modeling_vitpose"] = [
-        "VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ViTPoseModel",
         "ViTPosePreTrainedModel",
         "ViTPoseForPoseEstimation",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_vitpose import VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTPoseConfig
+    from .configuration_vitpose import ViTPoseConfig
 
     try:
         if not is_vision_available():
@@ -63,9 +61,7 @@
         pass
     else:
         from .modeling_vitpose import (
-            VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTPoseForPoseEstimation,
-            ViTPoseModel,
             ViTPosePreTrainedModel,
         )
 
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 08c23457674a..1725bafe7dab 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -17,6 +17,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -28,7 +29,7 @@
 
 class ViTPoseConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTPoseModel`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`ViTPoseForPoseEstimation`]. It is used to instantiate an
     ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the ViTPose
     [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
@@ -36,35 +37,23 @@ class ViTPoseConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
-        image_size (`int`, *optional*, defaults to `[256, 192]`):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `[16, 16]`):
-            The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitDetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
         scale_factor (`int`, *optional*, defaults to 4):
             Factor to upscale te feature maps coming from the ViT backbone.
         use_simple_decoder (`bool`, *optional*, defaults to `True`):
@@ -73,13 +62,13 @@ class ViTPoseConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import ViTPoseModel, ViTPoseConfig
+    >>> from transformers import ViTPoseConfig, ViTPoseForPoseEstimation
 
-    >>> # Initializing a ViTPose vitpose-base-patch16-224 style configuration
+    >>> # Initializing a ViTPose configuration
     >>> configuration = ViTPoseConfig()
 
-    >>> # Initializing a model from the vitpose-base-patch16-224 style configuration
-    >>> model = ViTPoseModel(configuration)
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ViTPoseForPoseEstimation(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -89,37 +78,41 @@ class ViTPoseConfig(PretrainedConfig):
 
     def __init__(
         self,
-        image_size=[256, 192],
-        patch_size=[16, 16],
-        num_channels=3,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
+        backbone_config: PretrainedConfig = None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
         initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        qkv_bias=True,
         scale_factor=4,
         use_simple_decoder=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
+            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_features=["stage4"])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+
         self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.qkv_bias = qkv_bias
         self.scale_factor = scale_factor
         self.use_simple_decoder = use_simple_decoder
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 6d3e3655f1b5..71f59e665183 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -27,7 +27,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
+from transformers import ViTPoseBackboneConfig, ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
 
 
 def _xywh2xyxy(bbox_xywh):
@@ -48,31 +48,31 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def get_config(model_name):
-    use_simple_decoder = "simple" in model_name
-    config = ViTPoseConfig(num_labels=17, use_simple_decoder=use_simple_decoder)
+    backbone_config = ViTPoseBackboneConfig(out_indices=[12])
     # size of the architecture
     if "small" in model_name:
-        config.hidden_size = 768
-        config.intermediate_size = 2304
-        config.num_hidden_layers = 8
-        config.num_attention_heads = 8
+        backbone_config.hidden_size = 768
+        backbone_config.intermediate_size = 2304
+        backbone_config.num_hidden_layers = 8
+        backbone_config.num_attention_heads = 8
     elif "large" in model_name:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
+        backbone_config.hidden_size = 1024
+        backbone_config.intermediate_size = 4096
+        backbone_config.num_hidden_layers = 24
+        backbone_config.num_attention_heads = 16
     elif "huge" in model_name:
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
+        backbone_config.hidden_size = 1280
+        backbone_config.intermediate_size = 5120
+        backbone_config.num_hidden_layers = 32
+        backbone_config.num_attention_heads = 16
+
+    use_simple_decoder = "simple" in model_name
+    config = ViTPoseConfig(backbone_config=backbone_config, num_labels=17, use_simple_decoder=use_simple_decoder)
 
     return config
 
 
 def rename_key(name, config):
-    if "backbone" in name:
-        name = name.replace("backbone", "vit")
     if "patch_embed.proj" in name:
         name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
     if "pos_embed" in name:
@@ -127,15 +127,17 @@ def convert_state_dict(orig_state_dict, dim, config):
             key_split = key.split(".")
             layer_num = int(key_split[2])
             if "weight" in key:
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
+                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
                     dim : dim * 2, :
                 ]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
             else:
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
+                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
+                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[
+                    dim : dim * 2
+                ]
+                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
         else:
             orig_state_dict[rename_key(key, config)] = val
 
@@ -176,7 +178,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     #     print(name, param.shape)
 
     # rename some keys
-    new_state_dict = convert_state_dict(state_dict, dim=config.hidden_size, config=config)
+    new_state_dict = convert_state_dict(state_dict, dim=config.backbone_config.hidden_size, config=config)
     model.load_state_dict(new_state_dict)
 
     # create image processor
@@ -257,8 +259,6 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         pose_result["bbox"] = bbox_xyxy
         pose_results.append(pose_result)
 
-    # print("Pose results:", pose_results)
-
     # Verify pose_results
     # This is a list of dictionaries, containing the bounding box and keypoints per detected person
     assert torch.allclose(
@@ -282,7 +282,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         )
 
     # test post_process_pose_estimation
-    target_sizes = [(426, 640)]
+    target_sizes = [image.size[::-1]]
     results = image_processor.post_process_pose_estimation(
         outputs, boxes=boxes[0], target_sizes=target_sizes, use_udp=True
     )
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index b52593b3a62f..17d9f528c0b7 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -14,27 +14,19 @@
 # limitations under the License.
 """ PyTorch ViTPose model."""
 
-
-import collections.abc
-import math
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
     logging,
 )
+from ...utils.backbone_utils import load_backbone
 from .configuration_vitpose import ViTPoseConfig
 
 
@@ -42,21 +34,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ViTPoseConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor"
-
-# Base docstring
-_CHECKPOINT_FOR_DOC = "unisydney"
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
-
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/vitpose-base-patch16-224"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
-
-
-VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "unisydney",
-    # See all ViTPose models at https://huggingface.co/models?filter=vitpose
-]
 
 
 @dataclass
@@ -87,321 +64,6 @@ class PoseEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-class ViTPoseEmbeddings(nn.Module):
-    """
-    Construct the position and patch embeddings.
-
-    """
-
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-
-        self.patch_embeddings = PatchEmbeddings(
-            image_size=config.image_size,
-            patch_size=config.patch_size,
-            num_channels=config.num_channels,
-            embed_dim=config.hidden_size,
-        )
-        num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.config = config
-
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, height, width = pixel_values.shape
-        embeddings = self.patch_embeddings(pixel_values)
-
-        # add positional encoding to each token
-        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
-
-        embeddings = self.dropout(embeddings)
-
-        return embeddings
-
-
-class PatchEmbeddings(nn.Module):
-    """
-    Image to Patch Embedding.
-
-    """
-
-    def __init__(
-        self,
-        image_size: int = 224,
-        patch_size: Union[int, Tuple[int, int]] = 16,
-        num_channels: int = 3,
-        embed_dim: int = 768,
-    ):
-        super().__init__()
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
-
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        height, width = pixel_values.shape[-2:]
-        if height != self.image_size[0] or width != self.image_size[1]:
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
-            )
-        x = self.projection(pixel_values)
-
-        x = x.flatten(2).transpose(1, 2)
-        return x
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTPose
-class ViTPoseSelfAttention(nn.Module):
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
-                f"heads {config.num_attention_heads}."
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTPose
-class ViTPoseSelfOutput(nn.Module):
-    """
-    The residual connection is defined in ViTPoseLayer instead of here (as is the case with other models), due to the
-    layernorm applied before each block.
-    """
-
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        return hidden_states
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTPose
-class ViTPoseAttention(nn.Module):
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        self.attention = ViTPoseSelfAttention(config)
-        self.output = ViTPoseSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads: Set[int]) -> None:
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.attention.query = prune_linear_layer(self.attention.query, index)
-        self.attention.key = prune_linear_layer(self.attention.key, index)
-        self.attention.value = prune_linear_layer(self.attention.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
-        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
-
-        attention_output = self.output(self_outputs[0], hidden_states)
-
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTPose
-class ViTPoseIntermediate(nn.Module):
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-
-        return hidden_states
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTPose
-class ViTPoseOutput(nn.Module):
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        hidden_states = hidden_states + input_tensor
-
-        return hidden_states
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTPose
-class ViTPoseLayer(nn.Module):
-    """This corresponds to the Block class in the timm implementation."""
-
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = ViTPoseAttention(config)
-        self.intermediate = ViTPoseIntermediate(config)
-        self.output = ViTPoseOutput(config)
-        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        self_attention_outputs = self.attention(
-            self.layernorm_before(hidden_states),  # in ViTPose, layernorm is applied before self-attention
-            head_mask,
-            output_attentions=output_attentions,
-        )
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        # first residual connection
-        hidden_states = attention_output + hidden_states
-
-        # in ViTPose, layernorm is also applied after self-attention
-        layer_output = self.layernorm_after(hidden_states)
-        layer_output = self.intermediate(layer_output)
-
-        # second residual connection is done here
-        layer_output = self.output(layer_output, hidden_states)
-
-        outputs = (layer_output,) + outputs
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTPose
-class ViTPoseEncoder(nn.Module):
-    def __init__(self, config: ViTPoseConfig) -> None:
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([ViTPoseLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
-                    hidden_states,
-                    layer_head_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
 class ViTPosePreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -412,7 +74,6 @@ class ViTPosePreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["ViTPoseEmbeddings", "ViTPoseLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -427,12 +88,6 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        elif isinstance(module, ViTPoseEmbeddings):
-            module.position_embeddings.data = nn.init.trunc_normal_(
-                module.position_embeddings.data.to(torch.float32),
-                mean=0.0,
-                std=self.config.initializer_range,
-            ).to(module.position_embeddings.dtype)
 
 
 VITPOSE_START_DOCSTRING = r"""
@@ -449,8 +104,8 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 VITPOSE_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
-            [`ViTFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`ViTPoseImageProcessor`]. See
+            [`ViTPoseImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -469,92 +124,6 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 """
 
 
-@add_start_docstrings(
-    "The bare ViTPose Model transformer outputting raw hidden-states without any specific head on top.",
-    VITPOSE_START_DOCSTRING,
-)
-class ViTPoseModel(ViTPosePreTrainedModel):
-    def __init__(self, config: ViTPoseConfig):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = ViTPoseEmbeddings(config)
-        self.encoder = ViTPoseEncoder(config)
-
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> PatchEmbeddings:
-        return self.embeddings.patch_embeddings
-
-    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPooling,
-        config_class=_CONFIG_FOR_DOC,
-        modality="vision",
-        expected_output=_EXPECTED_OUTPUT_SHAPE,
-    )
-    def forward(
-        self,
-        pixel_values: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(pixel_values)
-
-        encoder_outputs = self.encoder(
-            embedding_output,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        sequence_output = self.layernorm(sequence_output)
-        pooled_output = None
-
-        if not return_dict:
-            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
-            return head_outputs + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
 def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
     """Flip the flipped heatmaps back to the original form.
 
@@ -574,7 +143,8 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
     Returns:
         np.ndarray: heatmaps that flipped back to the original image
     """
-    assert output_flipped.ndim == 4, "output_flipped should be [batch_size, num_keypoints, height, width]"
+    if output_flipped.ndim != 4:
+        raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
     shape_ori = output_flipped.shape
     channels = 1
     if target_type.lower() == "CombinedTarget".lower():
@@ -603,7 +173,7 @@ def __init__(self, config) -> None:
         super().__init__()
 
         self.scale_factor = config.scale_factor
-        self.conv = nn.Conv2d(config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1)
+        self.conv = nn.Conv2d(config.backbone_hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
         # Transform input: ReLu + upsample
@@ -660,9 +230,11 @@ class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
     def __init__(self, config: ViTPoseConfig) -> None:
         super().__init__(config)
 
-        self.num_labels = config.num_labels
-        self.vit = ViTPoseModel(config)
-
+        self.backbone = load_backbone(config)
+        # add backbone attributes
+        config.backbone_hidden_size = self.backbone.config.hidden_size
+        config.image_size = self.backbone.config.image_size
+        config.patch_size = self.backbone.config.patch_size
         self.head = ViTPoseSimpleDecoder(config) if config.use_simple_decoder else ViTPoseClassicDecoder(config)
 
         # Initialize weights and apply final processing
@@ -672,7 +244,6 @@ def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
         flip_pairs: Optional = None,
-        head_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -684,16 +255,15 @@ def forward(
         if labels is not None:
             raise NotImplementedError("Training is not yet supported")
 
-        outputs = self.vit(
+        outputs = self.backbone.forward_with_filtered_kwargs(
             pixel_values,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
             return_dict=return_dict,
         )
 
         # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
-        sequence_output = outputs[0]
+        sequence_output = outputs.feature_maps[-1]
         batch_size = sequence_output.shape[0]
         patch_height = self.config.image_size[0] // self.config.patch_size[0]
         patch_width = self.config.image_size[1] // self.config.patch_size[1]
diff --git a/src/transformers/models/vitpose_backbone/__init__.py b/src/transformers/models/vitpose_backbone/__init__.py
new file mode 100644
index 000000000000..d45d242d6c6e
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/__init__.py
@@ -0,0 +1,54 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_vitpose_backbone": ["ViTPoseBackboneConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vitpose_backbone"] = [
+        "ViTPoseBackbonePreTrainedModel",
+        "ViTPoseBackbone",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vitpose_backbone import ViTPoseBackboneConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vitpose_backbone import (
+            ViTPoseBackbone,
+            ViTPoseBackbonePreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
new file mode 100644
index 000000000000..71b707ca7a6d
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViTPose backbone configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "unisydney": "https://huggingface.co/unisydney/resolve/main/config.json",
+}
+
+
+class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTPoseBackbone`]. It is used to instantiate an
+    ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ViTPose
+    [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to `[256, 192]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTPoseBackboneConfig, ViTPoseBackbone
+
+    >>> # Initializing a ViTPose configuration
+    >>> configuration = ViTPoseBackboneConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ViTPoseBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitpose_backbone"
+
+    def __init__(
+        self,
+        image_size=[256, 192],
+        patch_size=[16, 16],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        qkv_bias=True,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
new file mode 100644
index 000000000000..2f552ffe2ca5
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -0,0 +1,513 @@
+# coding=utf-8
+# Copyright 2024 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViTPose backbone model."""
+
+
+import collections.abc
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_vitpose_backbone import ViTPoseBackboneConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTPoseBackboneConfig"
+
+
+class ViTPoseBackboneEmbeddings(nn.Module):
+    """
+    Construct the position and patch embeddings.
+
+    """
+
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+
+        self.patch_embeddings = ViTPoseBackbonePatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        embeddings = self.patch_embeddings(pixel_values)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ViTPoseBackbonePatchEmbeddings(nn.Module):
+    """Image to Patch Embedding."""
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        num_channels: int = 3,
+        embed_dim: int = 768,
+    ):
+        super().__init__()
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        height, width = pixel_values.shape[-2:]
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        x = self.projection(pixel_values)
+
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTPoseBackbone
+class ViTPoseBackboneSelfAttention(nn.Module):
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTPoseBackbone
+class ViTPoseBackboneSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTPoseBackboneLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTPoseBackbone
+class ViTPoseBackboneAttention(nn.Module):
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        self.attention = ViTPoseBackboneSelfAttention(config)
+        self.output = ViTPoseBackboneSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTPoseBackbone
+class ViTPoseBackboneIntermediate(nn.Module):
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTPoseBackbone
+class ViTPoseBackboneOutput(nn.Module):
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTPoseBackbone
+class ViTPoseBackboneLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTPoseBackboneAttention(config)
+        self.intermediate = ViTPoseBackboneIntermediate(config)
+        self.output = ViTPoseBackboneOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTPoseBackbone, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViTPoseBackbone, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTPoseBackbone
+class ViTPoseBackboneEncoder(nn.Module):
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTPoseBackboneLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTPoseBackbonePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTPoseBackboneConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTPoseEmbeddings", "ViTPoseLayer"]
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTPoseBackboneEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+
+VITPOSE_BACKBONE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTPoseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITPOSE_BACKBONE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The ViTPose backbone useful for downstream tasks.",
+    VITPOSE_BACKBONE_START_DOCSTRING,
+)
+class ViTPoseBackbone(ViTPoseBackbonePreTrainedModel, BackboneMixin):
+    def __init__(self, config: ViTPoseBackboneConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = ViTPoseBackboneEmbeddings(config)
+        self.encoder = ViTPoseBackboneEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITPOSE_BACKBONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
+        >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                hidden_state = self.layernorm(hidden_state)
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (hidden_states,)
+            if output_attentions:
+                output += (outputs.attentions,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/vitpose_backbone/test.py b/src/transformers/models/vitpose_backbone/test.py
new file mode 100644
index 000000000000..75daeee3cd52
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/test.py
@@ -0,0 +1,10 @@
+import torch
+
+from transformers import ViTPoseBackbone, ViTPoseBackboneConfig
+
+
+model = ViTPoseBackbone(ViTPoseBackboneConfig())
+
+pixel_values = torch.randn(1, 3, 256, 192)
+
+feature_maps = model(pixel_values)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1352375de274..08ee0c63c62e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -9177,24 +9177,28 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class ViTPoseForPoseEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 
 
-class ViTPoseForPoseEstimation(metaclass=DummyObject):
+class ViTPosePreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTPoseModel(metaclass=DummyObject):
+class ViTPoseBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTPosePreTrainedModel(metaclass=DummyObject):
+class ViTPoseBackbonePreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index b3d98816dca3..250fe918766d 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -29,8 +29,7 @@
 if is_torch_available():
     from torch import nn
 
-    from transformers import ViTPoseForPoseEstimation, ViTPoseModel
-    from transformers.models.vitpose.modeling_vitpose import VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers import ViTPoseForPoseEstimation
 
 
 if is_vision_available():
@@ -114,13 +113,6 @@ def get_config(self):
             scale_factor=self.scale_factor,
         )
 
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTPoseModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
     def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
         model = ViTPoseForPoseEstimation(config)
         model.to(torch_device)
@@ -152,14 +144,7 @@ class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
     attention_mask and seq_length.
     """
 
-    all_model_classes = (
-        (
-            ViTPoseModel,
-            ViTPoseForPoseEstimation,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (ViTPoseForPoseEstimation,) if is_torch_available() else ()
     fx_compatible = False
 
     test_pruning = False
@@ -214,19 +199,15 @@ def test_forward_signature(self):
             expected_arg_names = ["pixel_values"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
     def test_for_pose_estimation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_pose_estimation(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in VITPOSE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = ViTPoseModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
+        model_name = ""
+        model = ViTPoseForPoseEstimation.from_pretrained(model_name)
+        self.assertIsNotNone(model)
 
 
 # We will verify our results on an image of cute cats
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
new file mode 100644
index 000000000000..c86c2120a5f6
--- /dev/null
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViTPose backbone model. """
+
+
+import inspect
+import unittest
+
+from transformers import ViTPoseConfig
+from transformers.testing_utils import require_torch
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    from torch import nn
+
+    from transformers import ViTPoseBackbone
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ViTPoseBackboneModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=[16 * 8, 12 * 8],
+        patch_size=[8, 8],
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=2,
+        scale_factor=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scale_factor = scale_factor
+        self.scope = scope
+
+        # in ViTPose, the seq length equals the number of patches
+        num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
+        self.seq_length = num_patches
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ViTPoseConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            num_labels=self.num_labels,
+            scale_factor=self.scale_factor,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViTPose does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (ViTPoseBackbone,) if is_torch_available() else ()
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ViTPoseBackboneModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTPoseConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="ViTPose does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+class ViTPoseBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (ViTPoseBackbone,) if is_torch_available() else ()
+    config_class = ViTPoseConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = ViTPoseBackboneModelTester(self)
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 60a2fac4c8f5..0dd68d8b169f 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -1151,6 +1151,7 @@ def check_model_list_copy(overwrite: bool = False):
     "CLIPVisionModel",
     "SiglipVisionModel",
     "ChineseCLIPVisionModel",
+    "ViTPoseBackbone",
 ]
 
 # Template for new entries to add in the main README when we have missing models.
diff --git a/utils/check_repo.py b/utils/check_repo.py
index a9223ae5a723..9e89f657cc1e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -977,6 +977,8 @@ def find_all_documented_objects() -> List[str]:
     "logging",  # External module
     "requires_backends",  # Internal function
     "AltRobertaModel",  # Internal module
+    "ViTPoseBackbone",  # Internal module
+    "ViTPoseBackboneConfig",  # Internal module
 ]
 
 # This list should be empty. Objects in it should get their own doc page.

From c4a7df16e611c7563575fc64c53fcf16d1b4be22 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 15:22:58 +0200
Subject: [PATCH 019/181] Add support for MoE models

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 38 ++++++--
 .../vitpose/image_processing_vitpose.py       |  3 -
 .../models/vitpose/modeling_vitpose.py        | 11 ++-
 .../configuration_vitpose_backbone.py         | 16 +++-
 .../modeling_vitpose_backbone.py              | 92 +++++++++++--------
 5 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 71f59e665183..792d390d5374 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -67,7 +67,16 @@ def get_config(model_name):
         backbone_config.num_attention_heads = 16
 
     use_simple_decoder = "simple" in model_name
-    config = ViTPoseConfig(backbone_config=backbone_config, num_labels=17, use_simple_decoder=use_simple_decoder)
+    num_experts = 6 if model_name == "vitpose-base-coco-aic-mpii" else None
+    part_features = 192 if model_name == "vitpose-base-coco-aic-mpii" else None
+
+    config = ViTPoseConfig(
+        backbone_config=backbone_config,
+        num_labels=17,
+        use_simple_decoder=use_simple_decoder,
+        num_experts=num_experts,
+        part_features=part_features,
+    )
 
     return config
 
@@ -87,10 +96,6 @@ def rename_key(name, config):
         name = name.replace("norm1", "layernorm_before")
     if "norm2" in name:
         name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
     if "last_norm" in name:
         name = name.replace("last_norm", "layernorm")
 
@@ -154,6 +159,7 @@ def prepare_img():
 name_to_path = {
     "vitpose-base-simple": "/Users/nielsrogge/Documents/ViTPose/vitpose-b-simple.pth",
     "vitpose-base": "/Users/nielsrogge/Documents/ViTPose/vitpose-b.pth",
+    "vitpose-base-coco-aic-mpii": "/Users/nielsrogge/Documents/ViTPose/vitpose_base_coco_aic_mpii.pth",
 }
 
 
@@ -179,7 +185,16 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
 
     # rename some keys
     new_state_dict = convert_state_dict(state_dict, dim=config.backbone_config.hidden_size, config=config)
-    model.load_state_dict(new_state_dict)
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+
+    # TODO add associate_heads to the MoE models
+    if model_name in ["vitpose-base", "vitpose-base-simple"]:
+        assert missing_keys == []
+        assert unexpected_keys == []
+    elif model_name == "vitpose-base-coco-aic-mpii":
+        for key in unexpected_keys:
+            if key != "backbone.cls_token":
+                assert "associate_heads" in key
 
     # create image processor
     image_processor = ViTPoseImageProcessor()
@@ -198,14 +213,16 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     print("Shape of pixel values:", pixel_values.shape)
     with torch.no_grad():
         # first forward pass
-        outputs = model(pixel_values)
+        outputs = model(pixel_values, dataset_index=0)
         output_heatmap = outputs.heatmaps
 
         # second forward pass (flipped)
         # this is done since the model uses `flip_test=True` in its test config
         pixel_values_flipped = torch.flip(pixel_values, [3])
         outputs_flipped = model(
-            pixel_values_flipped, flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+            pixel_values_flipped,
+            dataset_index=0,
+            flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]],
         )
         output_flipped_heatmap = outputs_flipped.heatmaps
 
@@ -280,6 +297,11 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.9807913e02, 1.8182812e02, 8.8235235e-01]),
         )
+    elif model_name == "vitpose-base-coco-aic-mpii":
+        assert torch.allclose(
+            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            torch.tensor([3.98305542e02, 1.81741592e02, 8.69966745e-01]),
+        )
 
     # test post_process_pose_estimation
     target_sizes = [image.size[::-1]]
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 89b4e5f9f9df..b3914e460649 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -451,9 +451,6 @@ def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size
         scales = np.zeros((batch_size, 2), dtype=np.float32)
 
         for idx, (box, (height, width)) in enumerate(zip(boxes, target_sizes)):
-            print("Box:", box)
-            print("Height:", height)
-            print("Width:", width)
             center, scale = _box2cs(box, width, height)
             centers[idx, :] = center
             scales[idx, :] = scale
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 17d9f528c0b7..be975516b188 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -199,7 +199,9 @@ class ViTPoseClassicDecoder(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.deconv1 = nn.ConvTranspose2d(config.hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False)
+        self.deconv1 = nn.ConvTranspose2d(
+            config.backbone_hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False
+        )
         self.batchnorm1 = nn.BatchNorm2d(256)
         self.relu1 = nn.ReLU()
 
@@ -242,8 +244,10 @@ def __init__(self, config: ViTPoseConfig) -> None:
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        flip_pairs: Optional = None,
+        pixel_values: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        # TODO flip_pairs must be a tensor instead of a lists of lists
+        flip_pairs: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -257,6 +261,7 @@ def forward(
 
         outputs = self.backbone.forward_with_filtered_kwargs(
             pixel_values,
+            dataset_index=dataset_index,
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions,
             return_dict=return_dict,
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 71b707ca7a6d..7ac3773cf6e6 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -50,8 +50,12 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            The ratio of the hidden size in the feedforward network to the hidden size in the attention layers.
+        num_experts (`int`, *optional*):
+            The number of experts in the MoE layer.
+        part_features (`int`, *optional*
+            The number of part features to output. Only used in case `num_experts` is greater than 1.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
@@ -101,7 +105,9 @@ def __init__(
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
-        intermediate_size=3072,
+        mlp_ratio=4,
+        num_experts=1,
+        part_features=None,
         hidden_act="gelu",
         hidden_dropout_prob=0.0,
         attention_probs_dropout_prob=0.0,
@@ -117,7 +123,9 @@ def __init__(
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
+        self.mlp_ratio = mlp_ratio
+        self.num_experts = num_experts
+        self.part_features = part_features
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 2f552ffe2ca5..b09b2e567e68 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -46,7 +46,6 @@
 class ViTPoseBackboneEmbeddings(nn.Module):
     """
     Construct the position and patch embeddings.
-
     """
 
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
@@ -226,56 +225,74 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTPoseBackbone
-class ViTPoseBackboneIntermediate(nn.Module):
+class ViTPoseBackboneMoEMLP(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
+        num_experts = config.num_experts
+        hidden_features = config.intermediate_size
+        part_features = config.part_features
 
-        return hidden_states
+        self.part_features = part_features
+        self.fc1 = nn.Linear(config.hidden_size, hidden_features)
+        self.act = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, config.hidden_size - part_features)
+        self.drop = nn.Dropout(config.hidden_dropout_prob)
 
+        self.num_experts = num_experts
+        experts = [nn.Linear(hidden_features, part_features) for _ in range(num_experts)]
+        self.experts = nn.ModuleList(experts)
 
-# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTPoseBackbone
-class ViTPoseBackboneOutput(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, x, indices):
+        expert_x = torch.zeros_like(x[:, :, -self.part_features :], device=x.device, dtype=x.dtype)
 
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
+        x = self.fc1(x)
+        x = self.act(x)
+        shared_x = self.fc2(x)
+        indices = indices.view(-1, 1, 1)
 
-        hidden_states = hidden_states + input_tensor
+        # to support ddp training
+        for i in range(self.num_experts):
+            selectedIndex = indices == i
+            current_x = self.experts[i](x) * selectedIndex
+            expert_x = expert_x + current_x
 
-        return hidden_states
+        x = torch.cat([shared_x, expert_x], dim=-1)
 
+        return x
 
-# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTPoseBackbone
-class ViTPoseBackboneLayer(nn.Module):
-    """This corresponds to the Block class in the timm implementation."""
 
+class ViTPoseBackboneMLP(nn.Module):
+    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor, indices=None) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class ViTPoseBackboneLayer(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
         super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
         self.attention = ViTPoseBackboneAttention(config)
-        self.intermediate = ViTPoseBackboneIntermediate(config)
-        self.output = ViTPoseBackboneOutput(config)
+        self.mlp = ViTPoseBackboneMLP(config) if config.num_experts == 1 else ViTPoseBackboneMoEMLP(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
@@ -290,12 +307,11 @@ def forward(
         # first residual connection
         hidden_states = attention_output + hidden_states
 
-        # in ViTPoseBackbone, layernorm is also applied after self-attention
         layer_output = self.layernorm_after(hidden_states)
-        layer_output = self.intermediate(layer_output)
+        layer_output = self.mlp(layer_output, dataset_index)
 
-        # second residual connection is done here
-        layer_output = self.output(layer_output, hidden_states)
+        # second residual connection
+        layer_output = layer_output + hidden_states
 
         outputs = (layer_output,) + outputs
 
@@ -310,9 +326,11 @@ def __init__(self, config: ViTPoseBackboneConfig) -> None:
         self.layer = nn.ModuleList([ViTPoseBackboneLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
+    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
@@ -335,7 +353,7 @@ def forward(
                     output_attentions,
                 )
             else:
-                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+                layer_outputs = layer_module(hidden_states, dataset_index, layer_head_mask, output_attentions)
 
             hidden_states = layer_outputs[0]
 
@@ -442,7 +460,8 @@ def __init__(self, config: ViTPoseBackboneConfig):
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -485,6 +504,7 @@ def forward(
 
         outputs = self.encoder(
             embedding_output,
+            dataset_index=dataset_index,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=True,

From b09592c19b18a8ff30acc286d475671134deeb19 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 18:37:22 +0200
Subject: [PATCH 020/181] Fix tests, improve num_experts%

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 16 ++++++-----
 .../vitpose/image_processing_vitpose.py       |  2 --
 .../models/vitpose/modeling_vitpose.py        | 18 +++++++++---
 .../configuration_vitpose_backbone.py         |  2 +-
 .../modeling_vitpose_backbone.py              | 25 +++++++++++------
 tests/models/vitpose/test_modeling_vitpose.py | 28 ++++++++++++-------
 6 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 792d390d5374..d7f0e63c924d 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -48,6 +48,11 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def get_config(model_name):
+
+    # TODO it's unclear whether this checkpoint uses an MoE, it looks like no?
+    # num_experts = 6 if model_name == "vitpose-base-coco-aic-mpii" else None
+    # part_features = 192 if model_name == "vitpose-base-coco-aic-mpii" else None
+
     backbone_config = ViTPoseBackboneConfig(out_indices=[12])
     # size of the architecture
     if "small" in model_name:
@@ -67,15 +72,11 @@ def get_config(model_name):
         backbone_config.num_attention_heads = 16
 
     use_simple_decoder = "simple" in model_name
-    num_experts = 6 if model_name == "vitpose-base-coco-aic-mpii" else None
-    part_features = 192 if model_name == "vitpose-base-coco-aic-mpii" else None
 
     config = ViTPoseConfig(
         backbone_config=backbone_config,
         num_labels=17,
         use_simple_decoder=use_simple_decoder,
-        num_experts=num_experts,
-        part_features=part_features,
     )
 
     return config
@@ -209,11 +210,12 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     assert torch.allclose(pixel_values, original_pixel_values)
 
     img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
+    dataset_index = torch.tensor([0])
 
     print("Shape of pixel values:", pixel_values.shape)
     with torch.no_grad():
         # first forward pass
-        outputs = model(pixel_values, dataset_index=0)
+        outputs = model(pixel_values, dataset_index=dataset_index)
         output_heatmap = outputs.heatmaps
 
         # second forward pass (flipped)
@@ -221,8 +223,8 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         pixel_values_flipped = torch.flip(pixel_values, [3])
         outputs_flipped = model(
             pixel_values_flipped,
-            dataset_index=0,
-            flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]],
+            dataset_index=dataset_index,
+            flip_pairs=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]),
         )
         output_flipped_heatmap = outputs_flipped.heatmaps
 
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index b3914e460649..9aaf86f8e525 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -305,8 +305,6 @@ def affine_transform(
     ) -> np.array:
         data_format = input_data_format if data_format is None else data_format
 
-        print("Data format:", data_format)
-
         size = (size["width"], size["height"])
 
         transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index be975516b188..c59361942c38 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -154,7 +154,7 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
     output_flipped_back = output_flipped.copy()
 
     # Swap left-right parts
-    for left, right in flip_pairs:
+    for left, right in flip_pairs.tolist():
         output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
         output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
     output_flipped_back = output_flipped_back.reshape(shape_ori)
@@ -242,11 +242,14 @@ def __init__(self, config: ViTPoseConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.backbone.get_input_embeddings()
+
     def forward(
         self,
         pixel_values: torch.Tensor,
+        # TODO dataset_index must be a tensor instead of a lists of lists
         dataset_index: Optional[torch.Tensor] = None,
-        # TODO flip_pairs must be a tensor instead of a lists of lists
         flip_pairs: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -254,6 +257,10 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PoseEstimatorOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
         loss = None
         if labels is not None:
@@ -268,7 +275,7 @@ def forward(
         )
 
         # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
-        sequence_output = outputs.feature_maps[-1]
+        sequence_output = outputs.feature_maps[-1] if return_dict else outputs[0][-1]
         batch_size = sequence_output.shape[0]
         patch_height = self.config.image_size[0] // self.config.patch_size[0]
         patch_width = self.config.image_size[1] // self.config.patch_size[1]
@@ -279,7 +286,10 @@ def forward(
         heatmaps = self.head(sequence_output, flip_pairs=flip_pairs)
 
         if not return_dict:
-            output = (heatmaps,) + outputs[1:]
+            if output_hidden_states:
+                output = (heatmaps,) + outputs[1:]
+            else:
+                output = (heatmaps,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
         return PoseEstimatorOutput(
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 7ac3773cf6e6..368e90091194 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -52,7 +52,7 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         mlp_ratio (`int`, *optional*, defaults to 4):
             The ratio of the hidden size in the feedforward network to the hidden size in the attention layers.
-        num_experts (`int`, *optional*):
+        num_experts (`int`, *optional*, defaults to 1):
             The number of experts in the MoE layer.
         part_features (`int`, *optional*
             The number of part features to output. Only used in case `num_experts` is greater than 1.
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index b09b2e567e68..5add90c53072 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -12,7 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViTPose backbone model."""
+""" PyTorch ViTPose backbone model.
+
+This code is the same as the original Vision Transformer (ViT) with 2 modifications:
+- use of padding=2 in the patch embedding layer
+- addition of a mixture-of-experts MLP layer
+"""
 
 
 import collections.abc
@@ -229,14 +234,16 @@ class ViTPoseBackboneMoEMLP(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
         super().__init__()
 
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+
         num_experts = config.num_experts
-        hidden_features = config.intermediate_size
         part_features = config.part_features
 
         self.part_features = part_features
         self.fc1 = nn.Linear(config.hidden_size, hidden_features)
         self.act = ACT2FN[config.hidden_act]
-        self.fc2 = nn.Linear(hidden_features, config.hidden_size - part_features)
+        self.fc2 = nn.Linear(hidden_features, out_features - part_features)
         self.drop = nn.Dropout(config.hidden_dropout_prob)
 
         self.num_experts = num_experts
@@ -308,7 +315,7 @@ def forward(
         hidden_states = attention_output + hidden_states
 
         layer_output = self.layernorm_after(hidden_states)
-        layer_output = self.mlp(layer_output, dataset_index)
+        layer_output = self.mlp(layer_output, indices=dataset_index)
 
         # second residual connection
         layer_output = layer_output + hidden_states
@@ -456,6 +463,9 @@ def __init__(self, config: ViTPoseBackboneConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings
+
     @add_start_docstrings_to_model_forward(VITPOSE_BACKBONE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -519,11 +529,10 @@ def forward(
                 feature_maps += (hidden_state,)
 
         if not return_dict:
-            output = (feature_maps,)
             if output_hidden_states:
-                output += (hidden_states,)
-            if output_attentions:
-                output += (outputs.attentions,)
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
             return output
 
         return BackboneOutput(
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 250fe918766d..84477b124e06 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -18,7 +18,7 @@
 import inspect
 import unittest
 
-from transformers import ViTPoseConfig
+from transformers import ViTPoseBackboneConfig, ViTPoseConfig
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
@@ -59,6 +59,7 @@ def __init__(
         initializer_range=0.02,
         num_labels=2,
         scale_factor=4,
+        out_indices=[-1],
         scope=None,
     ):
         self.parent = parent
@@ -79,6 +80,7 @@ def __init__(
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.scale_factor = scale_factor
+        self.out_indices = out_indices
         self.scope = scope
 
         # in ViTPose, the seq length equals the number of patches
@@ -98,19 +100,20 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         return ViTPoseConfig(
+            backbone_config=self.get_backbone_config(),
+        )
+
+    def get_backbone_config(self):
+        return ViTPoseBackboneConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
+            hidden_size=self.hidden_size,
             intermediate_size=self.intermediate_size,
+            num_attention_heads=self.num_attention_heads,
             hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            num_labels=self.num_labels,
-            scale_factor=self.scale_factor,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
@@ -123,7 +126,7 @@ def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
         expected_width = (self.image_size[1] // self.patch_size[1]) * self.scale_factor
 
         self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, expected_height, expected_width)
+            result.heatmaps.shape, (self.batch_size, self.num_labels, expected_height, expected_width)
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -156,7 +159,12 @@ def setUp(self):
         self.config_tester = ConfigTester(self, config_class=ViTPoseConfig, has_text_modality=False, hidden_size=37)
 
     def test_config(self):
-        self.config_tester.run_common_tests()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
 
     @unittest.skip(reason="ViTPose does not use inputs_embeds")
     def test_inputs_embeds(self):

From 04930ec36782f3fb10bd53571542cd2e9a61068d Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 22 Apr 2024 18:43:03 +0200
Subject: [PATCH 021/181] Improve variable names

---
 .../models/vitpose/convert_vitpose_to_hf.py   |  1 -
 .../vitpose/image_processing_vitpose.py       | 23 ++++++++++---------
 .../modeling_vitpose_backbone.py              |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index d7f0e63c924d..a564d3af10ec 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -48,7 +48,6 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def get_config(model_name):
-
     # TODO it's unclear whether this checkpoint uses an MoE, it looks like no?
     # num_experts = 6 if model_name == "vitpose-base-coco-aic-mpii" else None
     # part_features = 192 if model_name == "vitpose-base-coco-aic-mpii" else None
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 9aaf86f8e525..d4ce6ba26fcd 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -132,9 +132,10 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     """
     if not isinstance(batch_heatmaps, np.ndarray):
         batch_heatmaps = batch_heatmaps.cpu().numpy()
-    B, K, H, W = batch_heatmaps.shape
-    N = coords.shape[0]
-    assert B == 1 or B == N
+    batch_size, num_keypoints, height, width = batch_heatmaps.shape
+    num_coords = coords.shape[0]
+    if not (batch_size == 1 or batch_size == num_coords):
+        raise ValueError("The batch size of heatmaps should be 1 or equal to the batch size of coordinates.")
     for heatmaps in batch_heatmaps:
         for heatmap in heatmaps:
             cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
@@ -143,26 +144,26 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
 
     batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode="edge").flatten()
 
-    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
-    index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (width + 2)
+    index += (width + 2) * (height + 2) * np.arange(0, batch_size * num_keypoints).reshape(-1, num_keypoints)
     index = index.astype(int).reshape(-1, 1)
     i_ = batch_heatmaps_pad[index]
     ix1 = batch_heatmaps_pad[index + 1]
-    iy1 = batch_heatmaps_pad[index + W + 2]
-    ix1y1 = batch_heatmaps_pad[index + W + 3]
-    ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+    iy1 = batch_heatmaps_pad[index + width + 2]
+    ix1y1 = batch_heatmaps_pad[index + width + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - width - 3]
     ix1_ = batch_heatmaps_pad[index - 1]
-    iy1_ = batch_heatmaps_pad[index - 2 - W]
+    iy1_ = batch_heatmaps_pad[index - 2 - width]
 
     dx = 0.5 * (ix1 - ix1_)
     dy = 0.5 * (iy1 - iy1_)
     derivative = np.concatenate([dx, dy], axis=1)
-    derivative = derivative.reshape(N, K, 2, 1)
+    derivative = derivative.reshape(num_coords, num_keypoints, 2, 1)
     dxx = ix1 - 2 * i_ + ix1_
     dyy = iy1 - 2 * i_ + iy1_
     dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
     hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
-    hessian = hessian.reshape(N, K, 2, 2)
+    hessian = hessian.reshape(num_coords, num_keypoints, 2, 2)
     hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
     coords -= np.einsum("ijmn,ijnk->ijmk", hessian, derivative).squeeze()
     return coords
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 5add90c53072..8b024b0fadd0 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -241,7 +241,7 @@ def __init__(self, config: ViTPoseBackboneConfig) -> None:
         part_features = config.part_features
 
         self.part_features = part_features
-        self.fc1 = nn.Linear(config.hidden_size, hidden_features)
+        self.fc1 = nn.Linear(in_features, hidden_features)
         self.act = ACT2FN[config.hidden_act]
         self.fc2 = nn.Linear(hidden_features, out_features - part_features)
         self.drop = nn.Dropout(config.hidden_dropout_prob)

From 676aa5cb803e52dbaa0b41e803666b9aa6e6e03a Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 12:11:31 +0200
Subject: [PATCH 022/181] Make fixup

---
 .../speech-recognition/run_speech_recognition_seq2seq.py       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index f352954d80ae..943dff1894ed 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -122,7 +122,8 @@ class ModelArguments:
         metadata={"help": "Deprecated. Please use the `language` and `task` arguments instead."},
     )
     suppress_tokens: List[int] = field(
-        default=None, metadata={
+        default=None,
+        metadata={
             "help": (
                 "Deprecated. The use of `suppress_tokens` should not be required for the majority of fine-tuning examples."
                 "Should you need to use `suppress_tokens`, please manually update them in the fine-tuning script directly."

From 547d0dae0f882e009b6e646ec3eccfed8b0a69d0 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 13:23:28 +0200
Subject: [PATCH 023/181] More improvements

---
 docs/source/en/model_doc/vitpose.md             |  3 +--
 .../models/vitpose/configuration_vitpose.py     |  6 +-----
 .../models/vitpose/modeling_vitpose.py          | 17 ++++++-----------
 .../configuration_vitpose_backbone.py           |  4 ----
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 11a2b1b7af77..7c1c3b4af208 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -30,7 +30,6 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 [[autodoc]] ViTPoseImageProcessor
     - preprocess
 
-
 ## ViTPoseConfig
 
 [[autodoc]] ViTPoseConfig
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 1725bafe7dab..99cc12818ca4 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -22,14 +22,10 @@
 
 logger = logging.get_logger(__name__)
 
-VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "unisydney": "https://huggingface.co/unisydney/resolve/main/config.json",
-}
-
 
 class ViTPoseConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTPoseForPoseEstimation`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`ViTPoseForPoseEstimation`]. It is used to instantiate a
     ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the ViTPose
     [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index c59361942c38..651c3b392cc9 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -127,18 +127,13 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
     """Flip the flipped heatmaps back to the original form.
 
-    Note:
-        - batch_size: N
-        - num_keypoints: K
-        - heatmap height: H
-        - heatmap width: W
-
     Args:
-        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
-            from the flipped images.
-        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
-            (for example, left ear -- right ear).
-        target_type (str): GaussianHeatmap or CombinedTarget
+        output_flipped (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            The output heatmaps obtained from the flipped images.
+        flip_pairs (list[tuple()):
+            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
+        target_type (str):
+            GaussianHeatmap or CombinedTarget
 
     Returns:
         np.ndarray: heatmaps that flipped back to the original image
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 368e90091194..93788348a588 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -22,10 +22,6 @@
 
 logger = logging.get_logger(__name__)
 
-VITPOSE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "unisydney": "https://huggingface.co/unisydney/resolve/main/config.json",
-}
-
 
 class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     r"""

From 4435fd66edb20a77180db0aad6bf6c5ca70ba5c1 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 14:13:02 +0200
Subject: [PATCH 024/181] Improve post_process_pose_estimation

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 20 ++---
 .../vitpose/image_processing_vitpose.py       | 73 ++++++++++++++-----
 2 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index a564d3af10ec..79ade5f81378 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -255,14 +255,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
     all_boxes[:, 5] = score
 
-    result = {}
-
-    result["preds"] = all_preds
-    result["boxes"] = all_boxes
-    result["output_heatmap"] = None  # return_heatmap = False for inference in mmpose
-
-    # print(result)
-    poses, _ = result["preds"], result["output_heatmap"]
+    poses = all_preds
 
     # create final results by adding person bbox information
     filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
@@ -277,6 +270,10 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         pose_result["bbox"] = bbox_xyxy
         pose_results.append(pose_result)
 
+    print("Original pose results:")
+    for pose_result in pose_results:
+        print(pose_result)
+
     # Verify pose_results
     # This is a list of dictionaries, containing the bounding box and keypoints per detected person
     assert torch.allclose(
@@ -303,13 +300,18 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.98305542e02, 1.81741592e02, 8.69966745e-01]),
         )
+    else:
+        raise ValueError("Model not supported")
+    print("Looks ok!")
 
     # test post_process_pose_estimation
     target_sizes = [image.size[::-1]]
     results = image_processor.post_process_pose_estimation(
         outputs, boxes=boxes[0], target_sizes=target_sizes, use_udp=True
     )
-    print("Shape of results:", results.shape)
+    print("Pose results:")
+    for pose_result in results:
+        print(pose_result)
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index d4ce6ba26fcd..b524fed419c5 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -44,6 +44,23 @@
 logger = logging.get_logger(__name__)
 
 
+def _xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+
+    return bbox_xyxy
+
+
 def _box2cs(box, width, height):
     """This encodes a bounding box (x,y,w,h) into (center, scale)
 
@@ -420,7 +437,7 @@ def preprocess(
 
         return encoded_inputs
 
-    def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size=11, use_udp=False):
+    def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size=11, use_udp=True):
         """
         Transform the heatmaps into keypoint predictions and transform them back to the image.
 
@@ -437,33 +454,53 @@ def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size
                 Whether to use unbiased data processing.
         """
 
-        # Avoid being affected
-        heatmaps = outputs.heatmaps.numpy().copy()
+        # First compute centers and scales
+        # TODO use target_sizes instead
+        import torch
+        from huggingface_hub import hf_hub_download
 
-        batch_size, num_keypoints, height, width = heatmaps.shape
+        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
+        img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
 
-        preds, maxvals = _get_max_preds(heatmaps)
-
-        preds = post_dark_udp(preds, heatmaps, kernel=kernel_size)
+        batch_size = len(outputs.heatmaps)
 
         centers = np.zeros((batch_size, 2), dtype=np.float32)
         scales = np.zeros((batch_size, 2), dtype=np.float32)
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            centers[i, :] = img_metas[i]["center"]
+            scales[i, :] = img_metas[i]["scale"]
 
-        for idx, (box, (height, width)) in enumerate(zip(boxes, target_sizes)):
-            center, scale = _box2cs(box, width, height)
-            centers[idx, :] = center
-            scales[idx, :] = scale
+        # assert np.allclose(centers, our_centers, atol=1e-4), f"Centers are not equal: {centers} vs {our_centers}"
+        # assert np.allclose(scales, our_scales, atol=1e-4), f"Scales are not equal: {scales} vs {our_scales}"
 
-        # Transform back to the image
-        for i in range(batch_size):
-            preds[i] = transform_preds(preds[i], centers[i], scales[i], [width, height], use_udp=use_udp)
+        preds, maxvals = self.keypoints_from_heatmaps(
+            outputs.heatmaps, centers, scales, kernel=kernel_size, use_udp=use_udp
+        )
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = centers[:, 0:2]
+        all_boxes[:, 2:4] = scales[:, 0:2]
+        all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        poses = all_preds
+
+        bboxes = np.array(boxes)
+        bboxes_xyxy = _xywh2xyxy(bboxes)
 
-        # Concatenate along the final dimension
-        preds = np.concatenate([preds, maxvals], axis=-1)
+        pose_results = []
+        for pose, bbox_xyxy in zip(poses, bboxes_xyxy):
+            pose_result = {}
+            pose_result["keypoints"] = pose
+            pose_result["bbox"] = bbox_xyxy
+            pose_results.append(pose_result)
 
-        return preds
+        return pose_results
 
-    # TODO originally called keypoints_from_heatmaps
     def keypoints_from_heatmaps(
         self,
         heatmaps,

From db0e72be9499e38f3ea544f6d181d15650471513 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 14:29:24 +0200
Subject: [PATCH 025/181] Compute centers and scales

---
 .../vitpose/image_processing_vitpose.py       | 131 ++++++++----------
 1 file changed, 58 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index b524fed419c5..4c9de0a56169 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -62,7 +62,7 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def _box2cs(box, width, height):
-    """This encodes a bounding box (x,y,w,h) into (center, scale)
+    """This encodes bbox(x,y,w,h) into (center, scale)
 
     Args:
         x, y, w, h
@@ -75,7 +75,8 @@ def _box2cs(box, width, height):
     """
 
     x, y, w, h = box[:4]
-    aspect_ratio = width / height
+    input_size = (width, height)
+    aspect_ratio = input_size[0] / input_size[1]
     center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
 
     if w > aspect_ratio * h:
@@ -437,70 +438,6 @@ def preprocess(
 
         return encoded_inputs
 
-    def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size=11, use_udp=True):
-        """
-        Transform the heatmaps into keypoint predictions and transform them back to the image.
-
-        Args:
-            outputs (torch.Tensor):
-                Model outputs.
-            boxes (torch.Tensor of shape [batch_size, 4]):
-                Bounding boxes.
-            target_sizes (`List[Tuple[int, int]]`, *optional*):
-                Size of the target heatmaps.
-            kernel_size (`int`, *optional*, defaults to 11):
-                Gaussian kernel size (K) for modulation.
-            use_udp (`bool`, *optional*, defaults to `False`):
-                Whether to use unbiased data processing.
-        """
-
-        # First compute centers and scales
-        # TODO use target_sizes instead
-        import torch
-        from huggingface_hub import hf_hub_download
-
-        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
-        img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
-
-        batch_size = len(outputs.heatmaps)
-
-        centers = np.zeros((batch_size, 2), dtype=np.float32)
-        scales = np.zeros((batch_size, 2), dtype=np.float32)
-        score = np.ones(batch_size)
-        for i in range(batch_size):
-            centers[i, :] = img_metas[i]["center"]
-            scales[i, :] = img_metas[i]["scale"]
-
-        # assert np.allclose(centers, our_centers, atol=1e-4), f"Centers are not equal: {centers} vs {our_centers}"
-        # assert np.allclose(scales, our_scales, atol=1e-4), f"Scales are not equal: {scales} vs {our_scales}"
-
-        preds, maxvals = self.keypoints_from_heatmaps(
-            outputs.heatmaps, centers, scales, kernel=kernel_size, use_udp=use_udp
-        )
-
-        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
-        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
-        all_preds[:, :, 0:2] = preds[:, :, 0:2]
-        all_preds[:, :, 2:3] = maxvals
-        all_boxes[:, 0:2] = centers[:, 0:2]
-        all_boxes[:, 2:4] = scales[:, 0:2]
-        all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
-        all_boxes[:, 5] = score
-
-        poses = all_preds
-
-        bboxes = np.array(boxes)
-        bboxes_xyxy = _xywh2xyxy(bboxes)
-
-        pose_results = []
-        for pose, bbox_xyxy in zip(poses, bboxes_xyxy):
-            pose_result = {}
-            pose_result["keypoints"] = pose
-            pose_result["bbox"] = bbox_xyxy
-            pose_results.append(pose_result)
-
-        return pose_results
-
     def keypoints_from_heatmaps(
         self,
         heatmaps,
@@ -513,14 +450,8 @@ def keypoints_from_heatmaps(
         Get final keypoint predictions from heatmaps and transform them back to
         the image.
 
-        Note:
-            - batch size: N
-            - num keypoints: K
-            - heatmap height: H
-            - heatmap width: W
-
         Args:
-            heatmaps (np.ndarray[N, K, H, W]):
+            heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width])`):
                 Model predicted heatmaps.
             center (np.ndarray[N, 2]):
                 Center of the bounding box (x, y).
@@ -575,3 +506,57 @@ def keypoints_from_heatmaps(
             preds[i] = transform_preds(preds[i], center[i], scale[i], [width, height], use_udp=use_udp)
 
         return preds, maxvals
+
+    def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size=11, use_udp=True):
+        """
+        Transform the heatmaps into keypoint predictions and transform them back to the image.
+
+        Args:
+            outputs (torch.Tensor):
+                Model outputs.
+            boxes (torch.Tensor of shape [batch_size, 4]):
+                Bounding boxes.
+            target_sizes (`List[Tuple[int, int]]`, *optional*):
+                Size of the target heatmaps.
+            kernel_size (`int`, *optional*, defaults to 11):
+                Gaussian kernel size (K) for modulation.
+            use_udp (`bool`, *optional*, defaults to `False`):
+                Whether to use unbiased data processing.
+        """
+
+        # First compute centers and scales for each bounding box
+        batch_size = len(outputs.heatmaps)
+        centers = np.zeros((batch_size, 2), dtype=np.float32)
+        scales = np.zeros((batch_size, 2), dtype=np.float32)
+        for i in range(batch_size):
+            # compute center and scale
+            # TODO use target sizes
+            center, scale = _box2cs(boxes[i], width=192, height=256)
+            centers[i, :] = center
+            scales[i, :] = scale
+
+        preds, maxvals = self.keypoints_from_heatmaps(
+            outputs.heatmaps, centers, scales, kernel=kernel_size, use_udp=use_udp
+        )
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = centers[:, 0:2]
+        all_boxes[:, 2:4] = scales[:, 0:2]
+        all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
+
+        poses = all_preds
+
+        bboxes = np.array(boxes)
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+        pose_results = []
+        for pose, bbox_xyxy in zip(poses, bboxes_xyxy):
+            pose_result = {}
+            pose_result["keypoints"] = pose
+            pose_result["bbox"] = bbox_xyxy
+            pose_results.append(pose_result)
+
+        return pose_results

From 027100df422ce141fd68d4d8e955e5cdc43fd9a7 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 18:13:02 +0200
Subject: [PATCH 026/181] Improve postprocessing

---
 .../models/vitpose/convert_vitpose_to_hf.py   |  7 +-
 .../vitpose/image_processing_vitpose.py       | 67 ++++++++++---------
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 79ade5f81378..d90b7595de70 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -305,12 +305,9 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     print("Looks ok!")
 
     # test post_process_pose_estimation
-    target_sizes = [image.size[::-1]]
-    results = image_processor.post_process_pose_estimation(
-        outputs, boxes=boxes[0], target_sizes=target_sizes, use_udp=True
-    )
+    hf_pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0], use_udp=True)
     print("Pose results:")
-    for pose_result in results:
+    for pose_result in hf_pose_results:
         print(pose_result)
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 4c9de0a56169..8e8c66b96fb0 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -61,7 +61,7 @@ def _xywh2xyxy(bbox_xywh):
     return bbox_xyxy
 
 
-def _box2cs(box, width, height):
+def box_to_center_and_scale(box, width, height):
     """This encodes bbox(x,y,w,h) into (center, scale)
 
     Args:
@@ -95,12 +95,8 @@ def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
     Args:
-        heatmaps (np.ndarray of shape [N, K, H, W]):
-            Model predicted heatmaps. Note:
-            - batch_size: N
-            - num_keypoints: K
-            - heatmap height: H
-            - heatmap width: W
+        heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            Model predicted heatmaps.
 
     Returns:
         tuple: A tuple containing aggregated results.
@@ -108,27 +104,30 @@ def _get_max_preds(heatmaps):
         - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
         - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
     """
-    assert isinstance(heatmaps, np.ndarray), "heatmaps should be numpy.ndarray"
-    assert heatmaps.ndim == 4, "batch_images should be 4-ndim"
+    if not isinstance(heatmaps, np.ndarray):
+        raise ValueError("Heatmaps should be numpy.ndarray")
+    if heatmaps.ndim != 4:
+        raise ValueError("Heatmaps should be 4-dimensional")
 
-    N, K, _, W = heatmaps.shape
-    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
-    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
-    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    batch_size, num_keypoints, _, width = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((batch_size, num_keypoints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
 
     preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
-    preds[:, :, 0] = preds[:, :, 0] % W
-    preds[:, :, 1] = preds[:, :, 1] // W
+    preds[:, :, 0] = preds[:, :, 0] % width
+    preds[:, :, 1] = preds[:, :, 1] // width
 
     preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
     return preds, maxvals
 
 
 def post_dark_udp(coords, batch_heatmaps, kernel=3):
-    """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
-    Devil is in the Details: Delving into Unbiased Data Processing for Human
-    Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
-    Representation for Human Pose Estimation (CVPR 2020).
+    """DARK post-pocessing. Implemented by udp.
+
+    Paper references:
+    - Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    - Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020).
 
     Note:
         - batch size: B
@@ -205,9 +204,10 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
         center (np.ndarray[2, ]): Center of the bounding box (x, y).
         scale (np.ndarray[2, ]): Scale of the bounding box
             wrt [width, height].
-        output_size (np.ndarray[2, ] | list(2,)): Size of the
-            destination heatmaps.
-        use_udp (bool): Use unbiased data processing
+        output_size (np.ndarray[2, ] | list(2,)):
+            Size of the destination heatmaps.
+        use_udp (bool):
+            Whether to use unbiased data processing.
 
     Returns:
         np.ndarray: Predicted coordinates in the images.
@@ -322,6 +322,14 @@ def affine_transform(
         data_format: Optional[ChannelDimension] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.array:
+        """
+        Apply an affine transformation to an image.
+
+        Args:
+            image (`np.array`):
+                Image to transform.
+        """
+
         data_format = input_data_format if data_format is None else data_format
 
         size = (size["width"], size["height"])
@@ -407,7 +415,7 @@ def preprocess(
         if self.do_affine_transform:
             for image, image_boxes in zip(images, boxes):
                 for box in image_boxes:
-                    center, scale = _box2cs(box, size["width"], size["height"])
+                    center, scale = box_to_center_and_scale(box, size["width"], size["height"])
                     transformed_image = self.affine_transform(
                         image, center, scale, rotation=0, size=size, input_data_format=input_data_format
                     )
@@ -493,12 +501,8 @@ def keypoints_from_heatmaps(
 
         batch_size, num_keypoints, height, width = heatmaps.shape
 
-        # print("Mean of heatmaps before _get_max_preds:", np.mean(heatmaps))
-
         preds, maxvals = _get_max_preds(heatmaps)
 
-        # print("Preds after _get_max_preds:", preds)
-
         preds = post_dark_udp(preds, heatmaps, kernel=kernel)
 
         # Transform back to the image
@@ -507,7 +511,7 @@ def keypoints_from_heatmaps(
 
         return preds, maxvals
 
-    def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size=11, use_udp=True):
+    def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=True):
         """
         Transform the heatmaps into keypoint predictions and transform them back to the image.
 
@@ -516,8 +520,6 @@ def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size
                 Model outputs.
             boxes (torch.Tensor of shape [batch_size, 4]):
                 Bounding boxes.
-            target_sizes (`List[Tuple[int, int]]`, *optional*):
-                Size of the target heatmaps.
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
             use_udp (`bool`, *optional*, defaults to `False`):
@@ -529,9 +531,8 @@ def post_process_pose_estimation(self, outputs, boxes, target_sizes, kernel_size
         centers = np.zeros((batch_size, 2), dtype=np.float32)
         scales = np.zeros((batch_size, 2), dtype=np.float32)
         for i in range(batch_size):
-            # compute center and scale
-            # TODO use target sizes
-            center, scale = _box2cs(boxes[i], width=192, height=256)
+            width, height = self.size["width"], self.size["height"]
+            center, scale = box_to_center_and_scale(boxes[i], width=width, height=height)
             centers[i, :] = center
             scales[i, :] = scale
 

From fc8e5e057fd5548ef687efb2fc00e8b8da91372b Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 18:22:48 +0200
Subject: [PATCH 027/181] More improvements

---
 src/transformers/models/vitpose/convert_vitpose_to_hf.py   | 7 +++----
 .../vitpose_backbone/configuration_vitpose_backbone.py     | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index d90b7595de70..fd04aa9076d6 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -48,11 +48,10 @@ def _xywh2xyxy(bbox_xywh):
 
 
 def get_config(model_name):
-    # TODO it's unclear whether this checkpoint uses an MoE, it looks like no?
-    # num_experts = 6 if model_name == "vitpose-base-coco-aic-mpii" else None
-    # part_features = 192 if model_name == "vitpose-base-coco-aic-mpii" else None
+    num_experts = 6 if "+" in model_name else 1
+    part_features = 192 if "+" in model_name else 0
 
-    backbone_config = ViTPoseBackboneConfig(out_indices=[12])
+    backbone_config = ViTPoseBackboneConfig(out_indices=[12], num_experts=num_experts, part_features=part_features)
     # size of the architecture
     if "small" in model_name:
         backbone_config.hidden_size = 768
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 93788348a588..a5ac1be0ce09 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -50,7 +50,7 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
             The ratio of the hidden size in the feedforward network to the hidden size in the attention layers.
         num_experts (`int`, *optional*, defaults to 1):
             The number of experts in the MoE layer.
-        part_features (`int`, *optional*
+        part_features (`int`, *optional*):
             The number of part features to output. Only used in case `num_experts` is greater than 1.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,

From 6e7afac1b76c52466e7a4f071dce00758545e20c Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 20:07:37 +0200
Subject: [PATCH 028/181] Fix ViTPoseBackbone tests

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 12 ++++++
 tests/models/vitpose_backbone/__init__.py     |  0
 .../test_modeling_vitpose_backbone.py         | 38 +++++++++++--------
 tests/test_modeling_common.py                 |  4 ++
 4 files changed, 38 insertions(+), 16 deletions(-)
 create mode 100644 tests/models/vitpose_backbone/__init__.py

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index fd04aa9076d6..40328a8f2335 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -159,6 +159,7 @@ def prepare_img():
     "vitpose-base-simple": "/Users/nielsrogge/Documents/ViTPose/vitpose-b-simple.pth",
     "vitpose-base": "/Users/nielsrogge/Documents/ViTPose/vitpose-b.pth",
     "vitpose-base-coco-aic-mpii": "/Users/nielsrogge/Documents/ViTPose/vitpose_base_coco_aic_mpii.pth",
+    "vitpose+-base": "/Users/nielsrogge/Documents/ViTPose/vitpose+_base.pth",
 }
 
 
@@ -299,12 +300,23 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.98305542e02, 1.81741592e02, 8.69966745e-01]),
         )
+    elif model_name == "vitpose+-base":
+        assert torch.allclose(
+            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            torch.tensor([3.98201294e02, 1.81728302e02, 8.75046968e-01]),
+        )
     else:
         raise ValueError("Model not supported")
     print("Looks ok!")
 
     # test post_process_pose_estimation
+    # results are slightly different due to no flip augmentation
     hf_pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0], use_udp=True)
+    if model_name == "vitpose-base-simple":
+        assert torch.allclose(
+            torch.tensor(hf_pose_results[1]["keypoints"][0, :3]),
+            torch.tensor([3.9813846e02, 1.8180725e02, 8.7446749e-01]),
+        )
     print("Pose results:")
     for pose_result in hf_pose_results:
         print(pose_result)
diff --git a/tests/models/vitpose_backbone/__init__.py b/tests/models/vitpose_backbone/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index c86c2120a5f6..25de7b766817 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -18,7 +18,7 @@
 import inspect
 import unittest
 
-from transformers import ViTPoseConfig
+from transformers import ViTPoseBackboneConfig
 from transformers.testing_utils import require_torch
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -57,7 +57,6 @@ def __init__(
         type_sequence_label_size=10,
         initializer_range=0.02,
         num_labels=2,
-        scale_factor=4,
         scope=None,
     ):
         self.parent = parent
@@ -77,10 +76,9 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_labels = num_labels
-        self.scale_factor = scale_factor
         self.scope = scope
 
-        # in ViTPose, the seq length equals the number of patches
+        # in ViTPoseBackbone, the seq length equals the number of patches
         num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
         self.seq_length = num_patches
 
@@ -96,7 +94,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, labels
 
     def get_config(self):
-        return ViTPoseConfig(
+        return ViTPoseBackboneConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -109,7 +107,6 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
             num_labels=self.num_labels,
-            scale_factor=self.scale_factor,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -124,43 +121,52 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
+class ViTPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViTPose does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViTPoseBackbone does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (ViTPoseBackbone,) if is_torch_available() else ()
     fx_compatible = False
-
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
     def setUp(self):
         self.model_tester = ViTPoseBackboneModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTPoseConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=ViTPoseBackboneConfig, has_text_modality=False, hidden_size=37
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="ViTPose does not use inputs_embeds")
+    @unittest.skip(reason="ViTPoseBackbone does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="ViTPoseBackbone does not support feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip(reason="ViTPoseBackbone does not output a loss")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
     def test_training(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
@@ -195,7 +201,7 @@ def prepare_img():
 @require_torch
 class ViTPoseBackboneTest(unittest.TestCase, BackboneTesterMixin):
     all_model_classes = (ViTPoseBackbone,) if is_torch_available() else ()
-    config_class = ViTPoseConfig
+    config_class = ViTPoseBackboneConfig
 
     has_attentions = False
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 061c0000ceed..ac3b9c36ff63 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1656,6 +1656,10 @@ def test_feed_forward_chunking(self):
             model.to(torch_device)
             model.eval()
 
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            for k, v in inputs.items():
+                print(k, v.shape)
+
             hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
 
             torch.manual_seed(0)

From 8888c15891f75dcb5c3623ad472dac516e88fafa Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 20:53:23 +0200
Subject: [PATCH 029/181] Add docstrings, fix image processor tests

---
 .../vitpose/image_processing_vitpose.py       |  70 +++++--
 .../models/vitpose/modeling_vitpose.py        |   1 -
 .../models/vitpose_backbone/test.py           |  10 -
 .../vitpose/test_image_processing_vitpose.py  | 194 ++++++++++++++++++
 tests/test_modeling_common.py                 |   4 -
 5 files changed, 252 insertions(+), 27 deletions(-)
 delete mode 100644 src/transformers/models/vitpose_backbone/test.py
 create mode 100644 tests/models/vitpose/test_image_processing_vitpose.py

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 8e8c66b96fb0..a7bad5bf7632 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -17,7 +17,7 @@
 import math
 from typing import Dict, List, Optional, Union
 
-# TODO get rid of cv2
+# TODO get rid of cv2?
 import cv2
 import numpy as np
 
@@ -234,21 +234,25 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
     return target_coords
 
 
-def get_warp_matrix(theta, size_input, size_dst, size_target):
+def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray, size_target: np.ndarray):
     """
-    Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
-
     Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
     Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
 
+    Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
     Args:
-        theta (float): Rotation angle in degrees.
-        size_input (np.ndarray): Size of input image [w, h].
-        size_dst (np.ndarray): Size of output image [w, h].
-        size_target (np.ndarray): Size of ROI in input plane [w, h].
+        theta (`float`):
+            Rotation angle in degrees.
+        size_input (`np.ndarray`):
+            Size of input image [width, height].
+        size_dst (`np.ndarray`):
+            Size of output image [width, height].
+        size_target (`np.ndarray`):
+            Size of ROI in input plane [w, h].
 
     Returns:
-        np.ndarray: A matrix for transformation.
+        `np.ndarray`: A matrix for transformation.
     """
     theta = np.deg2rad(theta)
     matrix = np.zeros((2, 3), dtype=np.float32)
@@ -328,15 +332,36 @@ def affine_transform(
         Args:
             image (`np.array`):
                 Image to transform.
+            center (`tuple[float]`):
+                Center of the bounding box (x, y).
+            scale (`tuple[float]`):
+                Scale of the bounding box with respect to height/width.
+            rotation (`float`):
+                Rotation angle in degrees.
+            size (`Dict[str, int]`):
+                Size of the destination image.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format of the output image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image.
         """
 
         data_format = input_data_format if data_format is None else data_format
 
         size = (size["width"], size["height"])
 
+        # one uses a pixel standard deviation of 200 pixels
         transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
 
-        image = cv2.warpAffine(image, transformation, size, flags=cv2.INTER_LINEAR)
+        # cv2 requires channels last format
+        cv2_image = (
+            image
+            if input_data_format == ChannelDimension.LAST
+            else to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
+        )
+        image = cv2.warpAffine(cv2_image, transformation, size, flags=cv2.INTER_LINEAR)
+        # transform image back to input_data_format
+        image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
 
         # move back to input_data_format
         if data_format is not None:
@@ -347,7 +372,7 @@ def affine_transform(
     def preprocess(
         self,
         images: ImageInput,
-        boxes,
+        boxes: List[List[float]],
         do_affine_transform: bool = None,
         size: Dict[str, int] = None,
         do_rescale: bool = None,
@@ -367,6 +392,25 @@ def preprocess(
                 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
 
+            boxes (`List[List[float]]`):
+                List of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+                box coordinates (x, y, w, h).
+
+            do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
+                Whether to apply an affine transformation to the input images.
+            size (`Dict[str, int]` *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
             return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
@@ -424,7 +468,9 @@ def preprocess(
         images = new_images
 
         # TODO each image might have a variable number of boxes => padding?
-        # create pixel_values of shape (batch_size, num_boxes, num_channels, height, width)
+        # since the number of boxes can differ per image, the image processor takes a list
+        # rather than a numpy array of boxes
+        # it currently create pixel_values of shape (batch_size*num_persons, num_channels, height, width)
 
         if self.do_rescale:
             images = [
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 651c3b392cc9..80ce9aedbcee 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -243,7 +243,6 @@ def get_input_embeddings(self) -> nn.Module:
     def forward(
         self,
         pixel_values: torch.Tensor,
-        # TODO dataset_index must be a tensor instead of a lists of lists
         dataset_index: Optional[torch.Tensor] = None,
         flip_pairs: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/vitpose_backbone/test.py b/src/transformers/models/vitpose_backbone/test.py
deleted file mode 100644
index 75daeee3cd52..000000000000
--- a/src/transformers/models/vitpose_backbone/test.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import torch
-
-from transformers import ViTPoseBackbone, ViTPoseBackboneConfig
-
-
-model = ViTPoseBackbone(ViTPoseBackboneConfig())
-
-pixel_values = torch.randn(1, 3, 256, 192)
-
-feature_maps = model(pixel_values)
diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
new file mode 100644
index 000000000000..508e12d6298a
--- /dev/null
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTPoseImageProcessor
+
+if is_torch_available():
+    import torch
+
+
+class ViTPoseImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_affine_transform=True,
+        size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_affine_transform = do_affine_transform
+        self.size = size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_affine_transform": self.do_affine_transform,
+            "size": self.size,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ViTPoseImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ViTPoseImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ViTPoseImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_affine_transform"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size={"height": 42, "width": 42}
+        )
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processing(image_inputs[0], boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processing(image_inputs, boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processing(image_inputs[0], boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processing(image_inputs, boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processing(image_inputs[0], boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processing(image_inputs, boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
+        )
+
+    @unittest.skip(reason="ViTPoseImageProcessor does not support 4 channels for now")
+    def test_call_numpy_4_channels(self):
+        pass
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index ac3b9c36ff63..061c0000ceed 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1656,10 +1656,6 @@ def test_feed_forward_chunking(self):
             model.to(torch_device)
             model.eval()
 
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            for k, v in inputs.items():
-                print(k, v.shape)
-
             hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
 
             torch.manual_seed(0)

From 0290de59c03e88a421a4ae2588d2e0ee2c85f36f Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 28 Apr 2024 22:00:53 +0200
Subject: [PATCH 030/181] Update index

---
 docs/source/pt/index.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/pt/index.md b/docs/source/pt/index.md
index 7dd92ca1b817..18dbcbc06b80 100644
--- a/docs/source/pt/index.md
+++ b/docs/source/pt/index.md
@@ -283,7 +283,6 @@ disso, são diferenciados pelo suporte em diferentes frameworks: JAX (por meio d
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         ViTPosePose         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |

From d33cb0177eafcfc5e614c8cadc4698007410a540 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 29 Apr 2024 09:58:52 +0200
Subject: [PATCH 031/181] Use is_cv2_available

---
 .../models/vitpose/image_processing_vitpose.py            | 8 +++++---
 tests/models/vitpose/test_image_processing_vitpose.py     | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index a7bad5bf7632..c4435e91a77a 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -17,8 +17,6 @@
 import math
 from typing import Dict, List, Optional, Union
 
-# TODO get rid of cv2?
-import cv2
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
@@ -34,12 +32,16 @@
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_vision_available, logging
+from ...utils import TensorType, is_cv2_available, is_vision_available, logging
 
 
 if is_vision_available():
     import PIL
 
+if is_cv2_available():
+    # TODO get rid of cv2?
+    import cv2
+
 
 logger = logging.get_logger(__name__)
 
diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index 508e12d6298a..7f4b4bf0e9fb 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_cv2, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -93,6 +93,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
+@require_cv2
 class ViTPoseImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ViTPoseImageProcessor if is_vision_available() else None
 

From 6af97a9a11ee02f2253f4bfe43d73b1a5828f158 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 29 Apr 2024 10:33:16 +0200
Subject: [PATCH 032/181] Add model to toctree

---
 docs/source/en/_toctree.yml                            |  2 ++
 .../models/vitpose/image_processing_vitpose.py         | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index afc997a7aa54..2b01ace2dc64 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -639,6 +639,8 @@
         title: ViTMatte
       - local: model_doc/vit_msn
         title: ViTMSN
+      - local: model_doc/vitpose
+        title: ViTPose
       - local: model_doc/yolos
         title: YOLOS
       title: Vision models
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index c4435e91a77a..b2552ed2e236 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -15,7 +15,7 @@
 """Image processor class for ViTPose."""
 
 import math
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -321,8 +321,8 @@ def __init__(
     def affine_transform(
         self,
         image: np.array,
-        center: tuple[float],
-        scale: tuple[float],
+        center: Tuple[float],
+        scale: Tuple[float],
         rotation: float,
         size: Dict[str, int],
         data_format: Optional[ChannelDimension] = None,
@@ -334,9 +334,9 @@ def affine_transform(
         Args:
             image (`np.array`):
                 Image to transform.
-            center (`tuple[float]`):
+            center (`Tuple[float]`):
                 Center of the bounding box (x, y).
-            scale (`tuple[float]`):
+            scale (`Tuple[float]`):
                 Scale of the bounding box with respect to height/width.
             rotation (`float`):
                 Rotation angle in degrees.

From c4ccdb6ef527ae5869ebd21b9d101de01b514a9e Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 29 Apr 2024 11:06:03 +0200
Subject: [PATCH 033/181] Add cv2 to doc tests

---
 .circleci/create_circleci_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 2c5a0738bf5a..5088ce48100c 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -529,6 +529,7 @@ def job_name(self):
         "pip install --upgrade --upgrade-strategy eager 'pytest<8.0.0' pytest-sugar",
         "pip install -U --upgrade-strategy eager natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels",
         "pip install -U --upgrade-strategy eager g2p-en",
+        "pip install opencv-python",
         # TODO: remove this one after fixing the dependency issue(s) above
         "pip install -U --upgrade-strategy eager torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu",
         "find -name __pycache__ -delete",

From e09aa538643d10e4235985a7e1d76ca5a93c74ff Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 6 May 2024 15:24:22 +0200
Subject: [PATCH 034/181] Remove script

---
 .../models/vitpose/test_image_processor.py      | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100644 src/transformers/models/vitpose/test_image_processor.py

diff --git a/src/transformers/models/vitpose/test_image_processor.py b/src/transformers/models/vitpose/test_image_processor.py
deleted file mode 100644
index 23687e9c1b4f..000000000000
--- a/src/transformers/models/vitpose/test_image_processor.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import requests
-from PIL import Image
-
-from transformers import ViTPoseImageProcessor
-
-
-url = "http://images.cocodataset.org/val2017/000000000139.jpg"
-image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-image_processor = ViTPoseImageProcessor()
-
-boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
-
-inputs = image_processor(images=image, boxes=boxes, return_tensors="pt")
-
-print(inputs.pixel_values.shape)
-print(inputs.pixel_values.mean())

From 22035380fda5feebf34834ca5983e48f9e2bbf06 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 6 May 2024 15:42:14 +0200
Subject: [PATCH 035/181] Improve conversion script

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 92 +++++++++----------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 40328a8f2335..b1802457cdb2 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -47,6 +47,45 @@ def _xywh2xyxy(bbox_xywh):
     return bbox_xyxy
 
 
+def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_processor):
+    batch_size = pixel_values.shape[0]
+
+    centers = np.zeros((batch_size, 2), dtype=np.float32)
+    scales = np.zeros((batch_size, 2), dtype=np.float32)
+    for i in range(batch_size):
+        centers[i, :] = img_metas[i]["center"]
+        scales[i, :] = img_metas[i]["scale"]
+
+    preds, maxvals = image_processor.keypoints_from_heatmaps(
+        output_heatmap, center=centers, scale=scales, use_udp=True
+    )
+
+    all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+    all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+    all_preds[:, :, 0:2] = preds[:, :, 0:2]
+    all_preds[:, :, 2:3] = maxvals
+    all_boxes[:, 0:2] = centers[:, 0:2]
+    all_boxes[:, 2:4] = scales[:, 0:2]
+    all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
+
+    poses = all_preds
+
+    # create final results by adding person bbox information
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
+    person_results = torch.load(filepath, map_location="cpu")
+    bboxes = np.array([box["bbox"] for box in person_results])
+    bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    pose_results = []
+    for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy):
+        pose_result = person_result.copy()
+        pose_result["keypoints"] = pose
+        pose_result["bbox"] = bbox_xyxy
+        pose_results.append(pose_result)
+
+    return pose_results
+
+
 def get_config(model_name):
     num_experts = 6 if "+" in model_name else 1
     part_features = 192 if "+" in model_name else 0
@@ -229,52 +268,8 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
 
     output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
 
-    # TODO verify postprocessing
-    batch_size = pixel_values.shape[0]
-
-    centers = np.zeros((batch_size, 2), dtype=np.float32)
-    scales = np.zeros((batch_size, 2), dtype=np.float32)
-    score = np.ones(batch_size)
-    for i in range(batch_size):
-        centers[i, :] = img_metas[i]["center"]
-        scales[i, :] = img_metas[i]["scale"]
-
-        if "bbox_score" in img_metas[i]:
-            score[i] = np.array(img_metas[i]["bbox_score"]).reshape(-1)
-
-    preds, maxvals = image_processor.keypoints_from_heatmaps(
-        output_heatmap, center=centers, scale=scales, use_udp=True
-    )
-
-    all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
-    all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
-    all_preds[:, :, 0:2] = preds[:, :, 0:2]
-    all_preds[:, :, 2:3] = maxvals
-    all_boxes[:, 0:2] = centers[:, 0:2]
-    all_boxes[:, 2:4] = scales[:, 0:2]
-    all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
-    all_boxes[:, 5] = score
-
-    poses = all_preds
-
-    # create final results by adding person bbox information
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
-    person_results = torch.load(filepath, map_location="cpu")
-    bboxes = np.array([box["bbox"] for box in person_results])
-    bboxes_xyxy = _xywh2xyxy(bboxes)
-
-    pose_results = []
-    for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy):
-        pose_result = person_result.copy()
-        pose_result["keypoints"] = pose
-        pose_result["bbox"] = bbox_xyxy
-        pose_results.append(pose_result)
-
-    print("Original pose results:")
-    for pose_result in pose_results:
-        print(pose_result)
-
     # Verify pose_results
+    pose_results = get_original_pose_results(pixel_values, img_metas, output_heatmap, image_processor)
     # This is a list of dictionaries, containing the bounding box and keypoints per detected person
     assert torch.allclose(
         torch.from_numpy(pose_results[0]["bbox"]).float(), torch.tensor([412.8, 157.61, 464.85, 294.62])
@@ -282,8 +277,6 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     assert torch.allclose(
         torch.from_numpy(pose_results[1]["bbox"]).float(), torch.tensor([384.43, 172.21, 398.55, 206.95])
     )
-    assert pose_results[0]["keypoints"].shape == (17, 3)
-    assert pose_results[1]["keypoints"].shape == (17, 3)
 
     if model_name == "vitpose-base-simple":
         assert torch.allclose(
@@ -317,9 +310,8 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
             torch.tensor(hf_pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.9813846e02, 1.8180725e02, 8.7446749e-01]),
         )
-    print("Pose results:")
-    for pose_result in hf_pose_results:
-        print(pose_result)
+        assert hf_pose_results[0]["keypoints"].shape == (17, 3)
+        assert hf_pose_results[1]["keypoints"].shape == (17, 3)
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)

From ee5f19134087a604e32ddb96f481844212afd9f1 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 6 May 2024 16:26:16 +0200
Subject: [PATCH 036/181] Add coco_to_pascal_voc

---
 src/transformers/image_transforms.py          | 21 ++++++++++++++++
 .../models/vitpose/convert_vitpose_to_hf.py   | 20 ++-------------
 .../vitpose/image_processing_vitpose.py       | 25 +++----------------
 3 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 016fae4405e9..2b6639856841 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -533,6 +533,27 @@ def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
     return bboxes_corners
 
 
+def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
+    """
+    Converts bounding boxes from the COCO format to the Pascal VOC format.
+
+    In other words, converts from (top_left_x, top_left_y, width, height) format
+    to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
+
+    Args:
+        bboxes (`ndarray` of shape `(batch_size, 4)):
+            Bounding boxes in COCO format.
+
+    Returns:
+        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
+    """
+    bbox_xyxy = bboxes.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+
+    return bbox_xyxy
+
+
 # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
 def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
     """
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index b1802457cdb2..13fcecd4ea66 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -28,23 +28,7 @@
 from PIL import Image
 
 from transformers import ViTPoseBackboneConfig, ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
-
-
-def _xywh2xyxy(bbox_xywh):
-    """Transform the bbox format from xywh to x1y1x2y2.
-
-    Args:
-        bbox_xywh (ndarray): Bounding boxes (with scores),
-            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
-    Returns:
-        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
-          (n, 5). (left, top, right, bottom, [score])
-    """
-    bbox_xyxy = bbox_xywh.copy()
-    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
-    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
-
-    return bbox_xyxy
+from transformers.image_transforms import coco_to_pascal_voc
 
 
 def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_processor):
@@ -74,7 +58,7 @@ def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_pro
     filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
     person_results = torch.load(filepath, map_location="cpu")
     bboxes = np.array([box["bbox"] for box in person_results])
-    bboxes_xyxy = _xywh2xyxy(bboxes)
+    bboxes_xyxy = coco_to_pascal_voc(bboxes)
 
     pose_results = []
     for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy):
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index b2552ed2e236..b822beefdb47 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import to_channel_dimension_format
+from ...image_transforms import coco_to_pascal_voc, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -46,25 +46,8 @@
 logger = logging.get_logger(__name__)
 
 
-def _xywh2xyxy(bbox_xywh):
-    """Transform the bbox format from xywh to x1y1x2y2.
-
-    Args:
-        bbox_xywh (ndarray): Bounding boxes (with scores),
-            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
-    Returns:
-        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
-          (n, 5). (left, top, right, bottom, [score])
-    """
-    bbox_xyxy = bbox_xywh.copy()
-    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
-    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
-
-    return bbox_xyxy
-
-
 def box_to_center_and_scale(box, width, height):
-    """This encodes bbox(x,y,w,h) into (center, scale)
+    """This encodes bbox in COCO format (top, left, width, height) into (center, scale).
 
     Args:
         x, y, w, h
@@ -396,7 +379,7 @@ def preprocess(
 
             boxes (`List[List[float]]`):
                 List of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
-                box coordinates (x, y, w, h).
+                box coordinates in COCO format (x, y, w, h).
 
             do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
                 Whether to apply an affine transformation to the input images.
@@ -599,7 +582,7 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=T
         poses = all_preds
 
         bboxes = np.array(boxes)
-        bboxes_xyxy = _xywh2xyxy(bboxes)
+        bboxes_xyxy = coco_to_pascal_voc(bboxes)
 
         pose_results = []
         for pose, bbox_xyxy in zip(poses, bboxes_xyxy):

From dcd4401979f9257a10f1c8e76988db729f5954e2 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 6 May 2024 16:36:09 +0200
Subject: [PATCH 037/181] Add box_to_center_and_scale to image_transforms

---
 src/transformers/image_transforms.py          | 35 +++++++++++++++++
 .../vitpose/image_processing_vitpose.py       | 38 +++----------------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 2b6639856841..c1e963bef9c3 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -554,6 +554,41 @@ def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     return bbox_xyxy
 
 
+def box_to_center_and_scale(box: Union[Tuple, List], image_width: int, image_height: int):
+    """
+    Encodes a bounding box in COCO format into (center, scale).
+
+    Args:
+        box (`Tuple` or `List`):
+            Bounding box in COCO format (top_left_x, top_left_y, width, height).
+        image_width (`int`):
+            Image width.
+        image_height (`int`):
+            Image height.
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - `np.ndarray` [float32](2,): Center of the bbox (x, y).
+        - `np.ndarray` [float32](2,): Scale of the bbox width & height.
+    """
+
+    top_left_x, top_left_y, width, height = box[:4]
+    aspect_ratio = image_width / image_height
+    center = np.array([top_left_x + width * 0.5, top_left_y + height * 0.5], dtype=np.float32)
+
+    if width > aspect_ratio * height:
+        height = width * 1.0 / aspect_ratio
+    elif width < aspect_ratio * height:
+        width = height * aspect_ratio
+
+    # pixel std is 200.0
+    scale = np.array([width / 200.0, height / 200.0], dtype=np.float32)
+    scale = scale * 1.25
+
+    return center, scale
+
+
 # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
 def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
     """
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index b822beefdb47..9e1488a3d002 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import coco_to_pascal_voc, to_channel_dimension_format
+from ...image_transforms import box_to_center_and_scale, coco_to_pascal_voc, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -46,36 +46,6 @@
 logger = logging.get_logger(__name__)
 
 
-def box_to_center_and_scale(box, width, height):
-    """This encodes bbox in COCO format (top, left, width, height) into (center, scale).
-
-    Args:
-        x, y, w, h
-
-    Returns:
-        tuple: A tuple containing center and scale.
-
-        - np.ndarray[float32](2,): Center of the bbox (x, y).
-        - np.ndarray[float32](2,): Scale of the bbox w & h.
-    """
-
-    x, y, w, h = box[:4]
-    input_size = (width, height)
-    aspect_ratio = input_size[0] / input_size[1]
-    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
-
-    if w > aspect_ratio * h:
-        h = w * 1.0 / aspect_ratio
-    elif w < aspect_ratio * h:
-        w = h * aspect_ratio
-
-    # pixel std is 200.0
-    scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
-    scale = scale * 1.25
-
-    return center, scale
-
-
 def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
@@ -444,7 +414,9 @@ def preprocess(
         if self.do_affine_transform:
             for image, image_boxes in zip(images, boxes):
                 for box in image_boxes:
-                    center, scale = box_to_center_and_scale(box, size["width"], size["height"])
+                    center, scale = box_to_center_and_scale(
+                        box, image_width=size["width"], image_height=size["height"]
+                    )
                     transformed_image = self.affine_transform(
                         image, center, scale, rotation=0, size=size, input_data_format=input_data_format
                     )
@@ -563,7 +535,7 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=T
         scales = np.zeros((batch_size, 2), dtype=np.float32)
         for i in range(batch_size):
             width, height = self.size["width"], self.size["height"]
-            center, scale = box_to_center_and_scale(boxes[i], width=width, height=height)
+            center, scale = box_to_center_and_scale(boxes[i], image_width=width, image_height=height)
             centers[i, :] = center
             scales[i, :] = scale
 

From 97a0e090424f17130ec82ee97ab263f44fbcfcea Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 8 May 2024 14:11:04 +0200
Subject: [PATCH 038/181] Update tests

---
 tests/models/vitpose/test_modeling_vitpose.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 84477b124e06..7e9bc58b0ae3 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -35,7 +35,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import VitPoseImageProcessor
 
 
 class ViTPoseModelTester:
@@ -228,11 +228,15 @@ def prepare_img():
 @require_vision
 class ViTPoseModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            ViTFeatureExtractor.from_pretrained("google/vitpose-base-patch16-224") if is_vision_available() else None
+            VitPoseImageProcessor.from_pretrained("google/vitpose-base-patch16-224") if is_vision_available() else None
         )
 
     @slow
-    def test_inference_pose_estimation_head(self):
+    def test_inference(self):
+        raise NotImplementedError("To do")
+
+    @slow
+    def test_batched_inference(self):
         raise NotImplementedError("To do")

From d579009f06c0c5cb160ca29c9ebfd127c6c8d4a0 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 11 May 2024 14:23:14 +0200
Subject: [PATCH 039/181] Add integration test

---
 tests/models/vitpose/test_modeling_vitpose.py | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 7e9bc58b0ae3..879d1bdf0afd 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -27,6 +27,7 @@
 
 
 if is_torch_available():
+    import torch
     from torch import nn
 
     from transformers import ViTPoseForPoseEstimation
@@ -35,7 +36,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import VitPoseImageProcessor
+    from transformers import ViTPoseImageProcessor
 
 
 class ViTPoseModelTester:
@@ -213,7 +214,8 @@ def test_for_pose_estimation(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = ""
+        # TODO update organization
+        model_name = "nielsr/vitpose-base-simple"
         model = ViTPoseForPoseEstimation.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
@@ -229,13 +231,28 @@ def prepare_img():
 class ViTPoseModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
-        return (
-            VitPoseImageProcessor.from_pretrained("google/vitpose-base-patch16-224") if is_vision_available() else None
-        )
+        # TODO update organization
+        return ViTPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple") if is_vision_available() else None
 
     @slow
     def test_inference(self):
-        raise NotImplementedError("To do")
+        image_processor = self.default_image_processor
+        # TODO update organization
+        model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+
+        image = prepare_img()
+        boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+
+        inputs = image_processor(images=image, boxes=boxes, return_tensors="pt")
+
+        outputs = model(**inputs)
+        heatmaps = outputs.heatmaps
+
+        assert heatmaps.shape == (2, 17, 64, 48)
+
+        expected_slice = torch.tensor([[0.0003, 0.0003, 0.0003], [0.0005, 0.0007, 0.0007], [0.0006, 0.0007, 0.0007]])
+
+        assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
 
     @slow
     def test_batched_inference(self):

From 9b8b4d1a4dd63d63dce1d00def60ba2261d15dc2 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 May 2024 19:35:07 +0200
Subject: [PATCH 040/181] Fix merge

---
 src/transformers/__init__.py               | 28 +++++++++-------------
 src/transformers/utils/dummy_pt_objects.py |  3 ---
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 08f9b22af1f0..2d0ec8b9a1bb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -739,19 +739,13 @@
         "VisionTextDualEncoderConfig",
         "VisionTextDualEncoderProcessor",
     ],
-    "models.visual_bert": [
-        "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "VisualBertConfig",
-    ],
-    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-    "models.vit_hybrid": [
-        "VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "ViTHybridConfig",
-    ],
-    "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
-    "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
-    "models.vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"],
-    "models.vitmatte": ["VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitMatteConfig"],
+    "models.visual_bert": ["VisualBertConfig"],
+    "models.vit": ["ViTConfig"],
+    "models.vit_hybrid": ["ViTHybridConfig"],
+    "models.vit_mae": ["ViTMAEConfig"],
+    "models.vit_msn": ["ViTMSNConfig"],
+    "models.vitdet": ["VitDetConfig"],
+    "models.vitmatte": ["VitMatteConfig"],
     "models.vitpose": ["ViTPoseConfig"],
     "models.vitpose_backbone": ["ViTPoseBackboneConfig"],
     "models.vits": [
@@ -5324,10 +5318,10 @@
     from .models.vit_hybrid import (
         ViTHybridConfig,
     )
-    from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
-    from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
-    from .models.vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig
-    from .models.vitmatte import VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP, VitMatteConfig
+    from .models.vit_mae import ViTMAEConfig
+    from .models.vit_msn import ViTMSNConfig
+    from .models.vitdet import VitDetConfig
+    from .models.vitmatte import VitMatteConfig
     from .models.vitpose import ViTPoseConfig
     from .models.vitpose_backbone import ViTPoseBackboneConfig
     from .models.vits import (
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e9ed77724ef5..92c11496081b 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8676,9 +8676,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-VITS_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
 class VitsModel(metaclass=DummyObject):
     _backends = ["torch"]
 

From 13ee55f71624bbbf555b83c42beabd0033fcc8ca Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 22 May 2024 11:34:45 +0200
Subject: [PATCH 041/181] Address comments

---
 .../models/vitpose/configuration_vitpose.py   | 18 ++--
 .../vitpose/image_processing_vitpose.py       |  2 +-
 .../models/vitpose/modeling_vitpose.py        | 46 +++++++--
 .../modeling_vitpose_backbone.py              | 94 +++++++++----------
 4 files changed, 95 insertions(+), 65 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 99cc12818ca4..fab48d9ce256 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -34,7 +34,7 @@ class ViTPoseConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitDetConfig()`):
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitPoseBackboneConfig()`):
             The configuration of the backbone model.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
@@ -51,7 +51,7 @@ class ViTPoseConfig(PretrainedConfig):
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         scale_factor (`int`, *optional*, defaults to 4):
-            Factor to upscale te feature maps coming from the ViT backbone.
+            Factor to upscale the feature maps coming from the ViT backbone.
         use_simple_decoder (`bool`, *optional*, defaults to `True`):
             Whether to use a simple decoder to decode the feature maps from the backbone into heatmaps.
 
@@ -75,13 +75,13 @@ class ViTPoseConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config: PretrainedConfig = None,
-        backbone=None,
-        use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
-        initializer_range=0.02,
-        scale_factor=4,
-        use_simple_decoder=True,
+        backbone: str = None,
+        use_pretrained_backbone: bool = False,
+        use_timm_backbone: bool = False,
+        backbone_kwargs: dict = None,
+        initializer_range: float = 0.02,
+        scale_factor: int = 4,
+        use_simple_decoder: bool = True,
         **kwargs,
     ):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 9e1488a3d002..48580ad7edb4 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -499,7 +499,7 @@ def keypoints_from_heatmaps(
             - maxvals (np.ndarray[batch_size, num_keypoints, 1]):
                 Scores (confidence) of the keypoints.
         """
-        # Avoid being affected
+        # Avoid mutation
         heatmaps = heatmaps.numpy().copy()
 
         batch_size, num_keypoints, height, width = heatmaps.shape
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 80ce9aedbcee..81267fea000d 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -24,7 +24,10 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
     logging,
+    replace_return_docstrings,
 )
 from ...utils.backbone_utils import load_backbone
 from .configuration_vitpose import ViTPoseConfig
@@ -45,7 +48,7 @@ class PoseEstimatorOutput(ModelOutput):
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
             Classification (or regression if config.num_labels==1) loss.
         heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
-            Heatmaps.
+            Heatmaps as predicted by the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
@@ -107,11 +110,13 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             Pixel values. Pixel values can be obtained using [`ViTPoseImageProcessor`]. See
             [`ViTPoseImageProcessor.__call__`] for details.
 
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
+            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+            This corresponds to the dataset index used during training, e.g. index 0 refers to COCO.
+
+        flip_pairs (`torch.tensor`, *optional*):
+            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -132,7 +137,7 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
             The output heatmaps obtained from the flipped images.
         flip_pairs (list[tuple()):
             Pairs of keypoints which are mirrored (for example, left ear -- right ear).
-        target_type (str):
+        target_type (`str`):
             GaussianHeatmap or CombinedTarget
 
     Returns:
@@ -223,6 +228,10 @@ def forward(self, hidden_state, flip_pairs):
         return heatmaps
 
 
+@add_start_docstrings(
+    "The ViTPose model with a pose estimation head on top.",
+    VITPOSE_START_DOCSTRING,
+)
 class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
     def __init__(self, config: ViTPoseConfig) -> None:
         super().__init__(config)
@@ -240,6 +249,8 @@ def __init__(self, config: ViTPoseConfig) -> None:
     def get_input_embeddings(self) -> nn.Module:
         return self.backbone.get_input_embeddings()
 
+    @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PoseEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -250,6 +261,29 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PoseEstimatorOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTPoseForPoseEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> processor = AutoImageProcessor.from_pretrained("")
+        >>> model = ViTPoseForPoseEstimation.from_pretrained("")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> heatmaps = outputs.heatmaps
+        ```"""
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 8b024b0fadd0..946204526dcf 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -48,6 +48,38 @@
 _CONFIG_FOR_DOC = "ViTPoseBackboneConfig"
 
 
+class ViTPoseBackbonePatchEmbeddings(nn.Module):
+    """Image to Patch Embedding."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        image_size = config.image_size
+        patch_size = config.patch_size
+        num_channels = config.num_channels
+        embed_dim = config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        height, width = pixel_values.shape[-2:]
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values)
+
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        return embeddings
+
+
 class ViTPoseBackboneEmbeddings(nn.Module):
     """
     Construct the position and patch embeddings.
@@ -56,12 +88,7 @@ class ViTPoseBackboneEmbeddings(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
         super().__init__()
 
-        self.patch_embeddings = ViTPoseBackbonePatchEmbeddings(
-            image_size=config.image_size,
-            patch_size=config.patch_size,
-            num_channels=config.num_channels,
-            embed_dim=config.hidden_size,
-        )
+        self.patch_embeddings = ViTPoseBackbonePatchEmbeddings(config)
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -78,38 +105,6 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class ViTPoseBackbonePatchEmbeddings(nn.Module):
-    """Image to Patch Embedding."""
-
-    def __init__(
-        self,
-        image_size: int = 224,
-        patch_size: Union[int, Tuple[int, int]] = 16,
-        num_channels: int = 3,
-        embed_dim: int = 768,
-    ):
-        super().__init__()
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
-
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        height, width = pixel_values.shape[-2:]
-        if height != self.image_size[0] or width != self.image_size[1]:
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
-            )
-        x = self.projection(pixel_values)
-
-        x = x.flatten(2).transpose(1, 2)
-        return x
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTPoseBackbone
 class ViTPoseBackboneSelfAttention(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
@@ -251,7 +246,7 @@ def __init__(self, config: ViTPoseBackboneConfig) -> None:
         self.experts = nn.ModuleList(experts)
 
     def forward(self, x, indices):
-        expert_x = torch.zeros_like(x[:, :, -self.part_features :], device=x.device, dtype=x.dtype)
+        expert_x = torch.zeros_like(x[:, :, -self.part_features :])
 
         x = self.fc1(x)
         x = self.act(x)
@@ -428,6 +423,11 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values.
 
+        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
+            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
+
+            This corresponds to the dataset index used during training, e.g. index 0 refers to COCO.
+
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
@@ -483,19 +483,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> from transformers import ViTPoseBackboneConfig, ViTPoseBackbone
         >>> import torch
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
-        >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")
+        >>> config = ViTPoseBackboneConfig(out_indices=[-1])
+        >>> model = ViTPoseBackbone(config)
 
-        >>> inputs = processor(image, return_tensors="pt")
-        >>> outputs = model(**inputs)
+        >>> pixel_values = torch.randn(1, 3, 256, 192)
+        >>> dataset_index = torch.tensor([1])
+        >>> outputs = model(pixel_values, dataset_index)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 3b22ef8f4ed04e45c930fda0ae5c17680402c5ae Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 22 May 2024 11:46:34 +0200
Subject: [PATCH 042/181] Replace numpy by pytorch, improve docstrings

---
 .../models/vitpose/modeling_vitpose.py        | 24 +++++++++----------
 tests/models/vitpose/test_modeling_vitpose.py |  6 ++++-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 81267fea000d..b86727deee04 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -133,16 +133,19 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
     """Flip the flipped heatmaps back to the original form.
 
     Args:
-        output_flipped (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
             The output heatmaps obtained from the flipped images.
-        flip_pairs (list[tuple()):
+        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
             Pairs of keypoints which are mirrored (for example, left ear -- right ear).
-        target_type (`str`):
-            GaussianHeatmap or CombinedTarget
+        target_type (`str`, *optional*, defaults to `"GaussianHeatmap"`):
+            Target type to use. Can be GaussianHeatmap or CombinedTarget.
 
     Returns:
-        np.ndarray: heatmaps that flipped back to the original image
+        torch.Tensor: heatmaps that flipped back to the original image
     """
+    if target_type not in ["GaussianHeatmap", "CombinedTarget"]:
+        raise ValueError("target_type should be GaussianHeatmap or CombinedTarget")
+
     if output_flipped.ndim != 4:
         raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
     shape_ori = output_flipped.shape
@@ -151,7 +154,7 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
         channels = 3
         output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
     output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, shape_ori[2], shape_ori[3])
-    output_flipped_back = output_flipped.copy()
+    output_flipped_back = output_flipped.clone()
 
     # Swap left-right parts
     for left, right in flip_pairs.tolist():
@@ -159,7 +162,7 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
         output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
     output_flipped_back = output_flipped_back.reshape(shape_ori)
     # Flip horizontally
-    output_flipped_back = output_flipped_back[..., ::-1]
+    output_flipped_back = output_flipped_back.flip(-1)
     return output_flipped_back
 
 
@@ -185,7 +188,7 @@ def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
         heatmaps = self.conv(hidden_state)
 
         if flip_pairs is not None:
-            heatmaps = flip_back(heatmaps.detach().cpu().numpy(), flip_pairs)
+            heatmaps = flip_back(heatmaps, flip_pairs)
 
         return heatmaps
 
@@ -223,7 +226,7 @@ def forward(self, hidden_state, flip_pairs):
         heatmaps = self.conv(hidden_state)
 
         if flip_pairs is not None:
-            heatmaps = flip_back(heatmaps.detach().cpu().numpy(), flip_pairs)
+            heatmaps = flip_back(heatmaps, flip_pairs)
 
         return heatmaps
 
@@ -246,9 +249,6 @@ def __init__(self, config: ViTPoseConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.backbone.get_input_embeddings()
-
     @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=PoseEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 879d1bdf0afd..d367e5d4eb94 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -167,7 +167,11 @@ def test_config(self):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
-    @unittest.skip(reason="ViTPose does not use inputs_embeds")
+    @unittest.skip(reason="ViTPose does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="ViTPose does not support input and output embeddings")
     def test_inputs_embeds(self):
         pass
 

From 4873d381e57a83d32eb5bacee8f91c06a6c208c9 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 22 May 2024 11:51:52 +0200
Subject: [PATCH 043/181] Remove get_input_embeddings

---
 .../modeling_vitpose_backbone.py                |  3 ---
 tests/models/vitpose/test_modeling_vitpose.py   |  9 ---------
 .../test_modeling_vitpose_backbone.py           | 17 +++++------------
 3 files changed, 5 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 946204526dcf..9240327f957f 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -463,9 +463,6 @@ def __init__(self, config: ViTPoseBackboneConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.embeddings
-
     @add_start_docstrings_to_model_forward(VITPOSE_BACKBONE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index d367e5d4eb94..5ca270e69869 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -191,15 +191,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index 25de7b766817..3cecf49d0f28 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -28,8 +28,6 @@
 
 
 if is_torch_available():
-    from torch import nn
-
     from transformers import ViTPoseBackbone
 
 
@@ -142,7 +140,11 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="ViTPoseBackbone does not use inputs_embeds")
+    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
     def test_inputs_embeds(self):
         pass
 
@@ -170,15 +172,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 

From b32c1aa19869f0b132249c33d90f977f2f5b57d8 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 24 May 2024 08:56:20 +0200
Subject: [PATCH 044/181] Address comments

---
 docs/source/en/model_doc/vitpose.md           |  2 +-
 .../vitpose/image_processing_vitpose.py       | 52 +++++++------------
 .../models/vitpose/modeling_vitpose.py        |  3 ++
 tests/models/vitpose/test_modeling_vitpose.py |  1 -
 4 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 7c1c3b4af208..b6371a3f4e96 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard [Vision Transformer](vit) as backbone for the task of keypoint estimation.
+The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
 
 The abstract from the paper is the following:
 
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 48580ad7edb4..6079ae70492a 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -149,28 +149,33 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
         num_keypoints: K
 
     Args:
-        coords (np.ndarray[K, ndims]):
+        coords (`np.ndarray[K, ndims]`):
 
             * If ndims=2, corrds are predicted keypoint location.
             * If ndims=4, corrds are composed of (x, y, scores, tags)
             * If ndims=5, corrds are composed of (x, y, scores, tags,
               flipped_tags)
 
-        center (np.ndarray[2, ]): Center of the bounding box (x, y).
-        scale (np.ndarray[2, ]): Scale of the bounding box
-            wrt [width, height].
-        output_size (np.ndarray[2, ] | list(2,)):
+        center (`np.ndarray[2,]`):
+            Center of the bounding box (x, y).
+        scale (`np.ndarray[2,]`):
+            Scale of the bounding box wrt [width, height].
+        output_size (`np.ndarray[2,] or `List(2,)`):
             Size of the destination heatmaps.
-        use_udp (bool):
+        use_udp (`bool`, *optional*, defaults to `False`):
             Whether to use unbiased data processing.
 
     Returns:
         np.ndarray: Predicted coordinates in the images.
     """
-    assert coords.shape[1] in (2, 4, 5)
-    assert len(center) == 2
-    assert len(scale) == 2
-    assert len(output_size) == 2
+    if coords.shape[1] not in (2, 4, 5):
+        raise ValueError("Coordinates need to have either 2, 4 or 5 dimensions.")
+    if len(center) != 2:
+        raise ValueError("Center needs to have 2 elements, one for x and one for y.")
+    if len(scale) != 2:
+        raise ValueError("Scale needs to consist of a width and height")
+    if len(output_size) != 2:
+        raise ValueError("Output size needs to consist of a width and height")
 
     # Recover the scale which is normalized by a factor of 200.
     scale = scale * 200.0
@@ -315,12 +320,8 @@ def affine_transform(
             else to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
         )
         image = cv2.warpAffine(cv2_image, transformation, size, flags=cv2.INTER_LINEAR)
-        # transform image back to input_data_format
-        image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
 
-        # move back to input_data_format
-        if data_format is not None:
-            image = to_channel_dimension_format(image, data_format, input_data_format)
+        image = to_channel_dimension_format(image, data_format, ChannelDimension.LAST)
 
         return image
 
@@ -468,28 +469,11 @@ def keypoints_from_heatmaps(
                 Center of the bounding box (x, y).
             scale (np.ndarray[N, 2]):
                 Scale of the bounding box wrt height/width.
-            post_process (str/None):
-                Choice of methods to post-process heatmaps.
-                Currently supported: None, 'default', 'unbiased', 'megvii'.
-            unbiased (bool)
-                Option to use unbiased decoding. Mutually exclusive with megvii.
-                Note: this arg is deprecated and unbiased=True can be replaced
-                by post_process='unbiased'. Paper ref: Zhang et al. Distribution-Aware Coordinate
-                Representation for Human Pose Estimation (CVPR 2020).
             kernel (int):
-                aussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
+                Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
                 K=17 for sigma=3 and k=11 for sigma=2.
-            valid_radius_factor (float):
-                The radius factor of the positive area in classification heatmap for UDP.
-            use_udp (bool):
+            use_udp (`bool`, *optional*, defaults to `False`):
                 Use unbiased data processing.
-            target_type (str):
-                'GaussianHeatmap' or 'CombinedTarget'.
-                GaussianHeatmap: Classification target with gaussian distribution.
-                CombinedTarget: The combination of classification target
-                (response map) and regression target (offset map).
-                Paper ref: Huang et al. The Devil is in the Details: Delving into
-                Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
 
         Returns:
             tuple: A tuple containing keypoint predictions and scores.
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index b86727deee04..2cf3cc3df257 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -139,6 +139,9 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
             Pairs of keypoints which are mirrored (for example, left ear -- right ear).
         target_type (`str`, *optional*, defaults to `"GaussianHeatmap"`):
             Target type to use. Can be GaussianHeatmap or CombinedTarget.
+            GaussianHeatmap: Classification target with gaussian distribution.
+            CombinedTarget: The combination of classification target (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
 
     Returns:
         torch.Tensor: heatmaps that flipped back to the original image
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 5ca270e69869..2a0edae1cc04 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -28,7 +28,6 @@
 
 if is_torch_available():
     import torch
-    from torch import nn
 
     from transformers import ViTPoseForPoseEstimation
 

From 1a16aa6752e9ed1c731cb08fbab0c04eea7da116 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 24 May 2024 09:16:36 +0200
Subject: [PATCH 045/181] Move coco_to_pascal_voc

---
 src/transformers/image_transforms.py          | 21 ---------
 .../models/vitpose/convert_vitpose_to_hf.py   |  8 ++--
 .../vitpose/image_processing_vitpose.py       | 45 ++++++++++++++-----
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index c1e963bef9c3..11d6849888da 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -533,27 +533,6 @@ def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
     return bboxes_corners
 
 
-def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
-    """
-    Converts bounding boxes from the COCO format to the Pascal VOC format.
-
-    In other words, converts from (top_left_x, top_left_y, width, height) format
-    to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
-
-    Args:
-        bboxes (`ndarray` of shape `(batch_size, 4)):
-            Bounding boxes in COCO format.
-
-    Returns:
-        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
-    """
-    bbox_xyxy = bboxes.copy()
-    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
-    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
-
-    return bbox_xyxy
-
-
 def box_to_center_and_scale(box: Union[Tuple, List], image_width: int, image_height: int):
     """
     Encodes a bounding box in COCO format into (center, scale).
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 13fcecd4ea66..63f1610c60fc 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -28,7 +28,7 @@
 from PIL import Image
 
 from transformers import ViTPoseBackboneConfig, ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
-from transformers.image_transforms import coco_to_pascal_voc
+from transformers.models.vitpose.image_processing_vitpose import coco_to_pascal_voc
 
 
 def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_processor):
@@ -40,14 +40,12 @@ def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_pro
         centers[i, :] = img_metas[i]["center"]
         scales[i, :] = img_metas[i]["scale"]
 
-    preds, maxvals = image_processor.keypoints_from_heatmaps(
-        output_heatmap, center=centers, scale=scales, use_udp=True
-    )
+    preds, scores = image_processor.keypoints_from_heatmaps(output_heatmap, center=centers, scale=scales, use_udp=True)
 
     all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
     all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
     all_preds[:, :, 0:2] = preds[:, :, 0:2]
-    all_preds[:, :, 2:3] = maxvals
+    all_preds[:, :, 2:3] = scores
     all_boxes[:, 0:2] = centers[:, 0:2]
     all_boxes[:, 2:4] = scales[:, 0:2]
     all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 6079ae70492a..6ebf31a28b86 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import box_to_center_and_scale, coco_to_pascal_voc, to_channel_dimension_format
+from ...image_transforms import box_to_center_and_scale, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -46,6 +46,27 @@
 logger = logging.get_logger(__name__)
 
 
+def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
+    """
+    Converts bounding boxes from the COCO format to the Pascal VOC format.
+
+    In other words, converts from (top_left_x, top_left_y, width, height) format
+    to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
+
+    Args:
+        bboxes (`ndarray` of shape `(batch_size, 4)):
+            Bounding boxes in COCO format.
+
+    Returns:
+        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
+    """
+    bbox_xyxy = bboxes.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+
+    return bbox_xyxy
+
+
 def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
@@ -56,8 +77,10 @@ def _get_max_preds(heatmaps):
     Returns:
         tuple: A tuple containing aggregated results.
 
-        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
-        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+        - preds (np.ndarray[N, K, 2]):
+            Predicted keypoint location.
+        - scores (np.ndarray[N, K, 1]):
+            Scores (confidence) of the keypoints.
     """
     if not isinstance(heatmaps, np.ndarray):
         raise ValueError("Heatmaps should be numpy.ndarray")
@@ -67,14 +90,14 @@ def _get_max_preds(heatmaps):
     batch_size, num_keypoints, _, width = heatmaps.shape
     heatmaps_reshaped = heatmaps.reshape((batch_size, num_keypoints, -1))
     idx = np.argmax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
-    maxvals = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+    scores = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
 
     preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
     preds[:, :, 0] = preds[:, :, 0] % width
     preds[:, :, 1] = preds[:, :, 1] // width
 
-    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
-    return preds, maxvals
+    preds = np.where(np.tile(scores, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, scores
 
 
 def post_dark_udp(coords, batch_heatmaps, kernel=3):
@@ -480,7 +503,7 @@ def keypoints_from_heatmaps(
 
             - preds (np.ndarray[batch_size, num_keypoints, 2]):
                 Predicted keypoint location in images.
-            - maxvals (np.ndarray[batch_size, num_keypoints, 1]):
+            - scores (np.ndarray[batch_size, num_keypoints, 1]):
                 Scores (confidence) of the keypoints.
         """
         # Avoid mutation
@@ -488,7 +511,7 @@ def keypoints_from_heatmaps(
 
         batch_size, num_keypoints, height, width = heatmaps.shape
 
-        preds, maxvals = _get_max_preds(heatmaps)
+        preds, scores = _get_max_preds(heatmaps)
 
         preds = post_dark_udp(preds, heatmaps, kernel=kernel)
 
@@ -496,7 +519,7 @@ def keypoints_from_heatmaps(
         for i in range(batch_size):
             preds[i] = transform_preds(preds[i], center[i], scale[i], [width, height], use_udp=use_udp)
 
-        return preds, maxvals
+        return preds, scores
 
     def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=True):
         """
@@ -523,14 +546,14 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=T
             centers[i, :] = center
             scales[i, :] = scale
 
-        preds, maxvals = self.keypoints_from_heatmaps(
+        preds, scores = self.keypoints_from_heatmaps(
             outputs.heatmaps, centers, scales, kernel=kernel_size, use_udp=use_udp
         )
 
         all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
         all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
         all_preds[:, :, 0:2] = preds[:, :, 0:2]
-        all_preds[:, :, 2:3] = maxvals
+        all_preds[:, :, 2:3] = scores
         all_boxes[:, 0:2] = centers[:, 0:2]
         all_boxes[:, 2:4] = scales[:, 0:2]
         all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)

From b84f23c4c3ee213d9ab85f640f84302c1e71ede9 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 24 May 2024 11:49:23 +0200
Subject: [PATCH 046/181] Address comment

---
 .../vitpose/image_processing_vitpose.py       | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 6ebf31a28b86..69928c256ad5 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -107,23 +107,19 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     - Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
     - Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020).
 
-    Note:
-        - batch size: B
-        - num keypoints: K
-        - num persons: N
-        - height of heatmaps: H
-        - width of heatmaps: W
-
-        B=1 for bottom_up paradigm where all persons share the same heatmap.
-        B=N for top_down paradigm where each person has its own heatmaps.
-
     Args:
-        coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
-        batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
-        kernel (int): Gaussian kernel size (K) for modulation.
+        coords (`np.ndarray` of shape `(num_persons, num_keypoints, 2)`):
+            Initial coordinates of human pose.
+        batch_heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            Batched heatmaps as predicted by the model.
+            A batch_size of 1 is used for the bottom up paradigm where all persons share the same heatmap.
+            A batch_size of `num_persons` is used for the top down paradigm where each person has its own heatmaps.
+        kernel (`int`, *optional*, defaults to 3):
+            Gaussian kernel size (K) for modulation.
 
     Returns:
-        np.ndarray([N, K, 2]): Refined coordinates.
+        `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
+            Refined coordinates.
     """
     if not isinstance(batch_heatmaps, np.ndarray):
         batch_heatmaps = batch_heatmaps.cpu().numpy()
@@ -434,8 +430,8 @@ def preprocess(
             input_data_format = infer_channel_dimension_format(images[0])
 
         # transformations (affine transformation + rescaling + normalization)
-        new_images = []
         if self.do_affine_transform:
+            new_images = []
             for image, image_boxes in zip(images, boxes):
                 for box in image_boxes:
                     center, scale = box_to_center_and_scale(
@@ -445,8 +441,7 @@ def preprocess(
                         image, center, scale, rotation=0, size=size, input_data_format=input_data_format
                     )
                     new_images.append(transformed_image)
-
-        images = new_images
+            images = new_images
 
         # TODO each image might have a variable number of boxes => padding?
         # since the number of boxes can differ per image, the image processor takes a list

From 20c44b9e1954e6f86510843f4cc3aab73779bba8 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 27 May 2024 17:26:23 +0200
Subject: [PATCH 047/181] Fix style

---
 src/transformers/models/auto/configuration_auto.py   |  3 ++-
 src/transformers/models/auto/modeling_auto.py        |  2 +-
 .../bigbird_pegasus/modeling_bigbird_pegasus.py      | 10 ++++------
 .../models/vitpose/configuration_vitpose.py          |  3 +--
 .../models/vitpose/convert_vitpose_to_hf.py          |  1 -
 src/transformers/models/vitpose/modeling_vitpose.py  |  2 +-
 .../configuration_vitpose_backbone.py                |  3 +--
 .../vitpose_backbone/modeling_vitpose_backbone.py    |  3 +--
 tests/models/roc_bert/test_tokenization_roc_bert.py  | 12 ++++++------
 tests/models/vitpose/test_modeling_vitpose.py        |  3 +--
 .../test_modeling_vitpose_backbone.py                |  3 +--
 utils/check_copies.py                                |  4 ++--
 utils/check_repo.py                                  |  1 +
 13 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1ae52b23c205..135d4c654c54 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Config class."""
+"""Auto Config class."""
+
 import importlib
 import os
 import re
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ed749048e16a..3f030e375c6d 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class."""
+"""Auto Model class."""
 
 import warnings
 from collections import OrderedDict
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index b4e6419f9905..d1ba54213a03 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BigBirdPegasus model."""
+"""PyTorch BigBirdPegasus model."""
 
 import copy
 import math
@@ -717,11 +717,9 @@ def bigbird_block_sparse_attention(
             attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[
                 :, :, :, :to_block_size
             ]  # 1st key block (global)
-            attention_probs[
-                :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :
-            ] = second_last_attn_weights[
-                :, :, :, to_block_size : 4 * to_block_size
-            ]  # last three blocks (global + sliding)
+            attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = (
+                second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size]
+            )  # last three blocks (global + sliding)
             # random keys
             for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights):
                 # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index fab48d9ce256..55e54fb0067e 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViTPose model configuration"""
-
+"""ViTPose model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 63f1610c60fc..1076d3eb1056 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -17,7 +17,6 @@
 URL: https://github.com/vitae-transformer/vitpose
 """
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 2cf3cc3df257..03de7d9b27db 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViTPose model."""
+"""PyTorch ViTPose model."""
 
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index a5ac1be0ce09..944eff4e0cf5 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViTPose backbone configuration"""
-
+"""ViTPose backbone configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 9240327f957f..4befe1ce9e41 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -12,14 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViTPose backbone model.
+"""PyTorch ViTPose backbone model.
 
 This code is the same as the original Vision Transformer (ViT) with 2 modifications:
 - use of padding=2 in the patch embedding layer
 - addition of a mixture-of-experts MLP layer
 """
 
-
 import collections.abc
 import math
 from typing import Optional, Set, Tuple, Union
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
index 4fb2b172d665..fdd95a033aa7 100644
--- a/tests/models/roc_bert/test_tokenization_roc_bert.py
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -73,7 +73,7 @@ def test_full_tokenizer(self):
     def test_chinese(self):
         tokenizer = RoCBertBasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower(self):
@@ -82,7 +82,7 @@ def test_basic_tokenizer_lower(self):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_false with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_false(self):
@@ -91,7 +91,7 @@ def test_basic_tokenizer_lower_strip_accents_false(self):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_true with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_true(self):
@@ -100,7 +100,7 @@ def test_basic_tokenizer_lower_strip_accents_true(self):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_default with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_default(self):
@@ -109,7 +109,7 @@ def test_basic_tokenizer_lower_strip_accents_default(self):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_no_lower with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_no_lower(self):
@@ -164,7 +164,7 @@ def test_is_whitespace(self):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 2a0edae1cc04..92e6d7458b47 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViTPose model. """
-
+"""Testing suite for the PyTorch ViTPose model."""
 
 import inspect
 import unittest
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index 3cecf49d0f28..de844b73b2db 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViTPose backbone model. """
-
+"""Testing suite for the PyTorch ViTPose backbone model."""
 
 import inspect
 import unittest
diff --git a/utils/check_copies.py b/utils/check_copies.py
index e782870c8bef..6653964e8ee6 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -753,9 +753,9 @@ def is_copy_consistent(filename: str, overwrite: bool = False, buffer: dict = No
                 else:
                     # not in the target --> add it
                     theoretical_code_blocks[f"_ignored_new_block_{ignored_new_block_index}"] = code
-                    name_mappings_1[
+                    name_mappings_1[f"_ignored_new_block_{ignored_new_block_index}"] = (
                         f"_ignored_new_block_{ignored_new_block_index}"
-                    ] = f"_ignored_new_block_{ignored_new_block_index}"
+                    )
 
                     del observed_code_blocks[name]
                     observed_code_blocks[f"_ignored_new_block_{ignored_new_block_index}"] = code
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9e89f657cc1e..1d353ade7fc3 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -30,6 +30,7 @@
 
 It has no auto-fix mode.
 """
+
 import inspect
 import os
 import re

From 7aedeff1c010cb3144c9506b07c395d192f3093f Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 27 May 2024 17:39:47 +0200
Subject: [PATCH 048/181] Address comments

---
 .../models/vitpose/image_processing_vitpose.py            | 2 ++
 src/transformers/models/vitpose/modeling_vitpose.py       | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 69928c256ad5..56ad0331cdd0 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -135,6 +135,7 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
 
     batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode="edge").flatten()
 
+    # calculate indices for coordinates
     index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (width + 2)
     index += (width + 2) * (height + 2) * np.arange(0, batch_size * num_keypoints).reshape(-1, num_keypoints)
     index = index.astype(int).reshape(-1, 1)
@@ -146,6 +147,7 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     ix1_ = batch_heatmaps_pad[index - 1]
     iy1_ = batch_heatmaps_pad[index - 2 - width]
 
+    # calculate refined coordinates using Newton's method
     dx = 0.5 * (ix1 - ix1_)
     dy = 0.5 * (iy1 - iy1_)
     derivative = np.concatenate([dx, dy], axis=1)
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 03de7d9b27db..f56589097a01 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -243,7 +243,15 @@ def __init__(self, config: ViTPoseConfig) -> None:
         super().__init__(config)
 
         self.backbone = load_backbone(config)
+
         # add backbone attributes
+        if not hasattr(self.backbone.config, "hidden_size"):
+            raise ValueError("The backbone should have a hidden_size attribute")
+        if not hasattr(self.backbone.config, "image_size"):
+            raise ValueError("The backbone should have an image_size attribute")
+        if not hasattr(self.backbone.config, "patch_size"):
+            raise ValueError("The backbone should have a patch_size attribute")
+
         config.backbone_hidden_size = self.backbone.config.hidden_size
         config.image_size = self.backbone.config.image_size
         config.patch_size = self.backbone.config.patch_size

From 65ee99563753ff16691c2fb7a24371e0d8a13ad5 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 27 May 2024 17:52:46 +0200
Subject: [PATCH 049/181] Fix test

---
 .../vitpose/test_image_processing_vitpose.py  | 38 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index 7f4b4bf0e9fb..d6dd6c68b767 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -190,6 +190,40 @@ def test_call_pytorch(self):
             tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
         )
 
-    @unittest.skip(reason="ViTPoseImageProcessor does not support 4 channels for now")
     def test_call_numpy_4_channels(self):
-        pass
+        # Test that can process images which have an arbitrary number of channels
+        # Initialize image_processing
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processor(
+            image_inputs[0],
+            boxes=boxes,
+            return_tensors="pt",
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (len(boxes[0]), *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processor(
+            image_inputs,
+            boxes=boxes,
+            return_tensors="pt",
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape),
+            (self.image_processor_tester.batch_size * len(boxes[0]), *expected_output_image_shape),
+        )

From f75119a059ac0c8dd736493da1ade955fa8ce035 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 27 May 2024 18:01:38 +0200
Subject: [PATCH 050/181] Address comment

---
 .../models/vitpose/image_processing_vitpose.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 56ad0331cdd0..6d109a28d7af 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -182,7 +182,7 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
         scale (`np.ndarray[2,]`):
             Scale of the bounding box wrt [width, height].
         output_size (`np.ndarray[2,] or `List(2,)`):
-            Size of the destination heatmaps.
+            Size of the destination heatmaps in (height, width) format.
         use_udp (`bool`, *optional*, defaults to `False`):
             Whether to use unbiased data processing.
 
@@ -196,17 +196,17 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
     if len(scale) != 2:
         raise ValueError("Scale needs to consist of a width and height")
     if len(output_size) != 2:
-        raise ValueError("Output size needs to consist of a width and height")
+        raise ValueError("Output size needs to consist of a height and width")
 
     # Recover the scale which is normalized by a factor of 200.
     scale = scale * 200.0
 
     if use_udp:
-        scale_x = scale[0] / (output_size[0] - 1.0)
-        scale_y = scale[1] / (output_size[1] - 1.0)
+        scale_y = scale[1] / (output_size[0] - 1.0)
+        scale_x = scale[0] / (output_size[1] - 1.0)
     else:
-        scale_x = scale[0] / output_size[0]
-        scale_y = scale[1] / output_size[1]
+        scale_y = scale[1] / output_size[0]
+        scale_x = scale[0] / output_size[1]
 
     target_coords = np.ones_like(coords)
     target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
@@ -514,7 +514,7 @@ def keypoints_from_heatmaps(
 
         # Transform back to the image
         for i in range(batch_size):
-            preds[i] = transform_preds(preds[i], center[i], scale[i], [width, height], use_udp=use_udp)
+            preds[i] = transform_preds(preds[i], center=center[i], scale=scale[i], output_size=[height, width], use_udp=use_udp)
 
         return preds, scores
 

From 8588a0cf922edf9f0e2b4e4d6b827ee9a7416a67 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 3 Jun 2024 10:25:13 +0200
Subject: [PATCH 051/181] Remove udp

---
 .../models/vitpose/convert_vitpose_to_hf.py   |  4 +--
 .../vitpose/image_processing_vitpose.py       | 26 +++++--------------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 1076d3eb1056..d3c0d73bb1b8 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -39,7 +39,7 @@ def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_pro
         centers[i, :] = img_metas[i]["center"]
         scales[i, :] = img_metas[i]["scale"]
 
-    preds, scores = image_processor.keypoints_from_heatmaps(output_heatmap, center=centers, scale=scales, use_udp=True)
+    preds, scores = image_processor.keypoints_from_heatmaps(output_heatmap, center=centers, scale=scales)
 
     all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
     all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
@@ -285,7 +285,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
 
     # test post_process_pose_estimation
     # results are slightly different due to no flip augmentation
-    hf_pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0], use_udp=True)
+    hf_pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0])
     if model_name == "vitpose-base-simple":
         assert torch.allclose(
             torch.tensor(hf_pose_results[1]["keypoints"][0, :3]),
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 6d109a28d7af..4bbb2c2a2bf8 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -162,7 +162,7 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     return coords
 
 
-def transform_preds(coords, center, scale, output_size, use_udp=False):
+def transform_preds(coords, center, scale, output_size):
     """Get final keypoint predictions from heatmaps and apply scaling and
     translation to map them back to the image.
 
@@ -183,8 +183,6 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
             Scale of the bounding box wrt [width, height].
         output_size (`np.ndarray[2,] or `List(2,)`):
             Size of the destination heatmaps in (height, width) format.
-        use_udp (`bool`, *optional*, defaults to `False`):
-            Whether to use unbiased data processing.
 
     Returns:
         np.ndarray: Predicted coordinates in the images.
@@ -201,12 +199,9 @@ def transform_preds(coords, center, scale, output_size, use_udp=False):
     # Recover the scale which is normalized by a factor of 200.
     scale = scale * 200.0
 
-    if use_udp:
-        scale_y = scale[1] / (output_size[0] - 1.0)
-        scale_x = scale[0] / (output_size[1] - 1.0)
-    else:
-        scale_y = scale[1] / output_size[0]
-        scale_x = scale[0] / output_size[1]
+    # We use unbiased data processing
+    scale_y = scale[1] / (output_size[0] - 1.0)
+    scale_x = scale[0] / (output_size[1] - 1.0)
 
     target_coords = np.ones_like(coords)
     target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
@@ -476,7 +471,6 @@ def keypoints_from_heatmaps(
         center,
         scale,
         kernel=11,
-        use_udp=False,
     ):
         """
         Get final keypoint predictions from heatmaps and transform them back to
@@ -492,8 +486,6 @@ def keypoints_from_heatmaps(
             kernel (int):
                 Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
                 K=17 for sigma=3 and k=11 for sigma=2.
-            use_udp (`bool`, *optional*, defaults to `False`):
-                Use unbiased data processing.
 
         Returns:
             tuple: A tuple containing keypoint predictions and scores.
@@ -514,11 +506,11 @@ def keypoints_from_heatmaps(
 
         # Transform back to the image
         for i in range(batch_size):
-            preds[i] = transform_preds(preds[i], center=center[i], scale=scale[i], output_size=[height, width], use_udp=use_udp)
+            preds[i] = transform_preds(preds[i], center=center[i], scale=scale[i], output_size=[height, width])
 
         return preds, scores
 
-    def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=True):
+    def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
         """
         Transform the heatmaps into keypoint predictions and transform them back to the image.
 
@@ -529,8 +521,6 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=T
                 Bounding boxes.
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
-            use_udp (`bool`, *optional*, defaults to `False`):
-                Whether to use unbiased data processing.
         """
 
         # First compute centers and scales for each bounding box
@@ -543,9 +533,7 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11, use_udp=T
             centers[i, :] = center
             scales[i, :] = scale
 
-        preds, scores = self.keypoints_from_heatmaps(
-            outputs.heatmaps, centers, scales, kernel=kernel_size, use_udp=use_udp
-        )
+        preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps, centers, scales, kernel=kernel_size)
 
         all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
         all_boxes = np.zeros((batch_size, 6), dtype=np.float32)

From 62382771d6652d0c936ac7cee235aa62c67c7f49 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 3 Jun 2024 14:41:17 +0200
Subject: [PATCH 052/181] Remove comment

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 4bbb2c2a2bf8..c03b25d2639b 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -440,7 +440,6 @@ def preprocess(
                     new_images.append(transformed_image)
             images = new_images
 
-        # TODO each image might have a variable number of boxes => padding?
         # since the number of boxes can differ per image, the image processor takes a list
         # rather than a numpy array of boxes
         # it currently create pixel_values of shape (batch_size*num_persons, num_channels, height, width)
@@ -498,7 +497,7 @@ def keypoints_from_heatmaps(
         # Avoid mutation
         heatmaps = heatmaps.numpy().copy()
 
-        batch_size, num_keypoints, height, width = heatmaps.shape
+        batch_size, _, height, width = heatmaps.shape
 
         preds, scores = _get_max_preds(heatmaps)
 

From 3c3aa6786f4930fbcde8af858eea8c22e1252b94 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 11 Jul 2024 02:09:35 +0000
Subject: [PATCH 053/181] [WIP] need to check if the numpy function is same as
 cv

---
 .../vitpose/image_processing_vitpose.py       | 35 ++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index c03b25d2639b..cc2f29f6064a 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -32,16 +32,14 @@
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_cv2_available, is_vision_available, logging
+from ...utils import TensorType, is_scipy_available, is_vision_available, logging
 
 
 if is_vision_available():
     import PIL
 
-if is_cv2_available():
-    # TODO get rid of cv2?
-    import cv2
-
+if is_scipy_available():
+    from scipt.ndimage import gaussian_filter
 
 logger = logging.get_logger(__name__)
 
@@ -127,9 +125,10 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     num_coords = coords.shape[0]
     if not (batch_size == 1 or batch_size == num_coords):
         raise ValueError("The batch size of heatmaps should be 1 or equal to the batch size of coordinates.")
+    radius = int((kernel - 1) // 2)
     for heatmaps in batch_heatmaps:
         for heatmap in heatmaps:
-            cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+            gaussian_filter(heatmap, sigma=0.8, output=heatmap, radius=(radius, radius), axes=(0, 1))
     np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
     np.log(batch_heatmaps, batch_heatmaps)
 
@@ -247,6 +246,26 @@ def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray,
     return matrix
 
 
+def warp_affine(src, M, borderValue=0):
+    new_src = np.full_like(src, borderValue, dtype=src.dtype)
+    h, w = src.shape[:2]
+    y, x = np.indices((h, w))
+    x = x.flatten()
+    y = y.flatten()
+    ones = np.ones((h * w, 1))
+    coords = np.vstack([x, y, ones.T])
+    coords_transformed = M.dot(coords)
+    coords_transformed = np.round(coords_transformed).astype(int)
+    mask = (
+        (coords_transformed[0, :] >= 0)
+        & (coords_transformed[0, :] < w)
+        & (coords_transformed[1, :] >= 0)
+        & (coords_transformed[1, :] < h)
+    )
+    new_src[coords_transformed[1, mask], coords_transformed[0, mask]] = src[y[mask], x[mask]]
+    return new_src
+
+
 class ViTPoseImageProcessor(BaseImageProcessor):
     r"""
     Constructs a ViTPose image processor.
@@ -330,12 +349,12 @@ def affine_transform(
         transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
 
         # cv2 requires channels last format
-        cv2_image = (
+        image = (
             image
             if input_data_format == ChannelDimension.LAST
             else to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
         )
-        image = cv2.warpAffine(cv2_image, transformation, size, flags=cv2.INTER_LINEAR)
+        image = warp_affine(src=image, M=transformation)
 
         image = to_channel_dimension_format(image, data_format, ChannelDimension.LAST)
 

From 97961ee35e76b20ba77e7495f8ff02826c76f2e8 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 16 Jul 2024 05:18:45 +0000
Subject: [PATCH 054/181] add scipy affine_transform

---
 .../vitpose/image_processing_vitpose.py       | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index cc2f29f6064a..48fe2c50b37c 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -39,7 +39,8 @@
     import PIL
 
 if is_scipy_available():
-    from scipt.ndimage import gaussian_filter
+    from scipy.linalg import inv
+    from scipy.ndimage import affine_transform, gaussian_filter
 
 logger = logging.get_logger(__name__)
 
@@ -246,23 +247,24 @@ def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray,
     return matrix
 
 
-def warp_affine(src, M, borderValue=0):
-    new_src = np.full_like(src, borderValue, dtype=src.dtype)
-    h, w = src.shape[:2]
-    y, x = np.indices((h, w))
-    x = x.flatten()
-    y = y.flatten()
-    ones = np.ones((h * w, 1))
-    coords = np.vstack([x, y, ones.T])
-    coords_transformed = M.dot(coords)
-    coords_transformed = np.round(coords_transformed).astype(int)
-    mask = (
-        (coords_transformed[0, :] >= 0)
-        & (coords_transformed[0, :] < w)
-        & (coords_transformed[1, :] >= 0)
-        & (coords_transformed[1, :] < h)
+def warp_affine(src, M):
+    channels = [src[..., i] for i in range(src.shape[-1])]
+
+    # Convert to a 3x3 matrix used by SciPy
+    M_scipy = np.vstack([M, [0, 0, 1]])
+    # If you have a matrix for the ‘push’ transformation, use its inverse (numpy.linalg.inv) in this function.
+    M_inv = inv(M_scipy)
+    M_inv[0, 0], M_inv[0, 1], M_inv[1, 0], M_inv[1, 1], M_inv[0, 2], M_inv[1, 2] = (
+        M_inv[1, 1],
+        M_inv[1, 0],
+        M_inv[0, 1],
+        M_inv[0, 0],
+        M_inv[1, 2],
+        M_inv[0, 2],
     )
-    new_src[coords_transformed[1, mask], coords_transformed[0, mask]] = src[y[mask], x[mask]]
+
+    new_src = [affine_transform(channel, M_inv, order=1) for channel in channels]
+    new_src = np.stack(new_src, axis=-1)
     return new_src
 
 

From c3973846f22792adb999f8f8aa3dd2571e63b8fa Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 16 Jul 2024 16:05:25 +0900
Subject: [PATCH 055/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/vitpose/image_processing_vitpose.py             | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 48fe2c50b37c..474deda0a911 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -247,7 +247,12 @@ def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray,
     return matrix
 
 
-def warp_affine(src, M):
+def scipy_warp_affine(src, M):
+"""
+This function implements cv2.warpAffine used in the original implementation using scipy.
+
+Note: the original implementation uses cv2.INTER_LINEAR.
+"""
     channels = [src[..., i] for i in range(src.shape[-1])]
 
     # Convert to a 3x3 matrix used by SciPy

From 64ff8de06938bb6b4644dcc0b1ff473abf27ab01 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 16 Jul 2024 07:41:15 +0000
Subject: [PATCH 056/181] refactor convert

---
 .../models/vitpose/convert_vitpose_to_hf.py           | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index d3c0d73bb1b8..47e0d7959707 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -176,10 +176,10 @@ def prepare_img():
 
 
 name_to_path = {
-    "vitpose-base-simple": "/Users/nielsrogge/Documents/ViTPose/vitpose-b-simple.pth",
-    "vitpose-base": "/Users/nielsrogge/Documents/ViTPose/vitpose-b.pth",
-    "vitpose-base-coco-aic-mpii": "/Users/nielsrogge/Documents/ViTPose/vitpose_base_coco_aic_mpii.pth",
-    "vitpose+-base": "/Users/nielsrogge/Documents/ViTPose/vitpose+_base.pth",
+    "vitpose-base-simple": "vitpose-b-simple.pth",
+    "vitpose-base": "vitpose-b.pth",
+    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
+    "vitpose+-base": "vitpose+_base.pth",
 }
 
 
@@ -200,9 +200,6 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     checkpoint_path = name_to_path[model_name]
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
 
-    # for name, param in state_dict.items():
-    #     print(name, param.shape)
-
     # rename some keys
     new_state_dict = convert_state_dict(state_dict, dim=config.backbone_config.hidden_size, config=config)
     missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)

From daad34af57e3e708f463f0bd58421e36aa9a6517 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 17 Jul 2024 00:20:05 +0000
Subject: [PATCH 057/181] add output_shape

---
 .../models/vitpose/image_processing_vitpose.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 474deda0a911..542e04caeb98 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -247,12 +247,12 @@ def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray,
     return matrix
 
 
-def scipy_warp_affine(src, M):
-"""
-This function implements cv2.warpAffine used in the original implementation using scipy.
+def scipy_warp_affine(src, M, size):
+    """
+    This function implements cv2.warpAffine used in the original implementation using scipy.
 
-Note: the original implementation uses cv2.INTER_LINEAR.
-"""
+    Note: the original implementation uses cv2.INTER_LINEAR.
+    """
     channels = [src[..., i] for i in range(src.shape[-1])]
 
     # Convert to a 3x3 matrix used by SciPy
@@ -268,7 +268,7 @@ def scipy_warp_affine(src, M):
         M_inv[0, 2],
     )
 
-    new_src = [affine_transform(channel, M_inv, order=1) for channel in channels]
+    new_src = [affine_transform(channel, M_inv, output_shape=size, order=1) for channel in channels]
     new_src = np.stack(new_src, axis=-1)
     return new_src
 
@@ -361,7 +361,7 @@ def affine_transform(
             if input_data_format == ChannelDimension.LAST
             else to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
         )
-        image = warp_affine(src=image, M=transformation)
+        image = scipy_warp_affine(src=image, M=transformation, size=(size[1], size[0]))
 
         image = to_channel_dimension_format(image, data_format, ChannelDimension.LAST)
 

From b0a488ea0b3f3184008a50421c8a144bed3514d7 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 17 Jul 2024 06:28:22 +0000
Subject: [PATCH 058/181] add atol 5e-2

---
 src/transformers/models/vitpose/convert_vitpose_to_hf.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 47e0d7959707..2f24f6e2f367 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -223,7 +223,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
 
     filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
     original_pixel_values = torch.load(filepath, map_location="cpu")["img"]
-    assert torch.allclose(pixel_values, original_pixel_values)
+    assert torch.allclose(pixel_values, original_pixel_values, atol=1e-1)
 
     img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
     dataset_index = torch.tensor([0])
@@ -260,21 +260,25 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         assert torch.allclose(
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
+            atol=5e-2,
         )
     elif model_name == "vitpose-base":
         assert torch.allclose(
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.9807913e02, 1.8182812e02, 8.8235235e-01]),
+            atol=5e-2,
         )
     elif model_name == "vitpose-base-coco-aic-mpii":
         assert torch.allclose(
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.98305542e02, 1.81741592e02, 8.69966745e-01]),
+            atol=5e-2,
         )
     elif model_name == "vitpose+-base":
         assert torch.allclose(
             torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.98201294e02, 1.81728302e02, 8.75046968e-01]),
+            atol=5e-2,
         )
     else:
         raise ValueError("Model not supported")
@@ -287,6 +291,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         assert torch.allclose(
             torch.tensor(hf_pose_results[1]["keypoints"][0, :3]),
             torch.tensor([3.9813846e02, 1.8180725e02, 8.7446749e-01]),
+            atol=5e-2,
         )
         assert hf_pose_results[0]["keypoints"].shape == (17, 3)
         assert hf_pose_results[1]["keypoints"].shape == (17, 3)

From be6955a0d9dcf7e58a36d1b0cecc6af24dc89ec6 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Tue, 23 Jul 2024 12:38:54 +0200
Subject: [PATCH 059/181] Use hf_hub_download in conversion script

---
 src/transformers/models/vitpose/convert_vitpose_to_hf.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 2f24f6e2f367..34e7645c2556 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -175,7 +175,7 @@ def prepare_img():
     return image
 
 
-name_to_path = {
+model_name_to_file_name = {
     "vitpose-base-simple": "vitpose-b-simple.pth",
     "vitpose-base": "vitpose-b.pth",
     "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
@@ -197,7 +197,10 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     model.eval()
 
     # load original state_dict
-    checkpoint_path = name_to_path[model_name]
+    filename = model_name_to_file_name[model_name]
+    checkpoint_path = hf_hub_download(
+        repo_id="nielsr/vitpose-original-checkpoints", filename=filename, repo_type="model"
+    )
     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
 
     # rename some keys
@@ -314,7 +317,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     parser.add_argument(
         "--model_name",
         default="vitpose-base-simple",
-        choices=name_to_path.keys(),
+        choices=model_name_to_file_name.keys(),
         type=str,
         help="Name of the ViTPose model you'd like to convert.",
     )

From 1b439e20be2c07527ea89a54e4e0a749facad6be Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 2 Aug 2024 01:21:48 +0000
Subject: [PATCH 060/181] make box_to_center more applicable

---
 src/transformers/image_transforms.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 39b81aa93e7e..9aea3e8f0cef 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -539,7 +539,10 @@ def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
     return bboxes_corners
 
 
-def box_to_center_and_scale(box: Union[Tuple, List], image_width: int, image_height: int):
+# inspired by https://github.com/ViTAE-Transformer/ViTPose/blob/d5216452796c90c6bc29f5c5ec0bdba94366768a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py#L132
+def box_to_center_and_scale(
+    box: Union[Tuple, List], image_width: int, image_height: int, pixel_std: float = 200.0, padding: float = 1.25
+):
     """
     Encodes a bounding box in COCO format into (center, scale).
 
@@ -550,6 +553,10 @@ def box_to_center_and_scale(box: Union[Tuple, List], image_width: int, image_hei
             Image width.
         image_height (`int`):
             Image height.
+        pixel_std (`float`):
+            Width and height scale factor.
+        padding (`float`):
+            Bounding box padding factor.
 
     Returns:
         tuple: A tuple containing center and scale.
@@ -568,8 +575,8 @@ def box_to_center_and_scale(box: Union[Tuple, List], image_width: int, image_hei
         width = height * aspect_ratio
 
     # pixel std is 200.0
-    scale = np.array([width / 200.0, height / 200.0], dtype=np.float32)
-    scale = scale * 1.25
+    scale = np.array([width / pixel_std, height / pixel_std], dtype=np.float32)
+    scale = scale * padding
 
     return center, scale
 

From e576c8ec5f5177f75d40e8f21b62cc2e98ac2da9 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 2 Aug 2024 04:44:22 +0000
Subject: [PATCH 061/181] skipt test_get_set_embedding

---
 .../models/vitpose_backbone/test_modeling_vitpose_backbone.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index de844b73b2db..e96798833083 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -147,6 +147,10 @@ def test_model_common_attributes(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
     @unittest.skip(reason="ViTPoseBackbone does not support feedforward chunking")
     def test_feed_forward_chunking(self):
         pass

From a55a95564c9d84770fb3dcffe2bc2a1f9f5c6a8b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 2 Aug 2024 05:40:41 +0000
Subject: [PATCH 062/181] fix to accept array and fix CI

---
 src/transformers/image_transforms.py                      | 8 ++++++--
 .../models/vitpose/image_processing_vitpose.py            | 6 +++---
 tests/models/vitpose/test_modeling_vitpose.py             | 4 ++++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 9aea3e8f0cef..7234d1062dd4 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -541,13 +541,17 @@ def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
 
 # inspired by https://github.com/ViTAE-Transformer/ViTPose/blob/d5216452796c90c6bc29f5c5ec0bdba94366768a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py#L132
 def box_to_center_and_scale(
-    box: Union[Tuple, List], image_width: int, image_height: int, pixel_std: float = 200.0, padding: float = 1.25
+    box: Union[Tuple, List, np.ndarray],
+    image_width: int,
+    image_height: int,
+    pixel_std: float = 200.0,
+    padding: float = 1.25,
 ):
     """
     Encodes a bounding box in COCO format into (center, scale).
 
     Args:
-        box (`Tuple` or `List`):
+        box (`Tuple`, `List`, or `np.ndarray`):
             Bounding box in COCO format (top_left_x, top_left_y, width, height).
         image_width (`int`):
             Image width.
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 542e04caeb98..3f2f7db83855 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -370,7 +370,7 @@ def affine_transform(
     def preprocess(
         self,
         images: ImageInput,
-        boxes: List[List[float]],
+        boxes: Union[List[List[float]], np.ndarray],
         do_affine_transform: bool = None,
         size: Dict[str, int] = None,
         do_rescale: bool = None,
@@ -390,8 +390,8 @@ def preprocess(
                 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
 
-            boxes (`List[List[float]]`):
-                List of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+            boxes (`List[List[float]]` or `np.ndarray`):
+                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
                 box coordinates in COCO format (x, y, w, h).
 
             do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 92e6d7458b47..52840da87726 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -173,6 +173,10 @@ def test_model_common_attributes(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
     @unittest.skip(reason="ViTPose does not support training yet")
     def test_training(self):
         pass

From e621b80303ee4a76c4333bd446d586b8030b1f37 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 2 Aug 2024 05:57:48 +0000
Subject: [PATCH 063/181] add co-contributor

---
 docs/source/en/model_doc/vitpose.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index b6371a3f4e96..fc6e844e4fad 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -21,7 +21,7 @@ The abstract from the paper is the following:
 *Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
 
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
+This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
 The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
 
 

From 255ddf55a2c37409ba603e2cb8ab034a20f059d9 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 03:09:33 +0000
Subject: [PATCH 064/181] make it to tensor type output

---
 .../vitpose/image_processing_vitpose.py       | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 3f2f7db83855..1bcc27d49810 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -45,7 +45,7 @@
 logger = logging.get_logger(__name__)
 
 
-def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
+def coco_to_pascal_voc(bboxes: torch.Tensor) -> torch.Tensor:
     """
     Converts bounding boxes from the COCO format to the Pascal VOC format.
 
@@ -53,11 +53,11 @@ def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
 
     Args:
-        bboxes (`ndarray` of shape `(batch_size, 4)):
+        bboxes (`torch.Tensor` of shape `(batch_size, 4)):
             Bounding boxes in COCO format.
 
     Returns:
-        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
+        `torch.Tensor` of shape `(batch_size, 4) in Pascal VOC format.
     """
     bbox_xyxy = bboxes.copy()
     bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
@@ -546,12 +546,15 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
                 Bounding boxes.
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
+            in the batch as predicted by the model.
         """
 
         # First compute centers and scales for each bounding box
         batch_size = len(outputs.heatmaps)
-        centers = np.zeros((batch_size, 2), dtype=np.float32)
-        scales = np.zeros((batch_size, 2), dtype=np.float32)
+        centers = torch.zeros((batch_size, 2), dtype=torch.float32)
+        scales = torch.zeros((batch_size, 2), dtype=torch.float32)
         for i in range(batch_size):
             width, height = self.size["width"], self.size["height"]
             center, scale = box_to_center_and_scale(boxes[i], image_width=width, image_height=height)
@@ -560,20 +563,19 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
 
         preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps, centers, scales, kernel=kernel_size)
 
-        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
-        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds = torch.zeros((batch_size, preds.shape[1], 3), dtype=torch.float32)
+        all_boxes = torch.zeros((batch_size, 6), dtype=torch.float32)
         all_preds[:, :, 0:2] = preds[:, :, 0:2]
         all_preds[:, :, 2:3] = scores
         all_boxes[:, 0:2] = centers[:, 0:2]
         all_boxes[:, 2:4] = scales[:, 0:2]
-        all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
+        all_boxes[:, 4] = torch.prod(scales * 200.0, axis=1)
 
         poses = all_preds
 
-        bboxes = np.array(boxes)
-        bboxes_xyxy = coco_to_pascal_voc(bboxes)
+        bboxes_xyxy = coco_to_pascal_voc(boxes)
 
-        pose_results = []
+        pose_results: List[Dict[str, TensorType]] = []
         for pose, bbox_xyxy in zip(poses, bboxes_xyxy):
             pose_result = {}
             pose_result["keypoints"] = pose

From f0f9d618eed80517d95a4a3b82a50d45a6b8cb2f Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 03:13:15 +0000
Subject: [PATCH 065/181] add torch

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 1bcc27d49810..7f8edeaf9a3d 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -32,9 +32,13 @@
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_scipy_available, is_vision_available, logging
+from ...utils import TensorType, is_scipy_available, is_torch_available, is_vision_available, logging
 
 
+if is_torch_available():
+    import torch
+    from torch import nn
+
 if is_vision_available():
     import PIL
 

From e38b20720562fa0cbe0fb80ca9659f5e538fefd7 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 03:22:14 +0000
Subject: [PATCH 066/181] change to torch tensor

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 7f8edeaf9a3d..61eac1a4d23f 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -562,8 +562,8 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
         for i in range(batch_size):
             width, height = self.size["width"], self.size["height"]
             center, scale = box_to_center_and_scale(boxes[i], image_width=width, image_height=height)
-            centers[i, :] = center
-            scales[i, :] = scale
+            centers[i, :] = torch.Tensor(center)
+            scales[i, :] = torch.Tensor(scale)
 
         preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps, centers, scales, kernel=kernel_size)
 

From bbd534c53a9afc335875d5a142372a3728303d7d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 04:10:22 +0000
Subject: [PATCH 067/181] add more test

---
 .../vitpose/image_processing_vitpose.py       | 61 ++++++++++---------
 tests/models/vitpose/test_modeling_vitpose.py | 32 ++++++++--
 2 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 61eac1a4d23f..1ad36678d776 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -37,7 +37,6 @@
 
 if is_torch_available():
     import torch
-    from torch import nn
 
 if is_vision_available():
     import PIL
@@ -63,44 +62,45 @@ def coco_to_pascal_voc(bboxes: torch.Tensor) -> torch.Tensor:
     Returns:
         `torch.Tensor` of shape `(batch_size, 4) in Pascal VOC format.
     """
-    bbox_xyxy = bboxes.copy()
-    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
-    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+    bboxes[:, 2] = bboxes[:, 2] + bboxes[:, 0] - 1
+    bboxes[:, 3] = bboxes[:, 3] + bboxes[:, 1] - 1
 
-    return bbox_xyxy
+    return bboxes
 
 
 def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
     Args:
-        heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+        heatmaps (`torch.Tensor` or `np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
             Model predicted heatmaps.
 
     Returns:
         tuple: A tuple containing aggregated results.
 
-        - preds (np.ndarray[N, K, 2]):
+        - coords (torch.Tensor[N, K, 2]):
             Predicted keypoint location.
-        - scores (np.ndarray[N, K, 1]):
+        - scores (torch.Tensor[N, K, 1]):
             Scores (confidence) of the keypoints.
     """
-    if not isinstance(heatmaps, np.ndarray):
-        raise ValueError("Heatmaps should be numpy.ndarray")
+    if isinstance(heatmaps, np.ndarray):
+        heatmaps = torch.Tensor(heatmaps)
+    if not isinstance(heatmaps, torch.Tensor):
+        raise ValueError("Heatmaps should be torch.Tensor")
     if heatmaps.ndim != 4:
         raise ValueError("Heatmaps should be 4-dimensional")
 
     batch_size, num_keypoints, _, width = heatmaps.shape
-    heatmaps_reshaped = heatmaps.reshape((batch_size, num_keypoints, -1))
-    idx = np.argmax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
-    scores = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+    heatmaps_reshaped = heatmaps.view(batch_size, num_keypoints, -1)
+    idx = torch.argmax(heatmaps_reshaped, dim=2).view(batch_size, num_keypoints, 1)
+    scores = torch.amax(heatmaps_reshaped, dim=2).view(batch_size, num_keypoints, 1)
 
-    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
-    preds[:, :, 0] = preds[:, :, 0] % width
-    preds[:, :, 1] = preds[:, :, 1] // width
+    coords = idx.repeat(1, 1, 2).float()
+    coords[:, :, 0] = coords[:, :, 0] % width
+    coords[:, :, 1] = coords[:, :, 1] // width
 
-    preds = np.where(np.tile(scores, (1, 1, 2)) > 0.0, preds, -1)
-    return preds, scores
+    coords = torch.where(scores.repeat(1, 1, 2) > 0.0, coords, -1)
+    return coords, scores
 
 
 def post_dark_udp(coords, batch_heatmaps, kernel=3):
@@ -121,9 +121,11 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
             Gaussian kernel size (K) for modulation.
 
     Returns:
-        `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
+        `torch.Tensor` of shape `(num_persons, num_keypoints, 2)` ):
             Refined coordinates.
     """
+    if not isinstance(coords, np.ndarray):
+        coords = coords.cpu().numpy()
     if not isinstance(batch_heatmaps, np.ndarray):
         batch_heatmaps = batch_heatmaps.cpu().numpy()
     batch_size, num_keypoints, height, width = batch_heatmaps.shape
@@ -163,6 +165,7 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     hessian = hessian.reshape(num_coords, num_keypoints, 2, 2)
     hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
     coords -= np.einsum("ijmn,ijnk->ijmk", hessian, derivative).squeeze()
+    coords = torch.Tensor(coords)
     return coords
 
 
@@ -174,22 +177,22 @@ def transform_preds(coords, center, scale, output_size):
         num_keypoints: K
 
     Args:
-        coords (`np.ndarray[K, ndims]`):
+        coords (`torch.Tensor[K, ndims]`):
 
             * If ndims=2, corrds are predicted keypoint location.
             * If ndims=4, corrds are composed of (x, y, scores, tags)
             * If ndims=5, corrds are composed of (x, y, scores, tags,
               flipped_tags)
 
-        center (`np.ndarray[2,]`):
+        center (`torch.Tensor[2,]`):
             Center of the bounding box (x, y).
-        scale (`np.ndarray[2,]`):
+        scale (`torch.Tensor[2,]`):
             Scale of the bounding box wrt [width, height].
-        output_size (`np.ndarray[2,] or `List(2,)`):
+        output_size (`torch.Tensor[2,]):
             Size of the destination heatmaps in (height, width) format.
 
     Returns:
-        np.ndarray: Predicted coordinates in the images.
+        torch.Tensor: Predicted coordinates in the images.
     """
     if coords.shape[1] not in (2, 4, 5):
         raise ValueError("Coordinates need to have either 2, 4 or 5 dimensions.")
@@ -207,7 +210,7 @@ def transform_preds(coords, center, scale, output_size):
     scale_y = scale[1] / (output_size[0] - 1.0)
     scale_x = scale[0] / (output_size[1] - 1.0)
 
-    target_coords = np.ones_like(coords)
+    target_coords = torch.ones_like(coords)
     target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
     target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
 
@@ -529,9 +532,9 @@ def keypoints_from_heatmaps(
 
         batch_size, _, height, width = heatmaps.shape
 
-        preds, scores = _get_max_preds(heatmaps)
+        coords, scores = _get_max_preds(heatmaps)
 
-        preds = post_dark_udp(preds, heatmaps, kernel=kernel)
+        preds = post_dark_udp(coords, heatmaps, kernel=kernel)
 
         # Transform back to the image
         for i in range(batch_size):
@@ -577,9 +580,9 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
 
         poses = all_preds
 
-        bboxes_xyxy = coco_to_pascal_voc(boxes)
+        bboxes_xyxy = coco_to_pascal_voc(all_boxes)
 
-        pose_results: List[Dict[str, TensorType]] = []
+        pose_results: List[Dict[str, torch.Tensor]] = []
         for pose, bbox_xyxy in zip(poses, bboxes_xyxy):
             pose_result = {}
             pose_result["keypoints"] = pose
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 52840da87726..8755bd107764 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -17,6 +17,8 @@
 import inspect
 import unittest
 
+import requests
+
 from transformers import ViTPoseBackboneConfig, ViTPoseConfig
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
@@ -217,9 +219,10 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
 
-# We will verify our results on an image of cute cats
+# We will verify our results on an image of people in house
 def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
     return image
 
 
@@ -232,7 +235,7 @@ def default_image_processor(self):
         return ViTPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple") if is_vision_available() else None
 
     @slow
-    def test_inference(self):
+    def test_inference_pose_estimation(self):
         image_processor = self.default_image_processor
         # TODO update organization
         model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
@@ -247,10 +250,31 @@ def test_inference(self):
 
         assert heatmaps.shape == (2, 17, 64, 48)
 
-        expected_slice = torch.tensor([[0.0003, 0.0003, 0.0003], [0.0005, 0.0007, 0.0007], [0.0006, 0.0007, 0.0007]])
+        expected_slice = torch.tensor(
+            [
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+            ]
+        )
 
         assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
 
+        pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
+
+        expected_bbox = torch.tensor([439.3250, 226.6150, 438.9719, 226.4776, 22320.4219, 0.0000]).to(torch_device)
+        expected_keypoints = torch.tensor(
+            [
+                [3.9813e02, 1.8184e02, 8.7529e-01],
+                [3.9828e02, 1.7981e02, 8.4315e-01],
+                [3.9596e02, 1.7948e02, 9.2678e-01],
+            ]
+        ).to(torch_device)
+
+        self.assertEqual(len(pose_results), 2)
+        self.assertTrue(torch.allclose(pose_results[0]["bbox"], expected_bbox, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[0]["keypoints"], expected_keypoints, atol=1e-4))
+
     @slow
     def test_batched_inference(self):
         raise NotImplementedError("To do")

From dcf14854abcd137e92a8731e84f6b542178a6c4d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 04:31:04 +0000
Subject: [PATCH 068/181] minor change

---
 tests/models/vitpose/test_image_processing_vitpose.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index d6dd6c68b767..7ef8ef36bd0d 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_cv2, require_torch, require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -93,11 +93,11 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-@require_cv2
 class ViTPoseImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ViTPoseImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ViTPoseImageProcessingTester(self)
 
     @property

From d10fb30c6ed22f141973ec88d99dffc7d58022ad Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 04:58:39 +0000
Subject: [PATCH 069/181] CI test change

---
 tests/models/vitpose/test_image_processing_vitpose.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index 7ef8ef36bd0d..47477c0ba06d 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -198,14 +198,13 @@ def test_call_numpy_4_channels(self):
         # create random numpy tensors
         self.image_processor_tester.num_channels = 4
         image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-
         # Test not batched input
         boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
         encoded_images = image_processor(
             image_inputs[0],
             boxes=boxes,
             return_tensors="pt",
-            input_data_format="channels_first",
+            input_data_format="channels_last",
             image_mean=0,
             image_std=1,
         ).pixel_values
@@ -218,7 +217,7 @@ def test_call_numpy_4_channels(self):
             image_inputs,
             boxes=boxes,
             return_tensors="pt",
-            input_data_format="channels_first",
+            input_data_format="channels_last",
             image_mean=0,
             image_std=1,
         ).pixel_values

From 741f07bc0f55427a1ac8cb5b95e4c74d2597a90b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 07:49:46 +0000
Subject: [PATCH 070/181] import torch should be above ImageProcessor

---
 tests/models/vitpose/test_image_processing_vitpose.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index 47477c0ba06d..0fb450e6bd5c 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -24,14 +24,15 @@
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
 
+if is_torch_available():
+    import torch
+    
+
 if is_vision_available():
     from PIL import Image
 
     from transformers import ViTPoseImageProcessor
 
-if is_torch_available():
-    import torch
-
 
 class ViTPoseImageProcessingTester(unittest.TestCase):
     def __init__(

From 2a3d79226915612f13ea8a4d37bcec05f6d0fbe6 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 07:52:30 +0000
Subject: [PATCH 071/181] make style

---
 tests/models/vitpose/test_image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index 0fb450e6bd5c..aa4198081448 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -26,7 +26,7 @@
 
 if is_torch_available():
     import torch
-    
+
 
 if is_vision_available():
     from PIL import Image

From 9cdbf5f22e23a3b5580069f07d2627f88b355055 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 3 Aug 2024 08:13:47 +0000
Subject: [PATCH 072/181] try not use torch in def

---
 .../vitpose/image_processing_vitpose.py       | 67 +++++++++----------
 1 file changed, 32 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 1ad36678d776..640961faea98 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -48,7 +48,7 @@
 logger = logging.get_logger(__name__)
 
 
-def coco_to_pascal_voc(bboxes: torch.Tensor) -> torch.Tensor:
+def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     """
     Converts bounding boxes from the COCO format to the Pascal VOC format.
 
@@ -56,11 +56,11 @@ def coco_to_pascal_voc(bboxes: torch.Tensor) -> torch.Tensor:
     to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
 
     Args:
-        bboxes (`torch.Tensor` of shape `(batch_size, 4)):
+        bboxes (`np.ndarray` of shape `(batch_size, 4)):
             Bounding boxes in COCO format.
 
     Returns:
-        `torch.Tensor` of shape `(batch_size, 4) in Pascal VOC format.
+        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
     """
     bboxes[:, 2] = bboxes[:, 2] + bboxes[:, 0] - 1
     bboxes[:, 3] = bboxes[:, 3] + bboxes[:, 1] - 1
@@ -72,35 +72,33 @@ def _get_max_preds(heatmaps):
     """Get keypoint predictions from score maps.
 
     Args:
-        heatmaps (`torch.Tensor` or `np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+        heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
             Model predicted heatmaps.
 
     Returns:
         tuple: A tuple containing aggregated results.
 
-        - coords (torch.Tensor[N, K, 2]):
+        - coords (np.ndarray[N, K, 2]):
             Predicted keypoint location.
-        - scores (torch.Tensor[N, K, 1]):
+        - scores (np.ndarray[N, K, 1]):
             Scores (confidence) of the keypoints.
     """
-    if isinstance(heatmaps, np.ndarray):
-        heatmaps = torch.Tensor(heatmaps)
-    if not isinstance(heatmaps, torch.Tensor):
-        raise ValueError("Heatmaps should be torch.Tensor")
+    if not isinstance(heatmaps, np.ndarray):
+        raise ValueError("Heatmaps should be np.ndarray")
     if heatmaps.ndim != 4:
         raise ValueError("Heatmaps should be 4-dimensional")
 
     batch_size, num_keypoints, _, width = heatmaps.shape
-    heatmaps_reshaped = heatmaps.view(batch_size, num_keypoints, -1)
-    idx = torch.argmax(heatmaps_reshaped, dim=2).view(batch_size, num_keypoints, 1)
-    scores = torch.amax(heatmaps_reshaped, dim=2).view(batch_size, num_keypoints, 1)
+    heatmaps_reshaped = heatmaps.reshape((batch_size, num_keypoints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+    scores = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
 
-    coords = idx.repeat(1, 1, 2).float()
-    coords[:, :, 0] = coords[:, :, 0] % width
-    coords[:, :, 1] = coords[:, :, 1] // width
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % width
+    preds[:, :, 1] = preds[:, :, 1] // width
 
-    coords = torch.where(scores.repeat(1, 1, 2) > 0.0, coords, -1)
-    return coords, scores
+    preds = np.where(np.tile(scores, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, scores
 
 
 def post_dark_udp(coords, batch_heatmaps, kernel=3):
@@ -121,7 +119,7 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
             Gaussian kernel size (K) for modulation.
 
     Returns:
-        `torch.Tensor` of shape `(num_persons, num_keypoints, 2)` ):
+        `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
             Refined coordinates.
     """
     if not isinstance(coords, np.ndarray):
@@ -165,7 +163,6 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     hessian = hessian.reshape(num_coords, num_keypoints, 2, 2)
     hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
     coords -= np.einsum("ijmn,ijnk->ijmk", hessian, derivative).squeeze()
-    coords = torch.Tensor(coords)
     return coords
 
 
@@ -177,22 +174,22 @@ def transform_preds(coords, center, scale, output_size):
         num_keypoints: K
 
     Args:
-        coords (`torch.Tensor[K, ndims]`):
+        coords (`np.ndarray[K, ndims]`):
 
             * If ndims=2, corrds are predicted keypoint location.
             * If ndims=4, corrds are composed of (x, y, scores, tags)
             * If ndims=5, corrds are composed of (x, y, scores, tags,
               flipped_tags)
 
-        center (`torch.Tensor[2,]`):
+        center (`np.ndarray[2,]`):
             Center of the bounding box (x, y).
-        scale (`torch.Tensor[2,]`):
+        scale (`np.ndarray[2,]`):
             Scale of the bounding box wrt [width, height].
-        output_size (`torch.Tensor[2,]):
+        output_size (`np.ndarray[2,]):
             Size of the destination heatmaps in (height, width) format.
 
     Returns:
-        torch.Tensor: Predicted coordinates in the images.
+        np.ndarray: Predicted coordinates in the images.
     """
     if coords.shape[1] not in (2, 4, 5):
         raise ValueError("Coordinates need to have either 2, 4 or 5 dimensions.")
@@ -210,7 +207,7 @@ def transform_preds(coords, center, scale, output_size):
     scale_y = scale[1] / (output_size[0] - 1.0)
     scale_x = scale[0] / (output_size[1] - 1.0)
 
-    target_coords = torch.ones_like(coords)
+    target_coords = np.ones_like(coords)
     target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
     target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
 
@@ -560,27 +557,27 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
 
         # First compute centers and scales for each bounding box
         batch_size = len(outputs.heatmaps)
-        centers = torch.zeros((batch_size, 2), dtype=torch.float32)
-        scales = torch.zeros((batch_size, 2), dtype=torch.float32)
+        centers = np.zeros((batch_size, 2), dtype=np.float32)
+        scales = np.zeros((batch_size, 2), dtype=np.float32)
         for i in range(batch_size):
             width, height = self.size["width"], self.size["height"]
             center, scale = box_to_center_and_scale(boxes[i], image_width=width, image_height=height)
-            centers[i, :] = torch.Tensor(center)
-            scales[i, :] = torch.Tensor(scale)
+            centers[i, :] = center
+            scales[i, :] = scale
 
         preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps, centers, scales, kernel=kernel_size)
 
-        all_preds = torch.zeros((batch_size, preds.shape[1], 3), dtype=torch.float32)
-        all_boxes = torch.zeros((batch_size, 6), dtype=torch.float32)
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
         all_preds[:, :, 0:2] = preds[:, :, 0:2]
         all_preds[:, :, 2:3] = scores
         all_boxes[:, 0:2] = centers[:, 0:2]
         all_boxes[:, 2:4] = scales[:, 0:2]
-        all_boxes[:, 4] = torch.prod(scales * 200.0, axis=1)
+        all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
 
-        poses = all_preds
+        poses = torch.Tensor(all_preds)
 
-        bboxes_xyxy = coco_to_pascal_voc(all_boxes)
+        bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
 
         pose_results: List[Dict[str, torch.Tensor]] = []
         for pose, bbox_xyxy in zip(poses, bboxes_xyxy):

From 7f7e9ecb3c1083fbdf72b79be7bfdb5f9d67fd45 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Mon, 12 Aug 2024 23:33:22 +0900
Subject: [PATCH 073/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 640961faea98..a7dbb67127c1 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -472,7 +472,7 @@ def preprocess(
 
         # since the number of boxes can differ per image, the image processor takes a list
         # rather than a numpy array of boxes
-        # it currently create pixel_values of shape (batch_size*num_persons, num_channels, height, width)
+        # it currently creates pixel_values of shape (batch_size*num_persons, num_channels, height, width)
 
         if self.do_rescale:
             images = [

From de22f650a1f30208add56862f3d676ea62f32914 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Mon, 12 Aug 2024 23:33:48 +0900
Subject: [PATCH 074/181] Update
 src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/vitpose_backbone/configuration_vitpose_backbone.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 944eff4e0cf5..c3b6dbc99e94 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -24,7 +24,7 @@
 
 class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTPoseBackbone`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`ViTPoseBackbone`]. It is used to instantiate a
     ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the ViTPose
     [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.

From cf0743207d37cce819ee3f577a65ee0e64da3743 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Mon, 12 Aug 2024 23:33:57 +0900
Subject: [PATCH 075/181] Update
 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 4befe1ce9e41..c445f2ee21f7 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -412,7 +412,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
     behavior.
 
     Parameters:
-        config ([`ViTPoseConfig`]): Model configuration class with all the parameters of the model.
+        config ([`ViTPoseBackboneConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """

From 2aef46bdcc85add9f423c02db029b2250a5e4e7b Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 13 Aug 2024 09:09:30 +0900
Subject: [PATCH 076/181] Update
 src/transformers/models/vitpose/modeling_vitpose.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/vitpose/modeling_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index f56589097a01..842a906f79aa 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -116,7 +116,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             This corresponds to the dataset index used during training, e.g. index 0 refers to COCO.
 
         flip_pairs (`torch.tensor`, *optional*):
-            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
+            Whether to mirror pairs of keypoints (for example, left ear -- right ear).
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned

From 3cc8d2a418e6a82bc78c043f7788d396bb864b40 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 13 Aug 2024 00:42:22 +0000
Subject: [PATCH 077/181] fix

---
 docs/source/en/model_doc/vitpose.md           | 49 +++++++++++++++++++
 src/transformers/image_transforms.py          |  1 -
 .../models/vitpose/modeling_vitpose.py        | 10 ++--
 .../configuration_vitpose_backbone.py         |  4 +-
 4 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index fc6e844e4fad..12aeee91d4d7 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -24,6 +24,55 @@ The abstract from the paper is the following:
 This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
 The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
 
+## Usage Tips
+
+The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person. After that, the second step uses ViTPose to predict the keypoints.
+
+```py
+>>> import torch
+>>> import requests
+
+>>> from PIL import Image
+>>> from transformers import ViTPoseImageProcessor, ViTPoseForPoseEstimation
+
+>>> url = 'http://images.cocodataset.org/val2017/000000000139.jpg' 
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image_processor = ViTPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
+>>> model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+
+>>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+
+>>> pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
+
+>>> with torch.no_grad():
+...     outputs = model(pixel_values)
+
+>>> pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0])
+
+>>> for pose_result in pose_results:
+...     for keypoint in pose_result['keypoints']:
+...         x, y, score = keypoint
+...         print(f"coordinate : [{x}, {y}], score : {score}")
+coordinate : [428.25335693359375, 170.24496459960938], score : 0.8717536330223083
+coordinate : [429.13037109375, 167.39605712890625], score : 0.8820509910583496
+coordinate : [428.23681640625, 167.72825622558594], score : 0.7663289308547974
+coordinate : [433.1866455078125, 167.2566680908203], score : 0.933370053768158
+coordinate : [440.34075927734375, 166.58522033691406], score : 0.8911094069480896
+coordinate : [439.90283203125, 177.54049682617188], score : 0.9118685722351074
+coordinate : [445.50372314453125, 178.04055786132812], score : 0.751734733581543
+coordinate : [436.45819091796875, 199.42474365234375], score : 0.8745120167732239
+coordinate : [433.68255615234375, 200.17333984375], score : 0.5155676603317261
+coordinate : [430.5008544921875, 218.7760009765625], score : 0.8757728338241577
+coordinate : [420.5921630859375, 213.15621948242188], score : 0.9036439657211304
+coordinate : [445.17218017578125, 222.87921142578125], score : 0.8029380440711975
+coordinate : [452.07672119140625, 222.17730712890625], score : 0.8517846465110779
+coordinate : [441.92657470703125, 255.0374755859375], score : 0.8607744574546814
+coordinate : [451.2308349609375, 254.36398315429688], score : 0.8495950698852539
+coordinate : [443.9051513671875, 287.5822448730469], score : 0.703719437122345
+coordinate : [455.88482666015625, 285.6434631347656], score : 0.8391701579093933
+```
+
 
 ## ViTPoseImageProcessor
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7234d1062dd4..2cc014c3e93b 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -578,7 +578,6 @@ def box_to_center_and_scale(
     elif width < aspect_ratio * height:
         width = height * aspect_ratio
 
-    # pixel std is 200.0
     scale = np.array([width / pixel_std, height / pixel_std], dtype=np.float32)
     scale = scale * padding
 
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index f56589097a01..81c5bfa6866c 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -151,19 +151,19 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
 
     if output_flipped.ndim != 4:
         raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
-    shape_ori = output_flipped.shape
+    original_shape = output_flipped.shape
     channels = 1
     if target_type.lower() == "CombinedTarget".lower():
         channels = 3
         output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
-    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, shape_ori[2], shape_ori[3])
+    output_flipped = output_flipped.reshape(original_shape[0], -1, channels, original_shape[2], original_shape[3])
     output_flipped_back = output_flipped.clone()
 
     # Swap left-right parts
     for left, right in flip_pairs.tolist():
         output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
         output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
-    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    output_flipped_back = output_flipped_back.reshape(original_shape)
     # Flip horizontally
     output_flipped_back = output_flipped_back.flip(-1)
     return output_flipped_back
@@ -316,8 +316,8 @@ def forward(
         # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
         sequence_output = outputs.feature_maps[-1] if return_dict else outputs[0][-1]
         batch_size = sequence_output.shape[0]
-        patch_height = self.config.image_size[0] // self.config.patch_size[0]
-        patch_width = self.config.image_size[1] // self.config.patch_size[1]
+        patch_height = self.config.image_size[0] // self.config.patch_size
+        patch_width = self.config.image_size[1] // self.config.patch_size
         sequence_output = (
             sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
         )
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 944eff4e0cf5..98fe9694cbe9 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -35,7 +35,7 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     Args:
         image_size (`int`, *optional*, defaults to `[256, 192]`):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `[16, 16]`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
@@ -95,7 +95,7 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     def __init__(
         self,
         image_size=[256, 192],
-        patch_size=[16, 16],
+        patch_size=16,
         num_channels=3,
         hidden_size=768,
         num_hidden_layers=12,

From 5bdd62e6d523b10bcd9062c8d37442f7c1bf3e4c Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 13 Aug 2024 01:20:57 +0000
Subject: [PATCH 078/181] fix

---
 src/transformers/models/vitpose/image_processing_vitpose.py   | 4 ++--
 src/transformers/models/vitpose/modeling_vitpose.py           | 4 ++--
 .../models/vitpose_backbone/configuration_vitpose_backbone.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index a7dbb67127c1..ca31438960c9 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -68,7 +68,7 @@ def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     return bboxes
 
 
-def _get_max_preds(heatmaps):
+def get_keypoint_predictions(heatmaps):
     """Get keypoint predictions from score maps.
 
     Args:
@@ -529,7 +529,7 @@ def keypoints_from_heatmaps(
 
         batch_size, _, height, width = heatmaps.shape
 
-        coords, scores = _get_max_preds(heatmaps)
+        coords, scores = get_keypoint_predictions(heatmaps)
 
         preds = post_dark_udp(coords, heatmaps, kernel=kernel)
 
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 41494774ca54..78a160ec47ce 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -316,8 +316,8 @@ def forward(
         # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
         sequence_output = outputs.feature_maps[-1] if return_dict else outputs[0][-1]
         batch_size = sequence_output.shape[0]
-        patch_height = self.config.image_size[0] // self.config.patch_size
-        patch_width = self.config.image_size[1] // self.config.patch_size
+        patch_height = self.config.image_size[0] // self.config.patch_size[0]
+        patch_width = self.config.image_size[1] // self.config.patch_size[1]
         sequence_output = (
             sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
         )
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 5994ed09ceb4..705b72c890e0 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -35,7 +35,7 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     Args:
         image_size (`int`, *optional*, defaults to `[256, 192]`):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
+        patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
@@ -95,7 +95,7 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     def __init__(
         self,
         image_size=[256, 192],
-        patch_size=16,
+        patch_size=[16, 16],
         num_channels=3,
         hidden_size=768,
         num_hidden_layers=12,

From 2f40861fe2a2336993781ad597b0ddf850d5023d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 19 Aug 2024 00:40:40 +0000
Subject: [PATCH 079/181] add caution

---
 src/transformers/models/vitpose/configuration_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 55e54fb0067e..13baecf6b2db 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -34,7 +34,7 @@ class ViTPoseConfig(PretrainedConfig):
 
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitPoseBackboneConfig()`):
-            The configuration of the backbone model.
+            The configuration of the backbone model. Currently backbone_config with `vitpose_backbone` model_type is only supported.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`

From 5e8b89e3d1026f104f5468bd31e4cd8b38e7585d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 19 Aug 2024 01:06:59 +0000
Subject: [PATCH 080/181] make more detail about dataset_index

---
 src/transformers/models/vitpose/modeling_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 78a160ec47ce..47be7d62da72 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -113,7 +113,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         dataset_index (`torch.Tensor` of shape `(batch_size,)`):
             Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
 
-            This corresponds to the dataset index used during training, e.g. index 0 refers to COCO.
+            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
 
         flip_pairs (`torch.tensor`, *optional*):
             Whether to mirror pairs of keypoints (for example, left ear -- right ear).

From c19c97add7d0aec47f881a4997be93f1db82d5a0 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:28:01 +0200
Subject: [PATCH 081/181] Update
 src/transformers/models/vitpose/modeling_vitpose.py

Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
---
 src/transformers/models/vitpose/modeling_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 47be7d62da72..02b41f01e5d5 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -46,7 +46,7 @@ class PoseEstimatorOutput(ModelOutput):
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
+            Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
         heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
             Heatmaps as predicted by the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):

From f064009944cc048fdc0184ee53c14d67e35b7450 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:28:08 +0200
Subject: [PATCH 082/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index ca31438960c9..ed5016549d60 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -396,7 +396,7 @@ def preprocess(
 
             boxes (`List[List[float]]` or `np.ndarray`):
                 List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
-                box coordinates in COCO format (x, y, w, h).
+                box coordinates in COCO format (top_left_x, top_left_y, width, height).
 
             do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
                 Whether to apply an affine transformation to the input images.

From e9c6b1e75270bccefa35d7310ba226b6ee554e87 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 20 Aug 2024 15:06:11 +0000
Subject: [PATCH 083/181] add docs

---
 docs/source/en/model_doc/vitpose.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 12aeee91d4d7..0e8855ae7a11 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -26,7 +26,10 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 
 ## Usage Tips
 
-The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person. After that, the second step uses ViTPose to predict the keypoints.
+- To enable MoE(Mixture or Expert) function in backbone, user have to give appropriate input indices into the backbone model. 
+  However, it is not used in default parameters.
+- The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person.
+  After that, the second step uses ViTPose to predict the keypoints.
 
 ```py
 >>> import torch

From 80e05453766dc52a3dba230c8a7f0e04234e3054 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:20:29 +0900
Subject: [PATCH 084/181] Update docs/source/en/model_doc/vitpose.md

---
 docs/source/en/model_doc/vitpose.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 0e8855ae7a11..fcecf107370c 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
+The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](https://arxiv.org/pdf/2010.11929v2) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
 
 The abstract from the paper is the following:
 

From 533d298e59d1274af1c543763f4ee52a29f08c33 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 09:42:50 +0900
Subject: [PATCH 085/181] Update
 src/transformers/models/vitpose/configuration_vitpose.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/vitpose/configuration_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 13baecf6b2db..bb9496564e87 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -34,7 +34,7 @@ class ViTPoseConfig(PretrainedConfig):
 
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitPoseBackboneConfig()`):
-            The configuration of the backbone model. Currently backbone_config with `vitpose_backbone` model_type is only supported.
+            The configuration of the backbone model. Currently, only `backbone_config` with `vitpose_backbone` as `model_type` is supported.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`

From 7ffa504450bb9dbccf9c7ea668441b98a1939d5c Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 09:44:27 +0900
Subject: [PATCH 086/181] Update src/transformers/__init__.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f8e82121bae9..ebf2791c8503 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -775,8 +775,8 @@
     "models.vit_msn": ["ViTMSNConfig"],
     "models.vitdet": ["VitDetConfig"],
     "models.vitmatte": ["VitMatteConfig"],
-    "models.vitpose": ["ViTPoseConfig"],
-    "models.vitpose_backbone": ["ViTPoseBackboneConfig"],
+    "models.vitpose": ["VitPoseConfig"],
+    "models.vitpose_backbone": ["VitPoseBackboneConfig"],
     "models.vits": [
         "VitsConfig",
         "VitsTokenizer",

From 68da46a6699618fabe2119edbd470c6e9dfa28c5 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 3 Sep 2024 02:23:12 +0000
Subject: [PATCH 087/181] Revert "Update src/transformers/__init__.py"

This reverts commit 7ffa504450bb9dbccf9c7ea668441b98a1939d5c.
---
 src/transformers/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ebf2791c8503..f8e82121bae9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -775,8 +775,8 @@
     "models.vit_msn": ["ViTMSNConfig"],
     "models.vitdet": ["VitDetConfig"],
     "models.vitmatte": ["VitMatteConfig"],
-    "models.vitpose": ["VitPoseConfig"],
-    "models.vitpose_backbone": ["VitPoseBackboneConfig"],
+    "models.vitpose": ["ViTPoseConfig"],
+    "models.vitpose_backbone": ["ViTPoseBackboneConfig"],
     "models.vits": [
         "VitsConfig",
         "VitsTokenizer",

From 72f8fcbaabe85caf8d4e7f8cfa2595a177dfcd98 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 3 Sep 2024 02:23:43 +0000
Subject: [PATCH 088/181] change name

---
 .../modeling_vitpose_backbone.py               | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index c445f2ee21f7..45d7af50872f 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -244,23 +244,23 @@ def __init__(self, config: ViTPoseBackboneConfig) -> None:
         experts = [nn.Linear(hidden_features, part_features) for _ in range(num_experts)]
         self.experts = nn.ModuleList(experts)
 
-    def forward(self, x, indices):
-        expert_x = torch.zeros_like(x[:, :, -self.part_features :])
+    def forward(self, hidden_state, indices):
+        expert_hidden_state = torch.zeros_like(hidden_state[:, :, -self.part_features :])
 
-        x = self.fc1(x)
-        x = self.act(x)
-        shared_x = self.fc2(x)
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.act(hidden_state)
+        shared_hidden_state = self.fc2(hidden_state)
         indices = indices.view(-1, 1, 1)
 
         # to support ddp training
         for i in range(self.num_experts):
             selectedIndex = indices == i
-            current_x = self.experts[i](x) * selectedIndex
-            expert_x = expert_x + current_x
+            current_hidden_state = self.experts[i](hidden_state) * selectedIndex
+            expert_hidden_state = expert_hidden_state + current_hidden_state
 
-        x = torch.cat([shared_x, expert_x], dim=-1)
+        hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)
 
-        return x
+        return hidden_state
 
 
 class ViTPoseBackboneMLP(nn.Module):

From 7d82ba68eba248d85bc164ceea057fa6f3d246e4 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 22:58:44 +0900
Subject: [PATCH 089/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index ed5016549d60..e9a34be7c090 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -512,7 +512,7 @@ def keypoints_from_heatmaps(
                 Center of the bounding box (x, y).
             scale (np.ndarray[N, 2]):
                 Scale of the bounding box wrt height/width.
-            kernel (int):
+            kernel (int, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
                 K=17 for sigma=3 and k=11 for sigma=2.
 

From 26ee67f3dc3519bce136d61cd9f4a5bbed676116 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:01:35 +0900
Subject: [PATCH 090/181] Update tests/models/vitpose/test_modeling_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 tests/models/vitpose/test_modeling_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 8755bd107764..8a28bef8f122 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -175,7 +175,7 @@ def test_model_common_attributes(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    @unittest.skip(reason="ViTPose does not support input and output embeddings")
     def test_model_get_set_embeddings(self):
         pass
 

From 50d65ead600c676ad12e9ec0c95bf25b8e687dcf Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:02:53 +0900
Subject: [PATCH 091/181] Update docs/source/en/model_doc/vitpose.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/vitpose.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index fcecf107370c..864874873130 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -26,7 +26,7 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 
 ## Usage Tips
 
-- To enable MoE(Mixture or Expert) function in backbone, user have to give appropriate input indices into the backbone model. 
+- To enable MoE (Mixture of Experts) function in the backbone, the user has to give appropriate input indices to the backbone model. 
   However, it is not used in default parameters.
 - The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person.
   After that, the second step uses ViTPose to predict the keypoints.

From 292733414f849c85a0a69d41b6e4b6ed610fb974 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:06:14 +0900
Subject: [PATCH 092/181] Update
 src/transformers/models/vitpose/modeling_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/vitpose/modeling_vitpose.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 02b41f01e5d5..5a450f9e98ad 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -151,19 +151,19 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
 
     if output_flipped.ndim != 4:
         raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
-    original_shape = output_flipped.shape
+    batch_size, num_keypoints, height, width = output_flipped.shape
     channels = 1
     if target_type.lower() == "CombinedTarget".lower():
         channels = 3
         output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
-    output_flipped = output_flipped.reshape(original_shape[0], -1, channels, original_shape[2], original_shape[3])
+    output_flipped = output_flipped.reshape(batch_size, -1, channels, height, width)
     output_flipped_back = output_flipped.clone()
 
     # Swap left-right parts
     for left, right in flip_pairs.tolist():
         output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
         output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
-    output_flipped_back = output_flipped_back.reshape(original_shape)
+    output_flipped_back = output_flipped_back.reshape((batch_size, num_keypoints, height, width))
     # Flip horizontally
     output_flipped_back = output_flipped_back.flip(-1)
     return output_flipped_back

From 48ce1b4bb70f71809bc3d7d867ac0440d0b02e46 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:19:32 +0900
Subject: [PATCH 093/181] Update
 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 45d7af50872f..f678ee897c0c 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -91,7 +91,6 @@ def __init__(self, config: ViTPoseBackboneConfig) -> None:
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.config = config
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.patch_embeddings(pixel_values)

From 8132991372ca3aac897400f025f0d3879784bc2e Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:12:13 +0900
Subject: [PATCH 094/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/vitpose/image_processing_vitpose.py             | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index e9a34be7c090..d135ef57fa89 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -131,9 +131,10 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     if not (batch_size == 1 or batch_size == num_coords):
         raise ValueError("The batch size of heatmaps should be 1 or equal to the batch size of coordinates.")
     radius = int((kernel - 1) // 2)
-    for heatmaps in batch_heatmaps:
-        for heatmap in heatmaps:
-            gaussian_filter(heatmap, sigma=0.8, output=heatmap, radius=(radius, radius), axes=(0, 1))
+    batch_heatmaps = np.array([
+        [gaussian_filter(heatmap, sigma=0.8, radius=(radius, radius), axes=(0, 1)) for heatmap in heatmaps]
+        for heatmap in batch_heatmaps
+    ])
     np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
     np.log(batch_heatmaps, batch_heatmaps)
 

From eeb4a6f829f117603f45c39a5152ca9e61211034 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:26:09 +0000
Subject: [PATCH 095/181] move vitpose only function to image_processor

---
 src/transformers/image_transforms.py          | 45 ------------------
 .../vitpose/image_processing_vitpose.py       | 47 ++++++++++++++++++-
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7c18974fe7be..baf5ec95c4b8 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -539,51 +539,6 @@ def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
     return bboxes_corners
 
 
-# inspired by https://github.com/ViTAE-Transformer/ViTPose/blob/d5216452796c90c6bc29f5c5ec0bdba94366768a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py#L132
-def box_to_center_and_scale(
-    box: Union[Tuple, List, np.ndarray],
-    image_width: int,
-    image_height: int,
-    pixel_std: float = 200.0,
-    padding: float = 1.25,
-):
-    """
-    Encodes a bounding box in COCO format into (center, scale).
-
-    Args:
-        box (`Tuple`, `List`, or `np.ndarray`):
-            Bounding box in COCO format (top_left_x, top_left_y, width, height).
-        image_width (`int`):
-            Image width.
-        image_height (`int`):
-            Image height.
-        pixel_std (`float`):
-            Width and height scale factor.
-        padding (`float`):
-            Bounding box padding factor.
-
-    Returns:
-        tuple: A tuple containing center and scale.
-
-        - `np.ndarray` [float32](2,): Center of the bbox (x, y).
-        - `np.ndarray` [float32](2,): Scale of the bbox width & height.
-    """
-
-    top_left_x, top_left_y, width, height = box[:4]
-    aspect_ratio = image_width / image_height
-    center = np.array([top_left_x + width * 0.5, top_left_y + height * 0.5], dtype=np.float32)
-
-    if width > aspect_ratio * height:
-        height = width * 1.0 / aspect_ratio
-    elif width < aspect_ratio * height:
-        width = height * aspect_ratio
-
-    scale = np.array([width / pixel_std, height / pixel_std], dtype=np.float32)
-    scale = scale * padding
-
-    return center, scale
-
-
 # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
 def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
     """
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index e9a34be7c090..d0c30420a627 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import box_to_center_and_scale, to_channel_dimension_format
+from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -48,6 +48,51 @@
 logger = logging.get_logger(__name__)
 
 
+# inspired by https://github.com/ViTAE-Transformer/ViTPose/blob/d5216452796c90c6bc29f5c5ec0bdba94366768a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py#L132
+def box_to_center_and_scale(
+    box: Union[Tuple, List, np.ndarray],
+    image_width: int,
+    image_height: int,
+    pixel_std: float = 200.0,
+    padding: float = 1.25,
+):
+    """
+    Encodes a bounding box in COCO format into (center, scale).
+
+    Args:
+        box (`Tuple`, `List`, or `np.ndarray`):
+            Bounding box in COCO format (top_left_x, top_left_y, width, height).
+        image_width (`int`):
+            Image width.
+        image_height (`int`):
+            Image height.
+        pixel_std (`float`):
+            Width and height scale factor.
+        padding (`float`):
+            Bounding box padding factor.
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - `np.ndarray` [float32](2,): Center of the bbox (x, y).
+        - `np.ndarray` [float32](2,): Scale of the bbox width & height.
+    """
+
+    top_left_x, top_left_y, width, height = box[:4]
+    aspect_ratio = image_width / image_height
+    center = np.array([top_left_x + width * 0.5, top_left_y + height * 0.5], dtype=np.float32)
+
+    if width > aspect_ratio * height:
+        height = width * 1.0 / aspect_ratio
+    elif width < aspect_ratio * height:
+        width = height * aspect_ratio
+
+    scale = np.array([width / pixel_std, height / pixel_std], dtype=np.float32)
+    scale = scale * padding
+
+    return center, scale
+
+
 def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     """
     Converts bounding boxes from the COCO format to the Pascal VOC format.

From c1172a3e35cfe03aa066dd766da38850c054a9a1 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:28:58 +0000
Subject: [PATCH 096/181] raise valueerror when using timm backbone

---
 src/transformers/models/vitpose/configuration_vitpose.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index bb9496564e87..60e55bf9ed4c 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -91,6 +91,9 @@ def __init__(
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
+        if use_timm_backbone:
+            raise ValueError("Currently using timm backbone is not supported yet.")
+
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
             backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_features=["stage4"])

From 16b19035c08b195a0669a3f33fd06b071aadb20b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:33:11 +0000
Subject: [PATCH 097/181] use out_indices

---
 src/transformers/models/vitpose/configuration_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 60e55bf9ed4c..880c9287a76b 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -96,7 +96,7 @@ def __init__(
 
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
-            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_features=["stage4"])
+            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=["stage4"])
         elif isinstance(backbone_config, dict):
             backbone_model_type = backbone_config.get("model_type")
             config_class = CONFIG_MAPPING[backbone_model_type]

From b4465638dec09aa7fb6992e3977ad484fd236865 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:34:10 +0900
Subject: [PATCH 098/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/vitpose/image_processing_vitpose.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index d135ef57fa89..8e91ca72f968 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -122,10 +122,6 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
         `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
             Refined coordinates.
     """
-    if not isinstance(coords, np.ndarray):
-        coords = coords.cpu().numpy()
-    if not isinstance(batch_heatmaps, np.ndarray):
-        batch_heatmaps = batch_heatmaps.cpu().numpy()
     batch_size, num_keypoints, height, width = batch_heatmaps.shape
     num_coords = coords.shape[0]
     if not (batch_size == 1 or batch_size == num_coords):

From 97ffaa7cda0cf70835e00888d3246157b919a54b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:36:33 +0000
Subject: [PATCH 099/181] remove camel-case of def flip_back

---
 .../models/vitpose/modeling_vitpose.py           | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 5a450f9e98ad..44ba5a096097 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -129,7 +129,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 """
 
 
-def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
+def flip_back(output_flipped, flip_pairs, target_type="gaussian-heatmap"):
     """Flip the flipped heatmaps back to the original form.
 
     Args:
@@ -137,23 +137,23 @@ def flip_back(output_flipped, flip_pairs, target_type="GaussianHeatmap"):
             The output heatmaps obtained from the flipped images.
         flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
             Pairs of keypoints which are mirrored (for example, left ear -- right ear).
-        target_type (`str`, *optional*, defaults to `"GaussianHeatmap"`):
-            Target type to use. Can be GaussianHeatmap or CombinedTarget.
-            GaussianHeatmap: Classification target with gaussian distribution.
-            CombinedTarget: The combination of classification target (response map) and regression target (offset map).
+        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
+            Target type to use. Can be gaussian-heatmap or combined-target.
+            gaussian-heatmap: Classification target with gaussian distribution.
+            combined-target: The combination of classification target (response map) and regression target (offset map).
             Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
 
     Returns:
         torch.Tensor: heatmaps that flipped back to the original image
     """
-    if target_type not in ["GaussianHeatmap", "CombinedTarget"]:
-        raise ValueError("target_type should be GaussianHeatmap or CombinedTarget")
+    if target_type not in ["gaussian-heatmap", "combined-target"]:
+        raise ValueError("target_type should be gaussian-heatmap or combined-target")
 
     if output_flipped.ndim != 4:
         raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
     batch_size, num_keypoints, height, width = output_flipped.shape
     channels = 1
-    if target_type.lower() == "CombinedTarget".lower():
+    if target_type == "combined-target":
         channels = 3
         output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
     output_flipped = output_flipped.reshape(batch_size, -1, channels, height, width)

From 82934faed6946327e09b28594921da71e4b6e25a Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:38:55 +0000
Subject: [PATCH 100/181] rename vitposeEstimatorOutput

---
 src/transformers/models/vitpose/modeling_vitpose.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 44ba5a096097..89a2d17022a9 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -40,7 +40,7 @@
 
 
 @dataclass
-class PoseEstimatorOutput(ModelOutput):
+class VitPoseEstimatorOutput(ModelOutput):
     """
     Class for outputs of pose estimation models.
 
@@ -261,7 +261,7 @@ def __init__(self, config: ViTPoseConfig) -> None:
         self.post_init()
 
     @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=PoseEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=VitPoseEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -271,7 +271,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, PoseEstimatorOutput]:
+    ) -> Union[tuple, VitPoseEstimatorOutput]:
         """
         Returns:
 
@@ -331,7 +331,7 @@ def forward(
                 output = (heatmaps,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
-        return PoseEstimatorOutput(
+        return VitPoseEstimatorOutput(
             loss=loss,
             heatmaps=heatmaps,
             hidden_states=outputs.hidden_states,

From f839073412a2b0d3c7c0af3c1084daeb4f204113 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:39:39 +0900
Subject: [PATCH 101/181] Update
 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index f678ee897c0c..8536315b0ea3 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -223,7 +223,7 @@ def forward(
         return outputs
 
 
-class ViTPoseBackboneMoEMLP(nn.Module):
+class ViTPoseBackboneMoeMLP(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
         super().__init__()
 

From d81e3f8207f6c9ecad3bed1de866a829a96c3c7f Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:41:01 +0000
Subject: [PATCH 102/181] fix confused camelcase of MLP

---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 8536315b0ea3..19971e42cf82 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -285,7 +285,7 @@ class ViTPoseBackboneLayer(nn.Module):
     def __init__(self, config: ViTPoseBackboneConfig) -> None:
         super().__init__()
         self.attention = ViTPoseBackboneAttention(config)
-        self.mlp = ViTPoseBackboneMLP(config) if config.num_experts == 1 else ViTPoseBackboneMoEMLP(config)
+        self.mlp = ViTPoseBackboneMLP(config) if config.num_experts == 1 else ViTPoseBackboneMoeMLP(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 

From 0e40dc7e2997c166f218fe641a265c13864d790b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 00:50:47 +0000
Subject: [PATCH 103/181] remove in-place logic

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index b5a874fce159..c05a144fc563 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -176,8 +176,8 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
         [gaussian_filter(heatmap, sigma=0.8, radius=(radius, radius), axes=(0, 1)) for heatmap in heatmaps]
         for heatmap in batch_heatmaps
     ])
-    np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
-    np.log(batch_heatmaps, batch_heatmaps)
+    batch_heatmaps = np.clip(batch_heatmaps, 0.001, 50)
+    batch_heatmaps = np.log(batch_heatmaps)
 
     batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode="edge").flatten()
 

From 33a0040029af938d5e641b756a819420cccc7b33 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 01:10:31 +0000
Subject: [PATCH 104/181] clear scale description

---
 .../models/vitpose/image_processing_vitpose.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index c05a144fc563..b015ba5ece79 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -53,8 +53,8 @@ def box_to_center_and_scale(
     box: Union[Tuple, List, np.ndarray],
     image_width: int,
     image_height: int,
-    pixel_std: float = 200.0,
-    padding: float = 1.25,
+    normalize_factor: float = 200.0,
+    padding_factor: float = 1.25,
 ):
     """
     Encodes a bounding box in COCO format into (center, scale).
@@ -66,9 +66,9 @@ def box_to_center_and_scale(
             Image width.
         image_height (`int`):
             Image height.
-        pixel_std (`float`):
+        normalize_factor (`float`):
             Width and height scale factor.
-        padding (`float`):
+        padding_factor (`float`):
             Bounding box padding factor.
 
     Returns:
@@ -87,8 +87,8 @@ def box_to_center_and_scale(
     elif width < aspect_ratio * height:
         width = height * aspect_ratio
 
-    scale = np.array([width / pixel_std, height / pixel_std], dtype=np.float32)
-    scale = scale * padding
+    scale = np.array([width / normalize_factor, height / normalize_factor], dtype=np.float32)
+    scale = scale * padding_factor
 
     return center, scale
 
@@ -226,7 +226,7 @@ def transform_preds(coords, center, scale, output_size):
         center (`np.ndarray[2,]`):
             Center of the bounding box (x, y).
         scale (`np.ndarray[2,]`):
-            Scale of the bounding box wrt [width, height].
+            Scale of the bounding box wrt original image of width and height.
         output_size (`np.ndarray[2,]):
             Size of the destination heatmaps in (height, width) format.
 
@@ -553,7 +553,7 @@ def keypoints_from_heatmaps(
             center (np.ndarray[N, 2]):
                 Center of the bounding box (x, y).
             scale (np.ndarray[N, 2]):
-                Scale of the bounding box wrt height/width.
+                Scale of the bounding box wrt original images of width and height.
             kernel (int, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
                 K=17 for sigma=3 and k=11 for sigma=2.

From 8b9d9f74220d5eda371d2d47d819cbb4566f2c80 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 11:33:32 +0000
Subject: [PATCH 105/181] make consistent batch format

---
 .../models/vitpose/configuration_vitpose.py   |  6 ++-
 .../vitpose/image_processing_vitpose.py       | 50 +++++++++++++------
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 880c9287a76b..3fae857b9575 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -96,7 +96,11 @@ def __init__(
 
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
-            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=["stage4"])
+            backbone_config = CONFIG_MAPPING["vitpose_backbone"](
+                out_indices=[
+                    4,
+                ]
+            )
         elif isinstance(backbone_config, dict):
             backbone_model_type = backbone_config.get("model_type")
             config_class = CONFIG_MAPPING[backbone_model_type]
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index b015ba5ece79..1ec83e0306b6 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """Image processor class for ViTPose."""
 
+import itertools
 import math
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -172,10 +173,12 @@ def post_dark_udp(coords, batch_heatmaps, kernel=3):
     if not (batch_size == 1 or batch_size == num_coords):
         raise ValueError("The batch size of heatmaps should be 1 or equal to the batch size of coordinates.")
     radius = int((kernel - 1) // 2)
-    batch_heatmaps = np.array([
-        [gaussian_filter(heatmap, sigma=0.8, radius=(radius, radius), axes=(0, 1)) for heatmap in heatmaps]
-        for heatmap in batch_heatmaps
-    ])
+    batch_heatmaps = np.array(
+        [
+            [gaussian_filter(heatmap, sigma=0.8, radius=(radius, radius), axes=(0, 1)) for heatmap in heatmaps]
+            for heatmaps in batch_heatmaps
+        ]
+    )
     batch_heatmaps = np.clip(batch_heatmaps, 0.001, 50)
     batch_heatmaps = np.log(batch_heatmaps)
 
@@ -436,7 +439,7 @@ def preprocess(
                 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
 
-            boxes (`List[List[float]]` or `np.ndarray`):
+            boxes (`List[List[List[float]]]` or `np.ndarray`):
                 List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
                 box coordinates in COCO format (top_left_x, top_left_y, width, height).
 
@@ -485,6 +488,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
+        if isinstance(boxes, list):
+            if len(images) != len(boxes):
+                raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
+        else:
+            if len(images) != boxes.shape[0]:
+                raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
+
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
@@ -588,12 +598,13 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
         Args:
             outputs (torch.Tensor):
                 Model outputs.
-            boxes (torch.Tensor of shape [batch_size, 4]):
-                Bounding boxes.
+            boxes (`List[List[List[float]]]` or `np.ndarray`):
+                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+                box coordinates in COCO format (top_left_x, top_left_y, width, height).
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
         Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
+            `List[List[Dict]]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
             in the batch as predicted by the model.
         """
 
@@ -601,9 +612,10 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
         batch_size = len(outputs.heatmaps)
         centers = np.zeros((batch_size, 2), dtype=np.float32)
         scales = np.zeros((batch_size, 2), dtype=np.float32)
+        flattened_boxes = list(itertools.chain(*boxes))
         for i in range(batch_size):
             width, height = self.size["width"], self.size["height"]
-            center, scale = box_to_center_and_scale(boxes[i], image_width=width, image_height=height)
+            center, scale = box_to_center_and_scale(flattened_boxes[i], image_width=width, image_height=height)
             centers[i, :] = center
             scales[i, :] = scale
 
@@ -621,11 +633,17 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
 
         bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
 
-        pose_results: List[Dict[str, torch.Tensor]] = []
-        for pose, bbox_xyxy in zip(poses, bboxes_xyxy):
-            pose_result = {}
-            pose_result["keypoints"] = pose
-            pose_result["bbox"] = bbox_xyxy
-            pose_results.append(pose_result)
+        results: List[List[Dict[str, torch.Tensor]]] = []
+
+        pose_bbox_pairs = zip(poses, bboxes_xyxy)
+
+        for batch_bbox in boxes:
+            batch_results: List[Dict[str, torch.Tensor]] = []
+            for _ in batch_bbox:
+                # Unpack the next pose and bbox_xyxy from the iterator
+                pose, bbox_xyxy = next(pose_bbox_pairs)
+                pose_result = {"keypoints": pose, "bbox": bbox_xyxy}
+                batch_results.append(pose_result)
+            results.append(batch_results)
 
-        return pose_results
+        return results

From 20df85b4a01ff1d5baba488f9fa17e8b010c0221 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 11:40:49 +0000
Subject: [PATCH 106/181] docs update

---
 docs/source/en/model_doc/vitpose.md | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 864874873130..1dde9485b148 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -51,30 +51,15 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 >>> with torch.no_grad():
 ...     outputs = model(pixel_values)
 
->>> pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0])
+>>> pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
 >>> for pose_result in pose_results:
 ...     for keypoint in pose_result['keypoints']:
 ...         x, y, score = keypoint
 ...         print(f"coordinate : [{x}, {y}], score : {score}")
-coordinate : [428.25335693359375, 170.24496459960938], score : 0.8717536330223083
-coordinate : [429.13037109375, 167.39605712890625], score : 0.8820509910583496
-coordinate : [428.23681640625, 167.72825622558594], score : 0.7663289308547974
-coordinate : [433.1866455078125, 167.2566680908203], score : 0.933370053768158
-coordinate : [440.34075927734375, 166.58522033691406], score : 0.8911094069480896
-coordinate : [439.90283203125, 177.54049682617188], score : 0.9118685722351074
-coordinate : [445.50372314453125, 178.04055786132812], score : 0.751734733581543
-coordinate : [436.45819091796875, 199.42474365234375], score : 0.8745120167732239
-coordinate : [433.68255615234375, 200.17333984375], score : 0.5155676603317261
-coordinate : [430.5008544921875, 218.7760009765625], score : 0.8757728338241577
-coordinate : [420.5921630859375, 213.15621948242188], score : 0.9036439657211304
-coordinate : [445.17218017578125, 222.87921142578125], score : 0.8029380440711975
-coordinate : [452.07672119140625, 222.17730712890625], score : 0.8517846465110779
-coordinate : [441.92657470703125, 255.0374755859375], score : 0.8607744574546814
-coordinate : [451.2308349609375, 254.36398315429688], score : 0.8495950698852539
-coordinate : [443.9051513671875, 287.5822448730469], score : 0.703719437122345
-coordinate : [455.88482666015625, 285.6434631347656], score : 0.8391701579093933
 ```
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
+
 
 
 ## ViTPoseImageProcessor

From 2f3d6df45447a84c9c26377c3ef5fc8ef9daffdc Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 11:46:33 +0000
Subject: [PATCH 107/181] formatting docstring

---
 .../vitpose/image_processing_vitpose.py       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 1ec83e0306b6..d3b2b0ab3005 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -124,9 +124,9 @@ def get_keypoint_predictions(heatmaps):
     Returns:
         tuple: A tuple containing aggregated results.
 
-        - coords (np.ndarray[N, K, 2]):
+        - coords (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
             Predicted keypoint location.
-        - scores (np.ndarray[N, K, 1]):
+        - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
             Scores (confidence) of the keypoints.
     """
     if not isinstance(heatmaps, np.ndarray):
@@ -219,18 +219,18 @@ def transform_preds(coords, center, scale, output_size):
         num_keypoints: K
 
     Args:
-        coords (`np.ndarray[K, ndims]`):
+        coords (`np.ndarray` of shape `(num_keypoints, ndims)`):
 
             * If ndims=2, corrds are predicted keypoint location.
             * If ndims=4, corrds are composed of (x, y, scores, tags)
             * If ndims=5, corrds are composed of (x, y, scores, tags,
               flipped_tags)
 
-        center (`np.ndarray[2,]`):
+        center (`np.ndarray` of shape `(2,)`):
             Center of the bounding box (x, y).
-        scale (`np.ndarray[2,]`):
+        scale (`np.ndarray` of shape `(2,)`):
             Scale of the bounding box wrt original image of width and height.
-        output_size (`np.ndarray[2,]):
+        output_size (`np.ndarray` of shape `(2,)`):
             Size of the destination heatmaps in (height, width) format.
 
     Returns:
@@ -560,9 +560,9 @@ def keypoints_from_heatmaps(
         Args:
             heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width])`):
                 Model predicted heatmaps.
-            center (np.ndarray[N, 2]):
+            center (`np.ndarray` of shape `(batch_size, 2)`):
                 Center of the bounding box (x, y).
-            scale (np.ndarray[N, 2]):
+            scale (`np.ndarray` of shape `(batch_size, 2)`):
                 Scale of the bounding box wrt original images of width and height.
             kernel (int, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
@@ -571,9 +571,9 @@ def keypoints_from_heatmaps(
         Returns:
             tuple: A tuple containing keypoint predictions and scores.
 
-            - preds (np.ndarray[batch_size, num_keypoints, 2]):
+            - preds (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
                 Predicted keypoint location in images.
-            - scores (np.ndarray[batch_size, num_keypoints, 1]):
+            - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
                 Scores (confidence) of the keypoints.
         """
         # Avoid mutation

From 3b04bc77aea35ac32b96926c1643a82aae30f97d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 4 Sep 2024 11:55:50 +0000
Subject: [PATCH 108/181] add batch tests

---
 tests/models/vitpose/test_modeling_vitpose.py | 45 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 8a28bef8f122..a1a86af64845 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -260,7 +260,7 @@ def test_inference_pose_estimation(self):
 
         assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
 
-        pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
+        pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
         expected_bbox = torch.tensor([439.3250, 226.6150, 438.9719, 226.4776, 22320.4219, 0.0000]).to(torch_device)
         expected_keypoints = torch.tensor(
@@ -277,4 +277,45 @@ def test_inference_pose_estimation(self):
 
     @slow
     def test_batched_inference(self):
-        raise NotImplementedError("To do")
+        image_processor = self.default_image_processor
+        # TODO update organization
+        model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+
+        image = prepare_img()
+        boxes = [
+            [[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]],
+            [[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]],
+        ]
+
+        inputs = image_processor(images=[image, image], boxes=boxes, return_tensors="pt")
+
+        outputs = model(**inputs)
+        heatmaps = outputs.heatmaps
+
+        assert heatmaps.shape == (4, 17, 64, 48)
+
+        expected_slice = torch.tensor(
+            [
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+            ]
+        )
+
+        assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
+
+        pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
+
+        expected_bbox = torch.tensor([439.3250, 226.6150, 438.9719, 226.4776, 22320.4219, 0.0000]).to(torch_device)
+        expected_keypoints = torch.tensor(
+            [
+                [3.9813e02, 1.8184e02, 8.7529e-01],
+                [3.9828e02, 1.7981e02, 8.4315e-01],
+                [3.9596e02, 1.7948e02, 9.2678e-01],
+            ]
+        ).to(torch_device)
+
+        self.assertEqual(len(pose_results), 2)
+        self.assertEqual(len(pose_results[0]), 2)
+        self.assertTrue(torch.allclose(pose_results[0][0]["bbox"], expected_bbox, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[0][0]["keypoints"], expected_keypoints, atol=1e-4))

From c880093498987968d28c3d7d089390c4233fa802 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 5 Sep 2024 00:18:41 +0000
Subject: [PATCH 109/181] test docs change

---
 docs/source/en/model_doc/vitpose.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 1dde9485b148..9899e81bcaf8 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -61,7 +61,6 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
 
 
-
 ## ViTPoseImageProcessor
 
 [[autodoc]] ViTPoseImageProcessor

From cbbb966163f1f7f2ca26e71895a8ae3244985607 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Fri, 6 Sep 2024 15:24:45 +0900
Subject: [PATCH 110/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index d3b2b0ab3005..044930d5d860 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -298,9 +298,9 @@ def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray,
 
 def scipy_warp_affine(src, M, size):
     """
-    This function implements cv2.warpAffine used in the original implementation using scipy.
+    This function implements cv2.warpAffine function using affine_transform in scipy. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.affine_transform.html and https://docs.opencv.org/4.x/d4/d61/tutorial_warp_affine.html for more details.
 
-    Note: the original implementation uses cv2.INTER_LINEAR.
+    Note: the original implementation of cv2.warpAffine uses cv2.INTER_LINEAR.
     """
     channels = [src[..., i] for i in range(src.shape[-1])]
 

From f0e0f790adcd3a9d74723c5c1bb9009bc37aade1 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Fri, 6 Sep 2024 15:24:55 +0900
Subject: [PATCH 111/181] Update
 src/transformers/models/vitpose/configuration_vitpose.py

---
 src/transformers/models/vitpose/configuration_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 3fae857b9575..c29538e86653 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -52,7 +52,7 @@ class ViTPoseConfig(PretrainedConfig):
         scale_factor (`int`, *optional*, defaults to 4):
             Factor to upscale the feature maps coming from the ViT backbone.
         use_simple_decoder (`bool`, *optional*, defaults to `True`):
-            Whether to use a simple decoder to decode the feature maps from the backbone into heatmaps.
+            Whether to use a `VitPoseSimpleDecoder` to decode the feature maps from the backbone into heatmaps. Otherwise it uses `VitPoseClassicDecoder`.
 
     Example:
 

From 50294f6d0b242919762d589c37b53156fb4541e6 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 6 Sep 2024 06:44:50 +0000
Subject: [PATCH 112/181] chagne ViT to Vit

---
 docs/source/en/model_doc/vitpose.md           | 24 ++---
 src/transformers/__init__.py                  | 26 +++---
 .../models/auto/configuration_auto.py         |  8 +-
 src/transformers/models/auto/modeling_auto.py |  2 +-
 src/transformers/models/vitpose/__init__.py   | 16 ++--
 .../models/vitpose/configuration_vitpose.py   | 18 ++--
 .../models/vitpose/convert_vitpose_to_hf.py   | 18 ++--
 .../vitpose/image_processing_vitpose.py       |  6 +-
 .../models/vitpose/modeling_vitpose.py        | 32 +++----
 .../models/vitpose_backbone/__init__.py       | 12 +--
 .../configuration_vitpose_backbone.py         | 18 ++--
 .../modeling_vitpose_backbone.py              | 90 +++++++++----------
 src/transformers/utils/dummy_pt_objects.py    |  8 +-
 .../utils/dummy_vision_objects.py             |  2 +-
 .../vitpose/test_image_processing_vitpose.py  | 10 +--
 tests/models/vitpose/test_modeling_vitpose.py | 52 +++++------
 .../test_modeling_vitpose_backbone.py         | 48 +++++-----
 utils/check_copies.py                         |  2 +-
 utils/check_repo.py                           |  6 +-
 19 files changed, 199 insertions(+), 199 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 9899e81bcaf8..14b123542b25 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -10,11 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# ViTPose
+# VitPose
 
 ## Overview
 
-The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](https://arxiv.org/pdf/2010.11929v2) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
+The VitPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. VitPose employs a standard, non-hierarchical [Vision Transformer](https://arxiv.org/pdf/2010.11929v2) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
 
 The abstract from the paper is the following:
 
@@ -29,20 +29,20 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 - To enable MoE (Mixture of Experts) function in the backbone, the user has to give appropriate input indices to the backbone model. 
   However, it is not used in default parameters.
 - The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person.
-  After that, the second step uses ViTPose to predict the keypoints.
+  After that, the second step uses VitPose to predict the keypoints.
 
 ```py
 >>> import torch
 >>> import requests
 
 >>> from PIL import Image
->>> from transformers import ViTPoseImageProcessor, ViTPoseForPoseEstimation
+>>> from transformers import VitPoseImageProcessor, VitPoseForPoseEstimation
 
 >>> url = 'http://images.cocodataset.org/val2017/000000000139.jpg' 
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
->>> image_processor = ViTPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
->>> model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+>>> image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
+>>> model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
 
 >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
 
@@ -61,16 +61,16 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
 
 
-## ViTPoseImageProcessor
+## VitPoseImageProcessor
 
-[[autodoc]] ViTPoseImageProcessor
+[[autodoc]] VitPoseImageProcessor
     - preprocess
 
-## ViTPoseConfig
+## VitPoseConfig
 
-[[autodoc]] ViTPoseConfig
+[[autodoc]] VitPoseConfig
 
-## ViTPoseForPoseEstimation
+## VitPoseForPoseEstimation
 
-[[autodoc]] ViTPoseForPoseEstimation
+[[autodoc]] VitPoseForPoseEstimation
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5effe9de16d8..8a3e4eab6f42 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -791,8 +791,8 @@
     "models.vit_msn": ["ViTMSNConfig"],
     "models.vitdet": ["VitDetConfig"],
     "models.vitmatte": ["VitMatteConfig"],
-    "models.vitpose": ["ViTPoseConfig"],
-    "models.vitpose_backbone": ["ViTPoseBackboneConfig"],
+    "models.vitpose": ["VitPoseConfig"],
+    "models.vitpose_backbone": ["VitPoseBackboneConfig"],
     "models.vits": [
         "VitsConfig",
         "VitsTokenizer",
@@ -1212,7 +1212,7 @@
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
     _import_structure["models.vitmatte"].append("VitMatteImageProcessor")
-    _import_structure["models.vitpose"].append("ViTPoseImageProcessor")
+    _import_structure["models.vitpose"].append("VitPoseImageProcessor")
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
     _import_structure["models.zoedepth"].append("ZoeDepthImageProcessor")
@@ -3547,14 +3547,14 @@
     )
     _import_structure["models.vitpose"].extend(
         [
-            "ViTPoseForPoseEstimation",
-            "ViTPosePreTrainedModel",
+            "VitPoseForPoseEstimation",
+            "VitPosePreTrainedModel",
         ]
     )
     _import_structure["models.vitpose_backbone"].extend(
         [
-            "ViTPoseBackbone",
-            "ViTPoseBackbonePreTrainedModel",
+            "VitPoseBackbone",
+            "VitPoseBackbonePreTrainedModel",
         ]
     )
     _import_structure["models.vits"].extend(
@@ -5622,8 +5622,8 @@
     from .models.vit_msn import ViTMSNConfig
     from .models.vitdet import VitDetConfig
     from .models.vitmatte import VitMatteConfig
-    from .models.vitpose import ViTPoseConfig
-    from .models.vitpose_backbone import ViTPoseBackboneConfig
+    from .models.vitpose import VitPoseConfig
+    from .models.vitpose_backbone import VitPoseBackboneConfig
     from .models.vits import (
         VitsConfig,
         VitsTokenizer,
@@ -6049,7 +6049,7 @@
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
         from .models.vitmatte import VitMatteImageProcessor
-        from .models.vitpose import ViTPoseImageProcessor
+        from .models.vitpose import VitPoseImageProcessor
         from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
         from .models.zoedepth import ZoeDepthImageProcessor
@@ -7916,10 +7916,10 @@
             VitMattePreTrainedModel,
         )
         from .models.vitpose import (
-            ViTPoseForPoseEstimation,
-            ViTPosePreTrainedModel,
+            VitPoseForPoseEstimation,
+            VitPosePreTrainedModel,
         )
-        from .models.vitpose_backbone import ViTPoseBackbone, ViTPoseBackbonePreTrainedModel
+        from .models.vitpose_backbone import VitPoseBackbone, VitPoseBackbonePreTrainedModel
         from .models.vits import (
             VitsModel,
             VitsPreTrainedModel,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 65e2327c9eb2..f6e60b384db7 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -285,8 +285,8 @@
         ("vit_msn", "ViTMSNConfig"),
         ("vitdet", "VitDetConfig"),
         ("vitmatte", "VitMatteConfig"),
-        ("vitpose", "ViTPoseConfig"),
-        ("vitpose_backbone", "ViTPoseBackboneConfig"),
+        ("vitpose", "VitPoseConfig"),
+        ("vitpose_backbone", "VitPoseBackboneConfig"),
         ("vits", "VitsConfig"),
         ("vivit", "VivitConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -593,8 +593,8 @@
         ("vit_msn", "ViTMSN"),
         ("vitdet", "VitDet"),
         ("vitmatte", "ViTMatte"),
-        ("vitpose", "ViTPose"),
-        ("vitpose_backbone", "ViTPoseBackbone"),
+        ("vitpose", "VitPose"),
+        ("vitpose_backbone", "VitPoseBackbone"),
         ("vits", "VITS"),
         ("vivit", "ViViT"),
         ("wav2vec2", "Wav2Vec2"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 655594acdd23..18df8918418e 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1298,7 +1298,7 @@
         ("swinv2", "Swinv2Backbone"),
         ("timm_backbone", "TimmBackbone"),
         ("vitdet", "VitDetBackbone"),
-        ("vitpose_backbone", "ViTPoseBackbone"),
+        ("vitpose_backbone", "VitPoseBackbone"),
     ]
 )
 
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index 9c32191cdcfe..3ca6860da1b5 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -20,7 +20,7 @@
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
-_import_structure = {"configuration_vitpose": ["ViTPoseConfig"]}
+_import_structure = {"configuration_vitpose": ["VitPoseConfig"]}
 
 
 try:
@@ -29,7 +29,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["image_processing_vitpose"] = ["ViTPoseImageProcessor"]
+    _import_structure["image_processing_vitpose"] = ["VitPoseImageProcessor"]
 
 
 try:
@@ -39,12 +39,12 @@
     pass
 else:
     _import_structure["modeling_vitpose"] = [
-        "ViTPosePreTrainedModel",
-        "ViTPoseForPoseEstimation",
+        "VitPosePreTrainedModel",
+        "VitPoseForPoseEstimation",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_vitpose import ViTPoseConfig
+    from .configuration_vitpose import VitPoseConfig
 
     try:
         if not is_vision_available():
@@ -52,7 +52,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .image_processing_vitpose import ViTPoseImageProcessor
+        from .image_processing_vitpose import VitPoseImageProcessor
 
     try:
         if not is_torch_available():
@@ -61,8 +61,8 @@
         pass
     else:
         from .modeling_vitpose import (
-            ViTPoseForPoseEstimation,
-            ViTPosePreTrainedModel,
+            VitPoseForPoseEstimation,
+            VitPosePreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index c29538e86653..7950c608b2a2 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""ViTPose model configuration"""
+"""VitPose model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -22,11 +22,11 @@
 logger = logging.get_logger(__name__)
 
 
-class ViTPoseConfig(PretrainedConfig):
+class VitPoseConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTPoseForPoseEstimation`]. It is used to instantiate a
-    ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the ViTPose
+    This is the configuration class to store the configuration of a [`VitPoseForPoseEstimation`]. It is used to instantiate a
+    VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitPose
     [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -57,13 +57,13 @@ class ViTPoseConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import ViTPoseConfig, ViTPoseForPoseEstimation
+    >>> from transformers import VitPoseConfig, VitPoseForPoseEstimation
 
-    >>> # Initializing a ViTPose configuration
-    >>> configuration = ViTPoseConfig()
+    >>> # Initializing a VitPose configuration
+    >>> configuration = VitPoseConfig()
 
     >>> # Initializing a model (with random weights) from the configuration
-    >>> model = ViTPoseForPoseEstimation(configuration)
+    >>> model = VitPoseForPoseEstimation(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 34e7645c2556..d20f709f1b85 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert ViTPose checkpoints from the original repository.
+"""Convert VitPose checkpoints from the original repository.
 
 URL: https://github.com/vitae-transformer/vitpose
 """
@@ -26,7 +26,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import ViTPoseBackboneConfig, ViTPoseConfig, ViTPoseForPoseEstimation, ViTPoseImageProcessor
+from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
 from transformers.models.vitpose.image_processing_vitpose import coco_to_pascal_voc
 
 
@@ -71,7 +71,7 @@ def get_config(model_name):
     num_experts = 6 if "+" in model_name else 1
     part_features = 192 if "+" in model_name else 0
 
-    backbone_config = ViTPoseBackboneConfig(out_indices=[12], num_experts=num_experts, part_features=part_features)
+    backbone_config = VitPoseBackboneConfig(out_indices=[12], num_experts=num_experts, part_features=part_features)
     # size of the architecture
     if "small" in model_name:
         backbone_config.hidden_size = 768
@@ -91,7 +91,7 @@ def get_config(model_name):
 
     use_simple_decoder = "simple" in model_name
 
-    config = ViTPoseConfig(
+    config = VitPoseConfig(
         backbone_config=backbone_config,
         num_labels=17,
         use_simple_decoder=use_simple_decoder,
@@ -186,14 +186,14 @@ def prepare_img():
 @torch.no_grad()
 def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
     """
-    Copy/paste/tweak model's weights to our ViTPose structure.
+    Copy/paste/tweak model's weights to our VitPose structure.
     """
 
-    # define default ViTPose configuration
+    # define default VitPose configuration
     config = get_config(model_name)
 
     # load HuggingFace model
-    model = ViTPoseForPoseEstimation(config)
+    model = VitPoseForPoseEstimation(config)
     model.eval()
 
     # load original state_dict
@@ -217,7 +217,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
                 assert "associate_heads" in key
 
     # create image processor
-    image_processor = ViTPoseImageProcessor()
+    image_processor = VitPoseImageProcessor()
 
     # verify image processor
     image = prepare_img()
@@ -319,7 +319,7 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         default="vitpose-base-simple",
         choices=model_name_to_file_name.keys(),
         type=str,
-        help="Name of the ViTPose model you'd like to convert.",
+        help="Name of the VitPose model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 044930d5d860..bc56ee2a161c 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Image processor class for ViTPose."""
+"""Image processor class for VitPose."""
 
 import itertools
 import math
@@ -322,9 +322,9 @@ def scipy_warp_affine(src, M, size):
     return new_src
 
 
-class ViTPoseImageProcessor(BaseImageProcessor):
+class VitPoseImageProcessor(BaseImageProcessor):
     r"""
-    Constructs a ViTPose image processor.
+    Constructs a VitPose image processor.
 
     Args:
         do_affine_transform (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 89a2d17022a9..e721bc1a0433 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch ViTPose model."""
+"""PyTorch VitPose model."""
 
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
@@ -30,13 +30,13 @@
     replace_return_docstrings,
 )
 from ...utils.backbone_utils import load_backbone
-from .configuration_vitpose import ViTPoseConfig
+from .configuration_vitpose import VitPoseConfig
 
 
 logger = logging.get_logger(__name__)
 
 # General docstring
-_CONFIG_FOR_DOC = "ViTPoseConfig"
+_CONFIG_FOR_DOC = "VitPoseConfig"
 
 
 @dataclass
@@ -67,13 +67,13 @@ class VitPoseEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-class ViTPosePreTrainedModel(PreTrainedModel):
+class VitPosePreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = ViTPoseConfig
+    config_class = VitPoseConfig
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
@@ -99,7 +99,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
     behavior.
 
     Parameters:
-        config ([`ViTPoseConfig`]): Model configuration class with all the parameters of the model.
+        config ([`VitPoseConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -107,8 +107,8 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 VITPOSE_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTPoseImageProcessor`]. See
-            [`ViTPoseImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`VitPoseImageProcessor`]. See
+            [`VitPoseImageProcessor.__call__`] for details.
 
         dataset_index (`torch.Tensor` of shape `(batch_size,)`):
             Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
@@ -169,7 +169,7 @@ def flip_back(output_flipped, flip_pairs, target_type="gaussian-heatmap"):
     return output_flipped_back
 
 
-class ViTPoseSimpleDecoder(nn.Module):
+class VitPoseSimpleDecoder(nn.Module):
     """
     Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
     feature maps into heatmaps.
@@ -196,7 +196,7 @@ def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
         return heatmaps
 
 
-class ViTPoseClassicDecoder(nn.Module):
+class VitPoseClassicDecoder(nn.Module):
     """
     Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
     turning the feature maps into heatmaps.
@@ -235,11 +235,11 @@ def forward(self, hidden_state, flip_pairs):
 
 
 @add_start_docstrings(
-    "The ViTPose model with a pose estimation head on top.",
+    "The VitPose model with a pose estimation head on top.",
     VITPOSE_START_DOCSTRING,
 )
-class ViTPoseForPoseEstimation(ViTPosePreTrainedModel):
-    def __init__(self, config: ViTPoseConfig) -> None:
+class VitPoseForPoseEstimation(VitPosePreTrainedModel):
+    def __init__(self, config: VitPoseConfig) -> None:
         super().__init__(config)
 
         self.backbone = load_backbone(config)
@@ -255,7 +255,7 @@ def __init__(self, config: ViTPoseConfig) -> None:
         config.backbone_hidden_size = self.backbone.config.hidden_size
         config.image_size = self.backbone.config.image_size
         config.patch_size = self.backbone.config.patch_size
-        self.head = ViTPoseSimpleDecoder(config) if config.use_simple_decoder else ViTPoseClassicDecoder(config)
+        self.head = VitPoseSimpleDecoder(config) if config.use_simple_decoder else VitPoseClassicDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -278,13 +278,13 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, ViTPoseForPoseEstimation
+        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
         >>> import torch
         >>> from PIL import Image
         >>> import requests
 
         >>> processor = AutoImageProcessor.from_pretrained("")
-        >>> model = ViTPoseForPoseEstimation.from_pretrained("")
+        >>> model = VitPoseForPoseEstimation.from_pretrained("")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/vitpose_backbone/__init__.py b/src/transformers/models/vitpose_backbone/__init__.py
index d45d242d6c6e..f69ff8762af0 100644
--- a/src/transformers/models/vitpose_backbone/__init__.py
+++ b/src/transformers/models/vitpose_backbone/__init__.py
@@ -20,7 +20,7 @@
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
-_import_structure = {"configuration_vitpose_backbone": ["ViTPoseBackboneConfig"]}
+_import_structure = {"configuration_vitpose_backbone": ["VitPoseBackboneConfig"]}
 
 
 try:
@@ -30,12 +30,12 @@
     pass
 else:
     _import_structure["modeling_vitpose_backbone"] = [
-        "ViTPoseBackbonePreTrainedModel",
-        "ViTPoseBackbone",
+        "VitPoseBackbonePreTrainedModel",
+        "VitPoseBackbone",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_vitpose_backbone import ViTPoseBackboneConfig
+    from .configuration_vitpose_backbone import VitPoseBackboneConfig
 
     try:
         if not is_torch_available():
@@ -44,8 +44,8 @@
         pass
     else:
         from .modeling_vitpose_backbone import (
-            ViTPoseBackbone,
-            ViTPoseBackbonePreTrainedModel,
+            VitPoseBackbone,
+            VitPoseBackbonePreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 705b72c890e0..962dc6055764 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""ViTPose backbone configuration"""
+"""VitPose backbone configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -22,11 +22,11 @@
 logger = logging.get_logger(__name__)
 
 
-class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
+class VitPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTPoseBackbone`]. It is used to instantiate a
-    ViTPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the ViTPose
+    This is the configuration class to store the configuration of a [`VitPoseBackbone`]. It is used to instantiate a
+    VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitPose
     [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -78,13 +78,13 @@ class ViTPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import ViTPoseBackboneConfig, ViTPoseBackbone
+    >>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
 
-    >>> # Initializing a ViTPose configuration
-    >>> configuration = ViTPoseBackboneConfig()
+    >>> # Initializing a VitPose configuration
+    >>> configuration = VitPoseBackboneConfig()
 
     >>> # Initializing a model (with random weights) from the configuration
-    >>> model = ViTPoseBackbone(configuration)
+    >>> model = VitPoseBackbone(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 19971e42cf82..18f036811762 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch ViTPose backbone model.
+"""PyTorch VitPose backbone model.
 
 This code is the same as the original Vision Transformer (ViT) with 2 modifications:
 - use of padding=2 in the patch embedding layer
@@ -38,16 +38,16 @@
     replace_return_docstrings,
 )
 from ...utils.backbone_utils import BackboneMixin
-from .configuration_vitpose_backbone import ViTPoseBackboneConfig
+from .configuration_vitpose_backbone import VitPoseBackboneConfig
 
 
 logger = logging.get_logger(__name__)
 
 # General docstring
-_CONFIG_FOR_DOC = "ViTPoseBackboneConfig"
+_CONFIG_FOR_DOC = "VitPoseBackboneConfig"
 
 
-class ViTPoseBackbonePatchEmbeddings(nn.Module):
+class VitPoseBackbonePatchEmbeddings(nn.Module):
     """Image to Patch Embedding."""
 
     def __init__(self, config):
@@ -79,15 +79,15 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class ViTPoseBackboneEmbeddings(nn.Module):
+class VitPoseBackboneEmbeddings(nn.Module):
     """
     Construct the position and patch embeddings.
     """
 
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
 
-        self.patch_embeddings = ViTPoseBackbonePatchEmbeddings(config)
+        self.patch_embeddings = VitPoseBackbonePatchEmbeddings(config)
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -103,9 +103,9 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTPoseBackbone
-class ViTPoseBackboneSelfAttention(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->VitPoseBackbone
+class VitPoseBackboneSelfAttention(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
@@ -164,14 +164,14 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTPoseBackbone
-class ViTPoseBackboneSelfOutput(nn.Module):
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VitPoseBackbone
+class VitPoseBackboneSelfOutput(nn.Module):
     """
-    The residual connection is defined in ViTPoseBackboneLayer instead of here (as is the case with other models), due to the
+    The residual connection is defined in VitPoseBackboneLayer instead of here (as is the case with other models), due to the
     layernorm applied before each block.
     """
 
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -183,12 +183,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTPoseBackbone
-class ViTPoseBackboneAttention(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->VitPoseBackbone
+class VitPoseBackboneAttention(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
-        self.attention = ViTPoseBackboneSelfAttention(config)
-        self.output = ViTPoseBackboneSelfOutput(config)
+        self.attention = VitPoseBackboneSelfAttention(config)
+        self.output = VitPoseBackboneSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads: Set[int]) -> None:
@@ -223,8 +223,8 @@ def forward(
         return outputs
 
 
-class ViTPoseBackboneMoeMLP(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+class VitPoseBackboneMoeMLP(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
 
         in_features = out_features = config.hidden_size
@@ -262,8 +262,8 @@ def forward(self, hidden_state, indices):
         return hidden_state
 
 
-class ViTPoseBackboneMLP(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+class VitPoseBackboneMLP(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
         in_features = out_features = config.hidden_size
         hidden_features = int(config.hidden_size * config.mlp_ratio)
@@ -281,11 +281,11 @@ def forward(self, hidden_state: torch.Tensor, indices=None) -> torch.Tensor:
         return hidden_state
 
 
-class ViTPoseBackboneLayer(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+class VitPoseBackboneLayer(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
-        self.attention = ViTPoseBackboneAttention(config)
-        self.mlp = ViTPoseBackboneMLP(config) if config.num_experts == 1 else ViTPoseBackboneMoeMLP(config)
+        self.attention = VitPoseBackboneAttention(config)
+        self.mlp = VitPoseBackboneMLP(config) if config.num_experts == 1 else VitPoseBackboneMoeMLP(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -297,7 +297,7 @@ def forward(
         output_attentions: bool = False,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         self_attention_outputs = self.attention(
-            self.layernorm_before(hidden_states),  # in ViTPoseBackbone, layernorm is applied before self-attention
+            self.layernorm_before(hidden_states),  # in VitPoseBackbone, layernorm is applied before self-attention
             head_mask,
             output_attentions=output_attentions,
         )
@@ -318,12 +318,12 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTPoseBackbone
-class ViTPoseBackboneEncoder(nn.Module):
-    def __init__(self, config: ViTPoseBackboneConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->VitPoseBackbone
+class VitPoseBackboneEncoder(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([ViTPoseBackboneLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([VitPoseBackboneLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     # Ignore copy
@@ -372,17 +372,17 @@ def forward(
         )
 
 
-class ViTPoseBackbonePreTrainedModel(PreTrainedModel):
+class VitPoseBackbonePreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = ViTPoseBackboneConfig
+    config_class = VitPoseBackboneConfig
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["ViTPoseEmbeddings", "ViTPoseLayer"]
+    _no_split_modules = ["VitPoseEmbeddings", "VitPoseLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -397,7 +397,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        elif isinstance(module, ViTPoseBackboneEmbeddings):
+        elif isinstance(module, VitPoseBackboneEmbeddings):
             module.position_embeddings.data = nn.init.trunc_normal_(
                 module.position_embeddings.data.to(torch.float32),
                 mean=0.0,
@@ -411,7 +411,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
     behavior.
 
     Parameters:
-        config ([`ViTPoseBackboneConfig`]): Model configuration class with all the parameters of the model.
+        config ([`VitPoseBackboneConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -444,17 +444,17 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
 
 
 @add_start_docstrings(
-    "The ViTPose backbone useful for downstream tasks.",
+    "The VitPose backbone useful for downstream tasks.",
     VITPOSE_BACKBONE_START_DOCSTRING,
 )
-class ViTPoseBackbone(ViTPoseBackbonePreTrainedModel, BackboneMixin):
-    def __init__(self, config: ViTPoseBackboneConfig):
+class VitPoseBackbone(VitPoseBackbonePreTrainedModel, BackboneMixin):
+    def __init__(self, config: VitPoseBackboneConfig):
         super().__init__(config)
         super()._init_backbone(config)
 
         self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
-        self.embeddings = ViTPoseBackboneEmbeddings(config)
-        self.encoder = ViTPoseBackboneEncoder(config)
+        self.embeddings = VitPoseBackboneEmbeddings(config)
+        self.encoder = VitPoseBackboneEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -478,11 +478,11 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ViTPoseBackboneConfig, ViTPoseBackbone
+        >>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
         >>> import torch
 
-        >>> config = ViTPoseBackboneConfig(out_indices=[-1])
-        >>> model = ViTPoseBackbone(config)
+        >>> config = VitPoseBackboneConfig(out_indices=[-1])
+        >>> model = VitPoseBackbone(config)
 
         >>> pixel_values = torch.randn(1, 3, 256, 192)
         >>> dataset_index = torch.tensor([1])
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 7c64e0df99fe..c192146f94b1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -9295,28 +9295,28 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTPoseForPoseEstimation(metaclass=DummyObject):
+class VitPoseForPoseEstimation(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTPosePreTrainedModel(metaclass=DummyObject):
+class VitPosePreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTPoseBackbone(metaclass=DummyObject):
+class VitPoseBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTPoseBackbonePreTrainedModel(metaclass=DummyObject):
+class VitPoseBackbonePreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index ed2b4209f8ab..4d8ba6e8f3b0 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -646,7 +646,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ViTPoseImageProcessor(metaclass=DummyObject):
+class VitPoseImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index aa4198081448..5edf27e6a69a 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -31,10 +31,10 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTPoseImageProcessor
+    from transformers import VitPoseImageProcessor
 
 
-class ViTPoseImageProcessingTester(unittest.TestCase):
+class VitPoseImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -94,12 +94,12 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class ViTPoseImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ViTPoseImageProcessor if is_vision_available() else None
+class VitPoseImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = VitPoseImageProcessor if is_vision_available() else None
 
     def setUp(self):
         super().setUp()
-        self.image_processor_tester = ViTPoseImageProcessingTester(self)
+        self.image_processor_tester = VitPoseImageProcessingTester(self)
 
     @property
     def image_processor_dict(self):
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index a1a86af64845..a6d87fbc6fda 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -12,14 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Testing suite for the PyTorch ViTPose model."""
+"""Testing suite for the PyTorch VitPose model."""
 
 import inspect
 import unittest
 
 import requests
 
-from transformers import ViTPoseBackboneConfig, ViTPoseConfig
+from transformers import VitPoseBackboneConfig, VitPoseConfig
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
@@ -30,16 +30,16 @@
 if is_torch_available():
     import torch
 
-    from transformers import ViTPoseForPoseEstimation
+    from transformers import VitPoseForPoseEstimation
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTPoseImageProcessor
+    from transformers import VitPoseImageProcessor
 
 
-class ViTPoseModelTester:
+class VitPoseModelTester:
     def __init__(
         self,
         parent,
@@ -84,7 +84,7 @@ def __init__(
         self.out_indices = out_indices
         self.scope = scope
 
-        # in ViTPose, the seq length equals the number of patches
+        # in VitPose, the seq length equals the number of patches
         num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
         self.seq_length = num_patches
 
@@ -100,12 +100,12 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, labels
 
     def get_config(self):
-        return ViTPoseConfig(
+        return VitPoseConfig(
             backbone_config=self.get_backbone_config(),
         )
 
     def get_backbone_config(self):
-        return ViTPoseBackboneConfig(
+        return VitPoseBackboneConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -118,7 +118,7 @@ def get_backbone_config(self):
         )
 
     def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
-        model = ViTPoseForPoseEstimation(config)
+        model = VitPoseForPoseEstimation(config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
@@ -142,13 +142,13 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
+class VitPoseModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViTPose does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as VitPose does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    all_model_classes = (ViTPoseForPoseEstimation,) if is_torch_available() else ()
+    all_model_classes = (VitPoseForPoseEstimation,) if is_torch_available() else ()
     fx_compatible = False
 
     test_pruning = False
@@ -156,8 +156,8 @@ class ViTPoseModelTest(ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = ViTPoseModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTPoseConfig, has_text_modality=False, hidden_size=37)
+        self.model_tester = VitPoseModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VitPoseConfig, has_text_modality=False, hidden_size=37)
 
     def test_config(self):
         self.config_tester.create_and_test_config_to_json_string()
@@ -167,31 +167,31 @@ def test_config(self):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
-    @unittest.skip(reason="ViTPose does not support input and output embeddings")
+    @unittest.skip(reason="VitPose does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support input and output embeddings")
+    @unittest.skip(reason="VitPose does not support input and output embeddings")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support input and output embeddings")
+    @unittest.skip(reason="VitPose does not support input and output embeddings")
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="VitPose does not support training yet")
     def test_training(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="VitPose does not support training yet")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="VitPose does not support training yet")
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
-    @unittest.skip(reason="ViTPose does not support training yet")
+    @unittest.skip(reason="VitPose does not support training yet")
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
@@ -215,7 +215,7 @@ def test_for_pose_estimation(self):
     def test_model_from_pretrained(self):
         # TODO update organization
         model_name = "nielsr/vitpose-base-simple"
-        model = ViTPoseForPoseEstimation.from_pretrained(model_name)
+        model = VitPoseForPoseEstimation.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
 
@@ -228,17 +228,17 @@ def prepare_img():
 
 @require_torch
 @require_vision
-class ViTPoseModelIntegrationTest(unittest.TestCase):
+class VitPoseModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
         # TODO update organization
-        return ViTPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple") if is_vision_available() else None
+        return VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple") if is_vision_available() else None
 
     @slow
     def test_inference_pose_estimation(self):
         image_processor = self.default_image_processor
         # TODO update organization
-        model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+        model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
 
         image = prepare_img()
         boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
@@ -279,7 +279,7 @@ def test_inference_pose_estimation(self):
     def test_batched_inference(self):
         image_processor = self.default_image_processor
         # TODO update organization
-        model = ViTPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+        model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
 
         image = prepare_img()
         boxes = [
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index e96798833083..7d527149cad7 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -12,12 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Testing suite for the PyTorch ViTPose backbone model."""
+"""Testing suite for the PyTorch VitPose backbone model."""
 
 import inspect
 import unittest
 
-from transformers import ViTPoseBackboneConfig
+from transformers import VitPoseBackboneConfig
 from transformers.testing_utils import require_torch
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -27,14 +27,14 @@
 
 
 if is_torch_available():
-    from transformers import ViTPoseBackbone
+    from transformers import VitPoseBackbone
 
 
 if is_vision_available():
     from PIL import Image
 
 
-class ViTPoseBackboneModelTester:
+class VitPoseBackboneModelTester:
     def __init__(
         self,
         parent,
@@ -75,7 +75,7 @@ def __init__(
         self.num_labels = num_labels
         self.scope = scope
 
-        # in ViTPoseBackbone, the seq length equals the number of patches
+        # in VitPoseBackbone, the seq length equals the number of patches
         num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
         self.seq_length = num_patches
 
@@ -91,7 +91,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, labels
 
     def get_config(self):
-        return ViTPoseBackboneConfig(
+        return VitPoseBackboneConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -118,60 +118,60 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase):
+class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViTPoseBackbone does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as VitPoseBackbone does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    all_model_classes = (ViTPoseBackbone,) if is_torch_available() else ()
+    all_model_classes = (VitPoseBackbone,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = ViTPoseBackboneModelTester(self)
+        self.model_tester = VitPoseBackboneModelTester(self)
         self.config_tester = ConfigTester(
-            self, config_class=ViTPoseBackboneConfig, has_text_modality=False, hidden_size=37
+            self, config_class=VitPoseBackboneConfig, has_text_modality=False, hidden_size=37
         )
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support input and output embeddings")
+    @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support feedforward chunking")
+    @unittest.skip(reason="VitPoseBackbone does not support feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not output a loss")
+    @unittest.skip(reason="VitPoseBackbone does not output a loss")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
     def test_training(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
-    @unittest.skip(reason="ViTPoseBackbone does not support training yet")
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
@@ -195,11 +195,11 @@ def prepare_img():
 
 
 @require_torch
-class ViTPoseBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (ViTPoseBackbone,) if is_torch_available() else ()
-    config_class = ViTPoseBackboneConfig
+class VitPoseBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (VitPoseBackbone,) if is_torch_available() else ()
+    config_class = VitPoseBackboneConfig
 
     has_attentions = False
 
     def setUp(self):
-        self.model_tester = ViTPoseBackboneModelTester(self)
+        self.model_tester = VitPoseBackboneModelTester(self)
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 02bf256c4eda..1c3f98dbeaba 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -1088,7 +1088,7 @@ def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> Tup
     "CLIPVisionModel",
     "SiglipVisionModel",
     "ChineseCLIPVisionModel",
-    "ViTPoseBackbone",
+    "VitPoseBackbone",
 ]
 
 # Template for new entries to add in the main README when we have missing models.
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9d9ed8443aed..d97ef9e345a1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -328,7 +328,7 @@
     "SiglipVisionModel",
     "SiglipTextModel",
     "ChameleonVQVAE",  # no autoclass for VQ-VAE models
-    "ViTPoseForPoseEstimation",
+    "VitPoseForPoseEstimation",
 ]
 
 # DO NOT edit this list!
@@ -990,8 +990,8 @@ def find_all_documented_objects() -> List[str]:
     "logging",  # External module
     "requires_backends",  # Internal function
     "AltRobertaModel",  # Internal module
-    "ViTPoseBackbone",  # Internal module
-    "ViTPoseBackboneConfig",  # Internal module
+    "VitPoseBackbone",  # Internal module
+    "VitPoseBackboneConfig",  # Internal module
 ]
 
 # This list should be empty. Objects in it should get their own doc page.

From 5911010c6e0ac6b6765c521e375278bb683340b4 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 6 Sep 2024 07:04:54 +0000
Subject: [PATCH 113/181] change to enable MoE

---
 docs/source/en/model_doc/vitpose.md              | 16 ++++++++++++++--
 .../configuration_vitpose_backbone.py            |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 14b123542b25..71438f2d4143 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -26,8 +26,20 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 
 ## Usage Tips
 
-- To enable MoE (Mixture of Experts) function in the backbone, the user has to give appropriate input indices to the backbone model. 
-  However, it is not used in default parameters.
+- To enable MoE (Mixture of Experts) function in the backbone, user has to give appropriate configuration such as `num_experts` and input value `dataset_index` to the backbone model. 
+  However, it is not used in default parameters. Below is the code snippet for usage of MoE function.
+```py
+>>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+>>> import torch
+
+>>> config = VitPoseBackboneConfig(num_experts=3, out_indices=[-1])
+>>> model = VitPoseBackbone(config)
+
+>>> pixel_values = torch.randn(3, 3, 256, 192)
+>>> dataset_index = torch.tensor([1, 2, 3])
+>>> outputs = model(pixel_values, dataset_index)
+```
+
 - The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person.
   After that, the second step uses VitPose to predict the keypoints.
 
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 962dc6055764..d627b9141183 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -102,7 +102,7 @@ def __init__(
         num_attention_heads=12,
         mlp_ratio=4,
         num_experts=1,
-        part_features=None,
+        part_features=256,
         hidden_act="gelu",
         hidden_dropout_prob=0.0,
         attention_probs_dropout_prob=0.0,

From cb6d45f9f35da855da485b5891200904be1e089f Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 6 Sep 2024 07:10:22 +0000
Subject: [PATCH 114/181] make fix-copies

---
 docs/source/en/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index e39440124487..f1a166a8152c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -332,8 +332,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
-|                       [ViTPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
-|              [ViTPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
+|                       [VitPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
+|              [VitPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |

From 5197549711c9e206bdcb014e2c5e1ad9b08f24da Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:58:54 +0900
Subject: [PATCH 115/181] Update docs/source/en/model_doc/vitpose.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/vitpose.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 71438f2d4143..9c9f1a2ee255 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -40,8 +40,7 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 >>> outputs = model(pixel_values, dataset_index)
 ```
 
-- The current model utilizes a 2-step inference pipeline. The first step involves placing a bounding box around the region corresponding to the person.
-  After that, the second step uses VitPose to predict the keypoints.
+- ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt-detr), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
 >>> import torch

From 22fc70533d4e91466d5e29ac7f6af0e0a36e8624 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 10 Sep 2024 03:04:33 +0000
Subject: [PATCH 116/181] extract udp

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index bc56ee2a161c..0f284764ef6c 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -147,8 +147,8 @@ def get_keypoint_predictions(heatmaps):
     return preds, scores
 
 
-def post_dark_udp(coords, batch_heatmaps, kernel=3):
-    """DARK post-pocessing. Implemented by udp.
+def post_dark_unbiased_data_processing(coords, batch_heatmaps, kernel=3):
+    """DARK post-pocessing. Implemented by unbiased_data_processing.
 
     Paper references:
     - Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
@@ -583,7 +583,7 @@ def keypoints_from_heatmaps(
 
         coords, scores = get_keypoint_predictions(heatmaps)
 
-        preds = post_dark_udp(coords, heatmaps, kernel=kernel)
+        preds = post_dark_unbiased_data_processing(coords, heatmaps, kernel=kernel)
 
         # Transform back to the image
         for i in range(batch_size):

From 0e5549fa6241afc5e72d5167ff64b33763fc2205 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 10 Sep 2024 06:34:31 +0000
Subject: [PATCH 117/181] add more described docs

---
 docs/source/en/model_doc/vitpose.md | 213 ++++++++++++++++++++++++----
 1 file changed, 188 insertions(+), 25 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 9c9f1a2ee255..1c8b6dbbae9c 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -43,31 +43,194 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 - ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt-detr), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
->>> import torch
->>> import requests
-
->>> from PIL import Image
->>> from transformers import VitPoseImageProcessor, VitPoseForPoseEstimation
-
->>> url = 'http://images.cocodataset.org/val2017/000000000139.jpg' 
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
->>> model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
-
->>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
-
->>> pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
-
->>> with torch.no_grad():
-...     outputs = model(pixel_values)
-
->>> pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
-
->>> for pose_result in pose_results:
-...     for keypoint in pose_result['keypoints']:
-...         x, y, score = keypoint
-...         print(f"coordinate : [{x}, {y}], score : {score}")
+import torch
+import requests
+import numpy as np
+import cv2
+import math
+
+from typing import Union
+from PIL import Image
+from transformers import RTDetrImageProcessor, RTDetrForObjectDetection
+from transformers import VitPoseImageProcessor, VitPoseForPoseEstimation
+
+url = 'http://images.cocodataset.org/val2017/000000000139.jpg' 
+image = Image.open(requests.get(url, stream=True).raw)
+
+# Stage 1. Run Object Detector
+# User can replace this object_detector part
+person_image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+inputs = person_image_processor(images=image, return_tensors="pt")
+
+with torch.no_grad():
+  outputs = person_model(**inputs)
+
+results = person_image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
+
+def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
+    """
+    Converts bounding boxes from the Pascal VOC format to the COCO format.
+
+    In other words, converts from (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format
+    to (top_left_x, top_left_y, width, height).
+
+    Args:
+        bboxes (`np.ndarray` of shape `(batch_size, 4)):
+            Bounding boxes in Pascal VOC format.
+
+    Returns:
+        `np.ndarray` of shape `(batch_size, 4) in COCO format.
+    """
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+
+    return bboxes
+
+# 0 index indicates human label in COCO
+boxes = results[0]['boxes'][results[0]['labels'] == 0]
+boxes = [pascal_voc_to_coco(boxes.cpu().numpy())]
+
+image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
+model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+
+# Stage 2. Run ViTPose
+pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
+
+with torch.no_grad():
+  outputs = model(pixel_values)
+
+pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
+
+for pose_result in pose_results:
+  for keypoint in pose_result['keypoints']:
+    x, y, score = keypoint
+    print(f"coordinate : [{x}, {y}], score : {score}")
+
+def visualize_keypoints(img,
+  pose_result,
+  skeleton=None,
+  kpt_score_thr=0.3,
+  pose_kpt_color=None,
+  pose_link_color=None,
+  radius=4,
+  thickness=1,
+  show_keypoint_weight=False):
+    """Draw keypoints and links on an image.
+
+    Args:
+            img (str or Tensor): The image to draw poses on. If an image array
+                is given, id will be modified in-place.
+            pose_result (list[kpts]): The poses to draw. Each element kpts is
+                a set of K keypoints as an Kx3 numpy.ndarray, where each
+                keypoint is represented as x, y, score.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
+                the keypoint will not be drawn.
+            pose_link_color (np.array[Mx3]): Color of M links. If None, the
+                links will not be drawn.
+            thickness (int): Thickness of lines.
+    """
+    img = img.copy()
+    img_h, img_w, _ = img.shape
+
+    for kpts in pose_result:
+        kpts = np.array(kpts, copy=False)
+
+        # draw each point on image
+        if pose_kpt_color is not None:
+            assert len(pose_kpt_color) == len(kpts)
+            for kid, kpt in enumerate(kpts):
+                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+                if kpt_score > kpt_score_thr:
+                    color = tuple(int(c) for c in pose_kpt_color[kid])
+                    if show_keypoint_weight:
+                        img_copy = img.copy()
+                        cv2.circle(img_copy, (int(x_coord), int(y_coord)),
+                                   radius, color, -1)
+                        transparency = max(0, min(1, kpt_score))
+                        cv2.addWeighted(
+                            img_copy,
+                            transparency,
+                            img,
+                            1 - transparency,
+                            0,
+                            dst=img)
+                    else:
+                        cv2.circle(img, (int(x_coord), int(y_coord)), radius,
+                                   color, -1)
+
+        # draw links
+        if skeleton is not None and pose_link_color is not None:
+            assert len(pose_link_color) == len(skeleton)
+            for sk_id, sk in enumerate(skeleton):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+                if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0
+                        and pos1[1] < img_h and pos2[0] > 0 and pos2[0] < img_w
+                        and pos2[1] > 0 and pos2[1] < img_h
+                        and kpts[sk[0], 2] > kpt_score_thr
+                        and kpts[sk[1], 2] > kpt_score_thr):
+                    color = tuple(int(c) for c in pose_link_color[sk_id])
+                    if show_keypoint_weight:
+                        img_copy = img.copy()
+                        X = (pos1[0], pos2[0])
+                        Y = (pos1[1], pos2[1])
+                        mX = np.mean(X)
+                        mY = np.mean(Y)
+                        length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5
+                        angle = math.degrees(
+                            math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                        stickwidth = 2
+                        polygon = cv2.ellipse2Poly(
+                            (int(mX), int(mY)),
+                            (int(length / 2), int(stickwidth)), int(angle), 0,
+                            360, 1)
+                        cv2.fillConvexPoly(img_copy, polygon, color)
+                        transparency = max(
+                            0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                        cv2.addWeighted(
+                            img_copy,
+                            transparency,
+                            img,
+                            1 - transparency,
+                            0,
+                            dst=img)
+                    else:
+                        cv2.line(img, pos1, pos2, color, thickness=thickness)
+
+    return img
+
+# Note: skeleton and color palette are dataset-specific
+skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [4, 6]]
+
+palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                            [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                            [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                            [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                            [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                            [51, 255, 51], [0, 255, 0], [0, 0, 255],
+                            [255, 0, 0], [255, 255, 255]])
+
+pose_link_color = palette[[
+                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+            ]]
+pose_kpt_color = palette[[
+                16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0
+]]
+
+pose_results = [result["keypoints"] for result in pose_results]
+
+result = visualize_keypoints(np.array(image), pose_results, skeleton=skeleton, kpt_score_thr=0.3,
+                 pose_kpt_color=pose_kpt_color, pose_link_color=pose_link_color,
+                 radius=4, thickness=1)
+
+pose_image = Image.fromarray(result)
+pose_image
 ```
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
 

From 12a7b8c08d254cfa8da265c9e5830b3ffb582dbc Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 10 Sep 2024 09:05:26 +0000
Subject: [PATCH 118/181] simple fix

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 0f284764ef6c..33bf39cdd225 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -404,7 +404,7 @@ def affine_transform(
         # one uses a pixel standard deviation of 200 pixels
         transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
 
-        # cv2 requires channels last format
+        # input image requires channels last format
         image = (
             image
             if input_data_format == ChannelDimension.LAST

From 220859d1eebae98a9f4ddcf47f803d3873950a57 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 11 Sep 2024 03:21:43 +0000
Subject: [PATCH 119/181] change to accept target_size

---
 .../vitpose/image_processing_vitpose.py       | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 33bf39cdd225..89357f681b88 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -591,7 +591,9 @@ def keypoints_from_heatmaps(
 
         return preds, scores
 
-    def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
+    def post_process_pose_estimation(
+        self, outputs, boxes, kernel_size=11, target_sizes: Union[TensorType, List[Tuple]] = None
+    ):
         """
         Transform the heatmaps into keypoint predictions and transform them back to the image.
 
@@ -603,6 +605,9 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
                 box coordinates in COCO format (top_left_x, top_left_y, width, height).
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will be resize with the default value.
         Returns:
             `List[List[Dict]]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
             in the batch as predicted by the model.
@@ -610,10 +615,21 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
 
         # First compute centers and scales for each bounding box
         batch_size = len(outputs.heatmaps)
+
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
         centers = np.zeros((batch_size, 2), dtype=np.float32)
         scales = np.zeros((batch_size, 2), dtype=np.float32)
         flattened_boxes = list(itertools.chain(*boxes))
         for i in range(batch_size):
+            if target_sizes is not None:
+                img_w, img_h = target_sizes[i][0], target_sizes[i][1]
+                scale_fct = np.array([img_w, img_h, img_w, img_h])
+                flattened_boxes[i] = flattened_boxes[i] * scale_fct
             width, height = self.size["width"], self.size["height"]
             center, scale = box_to_center_and_scale(flattened_boxes[i], image_width=width, image_height=height)
             centers[i, :] = center
@@ -630,7 +646,6 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
         all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
 
         poses = torch.Tensor(all_preds)
-
         bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
 
         results: List[List[Dict[str, torch.Tensor]]] = []
@@ -646,4 +661,4 @@ def post_process_pose_estimation(self, outputs, boxes, kernel_size=11):
                 batch_results.append(pose_result)
             results.append(batch_results)
 
-        return results
+        r
\ No newline at end of file

From 1afd34726298481df292f0be69ad48c0dda344d3 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 11 Sep 2024 03:22:17 +0000
Subject: [PATCH 120/181] make style

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 89357f681b88..5d7ef564348c 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -661,4 +661,4 @@ def post_process_pose_estimation(
                 batch_results.append(pose_result)
             results.append(batch_results)
 
-        r
\ No newline at end of file
+        return results

From f9ae5243c3591fc2725bb2582234c155aeee876e Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Mon, 23 Sep 2024 20:12:34 +0900
Subject: [PATCH 121/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/vitpose/image_processing_vitpose.py         | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 5d7ef564348c..d7059b5c451e 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -488,12 +488,10 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        if isinstance(boxes, list):
-            if len(images) != len(boxes):
-                raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
-        else:
-            if len(images) != boxes.shape[0]:
-                raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
+        if isinstance(boxes, list)and len(images) != len(boxes):
+            raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
+        elif len(images) != boxes.shape[0]:
+            raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
 
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]

From 60fb1c1aa01cef43f817e773bd1b20950c61b49c Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Mon, 23 Sep 2024 20:12:43 +0900
Subject: [PATCH 122/181] Update
 src/transformers/models/vitpose/configuration_vitpose.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/vitpose/configuration_vitpose.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 7950c608b2a2..5b9d759bc6aa 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -96,11 +96,7 @@ def __init__(
 
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
-            backbone_config = CONFIG_MAPPING["vitpose_backbone"](
-                out_indices=[
-                    4,
-                ]
-            )
+            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=[4])
         elif isinstance(backbone_config, dict):
             backbone_model_type = backbone_config.get("model_type")
             config_class = CONFIG_MAPPING[backbone_model_type]

From b922d7e501ce1496ac88223f7278036810ceae02 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:20:55 +0000
Subject: [PATCH 123/181] change to `verify_backbone_config_arguments`

---
 .../models/vitpose/configuration_vitpose.py      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 5b9d759bc6aa..236818ff0673 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
 from ..auto.configuration_auto import CONFIG_MAPPING
 
 
@@ -88,12 +89,6 @@ def __init__(
         if use_pretrained_backbone:
             raise ValueError("Pretrained backbones are not supported yet.")
 
-        if backbone_config is not None and backbone is not None:
-            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
-
-        if use_timm_backbone:
-            raise ValueError("Currently using timm backbone is not supported yet.")
-
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
             backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=[4])
@@ -102,8 +97,13 @@ def __init__(
             config_class = CONFIG_MAPPING[backbone_model_type]
             backbone_config = config_class.from_dict(backbone_config)
 
-        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
 
         self.backbone_config = backbone_config
         self.backbone = backbone

From 6431ec4e23691456148c2ce31878d0f07bc21531 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Mon, 23 Sep 2024 20:22:05 +0900
Subject: [PATCH 124/181] Update docs/source/en/model_doc/vitpose.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/vitpose.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 1c8b6dbbae9c..3830977d8034 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -107,7 +107,8 @@ for pose_result in pose_results:
     x, y, score = keypoint
     print(f"coordinate : [{x}, {y}], score : {score}")
 
-def visualize_keypoints(img,
+def visualize_keypoints(
+    img,
   pose_result,
   skeleton=None,
   kpt_score_thr=0.3,

From 7a06e3840cf95cae7ffba2b222c0ce09af84ad79 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:23:44 +0000
Subject: [PATCH 125/181] remove unnecessary copy

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index d7059b5c451e..17216fee2e7f 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -574,9 +574,6 @@ def keypoints_from_heatmaps(
             - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
                 Scores (confidence) of the keypoints.
         """
-        # Avoid mutation
-        heatmaps = heatmaps.numpy().copy()
-
         batch_size, _, height, width = heatmaps.shape
 
         coords, scores = get_keypoint_predictions(heatmaps)

From edc53205a458d5017e97b08285306c55f373b978 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:29:11 +0000
Subject: [PATCH 126/181] make config immutable

---
 src/transformers/models/vitpose/modeling_vitpose.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index e721bc1a0433..7f92f2992be9 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -179,7 +179,7 @@ def __init__(self, config) -> None:
         super().__init__()
 
         self.scale_factor = config.scale_factor
-        self.conv = nn.Conv2d(config.backbone_hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1)
+        self.conv = nn.Conv2d(config.backbone_config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
         # Transform input: ReLu + upsample
@@ -206,7 +206,7 @@ def __init__(self, config):
         super().__init__()
 
         self.deconv1 = nn.ConvTranspose2d(
-            config.backbone_hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False
+            config.backbone_config.hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False
         )
         self.batchnorm1 = nn.BatchNorm2d(256)
         self.relu1 = nn.ReLU()
@@ -252,9 +252,6 @@ def __init__(self, config: VitPoseConfig) -> None:
         if not hasattr(self.backbone.config, "patch_size"):
             raise ValueError("The backbone should have a patch_size attribute")
 
-        config.backbone_hidden_size = self.backbone.config.hidden_size
-        config.image_size = self.backbone.config.image_size
-        config.patch_size = self.backbone.config.patch_size
         self.head = VitPoseSimpleDecoder(config) if config.use_simple_decoder else VitPoseClassicDecoder(config)
 
         # Initialize weights and apply final processing
@@ -316,8 +313,8 @@ def forward(
         # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
         sequence_output = outputs.feature_maps[-1] if return_dict else outputs[0][-1]
         batch_size = sequence_output.shape[0]
-        patch_height = self.config.image_size[0] // self.config.patch_size[0]
-        patch_width = self.config.image_size[1] // self.config.patch_size[1]
+        patch_height = self.config.backbone_config.image_size[0] // self.config.backbone_config.patch_size[0]
+        patch_width = self.config.backbone_config.image_size[1] // self.config.backbone_config.patch_size[1]
         sequence_output = (
             sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
         )

From 01a532b2a9f866950bb096aaa12dcaf1d17aae77 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:31:46 +0000
Subject: [PATCH 127/181] enable gradient checkpointing

---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 18f036811762..4dc0c7f51581 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -349,6 +349,7 @@ def forward(
                 layer_outputs = self._gradient_checkpointing_func(
                     layer_module.__call__,
                     hidden_states,
+                    dataset_index,
                     layer_head_mask,
                     output_attentions,
                 )

From 1cbba25a79d8dbf6e038813d2c62c1df655752d2 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:34:41 +0000
Subject: [PATCH 128/181] update inappropriate docstring

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 17216fee2e7f..e1d9303f8df5 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -520,9 +520,9 @@ def preprocess(
                     new_images.append(transformed_image)
             images = new_images
 
-        # since the number of boxes can differ per image, the image processor takes a list
-        # rather than a numpy array of boxes
-        # it currently creates pixel_values of shape (batch_size*num_persons, num_channels, height, width)
+        # For batch processing, the number of boxes must be consistent across all images in the batch.
+        # When using a list input, the number of boxes can vary dynamically per image.
+        # The image processor creates pixel_values of shape (batch_size*num_persons, num_channels, height, width)
 
         if self.do_rescale:
             images = [

From 2ac4c6774180dae0382f134497bd45a4cc63c61d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:48:59 +0000
Subject: [PATCH 129/181] linting docs

---
 docs/source/en/model_doc/vitpose.md | 226 +++++++++++++++++-----------
 1 file changed, 136 insertions(+), 90 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 1c8b6dbbae9c..d02957625636 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -43,18 +43,23 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 - ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt-detr), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
-import torch
-import requests
-import numpy as np
-import cv2
 import math
 
-from typing import Union
+import cv2
+import numpy as np
+import requests
+import torch
 from PIL import Image
-from transformers import RTDetrImageProcessor, RTDetrForObjectDetection
-from transformers import VitPoseImageProcessor, VitPoseForPoseEstimation
 
-url = 'http://images.cocodataset.org/val2017/000000000139.jpg' 
+from transformers import (
+    RTDetrForObjectDetection,
+    RTDetrImageProcessor,
+    VitPoseForPoseEstimation,
+    VitPoseImageProcessor,
+)
+
+
+url = "http://images.cocodataset.org/val2017/000000000139.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
 # Stage 1. Run Object Detector
@@ -64,9 +69,11 @@ person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_co
 inputs = person_image_processor(images=image, return_tensors="pt")
 
 with torch.no_grad():
-  outputs = person_model(**inputs)
+    outputs = person_model(**inputs)
 
-results = person_image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
+results = person_image_processor.post_process_object_detection(
+    outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3
+)
 
 def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
     """
@@ -88,7 +95,7 @@ def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
     return bboxes
 
 # 0 index indicates human label in COCO
-boxes = results[0]['boxes'][results[0]['labels'] == 0]
+boxes = results[0]["boxes"][results[0]["labels"] == 0]
 boxes = [pascal_voc_to_coco(boxes.cpu().numpy())]
 
 image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
@@ -98,39 +105,51 @@ model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
 pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
 
 with torch.no_grad():
-  outputs = model(pixel_values)
+    outputs = model(pixel_values)
 
 pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
 for pose_result in pose_results:
-  for keypoint in pose_result['keypoints']:
-    x, y, score = keypoint
-    print(f"coordinate : [{x}, {y}], score : {score}")
-
-def visualize_keypoints(img,
-  pose_result,
-  skeleton=None,
-  kpt_score_thr=0.3,
-  pose_kpt_color=None,
-  pose_link_color=None,
-  radius=4,
-  thickness=1,
-  show_keypoint_weight=False):
+    for keypoint in pose_result["keypoints"]:
+        x, y, score = keypoint
+        print(f"coordinate : [{x}, {y}], score : {score}")
+
+def visualize_keypoints(
+    img,
+    pose_result,
+    skeleton=None,
+    kpt_score_thr=0.3,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    radius=4,
+    thickness=1,
+    show_keypoint_weight=False,
+):
     """Draw keypoints and links on an image.
 
     Args:
-            img (str or Tensor): The image to draw poses on. If an image array
-                is given, id will be modified in-place.
-            pose_result (list[kpts]): The poses to draw. Each element kpts is
-                a set of K keypoints as an Kx3 numpy.ndarray, where each
-                keypoint is represented as x, y, score.
-            kpt_score_thr (float, optional): Minimum score of keypoints
-                to be shown. Default: 0.3.
-            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
-                the keypoint will not be drawn.
-            pose_link_color (np.array[Mx3]): Color of M links. If None, the
-                links will not be drawn.
-            thickness (int): Thickness of lines.
+        img (`numpy.ndarray`): 
+            The image to draw poses on. It will be modified in-place.
+        pose_result (`List[numpy.ndarray]`): 
+            The poses to draw. Each element is a set of K keypoints as a Kx3 numpy.ndarray, where each keypoint
+            is represented as x, y, score.
+        skeleton (`List[tuple]`, *optional*): 
+            Skeleton definition.
+        kpt_score_thr (`float`, *optional*, defaults to 0.3): 
+            Minimum score of keypoints to be shown.
+        pose_kpt_color (`numpy.ndarray`, *optional*): 
+            Color of N keypoints. If None, the keypoints will not be drawn.
+        pose_link_color (`numpy.ndarray`, *optional*): 
+            Color of M links. If None, the links will not be drawn.
+        radius (`int`, *optional*, defaults to 4):
+            Radius of keypoint circles.
+        thickness (`int`, *optional*, defaults to 1): 
+            Thickness of lines.
+        show_keypoint_weight (`bool`, *optional*, defaults to False): 
+            Whether to adjust keypoint and link visibility based on the keypoint scores.
+    
+    Returns:
+        `numpy.ndarray`: Image with drawn keypoints and links.
     """
     img = img.copy()
     img_h, img_w, _ = img.shape
@@ -147,19 +166,11 @@ def visualize_keypoints(img,
                     color = tuple(int(c) for c in pose_kpt_color[kid])
                     if show_keypoint_weight:
                         img_copy = img.copy()
-                        cv2.circle(img_copy, (int(x_coord), int(y_coord)),
-                                   radius, color, -1)
+                        cv2.circle(img_copy, (int(x_coord), int(y_coord)), radius, color, -1)
                         transparency = max(0, min(1, kpt_score))
-                        cv2.addWeighted(
-                            img_copy,
-                            transparency,
-                            img,
-                            1 - transparency,
-                            0,
-                            dst=img)
+                        cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
                     else:
-                        cv2.circle(img, (int(x_coord), int(y_coord)), radius,
-                                   color, -1)
+                        cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
 
         # draw links
         if skeleton is not None and pose_link_color is not None:
@@ -167,11 +178,18 @@ def visualize_keypoints(img,
             for sk_id, sk in enumerate(skeleton):
                 pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
                 pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
-                if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0
-                        and pos1[1] < img_h and pos2[0] > 0 and pos2[0] < img_w
-                        and pos2[1] > 0 and pos2[1] < img_h
-                        and kpts[sk[0], 2] > kpt_score_thr
-                        and kpts[sk[1], 2] > kpt_score_thr):
+                if (
+                    pos1[0] > 0
+                    and pos1[0] < img_w
+                    and pos1[1] > 0
+                    and pos1[1] < img_h
+                    and pos2[0] > 0
+                    and pos2[0] < img_w
+                    and pos2[1] > 0
+                    and pos2[1] < img_h
+                    and kpts[sk[0], 2] > kpt_score_thr
+                    and kpts[sk[1], 2] > kpt_score_thr
+                ):
                     color = tuple(int(c) for c in pose_link_color[sk_id])
                     if show_keypoint_weight:
                         img_copy = img.copy()
@@ -179,55 +197,83 @@ def visualize_keypoints(img,
                         Y = (pos1[1], pos2[1])
                         mX = np.mean(X)
                         mY = np.mean(Y)
-                        length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5
-                        angle = math.degrees(
-                            math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                        length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                        angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
                         stickwidth = 2
                         polygon = cv2.ellipse2Poly(
-                            (int(mX), int(mY)),
-                            (int(length / 2), int(stickwidth)), int(angle), 0,
-                            360, 1)
+                            (int(mX), int(mY)), (int(length / 2), int(stickwidth)), int(angle), 0, 360, 1
+                        )
                         cv2.fillConvexPoly(img_copy, polygon, color)
-                        transparency = max(
-                            0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
-                        cv2.addWeighted(
-                            img_copy,
-                            transparency,
-                            img,
-                            1 - transparency,
-                            0,
-                            dst=img)
+                        transparency = max(0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                        cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
                     else:
                         cv2.line(img, pos1, pos2, color, thickness=thickness)
 
     return img
 
 # Note: skeleton and color palette are dataset-specific
-skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
-                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
-                        [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],
-                        [3, 5], [4, 6]]
-
-palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
-                            [230, 230, 0], [255, 153, 255], [153, 204, 255],
-                            [255, 102, 255], [255, 51, 255], [102, 178, 255],
-                            [51, 153, 255], [255, 153, 153], [255, 102, 102],
-                            [255, 51, 51], [153, 255, 153], [102, 255, 102],
-                            [51, 255, 51], [0, 255, 0], [0, 0, 255],
-                            [255, 0, 0], [255, 255, 255]])
-
-pose_link_color = palette[[
-                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
-            ]]
-pose_kpt_color = palette[[
-                16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0
-]]
+skeleton = [
+    [15, 13],
+    [13, 11],
+    [16, 14],
+    [14, 12],
+    [11, 12],
+    [5, 11],
+    [6, 12],
+    [5, 6],
+    [5, 7],
+    [6, 8],
+    [7, 9],
+    [8, 10],
+    [1, 2],
+    [0, 1],
+    [0, 2],
+    [1, 3],
+    [2, 4],
+    [3, 5],
+    [4, 6],
+]
+
+palette = np.array(
+    [
+        [255, 128, 0],
+        [255, 153, 51],
+        [255, 178, 102],
+        [230, 230, 0],
+        [255, 153, 255],
+        [153, 204, 255],
+        [255, 102, 255],
+        [255, 51, 255],
+        [102, 178, 255],
+        [51, 153, 255],
+        [255, 153, 153],
+        [255, 102, 102],
+        [255, 51, 51],
+        [153, 255, 153],
+        [102, 255, 102],
+        [51, 255, 51],
+        [0, 255, 0],
+        [0, 0, 255],
+        [255, 0, 0],
+        [255, 255, 255],
+    ]
+)
+
+pose_link_color = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+pose_kpt_color = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
 
 pose_results = [result["keypoints"] for result in pose_results]
 
-result = visualize_keypoints(np.array(image), pose_results, skeleton=skeleton, kpt_score_thr=0.3,
-                 pose_kpt_color=pose_kpt_color, pose_link_color=pose_link_color,
-                 radius=4, thickness=1)
+result = visualize_keypoints(
+    np.array(image),
+    pose_results,
+    skeleton=skeleton,
+    kpt_score_thr=0.3,
+    pose_kpt_color=pose_kpt_color,
+    pose_link_color=pose_link_color,
+    radius=4,
+    thickness=1,
+)
 
 pose_image = Image.fromarray(result)
 pose_image

From eca096d40e6486f08ed131e9d1652275a5d9cf78 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:54:25 +0000
Subject: [PATCH 130/181] split function for visibility

---
 docs/source/en/model_doc/vitpose.md | 102 +++++++++++++++-------------
 1 file changed, 54 insertions(+), 48 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index d02957625636..504ca31ddebe 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -114,6 +114,58 @@ for pose_result in pose_results:
         x, y, score = keypoint
         print(f"coordinate : [{x}, {y}], score : {score}")
 
+def draw_points(pose_kpt_color, kpts, img):
+    if pose_kpt_color is not None:
+    assert len(pose_kpt_color) == len(kpts)
+    for kid, kpt in enumerate(kpts):
+        x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+        if kpt_score > kpt_score_thr:
+            color = tuple(int(c) for c in pose_kpt_color[kid])
+            if show_keypoint_weight:
+                img_copy = img.copy()
+                cv2.circle(img_copy, (int(x_coord), int(y_coord)), radius, color, -1)
+                transparency = max(0, min(1, kpt_score))
+                cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
+            else:
+                cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
+
+def draw_links(skeleton, pose_link_color, img):
+    if skeleton is not None and pose_link_color is not None:
+        assert len(pose_link_color) == len(skeleton)
+        for sk_id, sk in enumerate(skeleton):
+            pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+            pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+            if (
+                pos1[0] > 0
+                and pos1[0] < img_w
+                and pos1[1] > 0
+                and pos1[1] < img_h
+                and pos2[0] > 0
+                and pos2[0] < img_w
+                and pos2[1] > 0
+                and pos2[1] < img_h
+                and kpts[sk[0], 2] > kpt_score_thr
+                and kpts[sk[1], 2] > kpt_score_thr
+            ):
+                color = tuple(int(c) for c in pose_link_color[sk_id])
+                if show_keypoint_weight:
+                    img_copy = img.copy()
+                    X = (pos1[0], pos2[0])
+                    Y = (pos1[1], pos2[1])
+                    mX = np.mean(X)
+                    mY = np.mean(Y)
+                    length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    stickwidth = 2
+                    polygon = cv2.ellipse2Poly(
+                        (int(mX), int(mY)), (int(length / 2), int(stickwidth)), int(angle), 0, 360, 1
+                    )
+                    cv2.fillConvexPoly(img_copy, polygon, color)
+                    transparency = max(0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                    cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
+                else:
+                    cv2.line(img, pos1, pos2, color, thickness=thickness)
+
 def visualize_keypoints(
     img,
     pose_result,
@@ -158,56 +210,10 @@ def visualize_keypoints(
         kpts = np.array(kpts, copy=False)
 
         # draw each point on image
-        if pose_kpt_color is not None:
-            assert len(pose_kpt_color) == len(kpts)
-            for kid, kpt in enumerate(kpts):
-                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
-                if kpt_score > kpt_score_thr:
-                    color = tuple(int(c) for c in pose_kpt_color[kid])
-                    if show_keypoint_weight:
-                        img_copy = img.copy()
-                        cv2.circle(img_copy, (int(x_coord), int(y_coord)), radius, color, -1)
-                        transparency = max(0, min(1, kpt_score))
-                        cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
-                    else:
-                        cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
+        draw_points(pose_kpt_color, kpts, img)
 
         # draw links
-        if skeleton is not None and pose_link_color is not None:
-            assert len(pose_link_color) == len(skeleton)
-            for sk_id, sk in enumerate(skeleton):
-                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
-                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
-                if (
-                    pos1[0] > 0
-                    and pos1[0] < img_w
-                    and pos1[1] > 0
-                    and pos1[1] < img_h
-                    and pos2[0] > 0
-                    and pos2[0] < img_w
-                    and pos2[1] > 0
-                    and pos2[1] < img_h
-                    and kpts[sk[0], 2] > kpt_score_thr
-                    and kpts[sk[1], 2] > kpt_score_thr
-                ):
-                    color = tuple(int(c) for c in pose_link_color[sk_id])
-                    if show_keypoint_weight:
-                        img_copy = img.copy()
-                        X = (pos1[0], pos2[0])
-                        Y = (pos1[1], pos2[1])
-                        mX = np.mean(X)
-                        mY = np.mean(Y)
-                        length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
-                        angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
-                        stickwidth = 2
-                        polygon = cv2.ellipse2Poly(
-                            (int(mX), int(mY)), (int(length / 2), int(stickwidth)), int(angle), 0, 360, 1
-                        )
-                        cv2.fillConvexPoly(img_copy, polygon, color)
-                        transparency = max(0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
-                        cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
-                    else:
-                        cv2.line(img, pos1, pos2, color, thickness=thickness)
+        draw_links(skeleton, pose_link_color, img)
 
     return img
 

From a04714526f2eabba53ed7ceea8f8244362b34841 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 11:54:39 +0000
Subject: [PATCH 131/181] make style

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 src/transformers/models/vitpose/modeling_vitpose.py         | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index e1d9303f8df5..a8a5d0475d56 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -488,7 +488,7 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        if isinstance(boxes, list)and len(images) != len(boxes):
+        if isinstance(boxes, list) and len(images) != len(boxes):
             raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
         elif len(images) != boxes.shape[0]:
             raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 7f92f2992be9..5a9a2a4cc471 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -179,7 +179,9 @@ def __init__(self, config) -> None:
         super().__init__()
 
         self.scale_factor = config.scale_factor
-        self.conv = nn.Conv2d(config.backbone_config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1)
+        self.conv = nn.Conv2d(
+            config.backbone_config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1
+        )
 
     def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
         # Transform input: ReLu + upsample

From 4dd3aabfb611942e916f89ba77429350efb923b8 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 12:06:28 +0000
Subject: [PATCH 132/181] check isinstances

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index a8a5d0475d56..945bf08b4c4f 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -490,7 +490,7 @@ def preprocess(
 
         if isinstance(boxes, list) and len(images) != len(boxes):
             raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
-        elif len(images) != boxes.shape[0]:
+        elif isinstance(boxes, np.ndarray) and len(images) != boxes.shape[0]:
             raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
 
         # All transformations expect numpy arrays.

From d98171447efcb91f1205b1e676e48f2c1277ca3a Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 12:22:14 +0000
Subject: [PATCH 133/181] change to acceptable use_pretrained_backbone

---
 src/transformers/models/vitpose/configuration_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 236818ff0673..804d467ddaa2 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -87,7 +87,7 @@ def __init__(
         super().__init__(**kwargs)
 
         if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
+            logger.info("`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value.")
 
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")

From 2ad9ded1e2afb30dee6c2d0d74b19a204cc7fa95 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 23 Sep 2024 23:42:47 +0000
Subject: [PATCH 134/181] make style

---
 src/transformers/models/vitpose/configuration_vitpose.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 804d467ddaa2..796cbf927129 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -87,7 +87,9 @@ def __init__(
         super().__init__(**kwargs)
 
         if use_pretrained_backbone:
-            logger.info("`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value.")
+            logger.info(
+                "`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value."
+            )
 
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")

From fe49a840f7b7200b98a17be648028d149a1193cb Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 24 Sep 2024 14:55:20 +0000
Subject: [PATCH 135/181] remove copy in docs

---
 docs/source/en/model_doc/vitpose.md           | 27 +++++++++----------
 .../vitpose/image_processing_vitpose.py       |  2 +-
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 504ca31ddebe..db20e7b08e90 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -114,22 +114,22 @@ for pose_result in pose_results:
         x, y, score = keypoint
         print(f"coordinate : [{x}, {y}], score : {score}")
 
-def draw_points(pose_kpt_color, kpts, img):
+def draw_points(pose_kpt_color, kpts, img, kpt_score_thr, radius, show_keypoint_weight):
     if pose_kpt_color is not None:
-    assert len(pose_kpt_color) == len(kpts)
+        assert len(pose_kpt_color) == len(kpts)
     for kid, kpt in enumerate(kpts):
         x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
         if kpt_score > kpt_score_thr:
             color = tuple(int(c) for c in pose_kpt_color[kid])
             if show_keypoint_weight:
-                img_copy = img.copy()
-                cv2.circle(img_copy, (int(x_coord), int(y_coord)), radius, color, -1)
+                cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
                 transparency = max(0, min(1, kpt_score))
-                cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
+                cv2.addWeighted(img, transparency, img, 1 - transparency, 0, dst=img)
             else:
                 cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
 
-def draw_links(skeleton, pose_link_color, img):
+def draw_links(skeleton, pose_link_color, kpts, img, kpt_score_thr, thickness, show_keypoint_weight):
+    img_h, img_w, _ = img.shape
     if skeleton is not None and pose_link_color is not None:
         assert len(pose_link_color) == len(skeleton)
         for sk_id, sk in enumerate(skeleton):
@@ -149,7 +149,6 @@ def draw_links(skeleton, pose_link_color, img):
             ):
                 color = tuple(int(c) for c in pose_link_color[sk_id])
                 if show_keypoint_weight:
-                    img_copy = img.copy()
                     X = (pos1[0], pos2[0])
                     Y = (pos1[1], pos2[1])
                     mX = np.mean(X)
@@ -160,9 +159,9 @@ def draw_links(skeleton, pose_link_color, img):
                     polygon = cv2.ellipse2Poly(
                         (int(mX), int(mY)), (int(length / 2), int(stickwidth)), int(angle), 0, 360, 1
                     )
-                    cv2.fillConvexPoly(img_copy, polygon, color)
+                    cv2.fillConvexPoly(img, polygon, color)
                     transparency = max(0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
-                    cv2.addWeighted(img_copy, transparency, img, 1 - transparency, 0, dst=img)
+                    cv2.addWeighted(img, transparency, img, 1 - transparency, 0, dst=img)
                 else:
                     cv2.line(img, pos1, pos2, color, thickness=thickness)
 
@@ -203,17 +202,14 @@ def visualize_keypoints(
     Returns:
         `numpy.ndarray`: Image with drawn keypoints and links.
     """
-    img = img.copy()
-    img_h, img_w, _ = img.shape
-
     for kpts in pose_result:
         kpts = np.array(kpts, copy=False)
 
         # draw each point on image
-        draw_points(pose_kpt_color, kpts, img)
+        draw_points(pose_kpt_color, kpts, img, kpt_score_thr, radius, show_keypoint_weight)
 
         # draw links
-        draw_links(skeleton, pose_link_color, img)
+        draw_links(skeleton, pose_link_color, kpts, img, kpt_score_thr, thickness, show_keypoint_weight)
 
     return img
 
@@ -272,13 +268,14 @@ pose_results = [result["keypoints"] for result in pose_results]
 
 result = visualize_keypoints(
     np.array(image),
-    pose_results,
+    pose_result,
     skeleton=skeleton,
     kpt_score_thr=0.3,
     pose_kpt_color=pose_kpt_color,
     pose_link_color=pose_link_color,
     radius=4,
     thickness=1,
+    show_keypoint_weight=False,
 )
 
 pose_image = Image.fromarray(result)
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 945bf08b4c4f..faf81ab5871b 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -630,7 +630,7 @@ def post_process_pose_estimation(
             centers[i, :] = center
             scales[i, :] = scale
 
-        preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps, centers, scales, kernel=kernel_size)
+        preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps.numpy(), centers, scales, kernel=kernel_size)
 
         all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
         all_boxes = np.zeros((batch_size, 6), dtype=np.float32)

From bc4ae9a22ea1dad81716c5486a395ddd4f1f368a Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Sun, 6 Oct 2024 16:10:39 +0900
Subject: [PATCH 136/181] Update
 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 4dc0c7f51581..a671378b6964 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -224,7 +224,7 @@ def forward(
 
 
 class VitPoseBackboneMoeMLP(nn.Module):
-    def __init__(self, config: VitPoseBackboneConfig) -> None:
+    def __init__(self, config: VitPoseBackboneConfig):
         super().__init__()
 
         in_features = out_features = config.hidden_size

From fee25829fb9dd0c103f223ee53a290d644a5e974 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Sun, 6 Oct 2024 16:10:55 +0900
Subject: [PATCH 137/181] Update docs/source/en/model_doc/vitpose.md

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 docs/source/en/model_doc/vitpose.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index db20e7b08e90..412df7e67788 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -72,7 +72,7 @@ with torch.no_grad():
     outputs = person_model(**inputs)
 
 results = person_image_processor.post_process_object_detection(
-    outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3
+    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
 )
 
 def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:

From 8cb5f9ce75f25f61c098c935692b9a6757ab6f63 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Sun, 6 Oct 2024 16:11:22 +0900
Subject: [PATCH 138/181] Update
 src/transformers/models/vitpose/modeling_vitpose.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/vitpose/modeling_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 5a9a2a4cc471..f140799da40a 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -204,7 +204,7 @@ class VitPoseClassicDecoder(nn.Module):
     turning the feature maps into heatmaps.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: VitPoseConfig):
         super().__init__()
 
         self.deconv1 = nn.ConvTranspose2d(

From 42be42b28b7d1d0a095bdbbe73b938140bed4aa6 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 6 Oct 2024 10:08:18 +0000
Subject: [PATCH 139/181] simple fix + make style

---
 src/transformers/models/vitpose/__init__.py   |  4 --
 .../vitpose/image_processing_vitpose.py       | 38 ++++++++++++-------
 .../models/vitpose/modeling_vitpose.py        | 16 ++++----
 .../modeling_vitpose_backbone.py              |  4 +-
 4 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index 3ca6860da1b5..b6fd47c37056 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index faf81ab5871b..4b51807d0d75 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -114,7 +114,7 @@ def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     return bboxes
 
 
-def get_keypoint_predictions(heatmaps):
+def get_keypoint_predictions(heatmaps: np.ndarray) -> tuple:
     """Get keypoint predictions from score maps.
 
     Args:
@@ -147,7 +147,7 @@ def get_keypoint_predictions(heatmaps):
     return preds, scores
 
 
-def post_dark_unbiased_data_processing(coords, batch_heatmaps, kernel=3):
+def post_dark_unbiased_data_processing(coords: np.ndarray, batch_heatmaps: np.ndarray, kernel: int = 3):
     """DARK post-pocessing. Implemented by unbiased_data_processing.
 
     Paper references:
@@ -211,7 +211,7 @@ def post_dark_unbiased_data_processing(coords, batch_heatmaps, kernel=3):
     return coords
 
 
-def transform_preds(coords, center, scale, output_size):
+def transform_preds(coords: np.ndarray, center: np.ndarray, scale: np.ndarray, output_size: np.ndarray) -> np.ndarray:
     """Get final keypoint predictions from heatmaps and apply scaling and
     translation to map them back to the image.
 
@@ -366,6 +366,7 @@ def __init__(
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.normalize_factor = 200.0
 
     def affine_transform(
         self,
@@ -512,7 +513,10 @@ def preprocess(
             for image, image_boxes in zip(images, boxes):
                 for box in image_boxes:
                     center, scale = box_to_center_and_scale(
-                        box, image_width=size["width"], image_height=size["height"]
+                        box,
+                        image_width=size["width"],
+                        image_height=size["height"],
+                        normalize_factor=self.normalize_factor,
                     )
                     transformed_image = self.affine_transform(
                         image, center, scale, rotation=0, size=size, input_data_format=input_data_format
@@ -546,10 +550,10 @@ def preprocess(
 
     def keypoints_from_heatmaps(
         self,
-        heatmaps,
-        center,
-        scale,
-        kernel=11,
+        heatmaps: np.ndarray,
+        center: np.ndarray,
+        scale: np.ndarray,
+        kernel: int = 11,
     ):
         """
         Get final keypoint predictions from heatmaps and transform them back to
@@ -587,7 +591,11 @@ def keypoints_from_heatmaps(
         return preds, scores
 
     def post_process_pose_estimation(
-        self, outputs, boxes, kernel_size=11, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
+        outputs: torch.Tensor,
+        boxes: Union[List[List[List[float]]], np.ndarray],
+        kernel_size: int = 11,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
     ):
         """
         Transform the heatmaps into keypoint predictions and transform them back to the image.
@@ -622,15 +630,17 @@ def post_process_pose_estimation(
         flattened_boxes = list(itertools.chain(*boxes))
         for i in range(batch_size):
             if target_sizes is not None:
-                img_w, img_h = target_sizes[i][0], target_sizes[i][1]
-                scale_fct = np.array([img_w, img_h, img_w, img_h])
-                flattened_boxes[i] = flattened_boxes[i] * scale_fct
+                image_width, image_height = target_sizes[i][0], target_sizes[i][1]
+                scale_factor = np.array([image_width, image_height, image_width, image_height])
+                flattened_boxes[i] = flattened_boxes[i] * scale_factor
             width, height = self.size["width"], self.size["height"]
             center, scale = box_to_center_and_scale(flattened_boxes[i], image_width=width, image_height=height)
             centers[i, :] = center
             scales[i, :] = scale
 
-        preds, scores = self.keypoints_from_heatmaps(outputs.heatmaps.numpy(), centers, scales, kernel=kernel_size)
+        preds, scores = self.keypoints_from_heatmaps(
+            outputs.heatmaps.cpu().numpy(), centers, scales, kernel=kernel_size
+        )
 
         all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
         all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
@@ -638,7 +648,7 @@ def post_process_pose_estimation(
         all_preds[:, :, 2:3] = scores
         all_boxes[:, 0:2] = centers[:, 0:2]
         all_boxes[:, 2:4] = scales[:, 0:2]
-        all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
+        all_boxes[:, 4] = np.prod(scales * self.normalize_factor, axis=1)
 
         poses = torch.Tensor(all_preds)
         bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index f140799da40a..0a5663b0e62c 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -178,18 +178,16 @@ class VitPoseSimpleDecoder(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
 
-        self.scale_factor = config.scale_factor
+        self.activation = nn.ReLU()
+        self.upsampling = nn.Upsample(scale_factor=config.scale_factor, mode="bilinear", align_corners=False)
         self.conv = nn.Conv2d(
             config.backbone_config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1
         )
 
-    def forward(self, hidden_state, flip_pairs) -> torch.Tensor:
-        # Transform input: ReLu + upsample
-        hidden_state = nn.functional.relu(hidden_state)
-        hidden_state = nn.functional.interpolate(
-            hidden_state, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
-        )
-
+    def forward(self, hidden_state: torch.Tensor, flip_pairs: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Transform input: ReLU + upsample
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.upsampling(hidden_state)
         heatmaps = self.conv(hidden_state)
 
         if flip_pairs is not None:
@@ -219,7 +217,7 @@ def __init__(self, config: VitPoseConfig):
 
         self.conv = nn.Conv2d(256, config.num_labels, kernel_size=1, stride=1, padding=0)
 
-    def forward(self, hidden_state, flip_pairs):
+    def forward(self, hidden_state: torch.Tensor, flip_pairs: Optional[torch.Tensor] = None):
         hidden_state = self.deconv1(hidden_state)
         hidden_state = self.batchnorm1(hidden_state)
         hidden_state = self.relu1(hidden_state)
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index a671378b6964..a8e9df2bc1c5 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -253,8 +253,8 @@ def forward(self, hidden_state, indices):
 
         # to support ddp training
         for i in range(self.num_experts):
-            selectedIndex = indices == i
-            current_hidden_state = self.experts[i](hidden_state) * selectedIndex
+            selected_index = indices == i
+            current_hidden_state = self.experts[i](hidden_state) * selected_index
             expert_hidden_state = expert_hidden_state + current_hidden_state
 
         hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)

From f835be5ab730f78f73c702f22f5eba5c2e0b8b84 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 6 Oct 2024 10:11:55 +0000
Subject: [PATCH 140/181] change input config of activation function to string

---
 .../vitpose_backbone/configuration_vitpose_backbone.py       | 4 ++--
 .../models/vitpose_backbone/modeling_vitpose_backbone.py     | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index d627b9141183..08768eaae6d1 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -51,8 +51,8 @@ class VitPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
             The number of experts in the MoE layer.
         part_features (`int`, *optional*):
             The number of part features to output. Only used in case `num_experts` is greater than 1.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index a8e9df2bc1c5..ed932c88ced6 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -268,10 +268,7 @@ def __init__(self, config: VitPoseBackboneConfig) -> None:
         in_features = out_features = config.hidden_size
         hidden_features = int(config.hidden_size * config.mlp_ratio)
         self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
-        if isinstance(config.hidden_act, str):
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = config.hidden_act
+        self.activation = ACT2FN[config.hidden_act]
         self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
 
     def forward(self, hidden_state: torch.Tensor, indices=None) -> torch.Tensor:

From b6699c9e90e2980cb58396da70f377c77c7c02e0 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Sun, 6 Oct 2024 21:26:38 +0900
Subject: [PATCH 141/181] Update docs/source/en/model_doc/vitpose.md

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 docs/source/en/model_doc/vitpose.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 412df7e67788..b1e3c073927e 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -268,7 +268,7 @@ pose_results = [result["keypoints"] for result in pose_results]
 
 result = visualize_keypoints(
     np.array(image),
-    pose_result,
+    pose_results,
     skeleton=skeleton,
     kpt_score_thr=0.3,
     pose_kpt_color=pose_kpt_color,

From ebdf2df253cb423ea2210c09d260c53753c7749d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 6 Oct 2024 12:54:49 +0000
Subject: [PATCH 142/181] tmp docs

---
 docs/source/en/model_doc/vitpose.md           | 127 +++++++++---------
 .../models/vitpose/configuration_vitpose.py   |  21 +++
 2 files changed, 84 insertions(+), 64 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 412df7e67788..803e07fb0a1e 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -114,64 +114,63 @@ for pose_result in pose_results:
         x, y, score = keypoint
         print(f"coordinate : [{x}, {y}], score : {score}")
 
-def draw_points(pose_kpt_color, kpts, img, kpt_score_thr, radius, show_keypoint_weight):
-    if pose_kpt_color is not None:
-        assert len(pose_kpt_color) == len(kpts)
-    for kid, kpt in enumerate(kpts):
-        x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
-        if kpt_score > kpt_score_thr:
-            color = tuple(int(c) for c in pose_kpt_color[kid])
+def draw_points(image, keypoints, keypoint_colors, keypoint_score_threshold, radius, show_keypoint_weight):
+    if keypoint_colors is not None:
+        assert len(keypoint_colors) == len(keypoints)
+    for id, keypoint in enumerate(keypoints):
+        x_coord, y_coord, keypoint_score = int(keypoint[0]), int(keypoint[1]), keypoint[2]
+        if keypoint_score > keypoint_score_threshold:
+            color = tuple(int(c) for c in keypoint_colors[id])
             if show_keypoint_weight:
-                cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
-                transparency = max(0, min(1, kpt_score))
-                cv2.addWeighted(img, transparency, img, 1 - transparency, 0, dst=img)
+                cv2.circle(image, (x_coord, y_coord), radius, color, -1)
+                transparency = max(0, min(1, keypoint_score))
+                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
             else:
-                cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
-
-def draw_links(skeleton, pose_link_color, kpts, img, kpt_score_thr, thickness, show_keypoint_weight):
-    img_h, img_w, _ = img.shape
-    if skeleton is not None and pose_link_color is not None:
-        assert len(pose_link_color) == len(skeleton)
-        for sk_id, sk in enumerate(skeleton):
-            pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
-            pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+                cv2.circle(image, (x_coord, y_coord), radius, color, -1)
+
+def draw_links(image, keypoints, keypoint_connections, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+    height, width, _ = image.shape
+    if keypoint_connections is not None and link_colors is not None:
+        assert len(link_colors) == len(keypoint_connections)
+        for sk_id, sk in enumerate(keypoint_connections):
+            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), keypoints[sk[0], 2])
+            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), keypoints[sk[1], 2])
             if (
-                pos1[0] > 0
-                and pos1[0] < img_w
-                and pos1[1] > 0
-                and pos1[1] < img_h
-                and pos2[0] > 0
-                and pos2[0] < img_w
-                and pos2[1] > 0
-                and pos2[1] < img_h
-                and kpts[sk[0], 2] > kpt_score_thr
-                and kpts[sk[1], 2] > kpt_score_thr
+                x1 > 0
+                and x1 < width
+                and y1 > 0
+                and y1 < height
+                and x2 > 0
+                and x2 < width
+                and y2 > 0
+                and y2 < height
+                and score1 > keypoint_score_threshold
+                and score2 > keypoint_score_threshold
             ):
-                color = tuple(int(c) for c in pose_link_color[sk_id])
+                color = tuple(int(c) for c in link_colors[sk_id])
                 if show_keypoint_weight:
-                    X = (pos1[0], pos2[0])
-                    Y = (pos1[1], pos2[1])
-                    mX = np.mean(X)
-                    mY = np.mean(Y)
+                    X = (x1, x2)
+                    Y = (y1, y2)
+                    mean_x = np.mean(X)
+                    mean_y = np.mean(Y)
                     length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
                     angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
-                    stickwidth = 2
                     polygon = cv2.ellipse2Poly(
-                        (int(mX), int(mY)), (int(length / 2), int(stickwidth)), int(angle), 0, 360, 1
+                        (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
                     )
-                    cv2.fillConvexPoly(img, polygon, color)
-                    transparency = max(0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
-                    cv2.addWeighted(img, transparency, img, 1 - transparency, 0, dst=img)
+                    cv2.fillConvexPoly(image, polygon, color)
+                    transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
+                    cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
                 else:
-                    cv2.line(img, pos1, pos2, color, thickness=thickness)
+                    cv2.line(image, pos1, pos2, color, thickness=thickness)
 
 def visualize_keypoints(
-    img,
+    image,
     pose_result,
-    skeleton=None,
-    kpt_score_thr=0.3,
-    pose_kpt_color=None,
-    pose_link_color=None,
+    keypoint_connections=None,
+    keypoint_score_threshold=0.3,
+    keypoint_colors=None,
+    link_colors=None,
     radius=4,
     thickness=1,
     show_keypoint_weight=False,
@@ -179,18 +178,18 @@ def visualize_keypoints(
     """Draw keypoints and links on an image.
 
     Args:
-        img (`numpy.ndarray`): 
+        image (`numpy.ndarray`): 
             The image to draw poses on. It will be modified in-place.
         pose_result (`List[numpy.ndarray]`): 
             The poses to draw. Each element is a set of K keypoints as a Kx3 numpy.ndarray, where each keypoint
             is represented as x, y, score.
-        skeleton (`List[tuple]`, *optional*): 
-            Skeleton definition.
-        kpt_score_thr (`float`, *optional*, defaults to 0.3): 
+        keypoint_connections (`List[tuple]`, *optional*): 
+            Mapping index of the keypoint_connections links.
+        keypoint_score_threshold (`float`, *optional*, defaults to 0.3): 
             Minimum score of keypoints to be shown.
-        pose_kpt_color (`numpy.ndarray`, *optional*): 
+        keypoint_colors (`numpy.ndarray`, *optional*): 
             Color of N keypoints. If None, the keypoints will not be drawn.
-        pose_link_color (`numpy.ndarray`, *optional*): 
+        link_colors (`numpy.ndarray`, *optional*): 
             Color of M links. If None, the links will not be drawn.
         radius (`int`, *optional*, defaults to 4):
             Radius of keypoint circles.
@@ -202,19 +201,19 @@ def visualize_keypoints(
     Returns:
         `numpy.ndarray`: Image with drawn keypoints and links.
     """
-    for kpts in pose_result:
-        kpts = np.array(kpts, copy=False)
+    for keypoints in pose_result:
+        keypoints = np.array(keypoints, copy=False)
 
         # draw each point on image
-        draw_points(pose_kpt_color, kpts, img, kpt_score_thr, radius, show_keypoint_weight)
+        draw_points(image, keypoints, keypoint_colors, keypoint_score_threshold, radius, show_keypoint_weight)
 
         # draw links
-        draw_links(skeleton, pose_link_color, kpts, img, kpt_score_thr, thickness, show_keypoint_weight)
+        draw_links(image, keypoints, keypoint_connections, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight)
 
-    return img
+    return image
 
-# Note: skeleton and color palette are dataset-specific
-skeleton = [
+# Note: keypoint_connections and color palette are dataset-specific
+keypoint_connections = [
     [15, 13],
     [13, 11],
     [16, 14],
@@ -261,18 +260,18 @@ palette = np.array(
     ]
 )
 
-pose_link_color = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
-pose_kpt_color = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
+link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
 
 pose_results = [result["keypoints"] for result in pose_results]
 
 result = visualize_keypoints(
     np.array(image),
     pose_result,
-    skeleton=skeleton,
-    kpt_score_thr=0.3,
-    pose_kpt_color=pose_kpt_color,
-    pose_link_color=pose_link_color,
+    keypoint_connections=keypoint_connections,
+    keypoint_score_threshold=0.3,
+    keypoint_colors=keypoint_colors,
+    link_colors=link_colors,
     radius=4,
     thickness=1,
     show_keypoint_weight=False,
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 796cbf927129..c9f52de99127 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -82,6 +82,27 @@ def __init__(
         initializer_range: float = 0.02,
         scale_factor: int = 4,
         use_simple_decoder: bool = True,
+        skeleton: List = [
+            [15, 13],
+            [13, 11],
+            [16, 14],
+            [14, 12],
+            [11, 12],
+            [5, 11],
+            [6, 12],
+            [5, 6],
+            [5, 7],
+            [6, 8],
+            [7, 9],
+            [8, 10],
+            [1, 2],
+            [0, 1],
+            [0, 2],
+            [1, 3],
+            [2, 4],
+            [3, 5],
+            [4, 6],
+        ],
         **kwargs,
     ):
         super().__init__(**kwargs)

From 95629565e0caeb053cc88a79be54c1fe2d570bfb Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 19 Oct 2024 08:18:47 +0000
Subject: [PATCH 143/181] delete index.md

---
 docs/source/en/index.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 4ecf608cc2b2..85a91942b7d2 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -343,7 +343,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
 |                       [VitPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
-|              [VitPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |

From bb5cc969a1db3e2d6d3bc4ee7f4aaa8f46ab661b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 19 Oct 2024 08:25:56 +0000
Subject: [PATCH 144/181] make fix-copies

---
 docs/source/en/index.md                       |  1 +
 src/transformers/models/glm/modeling_glm.py   | 41 +++----------------
 .../models/vitpose/configuration_vitpose.py   |  4 +-
 3 files changed, 8 insertions(+), 38 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 85a91942b7d2..4ecf608cc2b2 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -343,6 +343,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
 |                       [VitPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
+|              [VitPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 9815dbc78992..a458c02a6fed 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -25,7 +25,6 @@
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
@@ -921,6 +920,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        **kwargs,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1071,18 +1071,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
+            loss = self.loss_function(logits, labels, self.vocab_size)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1186,27 +1175,8 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
         if not return_dict:
             output = (pooled_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
@@ -1289,8 +1259,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            loss = self.loss_function(logits, labels, self.config)
 
         if not return_dict:
             output = (logits,) + outputs[2:]
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 8fa519ec3111..ea0820cff667 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -54,9 +54,9 @@ class VitPoseConfig(PretrainedConfig):
             Factor to upscale the feature maps coming from the ViT backbone.
         use_simple_decoder (`bool`, *optional*, defaults to `True`):
             Whether to use a `VitPoseSimpleDecoder` to decode the feature maps from the backbone into heatmaps. Otherwise it uses `VitPoseClassicDecoder`.
-        skeleton_edges (`list`, *optional*):
+        skeleton_edges (`list`, *optional*, defaults to `[[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]`):
             List of edges connecting skeleton nodes, each edge represented by two node indices. This edges are based on MSCOCO.
-        skeleton_nodes (`list`, *optional*):
+        skeleton_nodes (`list`, *optional*, defaults to `['Nose', 'L_Eye', 'R_Eye', 'L_Ear', 'R_Ear', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle']`):
             List of node names representing different body parts in the skeleton. This edges are based on MSCOCO.
 
 

From 899cb96bda881f76ec1fc4d274f64241d2aa3747 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 19 Oct 2024 08:44:59 +0000
Subject: [PATCH 145/181] simple fix

---
 docs/source/en/model_doc/vitpose.md           | 66 +++++++++++++++----
 .../models/vitpose/configuration_vitpose.py   | 46 -------------
 2 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index fa598e22dd56..502694db80bc 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -101,7 +101,49 @@ boxes = [pascal_voc_to_coco(boxes.cpu().numpy())]
 
 image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
 model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
-config = VitPoseConfig()
+
+keypoint_edges = [
+    [15, 13],
+    [13, 11],
+    [16, 14],
+    [14, 12],
+    [11, 12],
+    [5, 11],
+    [6, 12],
+    [5, 6],
+    [5, 7],
+    [6, 8],
+    [7, 9],
+    [8, 10],
+    [1, 2],
+    [0, 1],
+    [0, 2],
+    [1, 3],
+    [2, 4],
+    [3, 5],
+    [4, 6],
+],
+keypoint_nodes = [
+    "Nose",
+    "L_Eye",
+    "R_Eye",
+    "L_Ear",
+    "R_Ear",
+    "L_Shoulder",
+    "R_Shoulder",
+    "L_Elbow",
+    "R_Elbow",
+    "L_Wrist",
+    "R_Wrist",
+    "L_Hip",
+    "R_Hip",
+    "L_Knee",
+    "R_Knee",
+    "L_Ankle",
+    "R_Ankle",
+],
+
+config = VitPoseConfig(keypoint_edges=keypoint_edges, keypoint_nodes=keypoint_nodes)
 
 # Stage 2. Run ViTPose
 pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
@@ -130,11 +172,11 @@ def draw_points(image, keypoints, keypoint_colors, keypoint_score_threshold, rad
             else:
                 cv2.circle(image, (x_coord, y_coord), radius, color, -1)
 
-def draw_links(image, keypoints, keypoint_connections, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+def draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
     height, width, _ = image.shape
-    if keypoint_connections is not None and link_colors is not None:
-        assert len(link_colors) == len(keypoint_connections)
-        for sk_id, sk in enumerate(keypoint_connections):
+    if keypoint_edges is not None and link_colors is not None:
+        assert len(link_colors) == len(keypoint_edges)
+        for sk_id, sk in enumerate(keypoint_edges):
             x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), keypoints[sk[0], 2])
             x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), keypoints[sk[1], 2])
             if (
@@ -169,7 +211,7 @@ def draw_links(image, keypoints, keypoint_connections, link_colors, keypoint_sco
 def visualize_keypoints(
     image,
     pose_result,
-    keypoint_connections=None,
+    keypoint_edges=None,
     keypoint_score_threshold=0.3,
     keypoint_colors=None,
     link_colors=None,
@@ -185,8 +227,8 @@ def visualize_keypoints(
         pose_result (`List[numpy.ndarray]`): 
             The poses to draw. Each element is a set of K keypoints as a Kx3 numpy.ndarray, where each keypoint
             is represented as x, y, score.
-        keypoint_connections (`List[tuple]`, *optional*): 
-            Mapping index of the keypoint_connections links.
+        keypoint_edges (`List[tuple]`, *optional*): 
+            Mapping index of the keypoint_edges links.
         keypoint_score_threshold (`float`, *optional*, defaults to 0.3): 
             Minimum score of keypoints to be shown.
         keypoint_colors (`numpy.ndarray`, *optional*): 
@@ -210,12 +252,12 @@ def visualize_keypoints(
         draw_points(image, keypoints, keypoint_colors, keypoint_score_threshold, radius, show_keypoint_weight)
 
         # draw links
-        draw_links(image, keypoints, keypoint_connections, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight)
+        draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight)
 
     return image
 
-# Note: keypoint_connections and color palette are dataset-specific
-keypoint_connections = config.skeleton_edges
+# Note: keypoint_edges and color palette are dataset-specific
+keypoint_edges = config.keypoint_edges
 
 palette = np.array(
     [
@@ -250,7 +292,7 @@ pose_results = [result["keypoints"] for result in pose_results]
 result = visualize_keypoints(
     np.array(image),
     pose_result,
-    keypoint_connections=keypoint_connections,
+    keypoint_edges=keypoint_edges,
     keypoint_score_threshold=0.3,
     keypoint_colors=keypoint_colors,
     link_colors=link_colors,
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index ea0820cff667..edfbc173d0d2 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -54,10 +54,6 @@ class VitPoseConfig(PretrainedConfig):
             Factor to upscale the feature maps coming from the ViT backbone.
         use_simple_decoder (`bool`, *optional*, defaults to `True`):
             Whether to use a `VitPoseSimpleDecoder` to decode the feature maps from the backbone into heatmaps. Otherwise it uses `VitPoseClassicDecoder`.
-        skeleton_edges (`list`, *optional*, defaults to `[[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]`):
-            List of edges connecting skeleton nodes, each edge represented by two node indices. This edges are based on MSCOCO.
-        skeleton_nodes (`list`, *optional*, defaults to `['Nose', 'L_Eye', 'R_Eye', 'L_Ear', 'R_Ear', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle']`):
-            List of node names representing different body parts in the skeleton. This edges are based on MSCOCO.
 
 
     Example:
@@ -87,46 +83,6 @@ def __init__(
         initializer_range: float = 0.02,
         scale_factor: int = 4,
         use_simple_decoder: bool = True,
-        skeleton_edges: list = [
-            [15, 13],
-            [13, 11],
-            [16, 14],
-            [14, 12],
-            [11, 12],
-            [5, 11],
-            [6, 12],
-            [5, 6],
-            [5, 7],
-            [6, 8],
-            [7, 9],
-            [8, 10],
-            [1, 2],
-            [0, 1],
-            [0, 2],
-            [1, 3],
-            [2, 4],
-            [3, 5],
-            [4, 6],
-        ],
-        skeleton_nodes: list = [
-            "Nose",
-            "L_Eye",
-            "R_Eye",
-            "L_Ear",
-            "R_Ear",
-            "L_Shoulder",
-            "R_Shoulder",
-            "L_Elbow",
-            "R_Elbow",
-            "L_Wrist",
-            "R_Wrist",
-            "L_Hip",
-            "R_Hip",
-            "L_Knee",
-            "R_Knee",
-            "L_Ankle",
-            "R_Ankle",
-        ],
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -163,5 +119,3 @@ def __init__(
         self.initializer_range = initializer_range
         self.scale_factor = scale_factor
         self.use_simple_decoder = use_simple_decoder
-        self.skeleton_edges = skeleton_edges
-        self.skeleton_nodes = skeleton_nodes

From 9eb2e64cc240c77e9c7d5b8f67e8900d9e776899 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 3 Nov 2024 11:48:20 +0000
Subject: [PATCH 146/181] change conversion to sam2/mllama style

---
 docs/source/en/model_doc/vitpose.md           |  43 +--
 .../models/vitpose/convert_vitpose_to_hf.py   | 314 +++++++++---------
 2 files changed, 153 insertions(+), 204 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 502694db80bc..31d9525739ad 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -102,48 +102,7 @@ boxes = [pascal_voc_to_coco(boxes.cpu().numpy())]
 image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
 model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
 
-keypoint_edges = [
-    [15, 13],
-    [13, 11],
-    [16, 14],
-    [14, 12],
-    [11, 12],
-    [5, 11],
-    [6, 12],
-    [5, 6],
-    [5, 7],
-    [6, 8],
-    [7, 9],
-    [8, 10],
-    [1, 2],
-    [0, 1],
-    [0, 2],
-    [1, 3],
-    [2, 4],
-    [3, 5],
-    [4, 6],
-],
-keypoint_nodes = [
-    "Nose",
-    "L_Eye",
-    "R_Eye",
-    "L_Ear",
-    "R_Ear",
-    "L_Shoulder",
-    "R_Shoulder",
-    "L_Elbow",
-    "R_Elbow",
-    "L_Wrist",
-    "R_Wrist",
-    "L_Hip",
-    "R_Hip",
-    "L_Knee",
-    "R_Knee",
-    "L_Ankle",
-    "R_Ankle",
-],
-
-config = VitPoseConfig(keypoint_edges=keypoint_edges, keypoint_nodes=keypoint_nodes)
+config = VitPoseConfig()
 
 # Stage 2. Run ViTPose
 pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index d20f709f1b85..06cc8c196295 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -18,53 +18,34 @@
 """
 
 import argparse
-from pathlib import Path
+import os
+import re
 
-import numpy as np
 import requests
 import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
 from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
-from transformers.models.vitpose.image_processing_vitpose import coco_to_pascal_voc
 
 
-def get_original_pose_results(pixel_values, img_metas, output_heatmap, image_processor):
-    batch_size = pixel_values.shape[0]
-
-    centers = np.zeros((batch_size, 2), dtype=np.float32)
-    scales = np.zeros((batch_size, 2), dtype=np.float32)
-    for i in range(batch_size):
-        centers[i, :] = img_metas[i]["center"]
-        scales[i, :] = img_metas[i]["scale"]
-
-    preds, scores = image_processor.keypoints_from_heatmaps(output_heatmap, center=centers, scale=scales)
-
-    all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
-    all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
-    all_preds[:, :, 0:2] = preds[:, :, 0:2]
-    all_preds[:, :, 2:3] = scores
-    all_boxes[:, 0:2] = centers[:, 0:2]
-    all_boxes[:, 2:4] = scales[:, 0:2]
-    all_boxes[:, 4] = np.prod(scales * 200.0, axis=1)
-
-    poses = all_preds
-
-    # create final results by adding person bbox information
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_person_results.pt", repo_type="dataset")
-    person_results = torch.load(filepath, map_location="cpu")
-    bboxes = np.array([box["bbox"] for box in person_results])
-    bboxes_xyxy = coco_to_pascal_voc(bboxes)
-
-    pose_results = []
-    for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy):
-        pose_result = person_result.copy()
-        pose_result["keypoints"] = pose
-        pose_result["bbox"] = bbox_xyxy
-        pose_results.append(pose_result)
+KEYS_TO_MODIFY_MAPPING = {
+    r"patch_embed.proj": "embeddings.patch_embeddings.projection",
+    r"pos_embed": "embeddings.position_embeddings",
+    r"blocks": "encoder.layer",
+    r"attn.proj": "attention.output.dense",
+    r"attn": "attention.self",
+    r"norm1": "layernorm_before",
+    r"norm2": "layernorm_after",
+    r"last_norm": "layernorm",
+}
 
-    return pose_results
+MODEL_TO_FILE_NAME_MAPPING = {
+    "vitpose-base-simple": "vitpose-b-simple.pth",
+    "vitpose-base": "vitpose-b.pth",
+    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
+    "vitpose+-base": "vitpose+_base.pth",
+}
 
 
 def get_config(model_name):
@@ -91,81 +72,121 @@ def get_config(model_name):
 
     use_simple_decoder = "simple" in model_name
 
+    keypoint_edges = (
+        [
+            [15, 13],
+            [13, 11],
+            [16, 14],
+            [14, 12],
+            [11, 12],
+            [5, 11],
+            [6, 12],
+            [5, 6],
+            [5, 7],
+            [6, 8],
+            [7, 9],
+            [8, 10],
+            [1, 2],
+            [0, 1],
+            [0, 2],
+            [1, 3],
+            [2, 4],
+            [3, 5],
+            [4, 6],
+        ],
+    )
+    keypoint_labels = (
+        [
+            "Nose",
+            "L_Eye",
+            "R_Eye",
+            "L_Ear",
+            "R_Ear",
+            "L_Shoulder",
+            "R_Shoulder",
+            "L_Elbow",
+            "R_Elbow",
+            "L_Wrist",
+            "R_Wrist",
+            "L_Hip",
+            "R_Hip",
+            "L_Knee",
+            "R_Knee",
+            "L_Ankle",
+            "R_Ankle",
+        ],
+    )
+
     config = VitPoseConfig(
         backbone_config=backbone_config,
         num_labels=17,
         use_simple_decoder=use_simple_decoder,
+        keypoint_edges=keypoint_edges,
+        keypoint_labels=keypoint_labels,
     )
 
     return config
 
 
-def rename_key(name, config):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "embeddings.position_embeddings")
-    if "blocks" in name:
-        name = name.replace("blocks", "encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "last_norm" in name:
-        name = name.replace("last_norm", "layernorm")
-
-    # keypoint head
-    if "keypoint_head" in name and config.use_simple_decoder:
-        name = name.replace("final_layer.", "")
-        name = name.replace("keypoint_head", "head.conv")
-    elif "keypoint_head" in name and not config.use_simple_decoder:
-        name = name.replace("keypoint_head", "head")
-        name = name.replace("deconv_layers.0.weight", "deconv1.weight")
-        name = name.replace("deconv_layers.1.weight", "batchnorm1.weight")
-        name = name.replace("deconv_layers.1.bias", "batchnorm1.bias")
-        name = name.replace("deconv_layers.1.running_mean", "batchnorm1.running_mean")
-        name = name.replace("deconv_layers.1.running_var", "batchnorm1.running_var")
-        name = name.replace("deconv_layers.1.num_batches_tracked", "batchnorm1.num_batches_tracked")
-        name = name.replace("deconv_layers.3.weight", "deconv2.weight")
-        name = name.replace("deconv_layers.4.weight", "batchnorm2.weight")
-        name = name.replace("deconv_layers.4.bias", "batchnorm2.bias")
-        name = name.replace("deconv_layers.4.running_mean", "batchnorm2.running_mean")
-        name = name.replace("deconv_layers.4.running_var", "batchnorm2.running_var")
-        name = name.replace("deconv_layers.4.num_batches_tracked", "batchnorm2.num_batches_tracked")
-
-        name = name.replace("final_layer.weight", "conv.weight")
-        name = name.replace("final_layer.bias", "conv.bias")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, dim, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[2])
+def convert_old_keys_to_new_keys(state_dict, config):
+    """
+    This function should be applied only once, on the concatenated keys to efficiently rename using
+    the key mappings.
+    """
+    model_state_dict = {}
+
+    output_hypernetworks_qkv_pattern = r".*.qkv.*"
+    output_hypernetworks_head_pattern = r"keypoint_head.*"
+
+    dim = config.backbone_config.hidden_size
+
+    for key in state_dict.copy().keys():
+        value = state_dict.pop(key)
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(output_hypernetworks_qkv_pattern, key):
+            layer_num = int(key.split(".")[3])
             if "weight" in key:
-                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
+                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.weight"] = value[
+                    :dim, :
+                ]
+                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.weight"] = value[
                     dim : dim * 2, :
                 ]
-                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.weight"] = value[
+                    -dim:, :
+                ]
             else:
-                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
-                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[
+                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.bias"] = value[:dim]
+                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.bias"] = value[
                     dim : dim * 2
                 ]
-                orig_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key, config)] = val
+                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.bias"] = value[-dim:]
 
-    return orig_state_dict
+        if re.match(output_hypernetworks_head_pattern, key):
+            if config.use_simple_decoder:
+                key = key.replace("keypoint_head.final_layer", "head.conv")
+            else:
+                key = key.replace("keypoint_head", "head")
+                key = key.replace("deconv_layers.0.weight", "deconv1.weight")
+                key = key.replace("deconv_layers.1.weight", "batchnorm1.weight")
+                key = key.replace("deconv_layers.1.bias", "batchnorm1.bias")
+                key = key.replace("deconv_layers.1.running_mean", "batchnorm1.running_mean")
+                key = key.replace("deconv_layers.1.running_var", "batchnorm1.running_var")
+                key = key.replace("deconv_layers.1.num_batches_tracked", "batchnorm1.num_batches_tracked")
+                key = key.replace("deconv_layers.3.weight", "deconv2.weight")
+                key = key.replace("deconv_layers.4.weight", "batchnorm2.weight")
+                key = key.replace("deconv_layers.4.bias", "batchnorm2.bias")
+                key = key.replace("deconv_layers.4.running_mean", "batchnorm2.running_mean")
+                key = key.replace("deconv_layers.4.running_var", "batchnorm2.running_var")
+                key = key.replace("deconv_layers.4.num_batches_tracked", "batchnorm2.num_batches_tracked")
+                key = key.replace("final_layer.weight", "conv.weight")
+                key = key.replace("final_layer.bias", "conv.bias")
+        model_state_dict[key] = value
+
+    return model_state_dict
 
 
 # We will verify our results on a COCO image
@@ -175,46 +196,38 @@ def prepare_img():
     return image
 
 
-model_name_to_file_name = {
-    "vitpose-base-simple": "vitpose-b-simple.pth",
-    "vitpose-base": "vitpose-b.pth",
-    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
-    "vitpose+-base": "vitpose+_base.pth",
-}
-
-
 @torch.no_grad()
-def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our VitPose structure.
-    """
+def write_model(model_path, model_name, push_to_hub):
+    os.makedirs(model_path, exist_ok=True)
+
+    # ------------------------------------------------------------
+    # Vision model params and config
+    # ------------------------------------------------------------
 
-    # define default VitPose configuration
+    # params from config
     config = get_config(model_name)
 
-    # load HuggingFace model
-    model = VitPoseForPoseEstimation(config)
-    model.eval()
+    # ------------------------------------------------------------
+    # Convert weights
+    # ------------------------------------------------------------
 
     # load original state_dict
-    filename = model_name_to_file_name[model_name]
+    filename = MODEL_TO_FILE_NAME_MAPPING[model_name]
+    print(f"Fetching all parameters from the checkpoint at {filename}...")
+
     checkpoint_path = hf_hub_download(
         repo_id="nielsr/vitpose-original-checkpoints", filename=filename, repo_type="model"
     )
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
 
-    # rename some keys
-    new_state_dict = convert_state_dict(state_dict, dim=config.backbone_config.hidden_size, config=config)
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    print("Converting model...")
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
+    new_state_dict = convert_old_keys_to_new_keys(state_dict, config)
 
-    # TODO add associate_heads to the MoE models
-    if model_name in ["vitpose-base", "vitpose-base-simple"]:
-        assert missing_keys == []
-        assert unexpected_keys == []
-    elif model_name == "vitpose-base-coco-aic-mpii":
-        for key in unexpected_keys:
-            if key != "backbone.cls_token":
-                assert "associate_heads" in key
+    print("Loading the checkpoint in a Vitpose model.")
+    model = VitPoseForPoseEstimation(config)
+    model.eval()
+    model.load_state_dict(new_state_dict, strict=False)
+    print("Checkpoint loaded successfully.")
 
     # create image processor
     image_processor = VitPoseImageProcessor()
@@ -228,10 +241,8 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     original_pixel_values = torch.load(filepath, map_location="cpu")["img"]
     assert torch.allclose(pixel_values, original_pixel_values, atol=1e-1)
 
-    img_metas = torch.load(filepath, map_location="cpu")["img_metas"]
     dataset_index = torch.tensor([0])
 
-    print("Shape of pixel values:", pixel_values.shape)
     with torch.no_grad():
         # first forward pass
         outputs = model(pixel_values, dataset_index=dataset_index)
@@ -247,63 +258,38 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         )
         output_flipped_heatmap = outputs_flipped.heatmaps
 
-    output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
+    outputs.heatmaps = (output_heatmap + output_flipped_heatmap) * 0.5
 
     # Verify pose_results
-    pose_results = get_original_pose_results(pixel_values, img_metas, output_heatmap, image_processor)
-    # This is a list of dictionaries, containing the bounding box and keypoints per detected person
-    assert torch.allclose(
-        torch.from_numpy(pose_results[0]["bbox"]).float(), torch.tensor([412.8, 157.61, 464.85, 294.62])
-    )
-    assert torch.allclose(
-        torch.from_numpy(pose_results[1]["bbox"]).float(), torch.tensor([384.43, 172.21, 398.55, 206.95])
-    )
+    pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
     if model_name == "vitpose-base-simple":
         assert torch.allclose(
-            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            pose_results[1]["keypoints"][0, :3],
             torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
             atol=5e-2,
         )
     elif model_name == "vitpose-base":
         assert torch.allclose(
-            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            pose_results[1]["keypoints"][0, :3],
             torch.tensor([3.9807913e02, 1.8182812e02, 8.8235235e-01]),
             atol=5e-2,
         )
     elif model_name == "vitpose-base-coco-aic-mpii":
         assert torch.allclose(
-            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            pose_results[1]["keypoints"][0, :3],
             torch.tensor([3.98305542e02, 1.81741592e02, 8.69966745e-01]),
             atol=5e-2,
         )
     elif model_name == "vitpose+-base":
         assert torch.allclose(
-            torch.from_numpy(pose_results[1]["keypoints"][0, :3]),
+            pose_results[1]["keypoints"][0, :3],
             torch.tensor([3.98201294e02, 1.81728302e02, 8.75046968e-01]),
             atol=5e-2,
         )
     else:
         raise ValueError("Model not supported")
-    print("Looks ok!")
-
-    # test post_process_pose_estimation
-    # results are slightly different due to no flip augmentation
-    hf_pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes[0])
-    if model_name == "vitpose-base-simple":
-        assert torch.allclose(
-            torch.tensor(hf_pose_results[1]["keypoints"][0, :3]),
-            torch.tensor([3.9813846e02, 1.8180725e02, 8.7446749e-01]),
-            atol=5e-2,
-        )
-        assert hf_pose_results[0]["keypoints"].shape == (17, 3)
-        assert hf_pose_results[1]["keypoints"].shape == (17, 3)
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
+    print("Conversion successfully done.")
 
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
@@ -311,13 +297,13 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
         image_processor.push_to_hub(f"nielsr/{model_name}")
 
 
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
         "--model_name",
         default="vitpose-base-simple",
-        choices=model_name_to_file_name.keys(),
+        choices=MODEL_TO_FILE_NAME_MAPPING.keys(),
         type=str,
         help="Name of the VitPose model you'd like to convert.",
     )
@@ -329,4 +315,8 @@ def convert_vitpose_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     )
 
     args = parser.parse_args()
-    convert_vitpose_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
+    write_model(model_path=args.pytorch_dump_folder_path, model_name=args.model_name, push_to_hub=args.push_to_hub)
+
+
+if __name__ == "__main__":
+    main()

From 8738973724dd9df1a677ce8e8c2831863e4da787 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Wed, 6 Nov 2024 21:45:26 +0900
Subject: [PATCH 147/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 4b51807d0d75..5e5259b83636 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -114,7 +114,7 @@ def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
     return bboxes
 
 
-def get_keypoint_predictions(heatmaps: np.ndarray) -> tuple:
+def get_keypoint_predictions(heatmaps: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     """Get keypoint predictions from score maps.
 
     Args:

From 75b268f2cf78dc65a9512a9f5bfe5faaad77c363 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Wed, 6 Nov 2024 22:13:45 +0900
Subject: [PATCH 148/181] Update
 src/transformers/models/vitpose/image_processing_vitpose.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 5e5259b83636..7be1521c9089 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -147,7 +147,7 @@ def get_keypoint_predictions(heatmaps: np.ndarray) -> Tuple[np.ndarray, np.ndarr
     return preds, scores
 
 
-def post_dark_unbiased_data_processing(coords: np.ndarray, batch_heatmaps: np.ndarray, kernel: int = 3):
+def post_dark_unbiased_data_processing(coords: np.ndarray, batch_heatmaps: np.ndarray, kernel: int = 3) -> np.ndarray:
     """DARK post-pocessing. Implemented by unbiased_data_processing.
 
     Paper references:

From 5a1f6a33f7aa45444ebc7d8235a8306e186b02b5 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 9 Nov 2024 05:29:38 +0000
Subject: [PATCH 149/181] refactor convert

---
 docs/source/en/model_doc/vitpose.md           |  18 +-
 .../models/vitpose/convert_vitpose_to_hf.py   | 168 +++++++++---------
 2 files changed, 87 insertions(+), 99 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 31d9525739ad..d90268771146 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -40,7 +40,7 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 >>> outputs = model(pixel_values, dataset_index)
 ```
 
-- ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt-detr), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
+- ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
 import math
@@ -117,20 +117,6 @@ for pose_result in pose_results:
         x, y, score = keypoint
         print(f"coordinate : [{x}, {y}], score : {score}")
 
-def draw_points(image, keypoints, keypoint_colors, keypoint_score_threshold, radius, show_keypoint_weight):
-    if keypoint_colors is not None:
-        assert len(keypoint_colors) == len(keypoints)
-    for id, keypoint in enumerate(keypoints):
-        x_coord, y_coord, keypoint_score = int(keypoint[0]), int(keypoint[1]), keypoint[2]
-        if keypoint_score > keypoint_score_threshold:
-            color = tuple(int(c) for c in keypoint_colors[id])
-            if show_keypoint_weight:
-                cv2.circle(image, (x_coord, y_coord), radius, color, -1)
-                transparency = max(0, min(1, keypoint_score))
-                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
-            else:
-                cv2.circle(image, (x_coord, y_coord), radius, color, -1)
-
 def draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
     height, width, _ = image.shape
     if keypoint_edges is not None and link_colors is not None:
@@ -216,7 +202,7 @@ def visualize_keypoints(
     return image
 
 # Note: keypoint_edges and color palette are dataset-specific
-keypoint_edges = config.keypoint_edges
+keypoint_edges = config.edges
 
 palette = np.array(
     [
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 06cc8c196295..bf056d98ce8f 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -29,7 +29,7 @@
 from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
 
 
-KEYS_TO_MODIFY_MAPPING = {
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     r"patch_embed.proj": "embeddings.patch_embeddings.projection",
     r"pos_embed": "embeddings.position_embeddings",
     r"blocks": "encoder.layer",
@@ -38,6 +38,8 @@
     r"norm1": "layernorm_before",
     r"norm2": "layernorm_after",
     r"last_norm": "layernorm",
+    r"keypoint_head": "head",
+    r"final_layer": "conv",
 }
 
 MODEL_TO_FILE_NAME_MAPPING = {
@@ -72,7 +74,7 @@ def get_config(model_name):
 
     use_simple_decoder = "simple" in model_name
 
-    keypoint_edges = (
+    edges = (
         [
             [15, 13],
             [13, 11],
@@ -95,98 +97,56 @@ def get_config(model_name):
             [4, 6],
         ],
     )
-    keypoint_labels = (
-        [
-            "Nose",
-            "L_Eye",
-            "R_Eye",
-            "L_Ear",
-            "R_Ear",
-            "L_Shoulder",
-            "R_Shoulder",
-            "L_Elbow",
-            "R_Elbow",
-            "L_Wrist",
-            "R_Wrist",
-            "L_Hip",
-            "R_Hip",
-            "L_Knee",
-            "R_Knee",
-            "L_Ankle",
-            "R_Ankle",
-        ],
-    )
+    id2label = {
+        0: "Nose",
+        1: "L_Eye",
+        2: "R_Eye",
+        3: "L_Ear",
+        4: "R_Ear",
+        5: "L_Shoulder",
+        6: "R_Shoulder",
+        7: "L_Elbow",
+        8: "R_Elbow",
+        9: "L_Wrist",
+        10: "R_Wrist",
+        11: "L_Hip",
+        12: "R_Hip",
+        13: "L_Knee",
+        14: "R_Knee",
+        15: "L_Ankle",
+        16: "R_Ankle",
+    }
+
+    label2id = {v: k for k, v in id2label.items()}
 
     config = VitPoseConfig(
         backbone_config=backbone_config,
         num_labels=17,
         use_simple_decoder=use_simple_decoder,
-        keypoint_edges=keypoint_edges,
-        keypoint_labels=keypoint_labels,
+        edges=edges,
+        id2label=id2label,
+        label2id=label2id,
     )
 
     return config
 
 
-def convert_old_keys_to_new_keys(state_dict, config):
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
     """
     This function should be applied only once, on the concatenated keys to efficiently rename using
     the key mappings.
     """
-    model_state_dict = {}
-
-    output_hypernetworks_qkv_pattern = r".*.qkv.*"
-    output_hypernetworks_head_pattern = r"keypoint_head.*"
-
-    dim = config.backbone_config.hidden_size
-
-    for key in state_dict.copy().keys():
-        value = state_dict.pop(key)
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(output_hypernetworks_qkv_pattern, key):
-            layer_num = int(key.split(".")[3])
-            if "weight" in key:
-                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.weight"] = value[
-                    :dim, :
-                ]
-                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.weight"] = value[
-                    dim : dim * 2, :
-                ]
-                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.weight"] = value[
-                    -dim:, :
-                ]
-            else:
-                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.query.bias"] = value[:dim]
-                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.key.bias"] = value[
-                    dim : dim * 2
-                ]
-                model_state_dict[f"backbone.encoder.layer.{layer_num}.attention.attention.value.bias"] = value[-dim:]
-
-        if re.match(output_hypernetworks_head_pattern, key):
-            if config.use_simple_decoder:
-                key = key.replace("keypoint_head.final_layer", "head.conv")
-            else:
-                key = key.replace("keypoint_head", "head")
-                key = key.replace("deconv_layers.0.weight", "deconv1.weight")
-                key = key.replace("deconv_layers.1.weight", "batchnorm1.weight")
-                key = key.replace("deconv_layers.1.bias", "batchnorm1.bias")
-                key = key.replace("deconv_layers.1.running_mean", "batchnorm1.running_mean")
-                key = key.replace("deconv_layers.1.running_var", "batchnorm1.running_var")
-                key = key.replace("deconv_layers.1.num_batches_tracked", "batchnorm1.num_batches_tracked")
-                key = key.replace("deconv_layers.3.weight", "deconv2.weight")
-                key = key.replace("deconv_layers.4.weight", "batchnorm2.weight")
-                key = key.replace("deconv_layers.4.bias", "batchnorm2.bias")
-                key = key.replace("deconv_layers.4.running_mean", "batchnorm2.running_mean")
-                key = key.replace("deconv_layers.4.running_var", "batchnorm2.running_var")
-                key = key.replace("deconv_layers.4.num_batches_tracked", "batchnorm2.num_batches_tracked")
-                key = key.replace("final_layer.weight", "conv.weight")
-                key = key.replace("final_layer.bias", "conv.bias")
-        model_state_dict[key] = value
-
-    return model_state_dict
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
 
 
 # We will verify our results on a COCO image
@@ -220,13 +180,55 @@ def write_model(model_path, model_name, push_to_hub):
     )
 
     print("Converting model...")
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
-    new_state_dict = convert_old_keys_to_new_keys(state_dict, config)
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
+    all_keys = list(original_state_dict.keys())
+    new_keys = convert_old_keys_to_new_keys(all_keys)
+
+    dim = config.backbone_config.hidden_size
+
+    state_dict = {}
+    for key in all_keys:
+        new_key = new_keys[key]
+        value = original_state_dict[key]
+
+    if re.search("qkv", new_key):
+        if "weight" in new_key:
+            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim, :]
+            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2, :]
+            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:, :]
+        else:
+            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
+            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
+            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
+
+    elif re.search("head", new_key) and not config.use_simple_decoder:
+        # Pattern for deconvolution layers
+        print(new_key)
+        deconv_pattern = r"deconv_layers\.(0|3)\.weight"
+        new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
+        # Pattern for batch normalization layers
+        bn_patterns = [
+            (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
+            (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
+            (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
+            (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
+            (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
+        ]
+
+        for pattern, replacement in bn_patterns:
+            if re.search(pattern, new_key):
+                # Convert the layer number to the correct batch norm index
+                layer_num = int(re.search(pattern, key).group(1))
+                bn_num = layer_num // 3 + 1
+                new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
+        state_dict[new_key] = value
+    else:
+        state_dict[new_key] = value
 
     print("Loading the checkpoint in a Vitpose model.")
     model = VitPoseForPoseEstimation(config)
     model.eval()
-    model.load_state_dict(new_state_dict, strict=False)
+    model.load_state_dict(state_dict, strict=False)
     print("Checkpoint loaded successfully.")
 
     # create image processor

From 3bfc2191fa39d1c1d9bae9b24bb2ef0538d66b8d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sat, 9 Nov 2024 06:01:17 +0000
Subject: [PATCH 150/181] add supervision

---
 docs/source/en/model_doc/vitpose.md | 108 +++++++++++-----------------
 1 file changed, 43 insertions(+), 65 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index d90268771146..498b641b3a3f 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -63,8 +63,7 @@ from transformers import (
 url = "http://images.cocodataset.org/val2017/000000000139.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
-# Stage 1. Run Object Detector
-# User can replace this object_detector part
+# Stage 1. Run Object Detector (User can replace this object_detector part)
 person_image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
 person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
 inputs = person_image_processor(images=image, return_tensors="pt")
@@ -95,16 +94,15 @@ def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
 
     return bboxes
 
-# 0 index indicates human label in COCO
+# Human label refers 0 index in COCO dataset
 boxes = results[0]["boxes"][results[0]["labels"] == 0]
 boxes = [pascal_voc_to_coco(boxes.cpu().numpy())]
 
+# Stage 2. Run ViTPose
+config = VitPoseConfig()
 image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
 model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
 
-config = VitPoseConfig()
-
-# Stage 2. Run ViTPose
 pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
 
 with torch.no_grad():
@@ -117,6 +115,35 @@ for pose_result in pose_results:
         x, y, score = keypoint
         print(f"coordinate : [{x}, {y}], score : {score}")
 
+# Visualization for supervision user
+import supervision as sv
+
+key_points = sv.KeyPoints(xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy())
+
+edge_annotator = sv.EdgeAnnotator(
+    color=sv.Color.GREEN,
+    thickness=5
+)
+annotated_frame = edge_annotator.annotate(
+    scene=image.copy(),
+    key_points=key_points
+)
+
+# Visualization for advanced user
+def draw_points(image, keypoints, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
+    if pose_keypoint_color is not None:
+        assert len(pose_keypoint_color) == len(keypoints)
+    for kid, kpt in enumerate(keypoints):
+        x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+        if kpt_score > keypoint_score_threshold:
+            color = tuple(int(c) for c in pose_keypoint_color[kid])
+            if show_keypoint_weight:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                transparency = max(0, min(1, kpt_score))
+                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+            else:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+
 def draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
     height, width, _ = image.shape
     if keypoint_edges is not None and link_colors is not None:
@@ -153,53 +180,6 @@ def draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_thr
                 else:
                     cv2.line(image, pos1, pos2, color, thickness=thickness)
 
-def visualize_keypoints(
-    image,
-    pose_result,
-    keypoint_edges=None,
-    keypoint_score_threshold=0.3,
-    keypoint_colors=None,
-    link_colors=None,
-    radius=4,
-    thickness=1,
-    show_keypoint_weight=False,
-):
-    """Draw keypoints and links on an image.
-
-    Args:
-        image (`numpy.ndarray`): 
-            The image to draw poses on. It will be modified in-place.
-        pose_result (`List[numpy.ndarray]`): 
-            The poses to draw. Each element is a set of K keypoints as a Kx3 numpy.ndarray, where each keypoint
-            is represented as x, y, score.
-        keypoint_edges (`List[tuple]`, *optional*): 
-            Mapping index of the keypoint_edges links.
-        keypoint_score_threshold (`float`, *optional*, defaults to 0.3): 
-            Minimum score of keypoints to be shown.
-        keypoint_colors (`numpy.ndarray`, *optional*): 
-            Color of N keypoints. If None, the keypoints will not be drawn.
-        link_colors (`numpy.ndarray`, *optional*): 
-            Color of M links. If None, the links will not be drawn.
-        radius (`int`, *optional*, defaults to 4):
-            Radius of keypoint circles.
-        thickness (`int`, *optional*, defaults to 1): 
-            Thickness of lines.
-        show_keypoint_weight (`bool`, *optional*, defaults to False): 
-            Whether to adjust keypoint and link visibility based on the keypoint scores.
-    
-    Returns:
-        `numpy.ndarray`: Image with drawn keypoints and links.
-    """
-    for keypoints in pose_result:
-        keypoints = np.array(keypoints, copy=False)
-
-        # draw each point on image
-        draw_points(image, keypoints, keypoint_colors, keypoint_score_threshold, radius, show_keypoint_weight)
-
-        # draw links
-        draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight)
-
-    return image
 
 # Note: keypoint_edges and color palette are dataset-specific
 keypoint_edges = config.edges
@@ -233,20 +213,18 @@ link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 1
 keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
 
 pose_results = [result["keypoints"] for result in pose_results]
+numpy_image = np.array(image)
 
-result = visualize_keypoints(
-    np.array(image),
-    pose_result,
-    keypoint_edges=keypoint_edges,
-    keypoint_score_threshold=0.3,
-    keypoint_colors=keypoint_colors,
-    link_colors=link_colors,
-    radius=4,
-    thickness=1,
-    show_keypoint_weight=False,
-)
+for keypoints in pose_result:
+    keypoints = np.array(keypoints, copy=False)
+
+    # draw each point on image
+    draw_points(numpy_image, keypoints, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
+
+    # draw links
+    draw_links(numpy_image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
 
-pose_image = Image.fromarray(result)
+pose_image = Image.fromarray(numpy_image)
 pose_image
 ```
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>

From dae04e8589bd1d1512d0b65cb93aa642ad34f65e Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Sat, 16 Nov 2024 11:17:59 +0900
Subject: [PATCH 151/181] Update
 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index ed932c88ced6..b12ffc3ee809 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -243,7 +243,7 @@ def __init__(self, config: VitPoseBackboneConfig):
         experts = [nn.Linear(hidden_features, part_features) for _ in range(num_experts)]
         self.experts = nn.ModuleList(experts)
 
-    def forward(self, hidden_state, indices):
+    def forward(self, hidden_state: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
         expert_hidden_state = torch.zeros_like(hidden_state[:, :, -self.part_features :])
 
         hidden_state = self.fc1(hidden_state)

From 0747e80f36636930bc0888f45dc68a1a27d61aa5 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 18 Nov 2024 05:49:45 +0000
Subject: [PATCH 152/181] remove reduntant def

---
 .../vitpose_backbone/test_modeling_vitpose_backbone.py      | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index 7d527149cad7..6120b444687f 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -188,12 +188,6 @@ def test_forward_signature(self):
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
 
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
 @require_torch
 class VitPoseBackboneTest(unittest.TestCase, BackboneTesterMixin):
     all_model_classes = (VitPoseBackbone,) if is_torch_available() else ()

From 5207b570699b195be0980716834dbd5cad447129 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 18 Nov 2024 05:52:51 +0000
Subject: [PATCH 153/181] seperate code block for visualization

---
 docs/source/en/model_doc/vitpose.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 498b641b3a3f..0b7e50512e1b 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -114,8 +114,11 @@ for pose_result in pose_results:
     for keypoint in pose_result["keypoints"]:
         x, y, score = keypoint
         print(f"coordinate : [{x}, {y}], score : {score}")
+```
+
 
-# Visualization for supervision user
+### Visualization for supervision user
+```py
 import supervision as sv
 
 key_points = sv.KeyPoints(xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy())
@@ -128,8 +131,10 @@ annotated_frame = edge_annotator.annotate(
     scene=image.copy(),
     key_points=key_points
 )
+```
 
-# Visualization for advanced user
+### Visualization for advanced user
+```py
 def draw_points(image, keypoints, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
     if pose_keypoint_color is not None:
         assert len(pose_keypoint_color) == len(keypoints)

From 96b4da91bdcc0835206dc1dd46489ff6c453b2f0 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 18 Nov 2024 06:03:51 +0000
Subject: [PATCH 154/181] add validation for num_moe

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 55 +++++++++----------
 .../modeling_vitpose_backbone.py              | 17 +++++-
 .../test_modeling_vitpose_backbone.py         |  2 +-
 3 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index bf056d98ce8f..0b019e37201e 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -191,44 +191,39 @@ def write_model(model_path, model_name, push_to_hub):
         new_key = new_keys[key]
         value = original_state_dict[key]
 
-    if re.search("qkv", new_key):
-        if "weight" in new_key:
-            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim, :]
-            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2, :]
-            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:, :]
-        else:
+        if re.search("qkv", new_key):
             state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
             state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
             state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
 
-    elif re.search("head", new_key) and not config.use_simple_decoder:
-        # Pattern for deconvolution layers
-        print(new_key)
-        deconv_pattern = r"deconv_layers\.(0|3)\.weight"
-        new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
-        # Pattern for batch normalization layers
-        bn_patterns = [
-            (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
-            (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
-            (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
-            (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
-            (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
-        ]
-
-        for pattern, replacement in bn_patterns:
-            if re.search(pattern, new_key):
-                # Convert the layer number to the correct batch norm index
-                layer_num = int(re.search(pattern, key).group(1))
-                bn_num = layer_num // 3 + 1
-                new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
-        state_dict[new_key] = value
-    else:
-        state_dict[new_key] = value
+        elif re.search("head", new_key) and not config.use_simple_decoder:
+            # Pattern for deconvolution layers
+            print(new_key)
+            deconv_pattern = r"deconv_layers\.(0|3)\.weight"
+            new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
+            # Pattern for batch normalization layers
+            bn_patterns = [
+                (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
+                (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
+                (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
+                (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
+                (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
+            ]
+
+            for pattern, replacement in bn_patterns:
+                if re.search(pattern, new_key):
+                    # Convert the layer number to the correct batch norm index
+                    layer_num = int(re.search(pattern, key).group(1))
+                    bn_num = layer_num // 3 + 1
+                    new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
+            state_dict[new_key] = value
+        else:
+            state_dict[new_key] = value
 
     print("Loading the checkpoint in a Vitpose model.")
     model = VitPoseForPoseEstimation(config)
     model.eval()
-    model.load_state_dict(state_dict, strict=False)
+    model.load_state_dict(state_dict)
     print("Checkpoint loaded successfully.")
 
     # create image processor
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index b12ffc3ee809..16ebaebb0679 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -271,7 +271,7 @@ def __init__(self, config: VitPoseBackboneConfig) -> None:
         self.activation = ACT2FN[config.hidden_act]
         self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
 
-    def forward(self, hidden_state: torch.Tensor, indices=None) -> torch.Tensor:
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         hidden_state = self.fc1(hidden_state)
         hidden_state = self.activation(hidden_state)
         hidden_state = self.fc2(hidden_state)
@@ -281,8 +281,9 @@ def forward(self, hidden_state: torch.Tensor, indices=None) -> torch.Tensor:
 class VitPoseBackboneLayer(nn.Module):
     def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
+        self.num_experts = config.num_experts
         self.attention = VitPoseBackboneAttention(config)
-        self.mlp = VitPoseBackboneMLP(config) if config.num_experts == 1 else VitPoseBackboneMoeMLP(config)
+        self.mlp = VitPoseBackboneMLP(config) if self.num_experts == 1 else VitPoseBackboneMoeMLP(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -293,6 +294,13 @@ def forward(
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        # Validate dataset_index when using multiple experts
+        if self.num_experts > 1 and dataset_index is None:
+            raise ValueError(
+                "dataset_index must be provided when using multiple experts "
+                f"(num_experts={self.num_experts}). Please provide dataset_index "
+                "to the forward pass."
+            )
         self_attention_outputs = self.attention(
             self.layernorm_before(hidden_states),  # in VitPoseBackbone, layernorm is applied before self-attention
             head_mask,
@@ -305,7 +313,10 @@ def forward(
         hidden_states = attention_output + hidden_states
 
         layer_output = self.layernorm_after(hidden_states)
-        layer_output = self.mlp(layer_output, indices=dataset_index)
+        if self.num_experts == 1:
+            layer_output = self.mlp(layer_output)
+        else:
+            layer_output = self.mlp(layer_output, indices=dataset_index)
 
         # second residual connection
         layer_output = layer_output + hidden_states
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index 6120b444687f..c32a57e9f298 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -31,7 +31,7 @@
 
 
 if is_vision_available():
-    from PIL import Image
+    pass
 
 
 class VitPoseBackboneModelTester:

From d4aa3ee670076d17165ca976eb527cc53d11c203 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 11:04:16 +0000
Subject: [PATCH 155/181] final commit

---
 docs/source/en/model_doc/vitpose.md           | 36 ++++----
 .../models/vitpose/convert_vitpose_to_hf.py   | 84 +++++++++++--------
 .../vitpose/image_processing_vitpose.py       | 21 +++--
 3 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 0b7e50512e1b..3e7794b43ff1 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -43,9 +43,6 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 - ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
-import math
-
-import cv2
 import numpy as np
 import requests
 import torch
@@ -111,9 +108,7 @@ with torch.no_grad():
 pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
 for pose_result in pose_results:
-    for keypoint in pose_result["keypoints"]:
-        x, y, score = keypoint
-        print(f"coordinate : [{x}, {y}], score : {score}")
+    print(pose_result)
 ```
 
 
@@ -135,11 +130,14 @@ annotated_frame = edge_annotator.annotate(
 
 ### Visualization for advanced user
 ```py
-def draw_points(image, keypoints, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
+import math
+import cv2
+
+def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
     if pose_keypoint_color is not None:
         assert len(pose_keypoint_color) == len(keypoints)
-    for kid, kpt in enumerate(keypoints):
-        x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+    for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
+        x_coord, y_coord = int(kpt[0]), int(kpt[1])
         if kpt_score > keypoint_score_threshold:
             color = tuple(int(c) for c in pose_keypoint_color[kid])
             if show_keypoint_weight:
@@ -149,13 +147,13 @@ def draw_points(image, keypoints, pose_keypoint_color, keypoint_score_threshold,
             else:
                 cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
 
-def draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
     height, width, _ = image.shape
     if keypoint_edges is not None and link_colors is not None:
         assert len(link_colors) == len(keypoint_edges)
         for sk_id, sk in enumerate(keypoint_edges):
-            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), keypoints[sk[0], 2])
-            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), keypoints[sk[1], 2])
+            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
+            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
             if (
                 x1 > 0
                 and x1 < width
@@ -183,11 +181,11 @@ def draw_links(image, keypoints, keypoint_edges, link_colors, keypoint_score_thr
                     transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
                     cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
                 else:
-                    cv2.line(image, pos1, pos2, color, thickness=thickness)
+                    cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)
 
 
 # Note: keypoint_edges and color palette are dataset-specific
-keypoint_edges = config.edges
+keypoint_edges = model.config.edges
 
 palette = np.array(
     [
@@ -217,17 +215,17 @@ palette = np.array(
 link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
 keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
 
-pose_results = [result["keypoints"] for result in pose_results]
 numpy_image = np.array(image)
 
-for keypoints in pose_result:
-    keypoints = np.array(keypoints, copy=False)
+for pose_result in pose_results:
+    scores = np.array(pose_result["scores"])
+    keypoints = np.array(pose_result["keypoints"])
 
     # draw each point on image
-    draw_points(numpy_image, keypoints, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
+    draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
 
     # draw links
-    draw_links(numpy_image, keypoints, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
+    draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
 
 pose_image = Image.fromarray(numpy_image)
 pose_image
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 0b019e37201e..30e068475eda 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -74,29 +74,27 @@ def get_config(model_name):
 
     use_simple_decoder = "simple" in model_name
 
-    edges = (
-        [
-            [15, 13],
-            [13, 11],
-            [16, 14],
-            [14, 12],
-            [11, 12],
-            [5, 11],
-            [6, 12],
-            [5, 6],
-            [5, 7],
-            [6, 8],
-            [7, 9],
-            [8, 10],
-            [1, 2],
-            [0, 1],
-            [0, 2],
-            [1, 3],
-            [2, 4],
-            [3, 5],
-            [4, 6],
-        ],
-    )
+    edges = [
+        [15, 13],
+        [13, 11],
+        [16, 14],
+        [14, 12],
+        [11, 12],
+        [5, 11],
+        [6, 12],
+        [5, 6],
+        [5, 7],
+        [6, 8],
+        [7, 9],
+        [8, 10],
+        [1, 2],
+        [0, 1],
+        [0, 2],
+        [1, 3],
+        [2, 4],
+        [3, 5],
+        [4, 6],
+    ]
     id2label = {
         0: "Nose",
         1: "L_Eye",
@@ -262,26 +260,46 @@ def write_model(model_path, model_name, push_to_hub):
 
     if model_name == "vitpose-base-simple":
         assert torch.allclose(
-            pose_results[1]["keypoints"][0, :3],
-            torch.tensor([3.98180511e02, 1.81808380e02, 8.66642594e-01]),
+            pose_results[1]["keypoints"][0],
+            torch.tensor([3.98180511e02, 1.81808380e02]),
+            atol=5e-2,
+        )
+        assert torch.allclose(
+            pose_results[1]["scores"][0],
+            torch.tensor([8.66642594e-01]),
             atol=5e-2,
         )
     elif model_name == "vitpose-base":
         assert torch.allclose(
-            pose_results[1]["keypoints"][0, :3],
-            torch.tensor([3.9807913e02, 1.8182812e02, 8.8235235e-01]),
+            pose_results[1]["keypoints"][0],
+            torch.tensor([3.9807913e02, 1.8182812e02]),
+            atol=5e-2,
+        )
+        assert torch.allclose(
+            pose_results[1]["scores"][0],
+            torch.tensor([8.8235235e-01]),
             atol=5e-2,
         )
     elif model_name == "vitpose-base-coco-aic-mpii":
         assert torch.allclose(
-            pose_results[1]["keypoints"][0, :3],
-            torch.tensor([3.98305542e02, 1.81741592e02, 8.69966745e-01]),
+            pose_results[1]["keypoints"][0],
+            torch.tensor([3.98305542e02, 1.81741592e02]),
+            atol=5e-2,
+        )
+        assert torch.allclose(
+            pose_results[1]["scores"][0],
+            torch.tensor([8.69966745e-01]),
             atol=5e-2,
         )
     elif model_name == "vitpose+-base":
         assert torch.allclose(
-            pose_results[1]["keypoints"][0, :3],
-            torch.tensor([3.98201294e02, 1.81728302e02, 8.75046968e-01]),
+            pose_results[1]["keypoints"][0],
+            torch.tensor([3.98201294e02, 1.81728302e02]),
+            atol=5e-2,
+        )
+        assert torch.allclose(
+            pose_results[1]["scores"][0],
+            torch.tensor([8.75046968e-01]),
             atol=5e-2,
         )
     else:
@@ -290,8 +308,8 @@ def write_model(model_path, model_name, push_to_hub):
 
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
-        model.push_to_hub(f"nielsr/{model_name}")
-        image_processor.push_to_hub(f"nielsr/{model_name}")
+        model.push_to_hub(f"danelcsb/{model_name}")
+        image_processor.push_to_hub(f"danelcsb/{model_name}")
 
 
 def main():
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 7be1521c9089..2e1e194332c0 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -595,6 +595,7 @@ def post_process_pose_estimation(
         outputs: torch.Tensor,
         boxes: Union[List[List[List[float]]], np.ndarray],
         kernel_size: int = 11,
+        threshold: float = None,
         target_sizes: Union[TensorType, List[Tuple]] = None,
     ):
         """
@@ -608,6 +609,8 @@ def post_process_pose_estimation(
                 box coordinates in COCO format (top_left_x, top_left_y, width, height).
             kernel_size (`int`, *optional*, defaults to 11):
                 Gaussian kernel size (K) for modulation.
+            threshold (`float`, *optional*, defaults to None):
+                Score threshold to keep object detection predictions.
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 `(height, width)` of each image in the batch. If unset, predictions will be resize with the default value.
@@ -642,27 +645,27 @@ def post_process_pose_estimation(
             outputs.heatmaps.cpu().numpy(), centers, scales, kernel=kernel_size
         )
 
-        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
-        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
-        all_preds[:, :, 0:2] = preds[:, :, 0:2]
-        all_preds[:, :, 2:3] = scores
+        all_boxes = np.zeros((batch_size, 4), dtype=np.float32)
         all_boxes[:, 0:2] = centers[:, 0:2]
         all_boxes[:, 2:4] = scales[:, 0:2]
-        all_boxes[:, 4] = np.prod(scales * self.normalize_factor, axis=1)
 
-        poses = torch.Tensor(all_preds)
+        poses = torch.Tensor(preds)
+        scores = torch.Tensor(scores)
         bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
 
         results: List[List[Dict[str, torch.Tensor]]] = []
 
-        pose_bbox_pairs = zip(poses, bboxes_xyxy)
+        pose_bbox_pairs = zip(poses, scores, bboxes_xyxy)
 
         for batch_bbox in boxes:
             batch_results: List[Dict[str, torch.Tensor]] = []
             for _ in batch_bbox:
                 # Unpack the next pose and bbox_xyxy from the iterator
-                pose, bbox_xyxy = next(pose_bbox_pairs)
-                pose_result = {"keypoints": pose, "bbox": bbox_xyxy}
+                pose, score, bbox_xyxy = next(pose_bbox_pairs)
+                if threshold is not None:
+                    score_condition = (score > threshold).squeeze(1)
+                    pose, score = pose[score_condition], score[score_condition]
+                pose_result = {"keypoints": pose, "scores": score, "bbox": bbox_xyxy}
                 batch_results.append(pose_result)
             results.append(batch_results)
 

From a9a3645406e8918796f89cbb67fdcbb19f142741 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 11:18:56 +0000
Subject: [PATCH 156/181] add labels

---
 .../models/vitpose/image_processing_vitpose.py             | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 2e1e194332c0..7a0c2d1ad120 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -620,7 +620,7 @@ def post_process_pose_estimation(
         """
 
         # First compute centers and scales for each bounding box
-        batch_size = len(outputs.heatmaps)
+        batch_size, num_keypoints, _, _ = outputs.heatmaps.shape
 
         if target_sizes is not None:
             if batch_size != len(target_sizes):
@@ -651,6 +651,7 @@ def post_process_pose_estimation(
 
         poses = torch.Tensor(preds)
         scores = torch.Tensor(scores)
+        labels = torch.range(0, num_keypoints - 1)
         bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
 
         results: List[List[Dict[str, torch.Tensor]]] = []
@@ -664,8 +665,8 @@ def post_process_pose_estimation(
                 pose, score, bbox_xyxy = next(pose_bbox_pairs)
                 if threshold is not None:
                     score_condition = (score > threshold).squeeze(1)
-                    pose, score = pose[score_condition], score[score_condition]
-                pose_result = {"keypoints": pose, "scores": score, "bbox": bbox_xyxy}
+                    pose, score, labels = pose[score_condition], score[score_condition], labels[score_condition]
+                pose_result = {"keypoints": pose, "scores": score, "labels": labels, "bbox": bbox_xyxy}
                 batch_results.append(pose_result)
             results.append(batch_results)
 

From d8e6e2ecd343fadbf1ed19501cf738e5b774dd1b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 11:20:40 +0000
Subject: [PATCH 157/181] [run-slow] vitpose, vitpose_backbone


From e588f4ff30ea33a9eec84d8ca5a23c9fa5b09ef2 Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Wed, 20 Nov 2024 22:06:25 +0900
Subject: [PATCH 158/181] Update
 src/transformers/models/vitpose/convert_vitpose_to_hf.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/vitpose/convert_vitpose_to_hf.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 30e068475eda..efff83bffb70 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -306,6 +306,10 @@ def write_model(model_path, model_name, push_to_hub):
         raise ValueError("Model not supported")
     print("Conversion successfully done.")
 
+    # save the model to a local directory
+    model.save_pretrained(model_path)
+    image_processor.save_pretrained(model_path)
+    
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
         model.push_to_hub(f"danelcsb/{model_name}")

From 78fe1b9c52d659facefdb9e06518ba45e9298ca4 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 13:49:15 +0000
Subject: [PATCH 159/181] enable all conversion

---
 .../models/vitpose/convert_vitpose_to_hf.py      | 16 +++++++++-------
 tests/models/vitpose/test_modeling_vitpose.py    |  6 +++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 30e068475eda..30c35515283b 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -46,13 +46,13 @@
     "vitpose-base-simple": "vitpose-b-simple.pth",
     "vitpose-base": "vitpose-b.pth",
     "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
-    "vitpose+-base": "vitpose+_base.pth",
+    "vitpose-plus-base": "vitpose+_base.pth",
 }
 
 
 def get_config(model_name):
-    num_experts = 6 if "+" in model_name else 1
-    part_features = 192 if "+" in model_name else 0
+    num_experts = 6 if "plus" in model_name else 1
+    part_features = 192 if "plus" in model_name else 0
 
     backbone_config = VitPoseBackboneConfig(out_indices=[12], num_experts=num_experts, part_features=part_features)
     # size of the architecture
@@ -189,14 +189,16 @@ def write_model(model_path, model_name, push_to_hub):
         new_key = new_keys[key]
         value = original_state_dict[key]
 
-        if re.search("qkv", new_key):
+        if re.search("associate_heads", new_key) or re.search("backbone.cls_token", new_key):
+            # This associated_heads is concept of auxiliary head so does not require in inference stage.
+            # backbone.cls_token is optional forward function for dynamically change of size, see detail in https://github.com/ViTAE-Transformer/ViTPose/issues/34
+            pass
+        elif re.search("qkv", new_key):
             state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
             state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
             state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
-
         elif re.search("head", new_key) and not config.use_simple_decoder:
             # Pattern for deconvolution layers
-            print(new_key)
             deconv_pattern = r"deconv_layers\.(0|3)\.weight"
             new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
             # Pattern for batch normalization layers
@@ -291,7 +293,7 @@ def write_model(model_path, model_name, push_to_hub):
             torch.tensor([8.69966745e-01]),
             atol=5e-2,
         )
-    elif model_name == "vitpose+-base":
+    elif model_name == "vitpose-plus-base":
         assert torch.allclose(
             pose_results[1]["keypoints"][0],
             torch.tensor([3.98201294e02, 1.81728302e02]),
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index a6d87fbc6fda..1e0f8b2f379c 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -232,13 +232,13 @@ class VitPoseModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
         # TODO update organization
-        return VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple") if is_vision_available() else None
+        return VitPoseImageProcessor.from_pretrained("danelcsb/vitpose-base-simple") if is_vision_available() else None
 
     @slow
     def test_inference_pose_estimation(self):
         image_processor = self.default_image_processor
         # TODO update organization
-        model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+        model = VitPoseForPoseEstimation.from_pretrained("danelcsb/vitpose-base-simple")
 
         image = prepare_img()
         boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
@@ -279,7 +279,7 @@ def test_inference_pose_estimation(self):
     def test_batched_inference(self):
         image_processor = self.default_image_processor
         # TODO update organization
-        model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+        model = VitPoseForPoseEstimation.from_pretrained("danelcsb/vitpose-base-simple")
 
         image = prepare_img()
         boxes = [

From 831f70dabd7c018f882e0b70989159f6d8044100 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 14:48:08 +0000
Subject: [PATCH 160/181] final commit

---
 .../models/vitpose/convert_vitpose_to_hf.py   |  2 +-
 tests/models/vitpose/test_modeling_vitpose.py | 63 +++++++++++++------
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index accfe436a621..fe1bb1177a96 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -311,7 +311,7 @@ def write_model(model_path, model_name, push_to_hub):
     # save the model to a local directory
     model.save_pretrained(model_path)
     image_processor.save_pretrained(model_path)
-    
+
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
         model.push_to_hub(f"danelcsb/{model_name}")
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 1e0f8b2f379c..f1fc32c6afe5 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -239,13 +239,16 @@ def test_inference_pose_estimation(self):
         image_processor = self.default_image_processor
         # TODO update organization
         model = VitPoseForPoseEstimation.from_pretrained("danelcsb/vitpose-base-simple")
+        model.to(torch_device)
+        model.eval()
 
         image = prepare_img()
         boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
 
-        inputs = image_processor(images=image, boxes=boxes, return_tensors="pt")
+        inputs = image_processor(images=image, boxes=boxes, return_tensors="pt").to(torch_device)
 
-        outputs = model(**inputs)
+        with torch.no_grad():
+            outputs = model(**inputs)
         heatmaps = outputs.heatmaps
 
         assert heatmaps.shape == (2, 17, 64, 48)
@@ -256,30 +259,40 @@ def test_inference_pose_estimation(self):
                 [9.9330e-06, 9.9330e-06, 9.9330e-06],
                 [9.9330e-06, 9.9330e-06, 9.9330e-06],
             ]
-        )
+        ).to(torch_device)
 
         assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
 
         pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
-        expected_bbox = torch.tensor([439.3250, 226.6150, 438.9719, 226.4776, 22320.4219, 0.0000]).to(torch_device)
+        expected_bbox = torch.tensor([391.9900, 190.0800, 391.1575, 189.3034])
         expected_keypoints = torch.tensor(
             [
-                [3.9813e02, 1.8184e02, 8.7529e-01],
-                [3.9828e02, 1.7981e02, 8.4315e-01],
-                [3.9596e02, 1.7948e02, 9.2678e-01],
+                [3.9813e02, 1.8184e02],
+                [3.9828e02, 1.7981e02],
+                [3.9596e02, 1.7948e02],
             ]
-        ).to(torch_device)
+        )
+        expected_scores = torch.tensor(
+            [
+                [8.7529e-01],
+                [8.4315e-01],
+                [9.2678e-01],
+            ]
+        )
 
         self.assertEqual(len(pose_results), 2)
-        self.assertTrue(torch.allclose(pose_results[0]["bbox"], expected_bbox, atol=1e-4))
-        self.assertTrue(torch.allclose(pose_results[0]["keypoints"], expected_keypoints, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[1]["bbox"].cpu(), expected_bbox, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[1]["keypoints"][:3].cpu(), expected_keypoints, atol=1e-2))
+        self.assertTrue(torch.allclose(pose_results[1]["scores"][:3].cpu(), expected_scores, atol=1e-4))
 
     @slow
     def test_batched_inference(self):
         image_processor = self.default_image_processor
         # TODO update organization
         model = VitPoseForPoseEstimation.from_pretrained("danelcsb/vitpose-base-simple")
+        model.to(torch_device)
+        model.eval()
 
         image = prepare_img()
         boxes = [
@@ -287,9 +300,10 @@ def test_batched_inference(self):
             [[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]],
         ]
 
-        inputs = image_processor(images=[image, image], boxes=boxes, return_tensors="pt")
+        inputs = image_processor(images=[image, image], boxes=boxes, return_tensors="pt").to(torch_device)
 
-        outputs = model(**inputs)
+        with torch.no_grad():
+            outputs = model(**inputs)
         heatmaps = outputs.heatmaps
 
         assert heatmaps.shape == (4, 17, 64, 48)
@@ -300,22 +314,31 @@ def test_batched_inference(self):
                 [9.9330e-06, 9.9330e-06, 9.9330e-06],
                 [9.9330e-06, 9.9330e-06, 9.9330e-06],
             ]
-        )
+        ).to(torch_device)
 
         assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
 
         pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
+        print(pose_results)
 
-        expected_bbox = torch.tensor([439.3250, 226.6150, 438.9719, 226.4776, 22320.4219, 0.0000]).to(torch_device)
+        expected_bbox = torch.tensor([391.9900, 190.0800, 391.1575, 189.3034])
         expected_keypoints = torch.tensor(
             [
-                [3.9813e02, 1.8184e02, 8.7529e-01],
-                [3.9828e02, 1.7981e02, 8.4315e-01],
-                [3.9596e02, 1.7948e02, 9.2678e-01],
+                [3.9813e02, 1.8184e02],
+                [3.9828e02, 1.7981e02],
+                [3.9596e02, 1.7948e02],
             ]
-        ).to(torch_device)
+        )
+        expected_scores = torch.tensor(
+            [
+                [8.7529e-01],
+                [8.4315e-01],
+                [9.2678e-01],
+            ]
+        )
 
         self.assertEqual(len(pose_results), 2)
         self.assertEqual(len(pose_results[0]), 2)
-        self.assertTrue(torch.allclose(pose_results[0][0]["bbox"], expected_bbox, atol=1e-4))
-        self.assertTrue(torch.allclose(pose_results[0][0]["keypoints"], expected_keypoints, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[0][1]["bbox"].cpu(), expected_bbox, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[0][1]["keypoints"][:3].cpu(), expected_keypoints, atol=1e-2))
+        self.assertTrue(torch.allclose(pose_results[0][1]["scores"][:3].cpu(), expected_scores, atol=1e-4))
\ No newline at end of file

From ac00401657dbfefd4987e0825b4befbab3445e16 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 14:48:28 +0000
Subject: [PATCH 161/181] [run-slow] vitpose, vitpose_backbone


From 241211d190d86085b4150a563ed5f3c5c993f4aa Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 14:52:44 +0000
Subject: [PATCH 162/181] ruff check --fix

---
 tests/models/vitpose/test_modeling_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index f1fc32c6afe5..e09706369196 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -341,4 +341,4 @@ def test_batched_inference(self):
         self.assertEqual(len(pose_results[0]), 2)
         self.assertTrue(torch.allclose(pose_results[0][1]["bbox"].cpu(), expected_bbox, atol=1e-4))
         self.assertTrue(torch.allclose(pose_results[0][1]["keypoints"][:3].cpu(), expected_keypoints, atol=1e-2))
-        self.assertTrue(torch.allclose(pose_results[0][1]["scores"][:3].cpu(), expected_scores, atol=1e-4))
\ No newline at end of file
+        self.assertTrue(torch.allclose(pose_results[0][1]["scores"][:3].cpu(), expected_scores, atol=1e-4))

From 5610d5dd90166a842936accd509c83c7d7140b68 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 20 Nov 2024 14:54:43 +0000
Subject: [PATCH 163/181] [run-slow] vitpose, vitpose_backbone


From 28623b1c6c31ede475b5b6efcddec8b425347404 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 21 Nov 2024 14:24:26 +0000
Subject: [PATCH 164/181] rename split module

---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 16ebaebb0679..d3f057988370 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -391,7 +391,7 @@ class VitPoseBackbonePreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["VitPoseEmbeddings", "VitPoseLayer"]
+    _no_split_modules = ["VitPoseBackboneEmbeddings", "VitPoseBackboneLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""

From e86751a548f0841e0a6f4c019a75a6be606b598f Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 21 Nov 2024 14:25:03 +0000
Subject: [PATCH 165/181] [run-slow] vitpose, vitpose_backbone


From 2c56a4806e30bc9b5753b142fa04b913306c54ff Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 16 Dec 2024 08:56:13 +0000
Subject: [PATCH 166/181] fix pos_embed

---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index c445f2ee21f7..26e80cad64cc 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -89,7 +89,11 @@ def __init__(self, config: ViTPoseBackboneConfig) -> None:
 
         self.patch_embeddings = ViTPoseBackbonePatchEmbeddings(config)
         num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        position_embeddings = torch.zeros(1, num_patches + 1, config.hidden_size)
+        # Pre-compute the modified position embeddings
+        self.position_embeddings = nn.Parameter(
+            position_embeddings[:, 1:] + position_embeddings[:, :1]
+        )
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
@@ -97,7 +101,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
 
         # add positional encoding to each token
-        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
+        embeddings = embeddings + self.position_embeddings
 
         embeddings = self.dropout(embeddings)
 

From ba7373f96fd35c58355705822ee36f605a4cee41 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 16 Dec 2024 14:35:30 +0100
Subject: [PATCH 167/181] Simplify init

---
 src/transformers/models/vitpose/__init__.py   | 53 +++----------------
 .../models/vitpose/configuration_vitpose.py   |  3 ++
 .../vitpose/image_processing_vitpose.py       |  3 ++
 .../models/vitpose/modeling_vitpose.py        |  3 ++
 .../modeling_vitpose_backbone.py              |  4 +-
 5 files changed, 17 insertions(+), 49 deletions(-)

diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
index b6fd47c37056..4a57524cce21 100644
--- a/src/transformers/models/vitpose/__init__.py
+++ b/src/transformers/models/vitpose/__init__.py
@@ -13,55 +13,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
-_import_structure = {"configuration_vitpose": ["VitPoseConfig"]}
-
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_vitpose"] = ["VitPoseImageProcessor"]
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_vitpose"] = [
-        "VitPosePreTrainedModel",
-        "VitPoseForPoseEstimation",
-    ]
-
 if TYPE_CHECKING:
-    from .configuration_vitpose import VitPoseConfig
-
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_vitpose import VitPoseImageProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_vitpose import (
-            VitPoseForPoseEstimation,
-            VitPosePreTrainedModel,
-        )
-
+    from .configuration_vitpose import *
+    from .image_processing_vitpose import *
+    from .modeling_vitpose import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index edfbc173d0d2..ae50167d571d 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -119,3 +119,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.scale_factor = scale_factor
         self.use_simple_decoder = use_simple_decoder
+
+
+__all__ = ["VitPoseConfig"]
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 7a0c2d1ad120..f8592e586737 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -671,3 +671,6 @@ def post_process_pose_estimation(
             results.append(batch_results)
 
         return results
+
+
+__all__ = ["VitPoseImageProcessor"]
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 0a5663b0e62c..009c5afcae69 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -334,3 +334,6 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+__all__ = ["VitPosePreTrainedModel", "VitPoseForPoseEstimation"]
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 5ac9fccd5b82..0fa81a807ff7 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -91,9 +91,7 @@ def __init__(self, config: VitPoseBackboneConfig) -> None:
         num_patches = self.patch_embeddings.num_patches
         position_embeddings = torch.zeros(1, num_patches + 1, config.hidden_size)
         # Pre-compute the modified position embeddings
-        self.position_embeddings = nn.Parameter(
-            position_embeddings[:, 1:] + position_embeddings[:, :1]
-        )
+        self.position_embeddings = nn.Parameter(position_embeddings[:, 1:] + position_embeddings[:, :1])
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:

From e2fbb269fee23ad13dffbdf4794893b0fe300ead Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 17 Dec 2024 09:13:01 +0000
Subject: [PATCH 168/181] Revert "fix pos_embed"

This reverts commit 2c56a4806e30bc9b5753b142fa04b913306c54ff.
---
 .../models/vitpose_backbone/modeling_vitpose_backbone.py    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 0fa81a807ff7..d3f057988370 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -89,16 +89,14 @@ def __init__(self, config: VitPoseBackboneConfig) -> None:
 
         self.patch_embeddings = VitPoseBackbonePatchEmbeddings(config)
         num_patches = self.patch_embeddings.num_patches
-        position_embeddings = torch.zeros(1, num_patches + 1, config.hidden_size)
-        # Pre-compute the modified position embeddings
-        self.position_embeddings = nn.Parameter(position_embeddings[:, 1:] + position_embeddings[:, :1])
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
 
         # add positional encoding to each token
-        embeddings = embeddings + self.position_embeddings
+        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
 
         embeddings = self.dropout(embeddings)
 

From a9bb08f04874209f5447e63d653529ef1943836f Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 24 Dec 2024 00:37:52 +0000
Subject: [PATCH 169/181] refactor single loop

---
 .../vitpose/image_processing_vitpose.py       | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index f8592e586737..c8a35202213d 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -528,19 +528,17 @@ def preprocess(
         # When using a list input, the number of boxes can vary dynamically per image.
         # The image processor creates pixel_values of shape (batch_size*num_persons, num_channels, height, width)
 
-        if self.do_rescale:
-            images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
-            ]
-        if self.do_normalize:
-            images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                for image in images
-            ]
-
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+            
+            all_images.append(image)
         images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in all_images
         ]
 
         data = {"pixel_values": images}

From 0dd96133c131bf5c261f4da6344fa44d979bf182 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 24 Dec 2024 01:06:38 +0000
Subject: [PATCH 170/181] allow flag to enable custom model

---
 .../models/vitpose/convert_vitpose_to_hf.py   | 100 +++++++++---------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index fe1bb1177a96..9033457d3a07 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -155,7 +155,7 @@ def prepare_img():
 
 
 @torch.no_grad()
-def write_model(model_path, model_name, push_to_hub):
+def write_model(model_path, model_name, push_to_hub, check_logits=True):
     os.makedirs(model_path, exist_ok=True)
 
     # ------------------------------------------------------------
@@ -260,52 +260,53 @@ def write_model(model_path, model_name, push_to_hub):
     # Verify pose_results
     pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
 
-    if model_name == "vitpose-base-simple":
-        assert torch.allclose(
-            pose_results[1]["keypoints"][0],
-            torch.tensor([3.98180511e02, 1.81808380e02]),
-            atol=5e-2,
-        )
-        assert torch.allclose(
-            pose_results[1]["scores"][0],
-            torch.tensor([8.66642594e-01]),
-            atol=5e-2,
-        )
-    elif model_name == "vitpose-base":
-        assert torch.allclose(
-            pose_results[1]["keypoints"][0],
-            torch.tensor([3.9807913e02, 1.8182812e02]),
-            atol=5e-2,
-        )
-        assert torch.allclose(
-            pose_results[1]["scores"][0],
-            torch.tensor([8.8235235e-01]),
-            atol=5e-2,
-        )
-    elif model_name == "vitpose-base-coco-aic-mpii":
-        assert torch.allclose(
-            pose_results[1]["keypoints"][0],
-            torch.tensor([3.98305542e02, 1.81741592e02]),
-            atol=5e-2,
-        )
-        assert torch.allclose(
-            pose_results[1]["scores"][0],
-            torch.tensor([8.69966745e-01]),
-            atol=5e-2,
-        )
-    elif model_name == "vitpose-plus-base":
-        assert torch.allclose(
-            pose_results[1]["keypoints"][0],
-            torch.tensor([3.98201294e02, 1.81728302e02]),
-            atol=5e-2,
-        )
-        assert torch.allclose(
-            pose_results[1]["scores"][0],
-            torch.tensor([8.75046968e-01]),
-            atol=5e-2,
-        )
-    else:
-        raise ValueError("Model not supported")
+    if check_logits:
+        if model_name == "vitpose-base-simple":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98180511e02, 1.81808380e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.66642594e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-base":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.9807913e02, 1.8182812e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.8235235e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-base-coco-aic-mpii":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98305542e02, 1.81741592e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.69966745e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-plus-base":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98201294e02, 1.81728302e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.75046968e-01]),
+                atol=5e-2,
+            )
+        else:
+            raise ValueError("Model not supported")
     print("Conversion successfully done.")
 
     # save the model to a local directory
@@ -334,9 +335,12 @@ def main():
     parser.add_argument(
         "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
     )
+    parser.add_argument(
+        "--push_to_hub", default=True, type=bool,  help="Whether to check the logits of public converted model to the 🤗 hub. You can disable when using custom model."
+    )
 
     args = parser.parse_args()
-    write_model(model_path=args.pytorch_dump_folder_path, model_name=args.model_name, push_to_hub=args.push_to_hub)
+    write_model(model_path=args.pytorch_dump_folder_path, model_name=args.model_name, push_to_hub=args.push_to_hub, check_logits=args.check_logits)
 
 
 if __name__ == "__main__":

From b21bb06bb86636efade7311e5bf987f173cb97c9 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 24 Dec 2024 02:23:03 +0000
Subject: [PATCH 171/181] efficiency of MoE to not use unused experts

---
 .../modeling_vitpose_backbone.py              | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index d3f057988370..512fd4e74600 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -249,13 +249,19 @@ def forward(self, hidden_state: torch.Tensor, indices: torch.Tensor) -> torch.Te
         hidden_state = self.fc1(hidden_state)
         hidden_state = self.act(hidden_state)
         shared_hidden_state = self.fc2(hidden_state)
-        indices = indices.view(-1, 1, 1)
-
-        # to support ddp training
-        for i in range(self.num_experts):
-            selected_index = indices == i
-            current_hidden_state = self.experts[i](hidden_state) * selected_index
-            expert_hidden_state = expert_hidden_state + current_hidden_state
+        indices = indices.view(-1)
+
+        # Convert indices to boolean mask
+        router_mask = nn.functional.one_hot(indices, num_classes=self.num_experts).bool()
+        # Identify active experts
+        seq_len, num_experts = router_mask.shape
+        idx_mask = router_mask.sum(dim=[0])
+        active_experts = torch.nonzero(idx_mask, as_tuple=True)[0].tolist()
+
+        for idx, expert in enumerate(active_experts):
+            mask = router_mask[idx, :]
+            current_hidden_state = self.experts[i](hidden_state[mask])
+            expert_hidden_state[mask] = current_hidden_state
 
         hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)
 

From 9a5c86de97cbd9c730030d23de1ebe08a669fc5c Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 24 Dec 2024 02:25:30 +0000
Subject: [PATCH 172/181] make style

---
 .../models/vitpose/convert_vitpose_to_hf.py          | 12 ++++++++++--
 .../models/vitpose/image_processing_vitpose.py       |  9 ++++++---
 .../vitpose_backbone/modeling_vitpose_backbone.py    |  2 +-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
index 9033457d3a07..f151adebbce7 100644
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -336,11 +336,19 @@ def main():
         "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
     )
     parser.add_argument(
-        "--push_to_hub", default=True, type=bool,  help="Whether to check the logits of public converted model to the 🤗 hub. You can disable when using custom model."
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        help="Whether to check the logits of public converted model to the 🤗 hub. You can disable when using custom model.",
     )
 
     args = parser.parse_args()
-    write_model(model_path=args.pytorch_dump_folder_path, model_name=args.model_name, push_to_hub=args.push_to_hub, check_logits=args.check_logits)
+    write_model(
+        model_path=args.pytorch_dump_folder_path,
+        model_name=args.model_name,
+        push_to_hub=args.push_to_hub,
+        check_logits=args.check_logits,
+    )
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index c8a35202213d..55386867470e 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -534,11 +534,14 @@ def preprocess(
                 image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
             if do_normalize:
-                image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-            
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
             all_images.append(image)
         images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in all_images
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
         ]
 
         data = {"pixel_values": images}
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 512fd4e74600..4cda2be52a0d 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -260,7 +260,7 @@ def forward(self, hidden_state: torch.Tensor, indices: torch.Tensor) -> torch.Te
 
         for idx, expert in enumerate(active_experts):
             mask = router_mask[idx, :]
-            current_hidden_state = self.experts[i](hidden_state[mask])
+            current_hidden_state = self.experts[expert](hidden_state[mask])
             expert_hidden_state[mask] = current_hidden_state
 
         hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)

From a5e796610c3b72a1f797bba2c127a60e53a34839 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 12:45:24 +0000
Subject: [PATCH 173/181] Fix range -> arange to avoid warning

---
 src/transformers/models/vitpose/image_processing_vitpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 55386867470e..8cc187c4860a 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -652,7 +652,7 @@ def post_process_pose_estimation(
 
         poses = torch.Tensor(preds)
         scores = torch.Tensor(scores)
-        labels = torch.range(0, num_keypoints - 1)
+        labels = torch.arange(0, num_keypoints)
         bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
 
         results: List[List[Dict[str, torch.Tensor]]] = []

From fa6e6130e2b8cb921cb5e08a67ead0229c8050fb Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 12:45:50 +0000
Subject: [PATCH 174/181] Revert MOE router, a new one does not work

---
 .../modeling_vitpose_backbone.py              | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index 4cda2be52a0d..d3f057988370 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -249,19 +249,13 @@ def forward(self, hidden_state: torch.Tensor, indices: torch.Tensor) -> torch.Te
         hidden_state = self.fc1(hidden_state)
         hidden_state = self.act(hidden_state)
         shared_hidden_state = self.fc2(hidden_state)
-        indices = indices.view(-1)
-
-        # Convert indices to boolean mask
-        router_mask = nn.functional.one_hot(indices, num_classes=self.num_experts).bool()
-        # Identify active experts
-        seq_len, num_experts = router_mask.shape
-        idx_mask = router_mask.sum(dim=[0])
-        active_experts = torch.nonzero(idx_mask, as_tuple=True)[0].tolist()
-
-        for idx, expert in enumerate(active_experts):
-            mask = router_mask[idx, :]
-            current_hidden_state = self.experts[expert](hidden_state[mask])
-            expert_hidden_state[mask] = current_hidden_state
+        indices = indices.view(-1, 1, 1)
+
+        # to support ddp training
+        for i in range(self.num_experts):
+            selected_index = indices == i
+            current_hidden_state = self.experts[i](hidden_state) * selected_index
+            expert_hidden_state = expert_hidden_state + current_hidden_state
 
         hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)
 

From f2037ce957b4cf3981c6d7346b0898939267f723 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 13:05:03 +0000
Subject: [PATCH 175/181] Fix postprocessing a bit (labels)

---
 .../vitpose/image_processing_vitpose.py       | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index 8cc187c4860a..fa4d0eadc221 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -650,26 +650,30 @@ def post_process_pose_estimation(
         all_boxes[:, 0:2] = centers[:, 0:2]
         all_boxes[:, 2:4] = scales[:, 0:2]
 
-        poses = torch.Tensor(preds)
-        scores = torch.Tensor(scores)
+        poses = torch.tensor(preds)
+        scores = torch.tensor(scores)
         labels = torch.arange(0, num_keypoints)
-        bboxes_xyxy = torch.Tensor(coco_to_pascal_voc(all_boxes))
+        bboxes_xyxy = torch.tensor(coco_to_pascal_voc(all_boxes))
 
         results: List[List[Dict[str, torch.Tensor]]] = []
 
         pose_bbox_pairs = zip(poses, scores, bboxes_xyxy)
 
-        for batch_bbox in boxes:
-            batch_results: List[Dict[str, torch.Tensor]] = []
-            for _ in batch_bbox:
+        for image_bboxes in boxes:
+            image_results: List[Dict[str, torch.Tensor]] = []
+            for _ in image_bboxes:
                 # Unpack the next pose and bbox_xyxy from the iterator
                 pose, score, bbox_xyxy = next(pose_bbox_pairs)
+                score = score.squeeze()
+                keypoints_labels = labels
                 if threshold is not None:
-                    score_condition = (score > threshold).squeeze(1)
-                    pose, score, labels = pose[score_condition], score[score_condition], labels[score_condition]
-                pose_result = {"keypoints": pose, "scores": score, "labels": labels, "bbox": bbox_xyxy}
-                batch_results.append(pose_result)
-            results.append(batch_results)
+                    keep = score > threshold
+                    pose = pose[keep]
+                    score = score[keep]
+                    keypoints_labels = keypoints_labels[keep]
+                pose_result = {"keypoints": pose, "scores": score, "labels": keypoints_labels, "bbox": bbox_xyxy}
+                image_results.append(pose_result)
+            results.append(image_results)
 
         return results
 

From 3cbd9e3f410f032cfd002ab6edda3be011691830 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 13:09:06 +0000
Subject: [PATCH 176/181] Fix type hint

---
 .../models/vitpose/image_processing_vitpose.py        | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index fa4d0eadc221..e7c5c524cb05 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -16,7 +16,7 @@
 
 import itertools
 import math
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -46,6 +46,9 @@
     from scipy.linalg import inv
     from scipy.ndimage import affine_transform, gaussian_filter
 
+if TYPE_CHECKING:
+    from .modeling_vitpose import VitPoseEstimatorOutput
+
 logger = logging.get_logger(__name__)
 
 
@@ -593,7 +596,7 @@ def keypoints_from_heatmaps(
 
     def post_process_pose_estimation(
         self,
-        outputs: torch.Tensor,
+        outputs: "VitPoseEstimatorOutput",
         boxes: Union[List[List[List[float]]], np.ndarray],
         kernel_size: int = 11,
         threshold: float = None,
@@ -603,8 +606,8 @@ def post_process_pose_estimation(
         Transform the heatmaps into keypoint predictions and transform them back to the image.
 
         Args:
-            outputs (torch.Tensor):
-                Model outputs.
+            outputs (`VitPoseEstimatorOutput`):
+                VitPoseForPoseEstimation model outputs.
             boxes (`List[List[List[float]]]` or `np.ndarray`):
                 List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
                 box coordinates in COCO format (top_left_x, top_left_y, width, height).

From fdd080ce8361da0bff17bce6ad0443885ec3e964 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 13:15:45 +0000
Subject: [PATCH 177/181] Fix docs snippets

---
 docs/source/en/model_doc/vitpose.md | 88 +++++++++++++++--------------
 1 file changed, 46 insertions(+), 42 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 3e7794b43ff1..40e7360d0eb5 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -43,27 +43,32 @@ The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPo
 - ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
-import numpy as np
-import requests
 import torch
+import requests
+import numpy as np
+
 from PIL import Image
 
 from transformers import (
+    AutoProcessor,
     RTDetrForObjectDetection,
-    RTDetrImageProcessor,
-    VitPoseConfig,
     VitPoseForPoseEstimation,
-    VitPoseImageProcessor,
 )
 
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
 url = "http://images.cocodataset.org/val2017/000000000139.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
-# Stage 1. Run Object Detector (User can replace this object_detector part)
-person_image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
-person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
-inputs = person_image_processor(images=image, return_tensors="pt")
+# ------------------------------------------------------------------------
+# Stage 1. Detect humans on the image
+# ------------------------------------------------------------------------
+
+# You can choose detector by your choice
+person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)
+
+inputs = person_image_processor(images=image, return_tensors="pt").to(device)
 
 with torch.no_grad():
     outputs = person_model(**inputs)
@@ -71,44 +76,30 @@ with torch.no_grad():
 results = person_image_processor.post_process_object_detection(
     outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
 )
+result = results[0]  # take first image results
 
-def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
-    """
-    Converts bounding boxes from the Pascal VOC format to the COCO format.
-
-    In other words, converts from (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format
-    to (top_left_x, top_left_y, width, height).
-
-    Args:
-        bboxes (`np.ndarray` of shape `(batch_size, 4)):
-            Bounding boxes in Pascal VOC format.
-
-    Returns:
-        `np.ndarray` of shape `(batch_size, 4) in COCO format.
-    """
-    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
-    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+# Human label refers 0 index in COCO dataset
+person_boxes = result["boxes"][result["labels"] == 0]
+person_boxes = person_boxes.cpu().numpy()
 
-    return bboxes
+# Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
+person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
 
-# Human label refers 0 index in COCO dataset
-boxes = results[0]["boxes"][results[0]["labels"] == 0]
-boxes = [pascal_voc_to_coco(boxes.cpu().numpy())]
+# ------------------------------------------------------------------------
+# Stage 2. Detect keypoints for each person found
+# ------------------------------------------------------------------------
 
-# Stage 2. Run ViTPose
-config = VitPoseConfig()
-image_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
-model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)
 
-pixel_values = image_processor(image, boxes=boxes, return_tensors="pt").pixel_values
+inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
 
 with torch.no_grad():
-    outputs = model(pixel_values)
+    outputs = model(**inputs)
 
-pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
-
-for pose_result in pose_results:
-    print(pose_result)
+pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
+image_pose_result = pose_results[0]  # results for first image
 ```
 
 
@@ -116,16 +107,29 @@ for pose_result in pose_results:
 ```py
 import supervision as sv
 
-key_points = sv.KeyPoints(xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy())
+xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
+scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()
+
+key_points = sv.KeyPoints(
+    xy=xy, confidence=scores
+)
 
 edge_annotator = sv.EdgeAnnotator(
     color=sv.Color.GREEN,
-    thickness=5
+    thickness=1
+)
+vertex_annotator = sv.VertexAnnotator(
+    color=sv.Color.RED,
+    radius=2
 )
 annotated_frame = edge_annotator.annotate(
     scene=image.copy(),
     key_points=key_points
 )
+annotated_frame = vertex_annotator.annotate(
+    scene=annotated_frame,
+    key_points=key_points
+)
 ```
 
 ### Visualization for advanced user
@@ -217,7 +221,7 @@ keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0,
 
 numpy_image = np.array(image)
 
-for pose_result in pose_results:
+for pose_result in image_pose_result:
     scores = np.array(pose_result["scores"])
     keypoints = np.array(pose_result["keypoints"])
 

From 3cb154cf1b08090efd0d478685730ca3a25c7fac Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 13:20:55 +0000
Subject: [PATCH 178/181] Fix links to checkpoints

---
 src/transformers/models/vitpose/configuration_vitpose.py   | 2 +-
 src/transformers/models/vitpose/modeling_vitpose.py        | 7 ++++---
 .../vitpose_backbone/configuration_vitpose_backbone.py     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index ae50167d571d..763c1f1bd7bd 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -28,7 +28,7 @@ class VitPoseConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`VitPoseForPoseEstimation`]. It is used to instantiate a
     VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the VitPose
-    [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
+    [usyd-community/vitpose-base-simple](https://huggingface.co/usyd-community/vitpose-base-simple) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index 009c5afcae69..b5dd274654ac 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -280,15 +280,16 @@ def forward(
         >>> from PIL import Image
         >>> import requests
 
-        >>> processor = AutoImageProcessor.from_pretrained("")
-        >>> model = VitPoseForPoseEstimation.from_pretrained("")
+        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
         >>> inputs = processor(image, boxes=boxes, return_tensors="pt")
 
-        >>> outputs = model(**inputs)
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
         >>> heatmaps = outputs.heatmaps
         ```"""
 
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
index 08768eaae6d1..2872d39a2a30 100644
--- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -27,7 +27,7 @@ class VitPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
     This is the configuration class to store the configuration of a [`VitPoseBackbone`]. It is used to instantiate a
     VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the VitPose
-    [google/vitpose-base-patch16-224](https://huggingface.co/google/vitpose-base-patch16-224) architecture.
+    [usyd-community/vitpose-base-simple](https://huggingface.co/usyd-community/vitpose-base-simple) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From 8a4d9c143e2238c37662143928297910a5095608 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 13:26:08 +0000
Subject: [PATCH 179/181] Fix checkpoints in tests

---
 tests/models/vitpose/test_modeling_vitpose.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index e09706369196..6afc2ecb3803 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -213,8 +213,7 @@ def test_for_pose_estimation(self):
 
     @slow
     def test_model_from_pretrained(self):
-        # TODO update organization
-        model_name = "nielsr/vitpose-base-simple"
+        model_name = "usyd-community/vitpose-base-simple"
         model = VitPoseForPoseEstimation.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
@@ -231,14 +230,16 @@ def prepare_img():
 class VitPoseModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
-        # TODO update organization
-        return VitPoseImageProcessor.from_pretrained("danelcsb/vitpose-base-simple") if is_vision_available() else None
+        return (
+            VitPoseImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+            if is_vision_available()
+            else None
+        )
 
     @slow
     def test_inference_pose_estimation(self):
         image_processor = self.default_image_processor
-        # TODO update organization
-        model = VitPoseForPoseEstimation.from_pretrained("danelcsb/vitpose-base-simple")
+        model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
         model.to(torch_device)
         model.eval()
 
@@ -289,8 +290,7 @@ def test_inference_pose_estimation(self):
     @slow
     def test_batched_inference(self):
         image_processor = self.default_image_processor
-        # TODO update organization
-        model = VitPoseForPoseEstimation.from_pretrained("danelcsb/vitpose-base-simple")
+        model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
         model.to(torch_device)
         model.eval()
 

From 09752cfa2d03e6bcefb4f8a492da212bfde65780 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 13:29:44 +0000
Subject: [PATCH 180/181] Fix test

---
 tests/models/vitpose/test_modeling_vitpose.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 6afc2ecb3803..1c33b6cf3671 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -274,13 +274,7 @@ def test_inference_pose_estimation(self):
                 [3.9596e02, 1.7948e02],
             ]
         )
-        expected_scores = torch.tensor(
-            [
-                [8.7529e-01],
-                [8.4315e-01],
-                [9.2678e-01],
-            ]
-        )
+        expected_scores = torch.tensor([8.7529e-01, 8.4315e-01, 9.2678e-01])
 
         self.assertEqual(len(pose_results), 2)
         self.assertTrue(torch.allclose(pose_results[1]["bbox"].cpu(), expected_bbox, atol=1e-4))
@@ -329,13 +323,7 @@ def test_batched_inference(self):
                 [3.9596e02, 1.7948e02],
             ]
         )
-        expected_scores = torch.tensor(
-            [
-                [8.7529e-01],
-                [8.4315e-01],
-                [9.2678e-01],
-            ]
-        )
+        expected_scores = torch.tensor([8.7529e-01, 8.4315e-01, 9.2678e-01])
 
         self.assertEqual(len(pose_results), 2)
         self.assertEqual(len(pose_results[0]), 2)

From 1bea6c1644c51b1644cdce275f150413fe746202 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 8 Jan 2025 14:59:54 +0000
Subject: [PATCH 181/181] Add image to docs

---
 docs/source/en/model_doc/vitpose.md | 32 +++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 40e7360d0eb5..361f8e30c75d 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -20,27 +20,14 @@ The abstract from the paper is the following:
 
 *Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
 
+![vitpose-architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png)
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
 The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
 
 ## Usage Tips
 
-- To enable MoE (Mixture of Experts) function in the backbone, user has to give appropriate configuration such as `num_experts` and input value `dataset_index` to the backbone model. 
-  However, it is not used in default parameters. Below is the code snippet for usage of MoE function.
-```py
->>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
->>> import torch
-
->>> config = VitPoseBackboneConfig(num_experts=3, out_indices=[-1])
->>> model = VitPoseBackbone(config)
-
->>> pixel_values = torch.randn(3, 3, 256, 192)
->>> dataset_index = torch.tensor([1, 2, 3])
->>> outputs = model(pixel_values, dataset_index)
-```
-
-- ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
+ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
 
 ```py
 import torch
@@ -236,6 +223,21 @@ pose_image
 ```
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
 
+### MoE backbone
+
+To enable MoE (Mixture of Experts) function in the backbone, user has to give appropriate configuration such as `num_experts` and input value `dataset_index` to the backbone model.  However, it is not used in default parameters. Below is the code snippet for usage of MoE function.
+
+```py
+>>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+>>> import torch
+
+>>> config = VitPoseBackboneConfig(num_experts=3, out_indices=[-1])
+>>> model = VitPoseBackbone(config)
+
+>>> pixel_values = torch.randn(3, 3, 256, 192)
+>>> dataset_index = torch.tensor([1, 2, 3])
+>>> outputs = model(pixel_values, dataset_index)
+```
 
 ## VitPoseImageProcessor