From aadf16a75fc2fb2552e9b24916e1ef55af08a3bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 20 Feb 2024 10:56:13 +0000
Subject: [PATCH] Fix drop path not being used

---
 src/transformers/models/dinov2/modeling_dinov2.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index ddf70f08b750..accdf0a9b23b 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -380,7 +380,7 @@ def __init__(self, config: Dinov2Config) -> None:
         self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.attention = Dinov2Attention(config)
         self.layer_scale1 = Dinov2LayerScale(config)
-        self.drop_path1 = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
 
         self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -389,7 +389,6 @@ def __init__(self, config: Dinov2Config) -> None:
         else:
             self.mlp = Dinov2MLP(config)
         self.layer_scale2 = Dinov2LayerScale(config)
-        self.drop_path2 = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
 
     def forward(
         self,
@@ -408,7 +407,7 @@ def forward(
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         # first residual connection
-        hidden_states = attention_output + hidden_states
+        hidden_states = self.drop_path(attention_output) + hidden_states
 
         # in Dinov2, layernorm is also applied after self-attention
         layer_output = self.norm2(hidden_states)
@@ -416,7 +415,7 @@ def forward(
         layer_output = self.layer_scale2(layer_output)
 
         # second residual connection
-        layer_output = layer_output + hidden_states
+        layer_output = self.drop_path(layer_output) + hidden_states
 
         outputs = (layer_output,) + outputs