huggingface · ArthurZucker · Aug 20, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 20, 2024
diff --git a/src/transformers/models/mamba2/configuration_mamba2.py b/src/transformers/models/mamba2/configuration_mamba2.py
@@ -83,8 +83,6 @@ class Mamba2Config(PretrainedConfig):
             Whether or not to rescale `out_proj` weights when initializing.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the cache should be used.
-        norm_before_gate (`bool`, *optional*, defaults to `True`):
-            Option of cuda kernels -whether to normalize before the gate or not.
         rms_norm (`bool`, *optional*, defaults to `True`):
             Whether to use RMS norm or not.
         chunk_size (`int`, *optional*, defaults to 256):
@@ -137,7 +135,6 @@ def __init__(
         time_step_limit=(0.0, float("inf")),
         rescale_prenorm_residual=False,
         use_cache=True,
-        norm_before_gate=True,
         rms_norm=True,
         chunk_size=256,
         tie_word_embeddings=False,
@@ -168,7 +165,6 @@ def __init__(
         self.n_groups = n_groups
         self.num_heads = num_heads
         self.head_dim = head_dim
-        self.norm_before_gate = norm_before_gate
         self.rms_norm = rms_norm
         self.state_size = state_size
         self.chunk_size = chunk_size

diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py
@@ -208,7 +208,6 @@ def __init__(self, config: Mamba2Config, layer_idx: int):
         self.activation = config.hidden_act
         self.act = ACT2FN[config.hidden_act]
 
-        self.norm_before_gate = config.norm_before_gate
         self.layer_norm_epsilon = config.layer_norm_epsilon
         self.rms_norm = config.rms_norm
 
@@ -347,7 +346,7 @@ def cuda_kernels_forward(
                     outproj_bias=self.out_proj.bias,
                     headdim=self.head_dim,
                     ngroups=self.n_groups,
-                    norm_before_gate=self.norm_before_gate,
+                    norm_before_gate=False,
                     return_final_states=True,
                     **dt_limit_kwargs,
                 )