From 461d14d171d867326a11724c71d35a4787e3d955 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Wed, 4 Dec 2024 23:33:46 +0100 Subject: [PATCH] Try fix for unused config.intermediate_size --- .../models/aria/configuration_aria.py | 2 +- src/transformers/models/aria/modeling_aria.py | 14 ++------------ src/transformers/models/aria/modular_aria.py | 15 ++------------- 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 5c695eb64dc9..d09f6d119084 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -188,7 +188,7 @@ def __init__( self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size - self.intermediate_size = intermediate_size + self.intermediate_size = moe_intermediate_size * moe_num_shared_experts self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 933d8cd7c598..8a9120958e89 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -225,21 +225,11 @@ def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tens class AriaSharedExpertsMLP(nn.Module): - """ - Shared Expert MLP for shared experts. - - Unlike routed experts, shared experts process all tokens without routing. - This class reconfigures the intermediate size in comparison to the LlamaMLP. - - Args: - config (`AriaTextConfig`): Configuration object for the Aria language model. - """ - - def __init__(self, config: AriaTextConfig): + def __init__(self, config): super().__init__() self.config = config self.hidden_size = config.hidden_size - self.intermediate_size = config.moe_intermediate_size * config.moe_num_shared_experts + self.intermediate_size = config.intermediate_size self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 39296ff98d70..68a0f51198a3 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -232,6 +232,7 @@ def __init__( self.moe_num_experts = moe_num_experts self.moe_topk = moe_topk self.moe_num_shared_experts = moe_num_shared_experts + self.intermediate_size = moe_intermediate_size * moe_num_shared_experts class AriaConfig(PretrainedConfig): @@ -1009,19 +1010,7 @@ def model_input_names(self): class AriaSharedExpertsMLP(LlamaMLP): - """ - Shared Expert MLP for shared experts. - - Unlike routed experts, shared experts process all tokens without routing. - This class reconfigures the intermediate size in comparison to the LlamaMLP. - - Args: - config (`AriaTextConfig`): Configuration object for the Aria language model. - """ - - def __init__(self, config: AriaTextConfig): - super().__init__(self) - self.intermediate_size = config.moe_intermediate_size * config.moe_num_shared_experts + pass class AriaGroupedExpertsGemm(nn.Module):