Skip to content

Commit

Permalink
Try fix for unused config.intermediate_size
Browse files Browse the repository at this point in the history
  • Loading branch information
aymeric-roucher committed Dec 4, 2024
1 parent 959702b commit 461d14d
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 26 deletions.
2 changes: 1 addition & 1 deletion src/transformers/models/aria/configuration_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def __init__(
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.intermediate_size = moe_intermediate_size * moe_num_shared_experts
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads

Expand Down
14 changes: 2 additions & 12 deletions src/transformers/models/aria/modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,21 +225,11 @@ def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tens


class AriaSharedExpertsMLP(nn.Module):
"""
Shared Expert MLP for shared experts.
Unlike routed experts, shared experts process all tokens without routing.
This class reconfigures the intermediate size in comparison to the LlamaMLP.
Args:
config (`AriaTextConfig`): Configuration object for the Aria language model.
"""

def __init__(self, config: AriaTextConfig):
def __init__(self, config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.moe_intermediate_size * config.moe_num_shared_experts
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
Expand Down
15 changes: 2 additions & 13 deletions src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def __init__(
self.moe_num_experts = moe_num_experts
self.moe_topk = moe_topk
self.moe_num_shared_experts = moe_num_shared_experts
self.intermediate_size = moe_intermediate_size * moe_num_shared_experts


class AriaConfig(PretrainedConfig):
Expand Down Expand Up @@ -1009,19 +1010,7 @@ def model_input_names(self):


class AriaSharedExpertsMLP(LlamaMLP):
"""
Shared Expert MLP for shared experts.
Unlike routed experts, shared experts process all tokens without routing.
This class reconfigures the intermediate size in comparison to the LlamaMLP.
Args:
config (`AriaTextConfig`): Configuration object for the Aria language model.
"""

def __init__(self, config: AriaTextConfig):
super().__init__(self)
self.intermediate_size = config.moe_intermediate_size * config.moe_num_shared_experts
pass


class AriaGroupedExpertsGemm(nn.Module):
Expand Down

0 comments on commit 461d14d

Please sign in to comment.