Fix rope theta for OpenLlama (#29893)

fix: rope_theta for open llama
huggingface · Mar 30, 2024 · 6fd93fe · 6fd93fe
1 parent 5ad7f17
commit 6fd93fe
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 0 deletions.
diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -66,6 +66,8 @@ class OpenLlamaConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
             strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
@@ -113,6 +115,7 @@ def __init__(
         attention_dropout_prob=0.1,
         use_stable_embedding=True,
         shared_input_output_embedding=True,
+        rope_theta=10000.0,
         rope_scaling=None,
         **kwargs,
     ):
@@ -133,6 +136,7 @@ def __init__(
         self.attention_dropout_prob = attention_dropout_prob
         self.use_stable_embedding = use_stable_embedding
         self.shared_input_output_embedding = shared_input_output_embedding
+        self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()
 

diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -214,6 +214,7 @@ def __init__(self, config: OpenLlamaConfig):
         self.head_dim = self.hidden_size // self.num_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.dropout_prob = config.attention_dropout_prob
+        self.rope_theta = config.rope_theta
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(