diff --git a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py index 63e83ff840..4d7e3320be 100644 --- a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -1247,7 +1247,7 @@ def prefill_forward( f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" f" {attn_weights.size()}" ) - assert attention_mask is not None + if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): raise ValueError( @@ -1311,7 +1311,7 @@ def decode_forward( cos, sin = self.rotary_emb(q_pe, seq_len=kv_seq_len) q_pe = apply_rotary_pos_emb(q_pe, cos, sin, q_position_ids) - q_nope = torch.matmul(q_nope.transpose(0, 1), self.q_absorb).transpose(0, 1) # opti + q_nope = torch.matmul(q_nope.transpose(0, 1), self.q_absorb).transpose(0, 1) compressed_kv, k_pe = self.compress_kv(hidden_states_kv, kv_position_ids) # update & get all compressed_kv, k_pe