[nit][rotary embeddings] Relaxing the test, does not pass on some new…

…er pytorch/cuda (#270) could be because of tf32 default use for newer cuda
facebookresearch · Apr 14, 2022 · b7ca410 · b7ca410
1 parent 0c555e9
commit b7ca410
Showing 1 changed file with 7 additions and 6 deletions.
diff --git a/tests/test_rotary_embeddings.py b/tests/test_rotary_embeddings.py
@@ -29,19 +29,20 @@ def test_rotary_embeddings(device):
     q = torch.ones((BATCH, HEADS, SEQ, EMB), device=device)  # uniform on purpose
     k = q.clone()
 
-    k_rot, q_rot = rotary(q, k)
+    q_rot, k_rot = rotary(q, k)
 
     # Check that the sequences now encode relative position information
     att = torch.einsum("bhne,bhme->bhnm", q, k)
     att_rot = torch.einsum("bhne,bhme->bhnm", q_rot, k_rot)
 
-    # - the attention for the same positions is not changed
-    assert torch.allclose(torch.diag(att[0, 0, :, :]), torch.diag(att_rot[0, 0, :, :]))
+    # - the attention for the same positions is not meaningfully changed
+    assert torch.allclose(
+        torch.diag(att[0, 0, :, :]), torch.diag(att_rot[0, 0, :, :]), rtol=0.1
+    )
 
     # - the post-rotary attention is more focused on the diagonal
-    att_rot -= att_rot[
-        0, 0, 0, 0
-    ].clone()  # all diagonal elements will have the same value
+    diag_max = torch.max(torch.diag(att_rot[0, 0, :, :]))
+    att_rot -= diag_max
     att_rot = (
         att_rot <= 1e-4
     )  # all non diagonal elements had lower attention than diagonal (+ float tolerance)