Merge pull request #1632 from YTianZHU/master

Update readme of Diff-Transformer
microsoft · Oct 7, 2024 · f65d928 · f65d928
2 parents 62aab43 + f80f28d
commit f65d928
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 4 deletions.
diff --git a/Diff-Transformer/README.md b/Diff-Transformer/README.md
@@ -2,4 +2,11 @@
 ## Approach
 <div align="center">
   <img src="./imgs/arch.png" width=90%/>
-</div>
+</div>
+
+## Contents
+`multihead_diffattn.py` contains naive implementation of multi-head differential attention.
+
+`multihead_flashdiff_1.py` contains multi-head differential attention implemented with FlashAttention, for packages that support different qk/v dimensions (e.g., our [customized-flash-attention](https://aka.ms/flash-diff) and [xformers](https://github.com/facebookresearch/xformers)).
+
+`multihead_flashdiff_2.py` contains multi-head differential attention implemented with FlashAttention, for packages that **do not** support different qk/v dimensions (e.g., [flash-attention](https://github.com/Dao-AILab/flash-attention)).
diff --git a/Diff-Transformer/multihead_flashdiff_1.py b/Diff-Transformer/multihead_flashdiff_1.py
@@ -38,8 +38,8 @@ def lambda_init_fn(depth):
 class MultiheadFlashDiff1(nn.Module):
     """
     (Recommended)
-    DiffAttn implemented with FlashAttention, for packages that support different qkv dimensions
-    e.g., our customized-flash-attention (https://github.com/xiayuqing0622/customized-flash-attention) and xformers (https://github.com/facebookresearch/xformers)
+    DiffAttn implemented with FlashAttention, for packages that support different qk/v dimensions
+    e.g., our customized-flash-attention (https://aka.ms/flash-diff) and xformers (https://github.com/facebookresearch/xformers)
     """
     def __init__(
         self,

diff --git a/Diff-Transformer/multihead_flashdiff_2.py b/Diff-Transformer/multihead_flashdiff_2.py
@@ -37,7 +37,7 @@ def lambda_init_fn(depth):
 
 class MultiheadFlashDiff2(nn.Module):
     """
-    DiffAttn implemented with FlashAttention, for packages that does not support different qkv dimensions
+    DiffAttn implemented with FlashAttention, for packages that does not support different qk/v dimensions
     e.g., flash-attention (https://github.com/Dao-AILab/flash-attention)
     """
     def __init__(