upgrade diffusers to 0.31.dev version to support flux and cogvideox s…

…imutanously (#268)
xdit-project · Sep 14, 2024 · 2e2ad16 · 2e2ad16
1 parent da885f4
commit 2e2ad16
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 32 deletions.
diff --git a/examples/flux_example.py b/examples/flux_example.py
@@ -63,10 +63,9 @@ def main():
         if is_dp_last_group():
             for i, image in enumerate(output.images):
                 image_rank = dp_group_index * dp_batch_size + i
-                image.save(f"./results/flux_result_{parallel_info}_{image_rank}.png")
-                print(
-                    f"image {i} saved to ./results/flux_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
-                )
+                image_name = f"flux_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+                image.save(f"./results/{image_name}")
+                print(f"image {i} saved to ./results/{image_name}")
 
     if get_world_group().rank == get_world_group().world_size - 1:
         print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")

diff --git a/setup.py b/setup.py
@@ -26,9 +26,9 @@ def get_cuda_version():
         author_email="fangjiarui123@gmail.com",
         packages=find_packages(),
         install_requires=[
-            "torch>=2.3.0",
-            "accelerate==0.33.0",
-            "diffusers==0.30.2",
+            "torch>=2.1.0",
+            "accelerate>=0.33.0",
+            "diffusers @ git+https://github.com/huggingface/diffusers.git",
             "transformers>=4.39.1",
             "sentencepiece>=0.1.99",
             "beautifulsoup4>=4.12.3",

diff --git a/tests/layers/attention_processor_test.py b/tests/layers/attention_processor_test.py
@@ -14,11 +14,9 @@
     AttentionProcessor,
     FusedHunyuanAttnProcessor2_0,
     HunyuanAttnProcessor2_0,
-    FluxSingleAttnProcessor2_0,
 )
 from xfuser.model_executor.layers.attention_processor import (
     xFuserHunyuanAttnProcessor2_0,
-    xFuserFluxSingleAttnProcessor2_0,
 )
 
 from xfuser.core.cache_manager.cache_manager import get_cache_manager
@@ -56,10 +54,6 @@ def run_attn_test(rank, world_size, attn_type: str):
 
     _type_dict = {
         "HunyuanDiT": (HunyuanAttnProcessor2_0(), xFuserHunyuanAttnProcessor2_0()),
-        "FluxSingle": (
-            FluxSingleAttnProcessor2_0(),
-            xFuserFluxSingleAttnProcessor2_0(),
-        ),
     }
     processor, parallel_processor = _type_dict[attn_type]
 
@@ -140,7 +134,7 @@ def run_attn_test(rank, world_size, attn_type: str):
     ), "Outputs are not close"
 
 
-@pytest.mark.parametrize("attn_type", ["HunyuanDiT", "FluxSingle"])
+@pytest.mark.parametrize("attn_type", ["HunyuanDiT"])
 def test_multi_process(attn_type):
     world_size = 4  # Number of processes
     processes = []
@@ -160,4 +154,4 @@ def test_multi_process(attn_type):
 
 
 if __name__ == "__main__":
-    test_multi_process("FluxSingle")
+    test_multi_process("HunyuanDiT")
diff --git a/xfuser/config/config.py b/xfuser/config/config.py
@@ -31,6 +31,17 @@ def check_env():
             "with `pip3 install --pre torch torchvision torchaudio --index-url "
             "https://download.pytorch.org/whl/nightly/cu121`"
         )
+    try:
+        import diffusers
+
+        if version.parse(diffusers.__version__) > version.parse("0.30.2"):
+            raise RuntimeError(
+                "This project requires diffusers version >= 0.31.0 or above. It is not on pypi. Please install it from source code!"
+            )
+    except ImportError:
+        raise ImportError(
+            "diffusers is not installed. Please install it with `pip install diffusers`"
+        )
 
 
 @dataclass

diff --git a/xfuser/model_executor/layers/attention_processor.py b/xfuser/model_executor/layers/attention_processor.py
@@ -11,9 +11,9 @@
     AttnProcessor2_0,
     JointAttnProcessor2_0,
     FluxAttnProcessor2_0,
-    FluxSingleAttnProcessor2_0,
     HunyuanAttnProcessor2_0,
 )
+
 try:
     from diffusers.models.attention_processor import CogVideoXAttnProcessor2_0
 except ImportError:
@@ -80,14 +80,20 @@ def apply_rotary_emb(
 
         if use_real_unbind_dim == -1:
             # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(
+                -1
+            )  # [B, S, H, D//2]
             x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
         elif use_real_unbind_dim == -2:
             # Used for Stable Audio
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(
+                -2
+            )  # [B, S, H, D//2]
             x_rotated = torch.cat([-x_imag, x_real], dim=-1)
         else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+            raise ValueError(
+                f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2."
+            )
 
         out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
 
@@ -343,8 +349,9 @@ def __call__(
         #! ---------------------------------------- KV CACHE ----------------------------------------
 
         #! ---------------------------------------- ATTENTION ----------------------------------------
-        if (HAS_LONG_CTX_ATTN 
-            and get_sequence_parallel_world_size() > 1 
+        if (
+            HAS_LONG_CTX_ATTN
+            and get_sequence_parallel_world_size() > 1
             and not latte_temporal_attention
         ):
             query = query.transpose(1, 2)
@@ -638,7 +645,11 @@ def __call__(
         *args,
         **kwargs,
     ) -> torch.FloatTensor:
-        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        batch_size, _, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
 
         # `sample` projections.
         query = attn.to_q(hidden_states)
@@ -675,9 +686,13 @@ def __call__(
             ).transpose(1, 2)
 
             if attn.norm_added_q is not None:
-                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+                encoder_hidden_states_query_proj = attn.norm_added_q(
+                    encoder_hidden_states_query_proj
+                )
             if attn.norm_added_k is not None:
-                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+                encoder_hidden_states_key_proj = attn.norm_added_k(
+                    encoder_hidden_states_key_proj
+                )
 
             num_encoder_hidden_states_tokens = encoder_hidden_states_query_proj.shape[2]
             num_query_tokens = query.shape[2]
@@ -808,7 +823,7 @@ def __init__(self):
                 )
         else:
             self.hybrid_seq_parallel_attn = None
-    
+
     # NOTE() torch.compile dose not works for V100
     @torch_compile_disable_if_v100
     def __call__(
@@ -1029,12 +1044,18 @@ def __call__(
             hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 
             batch_size, sequence_length, _ = (
-                hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+                hidden_states.shape
+                if encoder_hidden_states is None
+                else encoder_hidden_states.shape
             )
 
             if attention_mask is not None:
-                attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-                attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+                attention_mask = attn.prepare_attention_mask(
+                    attention_mask, sequence_length, batch_size
+                )
+                attention_mask = attention_mask.view(
+                    batch_size, attn.heads, -1, attention_mask.shape[-1]
+                )
 
             query = attn.to_q(hidden_states)
             key = attn.to_k(hidden_states)
@@ -1054,9 +1075,13 @@ def __call__(
 
             # Apply RoPE if needed
             if image_rotary_emb is not None:
-                query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+                query[:, :, text_seq_length:] = apply_rotary_emb(
+                    query[:, :, text_seq_length:], image_rotary_emb
+                )
                 if not attn.is_cross_attention:
-                    key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+                    key[:, :, text_seq_length:] = apply_rotary_emb(
+                        key[:, :, text_seq_length:], image_rotary_emb
+                    )
 
             #! ---------------------------------------- KV CACHE ----------------------------------------
             if not self.use_long_ctx_attn_kvcache:
@@ -1082,7 +1107,9 @@ def __call__(
                     causal=False,
                     joint_strategy="none",
                 )
-                hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
+                hidden_states = hidden_states.reshape(
+                    batch_size, -1, attn.heads * head_dim
+                )
             else:
                 if HAS_FLASH_ATTN:
                     from flash_attn import flash_attn_func
@@ -1114,7 +1141,6 @@ def __call__(
             # hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
             #! ---------------------------------------- ATTENTION ----------------------------------------
 
-
             # linear proj
             hidden_states = attn.to_out[0](hidden_states)
             # dropout