From de00c632174704d6d92405120f171e58b38e0ee8 Mon Sep 17 00:00:00 2001
From: Pi Esposito <piero.skywalker@gmail.com>
Date: Thu, 27 Oct 2022 11:52:21 -0300
Subject: [PATCH] Document sequential CPU offload method on Stable Diffusion
 pipeline (#1024)

* document cpu offloading method

* address review comments

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/optimization/fp16.mdx             | 64 +++++++++++++++++--
 .../pipeline_stable_diffusion.py              |  5 ++
 2 files changed, 62 insertions(+), 7 deletions(-)
diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx
index d1dd87f7652f..b4ffed31c76b 100644
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -14,9 +14,8 @@ specific language governing permissions and limitations under the License.
 
 We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed.
 
-
 |                  | Latency | Speedup |
-|------------------|---------|---------|
+| ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | cuDNN auto-tuner | 9.37s   | x1.01   |
 | autocast (fp16)  | 5.47s   | x1.91   |
@@ -24,7 +23,11 @@ We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for
 | channels last    | 3.30s   | x2.87   |
 | traced UNet      | 3.21s   | x2.96   |
 
-<em>obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps.</em>
+<em>
+  obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
+  the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM
+  steps.
+</em>
 
 ## Enable cuDNN auto-tuner
 
@@ -61,7 +64,7 @@ pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
 with autocast("cuda"):
-    image = pipe(prompt).images[0]  
+    image = pipe(prompt).images[0]
 ```
 
 Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back!
@@ -79,7 +82,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]  
+image = pipe(prompt).images[0]
 ```
 
 ## Sliced attention for additional memory savings
@@ -87,7 +90,10 @@ image = pipe(prompt).images[0]
 For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
 
 <Tip>
-Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the *QK^T* attention matrix can be computed sequentially for each head which can save a significant amount of memory.
+  Attention slicing is useful even if a batch size of just 1 is used - as long
+  as the model uses more than one attention head. If there is more than one
+  attention head the *QK^T* attention matrix can be computed sequentially for
+  each head which can save a significant amount of memory.
 </Tip>
 
 To perform the attention computation sequentially over each head, you only need to invoke [`~StableDiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here:
@@ -105,11 +111,55 @@ pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_attention_slicing()
-image = pipe(prompt).images[0]  
+image = pipe(prompt).images[0]
 ```
 
 There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!
 
+## Offloading to CPU with accelerate for memory savings
+
+For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
+
+To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+And you can get the memory consumption to < 2GB.
+
+If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing(1)
+
+image = pipe(prompt).images[0]
+```
+
 ## Using Channels Last memory format
 
 Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index cca11281359a..e80cc1360b33 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -120,6 +120,11 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
         if is_accelerate_available():
             from accelerate import cpu_offload
         else: