From de00c632174704d6d92405120f171e58b38e0ee8 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Thu, 27 Oct 2022 11:52:21 -0300 Subject: [PATCH] Document sequential CPU offload method on Stable Diffusion pipeline (#1024) * document cpu offloading method * address review comments Co-authored-by: Patrick von Platen Co-authored-by: Patrick von Platen --- docs/source/optimization/fp16.mdx | 64 +++++++++++++++++-- .../pipeline_stable_diffusion.py | 5 ++ 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx index d1dd87f7652f..b4ffed31c76b 100644 --- a/docs/source/optimization/fp16.mdx +++ b/docs/source/optimization/fp16.mdx @@ -14,9 +14,8 @@ specific language governing permissions and limitations under the License. We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed. - | | Latency | Speedup | -|------------------|---------|---------| +| ---------------- | ------- | ------- | | original | 9.50s | x1 | | cuDNN auto-tuner | 9.37s | x1.01 | | autocast (fp16) | 5.47s | x1.91 | @@ -24,7 +23,11 @@ We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for | channels last | 3.30s | x2.87 | | traced UNet | 3.21s | x2.96 | -obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps. + + obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from + the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM + steps. + ## Enable cuDNN auto-tuner @@ -61,7 +64,7 @@ pipe = pipe.to("cuda") prompt = "a photo of an astronaut riding a horse on mars" with autocast("cuda"): - image = pipe(prompt).images[0] + image = pipe(prompt).images[0] ``` Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back! @@ -79,7 +82,7 @@ pipe = StableDiffusionPipeline.from_pretrained( pipe = pipe.to("cuda") prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] +image = pipe(prompt).images[0] ``` ## Sliced attention for additional memory savings @@ -87,7 +90,10 @@ image = pipe(prompt).images[0] For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once. -Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the *QK^T* attention matrix can be computed sequentially for each head which can save a significant amount of memory. + Attention slicing is useful even if a batch size of just 1 is used - as long + as the model uses more than one attention head. If there is more than one + attention head the *QK^T* attention matrix can be computed sequentially for + each head which can save a significant amount of memory. To perform the attention computation sequentially over each head, you only need to invoke [`~StableDiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here: @@ -105,11 +111,55 @@ pipe = pipe.to("cuda") prompt = "a photo of an astronaut riding a horse on mars" pipe.enable_attention_slicing() -image = pipe(prompt).images[0] +image = pipe(prompt).images[0] ``` There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM! +## Offloading to CPU with accelerate for memory savings + +For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass. + +To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]: + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + revision="fp16", + torch_dtype=torch.float16, +) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_sequential_cpu_offload() +image = pipe(prompt).images[0] +``` + +And you can get the memory consumption to < 2GB. + +If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM: + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + revision="fp16", + torch_dtype=torch.float16, +) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_sequential_cpu_offload() +pipe.enable_attention_slicing(1) + +image = pipe(prompt).images[0] +``` + ## Using Channels Last memory format Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index cca11281359a..e80cc1360b33 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -120,6 +120,11 @@ def disable_attention_slicing(self): self.enable_attention_slicing(None) def enable_sequential_cpu_offload(self): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + """ if is_accelerate_available(): from accelerate import cpu_offload else: