huggingface · AlekseyKorshuk · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 6, 2024
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -308,6 +308,25 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
+def requires_vae_latents_normalization(vae):
+    return hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None and \
+           hasattr(vae.config, "latents_std") and vae.config.latents_std is not None
+
+
+def normalize_vae_latents(latents, latents_mean, latents_std):
+    latents_mean = latents_mean.to(device=latents.device, dtype=latents.dtype)
+    latents_std = latents_std.to(device=latents.device, dtype=latents.dtype)
+    latents = (latents - latents_mean) / latents_std
+    return latents
+
+
+def denormalize_vae_latents(latents, latents_mean, latents_std):
+    latents_mean = latents_mean.to(device=latents.device, dtype=latents.dtype)
+    latents_std = latents_std.to(device=latents.device, dtype=latents.dtype)
+    latents = latents * latents_std + latents_mean
+    return latents
+
+
 class StableDiffusionXLInpaintPipeline(
     DiffusionPipeline,
     StableDiffusionMixin,
@@ -939,6 +958,12 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
         else:
             image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
 
+        if requires_vae_latents_normalization(self.vae):
+            image_latents = normalize_vae_latents(
+                image_latents,
+                latents_mean=torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1),
+                latents_std=torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1),
+            )
         if self.vae.config.force_upcast:
             self.vae.to(dtype)
 
@@ -1763,6 +1788,13 @@ def denoising_value_valid(dnv):
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
+        if requires_vae_latents_normalization(self.vae):
+            latents = denormalize_vae_latents(
+                latents,
+                latents_mean=torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1),
+                latents_std=torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1),
+            )
+
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
             needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast