Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IPAdapterTesterMixin #6862

Merged
merged 33 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
fdbb31a
begin IPAdapterTesterMixin
a-r-r-o-w Feb 5, 2024
ab8c2aa
add image encoder to sd pipe test
a-r-r-o-w Feb 5, 2024
0f445f7
add ip-adapter test
a-r-r-o-w Feb 5, 2024
ae0e789
update signature test
a-r-r-o-w Feb 5, 2024
724ef3a
fix
a-r-r-o-w Feb 5, 2024
6136218
add multi-adapter test
a-r-r-o-w Feb 5, 2024
7f5979a
Update tests/pipelines/test_pipelines_common.py
a-r-r-o-w Feb 6, 2024
b08daa5
Update tests/pipelines/test_pipelines_common.py
a-r-r-o-w Feb 6, 2024
214ebc5
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 6, 2024
b8ddccc
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 8, 2024
c568d5b
update tests to use ip adapter image embeds
a-r-r-o-w Feb 8, 2024
1bc0697
add mixin to most fast tests
a-r-r-o-w Feb 8, 2024
0711d70
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 9, 2024
3ac557f
add check for ip_adapter_image in pipeline call as well
a-r-r-o-w Feb 9, 2024
13af51d
fix return_dict related issues on accessing .images for video pipelines
a-r-r-o-w Feb 9, 2024
53c39e8
use cross attention dim instead of sample size from unet config
a-r-r-o-w Feb 9, 2024
ca36d10
fix
a-r-r-o-w Feb 9, 2024
501e7f3
remove ip adapter test mixin from LCM
a-r-r-o-w Feb 10, 2024
11475a7
remove ip adapter test mixin from SAG
a-r-r-o-w Feb 10, 2024
7954cb8
fixes
a-r-r-o-w Feb 10, 2024
ff4181d
fix LCM IPA
a-r-r-o-w Feb 10, 2024
ceabed5
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 10, 2024
4e9d60a
fix animatediff based tests
a-r-r-o-w Feb 10, 2024
274c33e
Merge branch 'main' into ip-adapter-test-mixin
yiyixuxu Feb 15, 2024
91a35e3
fix pia test
a-r-r-o-w Feb 15, 2024
08d6b15
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 15, 2024
6744cac
fix tests
a-r-r-o-w Feb 15, 2024
9db5474
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 16, 2024
0dfc4d3
remove ip adapter test from sd panorama
a-r-r-o-w Feb 16, 2024
e4cafed
split test into single and multi
a-r-r-o-w Feb 16, 2024
a22fe2e
final final fix, i promise
a-r-r-o-w Feb 18, 2024
b2d84c5
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 18, 2024
cd225d4
Merge branch 'main' into ip-adapter-test-mixin
a-r-r-o-w Feb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/diffusers/pipelines/animatediff/pipeline_animatediff.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7. Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 8. Denoising loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,41 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state

return image_embeds, uncond_image_embeds

# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
def prepare_ip_adapter_image_embeds(
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
):
if ip_adapter_image_embeds is None:
if not isinstance(ip_adapter_image, list):
ip_adapter_image = [ip_adapter_image]

if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
raise ValueError(
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
)

image_embeds = []
for single_ip_adapter_image, image_proj_layer in zip(
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
):
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
single_image_embeds, single_negative_image_embeds = self.encode_image(
single_ip_adapter_image, device, 1, output_hidden_state
)
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
single_negative_image_embeds = torch.stack(
[single_negative_image_embeds] * num_images_per_prompt, dim=0
)

if self.do_classifier_free_guidance:
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
single_image_embeds = single_image_embeds.to(device)

image_embeds.append(single_image_embeds)
else:
image_embeds = ip_adapter_image_embeds
return image_embeds

# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
Expand Down Expand Up @@ -731,6 +766,7 @@ def __call__(
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -780,6 +816,9 @@ def __call__(
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. If not
provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
`np.array`.
Expand Down Expand Up @@ -866,13 +905,10 @@ def __call__(
if self.do_classifier_free_guidance:
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
image_embeds = self.prepare_ip_adapter_image_embeds(
ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
)
if self.do_classifier_free_guidance:
image_embeds = torch.cat([negative_image_embeds, image_embeds])

# 4. Prepare timesteps
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
Expand All @@ -899,7 +935,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7. Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 8. Denoising loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
Expand Down
6 changes: 5 additions & 1 deletion src/diffusers/pipelines/controlnet/pipeline_controlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,7 +1206,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 7.2 Create tensor stating which controlnets to keep
controlnet_keep = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1200,7 +1200,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 7.2 Create tensor stating which controlnets to keep
controlnet_keep = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1495,7 +1495,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 7.2 Create tensor stating which controlnets to keep
controlnet_keep = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -477,8 +477,9 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state

return image_embeds, uncond_image_embeds

# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
def prepare_ip_adapter_image_embeds(
self, ip_adapter_image, ip_adapter_image_embeds, do_classifier_free_guidance, device, num_images_per_prompt
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
):
if ip_adapter_image_embeds is None:
if not isinstance(ip_adapter_image, list):
Expand All @@ -502,7 +503,7 @@ def prepare_ip_adapter_image_embeds(
[single_negative_image_embeds] * num_images_per_prompt, dim=0
)

if do_classifier_free_guidance:
if self.do_classifier_free_guidance:
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
single_image_embeds = single_image_embeds.to(device)

Expand Down Expand Up @@ -699,6 +700,10 @@ def cross_attention_kwargs(self):
def clip_skip(self):
return self._clip_skip

@property
def do_classifier_free_guidance(self):
return False

@property
def num_timesteps(self):
return self._num_timesteps
Expand Down Expand Up @@ -845,7 +850,7 @@ def __call__(

if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
image_embeds = self.prepare_ip_adapter_image_embeds(
ip_adapter_image, ip_adapter_image_embeds, False, device, batch_size * num_images_per_prompt
ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt
)

# 3. Encode input prompt
Expand All @@ -860,7 +865,7 @@ def __call__(
prompt,
device,
num_images_per_prompt,
False,
self.do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=None,
Expand Down Expand Up @@ -906,7 +911,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 8. LCM Multistep Sampling Loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,41 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state

return image_embeds, uncond_image_embeds

# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
def prepare_ip_adapter_image_embeds(
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
):
if ip_adapter_image_embeds is None:
if not isinstance(ip_adapter_image, list):
ip_adapter_image = [ip_adapter_image]

if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
raise ValueError(
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
)

image_embeds = []
for single_ip_adapter_image, image_proj_layer in zip(
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
):
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
single_image_embeds, single_negative_image_embeds = self.encode_image(
single_ip_adapter_image, device, 1, output_hidden_state
)
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
single_negative_image_embeds = torch.stack(
[single_negative_image_embeds] * num_images_per_prompt, dim=0
)

if self.do_classifier_free_guidance:
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
single_image_embeds = single_image_embeds.to(device)

image_embeds.append(single_image_embeds)
else:
image_embeds = ip_adapter_image_embeds
return image_embeds

# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
Expand Down Expand Up @@ -590,6 +625,10 @@ def cross_attention_kwargs(self):
def clip_skip(self):
return self._clip_skip

@property
def do_classifier_free_guidance(self):
a-r-r-o-w marked this conversation as resolved.
Show resolved Hide resolved
return False

@property
def num_timesteps(self):
return self._num_timesteps
Expand All @@ -610,6 +649,7 @@ def __call__(
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -660,6 +700,9 @@ def __call__(
provided, text embeddings are generated from the `prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. If not
provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Expand Down Expand Up @@ -726,12 +769,10 @@ def __call__(
batch_size = prompt_embeds.shape[0]

device = self._execution_device
# do_classifier_free_guidance = guidance_scale > 1.0

if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
image_embeds = self.prepare_ip_adapter_image_embeds(
ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt
)

# 3. Encode input prompt
Expand All @@ -746,7 +787,7 @@ def __call__(
prompt,
device,
num_images_per_prompt,
False,
self.do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=None,
Expand Down Expand Up @@ -786,7 +827,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 8. LCM MultiStep Sampling Loop:
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
Expand Down
6 changes: 5 additions & 1 deletion src/diffusers/pipelines/pia/pipeline_pia.py
Original file line number Diff line number Diff line change
Expand Up @@ -1188,7 +1188,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7. Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 8. Denoising loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1111,7 +1111,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 7.2 Optionally get Guidance Scale Embedding
timestep_cond = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1397,7 +1397,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 9.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 9.2 Optionally get Guidance Scale Embedding
timestep_cond = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,11 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_cond_kwargs = (
{"image_embeds": image_embeds}
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
else None
)

# 8. Denoising loop
# Each denoising step also includes refinement of the latents with respect to the
Expand Down
6 changes: 5 additions & 1 deletion tests/models/unets/test_models_unet_2d_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ def create_ip_adapter_state_dict(model):
key_id = 1

for name in model.attn_processors.keys():
cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
cross_attention_dim = (
None if name.endswith("attn1.processor") or "motion_module" in name else model.config.cross_attention_dim
)

if name.startswith("mid_block"):
hidden_size = model.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
Expand All @@ -71,6 +74,7 @@ def create_ip_adapter_state_dict(model):
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = model.config.block_out_channels[block_id]

if cross_attention_dim is not None:
sd = IPAdapterAttnProcessor(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
Expand Down
Loading
Loading