Merge branch 'dev' into SD3-parsing

AI-Casanova · Jun 21, 2024 · 6b6170b · 6b6170b
2 parents 1fcd378 + e51599c
commit 6b6170b
Show file tree

Hide file tree

Showing 14 changed files with 87 additions and 66 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,12 +3,11 @@
 ## Pending
 
 - Diffusers==0.30.0
-- https://github.com/huggingface/diffusers/pull/8566
 - https://github.com/huggingface/diffusers/pull/8584
 
-## Update for 2024-06-19
+## Update for 2024-06-20
 
-### Highlights for 2024-06-19
+### Highlights for 2024-06-20
 
 Following zero-day **SD3** release, a week later here's a refresh with more than a few improvements.  
 But there's more than SD3:
@@ -51,6 +50,7 @@ But there's more than SD3:
 - improved google.colab support
 - css tweaks for standardui
 - css tweaks for modernui
+- additional torch gc checks, thanks @Disty0!
 
 ### Fixes
 
@@ -68,6 +68,7 @@ But there's more than SD3:
 - fix sdxl "has been incorrectly initialized"
 - fix api face-hires
 - fix api ip-adapter
+- fix memory exceptions with ROCm, thanks @Disty0!
 - cleanup image metadata
 - restructure api examples: `cli/api-*`
 - handle theme fallback when invalid theme is specified

diff --git a/TODO.md b/TODO.md
@@ -11,6 +11,7 @@ Main ToDo list can be found at [GitHub projects](https://github.com/users/vladma
 - diffusers public callbacks  
 - include reference styles
 - lora: sc lora, dora, etc
+- sd3 controlnet: <https://github.com/huggingface/diffusers/pull/8566>
 
 ## Experimental
 

diff --git a/installer.py b/installer.py
@@ -399,7 +399,7 @@ def check_python(supported_minors=[9, 10, 11, 12], reason=None):
     if args.quick:
         return
     log.info(f'Python version={platform.python_version()} platform={platform.system()} bin="{sys.executable}" venv="{sys.prefix}"')
-    if int(sys.version_info.major) == 3 and int(sys.version_info.minor) == 12 and int(sys.version_info.minor) > 3: # TODO python 3.12.4 or higher cause a mess with pydantic
+    if int(sys.version_info.major) == 3 and int(sys.version_info.minor) == 12 and int(sys.version_info.micro) > 3: # TODO python 3.12.4 or higher cause a mess with pydantic
         log.error(f"Incompatible Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro} required 3.12.3 or lower")
         if reason is not None:
             log.error(reason)

diff --git a/modules/devices.py b/modules/devices.py
@@ -140,13 +140,16 @@ def torch_gc(force=False):
         used_gpu = round(100 * gpu.get('used', 0) / gpu.get('total', 1)) if gpu.get('total', 1) > 1 else 0
     used_ram = round(100 * ram.get('used', 0) / ram.get('total', 1)) if ram.get('total', 1) > 1 else 0
     global previous_oom # pylint: disable=global-statement
+    if force or shared.opts.torch_gc_threshold == 0:
+        log.debug(f'Forced Torch GC: GPU={used_gpu}% RAM={used_ram}% {mem}')
+        force = True
+    elif used_gpu >= shared.opts.torch_gc_threshold or used_ram >= shared.opts.torch_gc_threshold:
+        log.info(f'High memory utilization: GPU={used_gpu}% RAM={used_ram}% {mem}')
+        force = True
     if oom > previous_oom:
         previous_oom = oom
         log.warning(f'GPU out-of-memory error: {mem}')
         force = True
-    if used_gpu >= shared.opts.torch_gc_threshold or used_ram >= shared.opts.torch_gc_threshold:
-        log.info(f'High memory utilization: GPU={used_gpu}% RAM={used_ram}% {mem}')
-        force = True
     if not force:
         return
 

diff --git a/modules/pag/pipe_sdxl.py b/modules/pag/pipe_sdxl.py
@@ -461,8 +461,9 @@ def __init__(
             image_encoder=image_encoder,
             feature_extractor=feature_extractor,
         )
+        if 'requires_aesthetics_score' in self.config:
+            self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
-        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.default_sample_size = self.unet.config.sample_size
@@ -1500,7 +1501,7 @@ def __call__(
                     else:
                         replace_processor = PAGIdentitySelfAttnProcessor()
 
-                    if(self.pag_applied_layers_index):
+                    if self.pag_applied_layers_index:
                         drop_layers = self.pag_applied_layers_index
                         for drop_layer in drop_layers:
                             layer_number = int(drop_layer[1:])
@@ -1517,7 +1518,7 @@ def __call__(
                                 raise ValueError(
                                     f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
                                 )
-                    elif(self.pag_applied_layers):
+                    elif self.pag_applied_layers:
                         drop_full_layers = self.pag_applied_layers
                         for drop_full_layer in drop_full_layers:
                             try:
@@ -1621,7 +1622,7 @@ def __call__(
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
-        if not output_type == "latent":
+        if output_type != "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
             needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
 
@@ -1656,7 +1657,7 @@ def __call__(
         else:
             image = latents
 
-        if not output_type == "latent":
+        if output_type != "latent":
             # apply watermark if available
             if self.watermark is not None:
                 image = self.watermark.apply_watermark(image)
@@ -1671,7 +1672,7 @@ def __call__(
 
         #Change the attention layers back to original ones after PAG was applied
         if self.do_adversarial_guidance:
-            if(self.pag_applied_layers_index):
+            if self.pag_applied_layers_index:
                 drop_layers = self.pag_applied_layers_index
                 for drop_layer in drop_layers:
                     layer_number = int(drop_layer[1:])
@@ -1685,26 +1686,22 @@ def __call__(
                         else:
                             raise ValueError(f"Invalid layer type: {drop_layer[0]}")
                     except IndexError:
-                        raise ValueError(
-                            f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
-                        )
-            elif(self.pag_applied_layers):
-                            drop_full_layers = self.pag_applied_layers
-                            for drop_full_layer in drop_full_layers:
-                                try:
-                                    if drop_full_layer == "down":
-                                        for down_layer in down_layers:
-                                            down_layer.processor = AttnProcessor2_0()
-                                    elif drop_full_layer == "mid":
-                                        for mid_layer in mid_layers:
-                                            mid_layer.processor = AttnProcessor2_0()
-                                    elif drop_full_layer == "up":
-                                        for up_layer in up_layers:
-                                            up_layer.processor = AttnProcessor2_0()
-                                    else:
-                                        raise ValueError(f"Invalid layer type: {drop_full_layer}")
-                                except IndexError:
-                                    raise ValueError(
-                                        f"Invalid layer index: {drop_full_layer}. Available layers are: down, mid and up. If you need to specify each layer index, you can use `pag_applied_layers_index`"
-                                    )
+                        raise ValueError(f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers.")
+            elif self.pag_applied_layers:
+                drop_full_layers = self.pag_applied_layers
+                for drop_full_layer in drop_full_layers:
+                    try:
+                        if drop_full_layer == "down":
+                            for down_layer in down_layers:
+                                down_layer.processor = AttnProcessor2_0()
+                        elif drop_full_layer == "mid":
+                            for mid_layer in mid_layers:
+                                mid_layer.processor = AttnProcessor2_0()
+                        elif drop_full_layer == "up":
+                            for up_layer in up_layers:
+                                up_layer.processor = AttnProcessor2_0()
+                        else:
+                            raise ValueError(f"Invalid layer type: {drop_full_layer}")
+                    except IndexError:
+                        raise ValueError(f"Invalid layer index: {drop_full_layer}. Available layers are: down, mid and up. If you need to specify each layer index, you can use `pag_applied_layers_index`")
         return StableDiffusionXLPipelineOutput(images=image)
diff --git a/modules/processing_helpers.py b/modules/processing_helpers.py
@@ -400,6 +400,7 @@ def resize_hires(p, latents): # input=latents output=pil if not latent_upscaler
         else:
             resized_image = img
         resized_images.append(resized_image)
+    devices.torch_gc()
     return resized_images
 
 

diff --git a/modules/processing_vae.py b/modules/processing_vae.py
@@ -140,6 +140,7 @@ def vae_decode(latents, model, output_type='np', full_quality=True):
     if shared.cmd_opts.profile:
         t1 = time.time()
         shared.log.debug(f'Profile: VAE decode: {t1-t0:.2f}')
+    devices.torch_gc()
     return imgs
 
 
@@ -155,4 +156,5 @@ def vae_encode(image, model, full_quality=True): # pylint: disable=unused-variab
         latents = full_vae_encode(image=tensor, model=shared.sd_model)
     else:
         latents = taesd_vae_encode(image=tensor)
+    devices.torch_gc()
     return latents
diff --git a/modules/prompt_parser_diffusers.py b/modules/prompt_parser_diffusers.py
@@ -12,10 +12,9 @@
 debug = shared.log.trace if os.environ.get('SD_PROMPT_DEBUG', None) is not None else lambda *args, **kwargs: None
 debug('Trace: PROMPT')
 orig_encode_token_ids_to_embeddings = EmbeddingsProvider._encode_token_ids_to_embeddings # pylint: disable=protected-access
-token_dict = None
-token_type = None
+token_dict = None # used by helper get_tokens
+token_type = None # used by helper get_tokens
 cache = {}
-cache_type = None
 
 
 def compel_hijack(self, token_ids: torch.Tensor,
@@ -151,7 +150,7 @@ def encode_prompts(pipe, p, prompts: list, negative_prompts: list, steps: int, c
     if 'StableDiffusion' not in pipe.__class__.__name__ and 'DemoFusion' not in pipe.__class__.__name__ and 'StableCascade' not in pipe.__class__.__name__:
         shared.log.warning(f"Prompt parser not supported: {pipe.__class__.__name__}")
         return
-    elif prompts == cache.get('prompts', None) and negative_prompts == cache.get('negative_prompts', None) and clip_skip == cache.get('clip_skip', None) and cache.get('model_type', None) == shared.sd_model_type and steps == cache.get('steps', None):
+    elif shared.opts.sd_textencoder_cache and prompts == cache.get('prompts', None) and negative_prompts == cache.get('negative_prompts', None) and clip_skip == cache.get('clip_skip', None) and cache.get('model_type', None) == shared.sd_model_type and steps == cache.get('steps', None):
         p.prompt_embeds = cache.get('prompt_embeds', None)
         p.positive_pooleds = cache.get('positive_pooleds', None)
         p.negative_embeds = cache.get('negative_embeds', None)
@@ -182,18 +181,21 @@ def encode_prompts(pipe, p, prompts: list, negative_prompts: list, steps: int, c
             if negative_pooled is not None:
                 p.negative_pooleds.append(torch.cat([negative_pooled] * len(negative_prompts), dim=0))
 
-        cache.update({
-            'prompt_embeds': p.prompt_embeds,
-            'negative_embeds': p.negative_embeds,
-            'positive_pooleds': p.positive_pooleds,
-            'negative_pooleds': p.negative_pooleds,
-            'scheduled_prompt': p.scheduled_prompt,
-            'prompts': prompts,
-            'negative_prompts': negative_prompts,
-            'clip_skip': clip_skip,
-            'steps': steps,
-            'model_type': shared.sd_model_type
-        })
+        if shared.opts.sd_textencoder_cache:
+            cache.update({
+                'prompt_embeds': p.prompt_embeds,
+                'negative_embeds': p.negative_embeds,
+                'positive_pooleds': p.positive_pooleds,
+                'negative_pooleds': p.negative_pooleds,
+                'scheduled_prompt': p.scheduled_prompt,
+                'prompts': prompts,
+                'negative_prompts': negative_prompts,
+                'clip_skip': clip_skip,
+                'steps': steps,
+                'model_type': shared.sd_model_type
+            })
+        else:
+            cache.clear()
         if debug_enabled:
             get_tokens('positive', prompts[0])
             get_tokens('negative', negative_prompts[0])

diff --git a/modules/sd_models.py b/modules/sd_models.py
@@ -255,15 +255,16 @@ def select_checkpoint(op='model'):
         shared.log.info("  or use --ckpt-dir <path-to-folder> to specify folder with sd models")
         shared.log.info("  or use --ckpt <path-to-checkpoint> to force using specific model")
         return None
-    checkpoint_info = next(iter(checkpoints_list.values()))
+    # checkpoint_info = next(iter(checkpoints_list.values()))
     if model_checkpoint is not None:
         if model_checkpoint != 'model.ckpt' and model_checkpoint != 'runwayml/stable-diffusion-v1-5':
-            shared.log.warning(f"Selected checkpoint not found: {model_checkpoint}")
+            shared.log.warning(f'Selected: {op}="{model_checkpoint}" not found')
         else:
             shared.log.info("Selecting first available checkpoint")
         # shared.log.warning(f"Loading fallback checkpoint: {checkpoint_info.title}")
-        shared.opts.data['sd_model_checkpoint'] = checkpoint_info.title
-    shared.log.info(f'Select: {op}="{checkpoint_info.title if checkpoint_info is not None else None}"')
+        # shared.opts.data['sd_model_checkpoint'] = checkpoint_info.title
+    else:
+        shared.log.info(f'Select: {op}="{checkpoint_info.title if checkpoint_info is not None else None}"')
     return checkpoint_info
 
 
@@ -936,6 +937,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
 
         checkpoint_info = checkpoint_info or select_checkpoint(op=op)
         if checkpoint_info is None:
+            print('HERE1')
             unload_model_weights(op=op)
             return
 
@@ -1113,6 +1115,8 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
         if sd_model is None:
             shared.log.error('Diffuser model not loaded')
             return
+        if 'requires_aesthetics_score' in sd_model.config:
+            sd_model.register_to_config(requires_aesthetics_score=False)
         sd_model.sd_model_hash = checkpoint_info.calculate_shorthash() # pylint: disable=attribute-defined-outside-init
         sd_model.sd_checkpoint_info = checkpoint_info # pylint: disable=attribute-defined-outside-init
         sd_model.sd_model_checkpoint = checkpoint_info.filename # pylint: disable=attribute-defined-outside-init
@@ -1575,6 +1579,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model',
         else:
             load_diffuser(checkpoint_info, already_loaded_state_dict=state_dict, timer=timer, op=op)
         if load_dict and next_checkpoint_info is not None:
+            print('HERE2')
             model_data.sd_dict = shared.opts.sd_model_dict
             shared.opts.data["sd_model_checkpoint"] = next_checkpoint_info.title
             reload_model_weights(reuse_dict=True) # ok we loaded dict now lets redo and load model on top of it
@@ -1588,6 +1593,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model',
             shared.opts.data["sd_model_refiner"] = checkpoint_info.title
             return model_data.sd_refiner
 
+    print('HERE3')
     # fallback
     shared.log.info(f"Loading using fallback: {op} model={checkpoint_info.title}")
     try:

diff --git a/modules/sd_samplers_common.py b/modules/sd_samplers_common.py
@@ -40,15 +40,17 @@ def single_sample_to_image(sample, approximation=None):
                 warn_once('Unknown decode type')
                 approximation = 0
         # normal sample is [4,64,64]
-        if sample.dtype == torch.bfloat16:
-            sample = sample.to(torch.float16)
+        try:
+            if sample.dtype == torch.bfloat16:
+                sample = sample.to(torch.float16)
+        except Exception as e:
+            warn_once(f'live preview: {e}')
         if len(sample.shape) > 4: # likely unknown video latent (e.g. svd)
             return Image.new(mode="RGB", size=(512, 512))
         if len(sample) == 16: # sd_cascade
             sd_cascade = True
         if len(sample.shape) == 4 and sample.shape[0]: # likely animatediff latent
             sample = sample.permute(1, 0, 2, 3)[0]
-
         if shared.native: # [-x,x] to [-5,5]
             sample_max = torch.max(sample)
             if sample_max > 5:

diff --git a/modules/sd_vae_approx.py b/modules/sd_vae_approx.py
@@ -34,21 +34,24 @@ def forward(self, x):
 
 def nn_approximation(sample): # Approximate NN
     global sd_vae_approx_model # pylint: disable=global-statement
+    # ROCm throws memory exceptions and crashes the GPU with it if we use approx on the GPU
+    device = devices.device if devices.backend != "rocm" else "cpu"
+    dtype = devices.dtype_vae if devices.backend != "rocm" else torch.float32
     if sd_vae_approx_model is None:
         model_path = os.path.join(paths.models_path, "VAE-approx", "model.pt")
         sd_vae_approx_model = VAEApprox()
         if not os.path.exists(model_path):
             model_path = os.path.join(paths.script_path, "models", "VAE-approx", "model.pt")
-        approx_weights = torch.load(model_path, map_location='cpu' if devices.device.type != 'cuda' else None)
+        approx_weights = torch.load(model_path, map_location='cpu' if devices.device.type != 'cuda' or devices.backend == "rocm" else None)
         sd_vae_approx_model.load_state_dict(approx_weights)
         sd_vae_approx_model.eval()
-        sd_vae_approx_model.to(devices.device, sample.dtype)
+        sd_vae_approx_model.to(device, dtype)
         shared.log.debug(f'VAE load: type=approximate model={model_path}')
     try:
-        in_sample = sample.to(devices.device).unsqueeze(0)
-        sd_vae_approx_model.to(devices.device, devices.dtype)
+        in_sample = sample.to(device, dtype).unsqueeze(0)
+        sd_vae_approx_model.to(device, dtype)
         x_sample = sd_vae_approx_model(in_sample)
-        x_sample = x_sample[0].detach().cpu()
+        x_sample = x_sample[0].to(torch.float32).detach().cpu()
         return x_sample
     except Exception as e:
         shared.log.error(f'VAE decode approximate: {e}')