Refactoring simplet2i (#387)

* start refactoring -not yet functional * first phase of refactor done - not sure weighted prompts working * Second phase of refactoring. Everything mostly working. * The refactoring has moved all the hard-core inference work into ldm.dream.generator.*, where there are submodules for txt2img and img2img. inpaint will go in there as well. * Some additional refactoring will be done soon, but relatively minor work. * fix -save_orig flag to actually work * add @neonsecret attention.py memory optimization * remove unneeded imports * move token logging into conditioning.py * add placeholder version of inpaint; porting in progress * fix crash in img2img * inpainting working; not tested on variations * fix crashes in img2img * ported attention.py memory optimization #117 from basujindal branch * added @torch_no_grad() decorators to img2img, txt2img, inpaint closures * Final commit prior to PR against development * fixup crash when generating intermediate images in web UI * rename ldm.simplet2i to ldm.generate * add backward-compatibility simplet2i shell with deprecation warning * add back in mps exception, addresses @Vargol comment in #354 * replaced Conditioning class with exported functions * fix wrong type of with_variations attribute during intialization * changed "image_iterator()" to "get_make_image()" * raise NotImplementedError for calling get_make_image() in parent class * Update ldm/generate.py better error message Co-authored-by: Kevin Gibbons <bakkot@gmail.com> * minor stylistic fixes and assertion checks from code review * moved get_noise() method into img2img class * break get_noise() into two methods, one for txt2img and the other for img2img * inpainting works on non-square images now * make get_noise() an abstract method in base class * much improved inpainting Co-authored-by: Kevin Gibbons <bakkot@gmail.com>
invoke-ai · Sep 6, 2022 · 720e5cd · 720e5cd
1 parent 1ad2a8e
commit 720e5cd
Show file tree

Hide file tree

Showing 16 changed files with 1,261 additions and 990 deletions.
diff --git a/ldm/dream/conditioning.py b/ldm/dream/conditioning.py
@@ -0,0 +1,96 @@
+'''
+This module handles the generation of the conditioning tensors, including management of
+weighted subprompts.
+
+Useful function exports:
+
+get_uc_and_c()                  get the conditioned and unconditioned latent
+split_weighted_subpromopts()    split subprompts, normalize and weight them
+log_tokenization()              print out colour-coded tokens and warn if truncated
+
+'''
+import re
+import torch
+
+def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
+    uc = model.get_learned_conditioning([''])
+
+    # get weighted sub-prompts
+    weighted_subprompts = split_weighted_subprompts(
+        prompt, skip_normalize
+    )
+
+    if len(weighted_subprompts) > 1:
+        # i dont know if this is correct.. but it works
+        c = torch.zeros_like(uc)
+        # normalize each "sub prompt" and add it
+        for subprompt, weight in weighted_subprompts:
+            log_tokenization(subprompt, model, log_tokens)
+            c = torch.add(
+                c,
+                model.get_learned_conditioning([subprompt]),
+                alpha=weight,
+            )
+    else:   # just standard 1 prompt
+        log_tokenization(prompt, model, log_tokens)
+        c = model.get_learned_conditioning([prompt])
+    return (uc, c)
+
+def split_weighted_subprompts(text, skip_normalize=False)->list:
+    """
+    grabs all text up to the first occurrence of ':'
+    uses the grabbed text as a sub-prompt, and takes the value following ':' as weight
+    if ':' has no value defined, defaults to 1.0
+    repeats until no text remaining
+    """
+    prompt_parser = re.compile("""
+            (?P<prompt>     # capture group for 'prompt'
+            (?:\\\:|[^:])+  # match one or more non ':' characters or escaped colons '\:'
+            )               # end 'prompt'
+            (?:             # non-capture group
+            :+              # match one or more ':' characters
+            (?P<weight>     # capture group for 'weight'
+            -?\d+(?:\.\d+)? # match positive or negative integer or decimal number
+            )?              # end weight capture group, make optional
+            \s*             # strip spaces after weight
+            |               # OR
+            $               # else, if no ':' then match end of line
+            )               # end non-capture group
+            """, re.VERBOSE)
+    parsed_prompts = [(match.group("prompt").replace("\\:", ":"), float(
+        match.group("weight") or 1)) for match in re.finditer(prompt_parser, text)]
+    if skip_normalize:
+        return parsed_prompts
+    weight_sum = sum(map(lambda x: x[1], parsed_prompts))
+    if weight_sum == 0:
+        print(
+            "Warning: Subprompt weights add up to zero. Discarding and using even weights instead.")
+        equal_weight = 1 / len(parsed_prompts)
+        return [(x[0], equal_weight) for x in parsed_prompts]
+    return [(x[0], x[1] / weight_sum) for x in parsed_prompts]
+
+# shows how the prompt is tokenized
+# usually tokens have '</w>' to indicate end-of-word,
+# but for readability it has been replaced with ' '
+def log_tokenization(text, model, log=False):
+    if not log:
+        return
+    tokens    = model.cond_stage_model.tokenizer._tokenize(text)
+    tokenized = ""
+    discarded = ""
+    usedTokens = 0
+    totalTokens = len(tokens)
+    for i in range(0, totalTokens):
+        token = tokens[i].replace('</w>', ' ')
+        # alternate color
+        s = (usedTokens % 6) + 1
+        if i < model.cond_stage_model.max_length:
+            tokenized = tokenized + f"\x1b[0;3{s};40m{token}"
+            usedTokens += 1
+        else:  # over max token length
+            discarded = discarded + f"\x1b[0;3{s};40m{token}"
+        print(f"\n>> Tokens ({usedTokens}):\n{tokenized}\x1b[0m")
+        if discarded != "":
+            print(
+                f">> Tokens Discarded ({totalTokens-usedTokens}):\n{discarded}\x1b[0m"
+            )
diff --git a/ldm/dream/devices.py b/ldm/dream/devices.py
@@ -1,4 +1,6 @@
 import torch
+from torch import autocast
+from contextlib import contextmanager, nullcontext
 
 def choose_torch_device() -> str:
     '''Convenience routine for guessing which GPU device to run model on'''
@@ -8,10 +10,11 @@ def choose_torch_device() -> str:
         return 'mps'
     return 'cpu'
 
-def choose_autocast_device(device) -> str:
+def choose_autocast_device(device):
     '''Returns an autocast compatible device from a torch device'''
     device_type = device.type # this returns 'mps' on M1
     # autocast only supports cuda or cpu
-    if device_type not in ('cuda','cpu'):
-        return 'cpu'
-    return device_type
+    if device_type in ('cuda','cpu'):
+        return device_type,autocast
+    else:
+        return 'cpu',nullcontext
diff --git a/ldm/dream/generator/__init__.py b/ldm/dream/generator/__init__.py
@@ -0,0 +1,4 @@
+'''
+Initialization file for the ldm.dream.generator package
+'''
+from .base import Generator
diff --git a/ldm/dream/generator/base.py b/ldm/dream/generator/base.py
@@ -0,0 +1,158 @@
+'''
+Base class for ldm.dream.generator.*
+including img2img, txt2img, and inpaint
+'''
+import torch
+import numpy as  np
+import random
+from tqdm import tqdm, trange
+from PIL               import Image
+from einops import rearrange, repeat
+from pytorch_lightning import seed_everything
+from ldm.dream.devices import choose_autocast_device
+
+downsampling = 8
+
+class Generator():
+    def __init__(self,model):
+        self.model               = model
+        self.seed                = None
+        self.latent_channels     = model.channels
+        self.downsampling_factor = downsampling   # BUG: should come from model or config
+        self.variation_amount    = 0
+        self.with_variations     = []
+
+    # this is going to be overridden in img2img.py, txt2img.py and inpaint.py
+    def get_make_image(self,prompt,**kwargs):
+        """
+        Returns a function returning an image derived from the prompt and the initial image
+        Return value depends on the seed at the time you call it
+        """
+        raise NotImplementedError("image_iterator() must be implemented in a descendent class")
+
+    def set_variation(self, seed, variation_amount, with_variations):
+        self.seed             = seed
+        self.variation_amount = variation_amount
+        self.with_variations  = with_variations
+
+    def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
+                 image_callback=None, step_callback=None,
+                 **kwargs):
+        device_type,scope   = choose_autocast_device(self.model.device)
+        make_image          = self.get_make_image(
+            prompt,
+            init_image    = init_image,
+            width         = width,
+            height        = height,
+            step_callback = step_callback,
+            **kwargs
+        )
+
+        results             = []
+        seed                = seed if seed else self.new_seed()
+        seed, initial_noise = self.generate_initial_noise(seed, width, height)
+        with scope(device_type), self.model.ema_scope():
+            for n in trange(iterations, desc='Generating'):
+                x_T = None
+                if self.variation_amount > 0:
+                    seed_everything(seed)
+                    target_noise = self.get_noise(width,height)
+                    x_T = self.slerp(self.variation_amount, initial_noise, target_noise)
+                elif initial_noise is not None:
+                    # i.e. we specified particular variations
+                    x_T = initial_noise
+                else:
+                    seed_everything(seed)
+                    if self.model.device.type == 'mps':
+                        x_T = self.get_noise(width,height)
+
+                # make_image will do the equivalent of get_noise itself
+                image = make_image(x_T)
+                results.append([image, seed])
+                if image_callback is not None:
+                    image_callback(image, seed)
+                seed = self.new_seed()
+        return results
+
+    def sample_to_image(self,samples):
+        """
+        Returns a function returning an image derived from the prompt and the initial image
+        Return value depends on the seed at the time you call it
+        """
+        x_samples = self.model.decode_first_stage(samples)
+        x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+        if len(x_samples) != 1:
+            raise Exception(
+                f'>> expected to get a single image, but got {len(x_samples)}')
+        x_sample = 255.0 * rearrange(
+            x_samples[0].cpu().numpy(), 'c h w -> h w c'
+        )
+        return Image.fromarray(x_sample.astype(np.uint8))
+
+    def generate_initial_noise(self, seed, width, height):
+        initial_noise = None
+        if self.variation_amount > 0 or len(self.with_variations) > 0:
+            # use fixed initial noise plus random noise per iteration
+            seed_everything(seed)
+            initial_noise = self.get_noise(width,height)
+            for v_seed, v_weight in self.with_variations:
+                seed = v_seed
+                seed_everything(seed)
+                next_noise = self.get_noise(width,height)
+                initial_noise = self.slerp(v_weight, initial_noise, next_noise)
+            if self.variation_amount > 0:
+                random.seed() # reset RNG to an actually random state, so we can get a random seed for variations
+                seed = random.randrange(0,np.iinfo(np.uint32).max)
+            return (seed, initial_noise)
+        else:
+            return (seed, None)
+
+    # returns a tensor filled with random numbers from a normal distribution
+    def get_noise(self,width,height):
+        """
+        Returns a tensor filled with random numbers, either form a normal distribution
+        (txt2img) or from the latent image (img2img, inpaint)
+        """
+        raise NotImplementedError("get_noise() must be implemented in a descendent class")
+
+    def new_seed(self):
+        self.seed = random.randrange(0, np.iinfo(np.uint32).max)
+        return self.seed
+
+    def slerp(self, t, v0, v1, DOT_THRESHOLD=0.9995):
+        '''
+        Spherical linear interpolation
+        Args:
+            t (float/np.ndarray): Float value between 0.0 and 1.0
+            v0 (np.ndarray): Starting vector
+            v1 (np.ndarray): Final vector
+            DOT_THRESHOLD (float): Threshold for considering the two vectors as
+                                colineal. Not recommended to alter this.
+        Returns:
+            v2 (np.ndarray): Interpolation vector between v0 and v1
+        '''
+        inputs_are_torch = False
+        if not isinstance(v0, np.ndarray):
+            inputs_are_torch = True
+            v0 = v0.detach().cpu().numpy()
+        if not isinstance(v1, np.ndarray):
+            inputs_are_torch = True
+            v1 = v1.detach().cpu().numpy()
+
+        dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+        if np.abs(dot) > DOT_THRESHOLD:
+            v2 = (1 - t) * v0 + t * v1
+        else:
+            theta_0 = np.arccos(dot)
+            sin_theta_0 = np.sin(theta_0)
+            theta_t = theta_0 * t
+            sin_theta_t = np.sin(theta_t)
+            s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+            s1 = sin_theta_t / sin_theta_0
+            v2 = s0 * v0 + s1 * v1
+
+        if inputs_are_torch:
+            v2 = torch.from_numpy(v2).to(self.model.device)
+
+        return v2
+
diff --git a/ldm/dream/generator/img2img.py b/ldm/dream/generator/img2img.py
@@ -0,0 +1,72 @@
+'''
+ldm.dream.generator.txt2img descends from ldm.dream.generator
+'''
+
+import torch
+import numpy as  np
+from ldm.dream.devices             import choose_autocast_device
+from ldm.dream.generator.base      import Generator
+from ldm.models.diffusion.ddim     import DDIMSampler
+
+class Img2Img(Generator):
+    def __init__(self,model):
+        super().__init__(model)
+        self.init_latent         = None    # by get_noise()
+
+    @torch.no_grad()
+    def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
+                       conditioning,init_image,strength,step_callback=None,**kwargs):
+        """
+        Returns a function returning an image derived from the prompt and the initial image
+        Return value depends on the seed at the time you call it.
+        """
+
+        # PLMS sampler not supported yet, so ignore previous sampler
+        if not isinstance(sampler,DDIMSampler):
+            print(
+                f">> sampler '{sampler.__class__.__name__}' is not yet supported. Using DDIM sampler"
+            )
+            sampler = DDIMSampler(self.model, device=self.model.device)
+
+        sampler.make_schedule(
+            ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
+        )
+
+        device_type,scope   = choose_autocast_device(self.model.device)
+        with scope(device_type):
+            self.init_latent = self.model.get_first_stage_encoding(
+                self.model.encode_first_stage(init_image)
+            ) # move to latent space
+
+        t_enc = int(strength * steps)
+        uc, c   = conditioning
+
+        @torch.no_grad()
+        def make_image(x_T):
+            # encode (scaled latent)
+            z_enc = sampler.stochastic_encode(
+                self.init_latent,
+                torch.tensor([t_enc]).to(self.model.device),
+                noise=x_T
+            )
+            # decode it
+            samples = sampler.decode(
+                z_enc,
+                c,
+                t_enc,
+                img_callback = step_callback,
+                unconditional_guidance_scale=cfg_scale,
+                unconditional_conditioning=uc,
+            )
+            return self.sample_to_image(samples)
+
+        return make_image
+
+    def get_noise(self,width,height):
+        device      = self.model.device
+        init_latent = self.init_latent
+        assert init_latent is not None,'call to get_noise() when init_latent not set'
+        if device.type == 'mps':
+            return torch.randn_like(init_latent, device='cpu').to(device)
+        else:
+            return torch.randn_like(init_latent, device=device)