From c5e223492c6f7ed9b09b6808654e832c674425b5 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Wed, 29 May 2024 17:05:32 +0800
Subject: [PATCH 01/14] refactor

---
 .../compile/__init__.py                       |   9 +
 .../{ => compile}/compile_ldm.py              |   0
 .../{ => compile}/compile_sgm.py              |   0
 .../compile/compile_utils.py                  |  74 ++++++++
 .../{ => compile}/compile_vae.py              |   0
 .../compile/onediff_compiled_graph.py         |  29 +++
 onediff_sd_webui_extensions/onediff_hijack.py |   3 +-
 onediff_sd_webui_extensions/onediff_shared.py |  13 ++
 .../scripts/onediff.py                        | 179 +++++++-----------
 onediff_sd_webui_extensions/ui_utils.py       |  72 ++++++-
 10 files changed, 262 insertions(+), 117 deletions(-)
 create mode 100644 onediff_sd_webui_extensions/compile/__init__.py
 rename onediff_sd_webui_extensions/{ => compile}/compile_ldm.py (100%)
 rename onediff_sd_webui_extensions/{ => compile}/compile_sgm.py (100%)
 create mode 100644 onediff_sd_webui_extensions/compile/compile_utils.py
 rename onediff_sd_webui_extensions/{ => compile}/compile_vae.py (100%)
 create mode 100644 onediff_sd_webui_extensions/compile/onediff_compiled_graph.py
 create mode 100644 onediff_sd_webui_extensions/onediff_shared.py

diff --git a/onediff_sd_webui_extensions/compile/__init__.py b/onediff_sd_webui_extensions/compile/__init__.py
new file mode 100644
index 000000000..4d225f4c6
--- /dev/null
+++ b/onediff_sd_webui_extensions/compile/__init__.py
@@ -0,0 +1,9 @@
+# from .compile_ldm import SD21CompileCtx, compile_ldm_unet
+from .compile_ldm import SD21CompileCtx
+
+# from .compile_sgm import compile_sgm_unet
+from .compile_vae import VaeCompileCtx
+
+# from .compile_utils import compile_unet, get_compiled_unet
+from .compile_utils import get_compiled_graph
+from .onediff_compiled_graph import OneDiffCompiledGraph
diff --git a/onediff_sd_webui_extensions/compile_ldm.py b/onediff_sd_webui_extensions/compile/compile_ldm.py
similarity index 100%
rename from onediff_sd_webui_extensions/compile_ldm.py
rename to onediff_sd_webui_extensions/compile/compile_ldm.py
diff --git a/onediff_sd_webui_extensions/compile_sgm.py b/onediff_sd_webui_extensions/compile/compile_sgm.py
similarity index 100%
rename from onediff_sd_webui_extensions/compile_sgm.py
rename to onediff_sd_webui_extensions/compile/compile_sgm.py
diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
new file mode 100644
index 000000000..66c5fc503
--- /dev/null
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -0,0 +1,74 @@
+import os
+from typing import Dict
+
+# import modules.shared as shared
+import warnings
+from typing import Union, Dict
+from pathlib import Path
+
+from .compile_ldm import compile_ldm_unet
+from .compile_sgm import compile_sgm_unet
+from .onediff_compiled_graph import OneDiffCompiledGraph
+from ldm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelLDM
+from sgm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelSGM
+from onediff.optimization.quant_optimizer import (
+    quantize_model,
+    varify_can_use_quantization,
+)
+from onediff.utils import logger
+from onediff_shared import graph_dict
+
+from modules.sd_models import select_checkpoint
+
+
+def compile_unet(
+    unet_model, quantization=False, *, options=None,
+):
+    if isinstance(unet_model, UNetModelLDM):
+        compiled_unet = compile_ldm_unet(unet_model, options=options)
+    elif isinstance(unet_model, UNetModelSGM):
+        compiled_unet = compile_sgm_unet(unet_model, options=options)
+    else:
+        warnings.warn(
+            f"Unsupported model type: {type(unet_model)} for compilation , skip",
+            RuntimeWarning,
+        )
+        compiled_unet = unet_model
+    # In OneDiff Community, quantization can be True when called by api
+    if quantization and varify_can_use_quantization():
+        calibrate_info = get_calibrate_info(
+            f"{Path(select_checkpoint().filename).stem}_sd_calibrate_info.txt"
+        )
+        compiled_unet = quantize_model(
+            compiled_unet, inplace=False, calibrate_info=calibrate_info
+        )
+    return compiled_unet
+
+
+def get_calibrate_info(filename: str) -> Union[None, Dict]:
+    calibration_path = Path(select_checkpoint().filename).parent / filename
+    if not calibration_path.exists():
+        return None
+
+    logger.info(f"Got calibrate info at {str(calibration_path)}")
+    calibrate_info = {}
+    with open(calibration_path, "r") as f:
+        for line in f.readlines():
+            line = line.strip()
+            items = line.split(" ")
+            calibrate_info[items[0]] = [
+                float(items[1]),
+                int(items[2]),
+                [float(x) for x in items[3].split(",")],
+            ]
+    return calibrate_info
+
+
+def get_compiled_graph(sd_model, quantization) -> OneDiffCompiledGraph:
+    if sd_model.sd_model_hash in graph_dict:
+        return graph_dict[sd_model.sd_model_hash]
+    else:
+        compiled_unet = compile_unet(
+            sd_model.model.diffusion_model, quantization=quantization
+        )
+        return OneDiffCompiledGraph(sd_model, compiled_unet, quantization)
diff --git a/onediff_sd_webui_extensions/compile_vae.py b/onediff_sd_webui_extensions/compile/compile_vae.py
similarity index 100%
rename from onediff_sd_webui_extensions/compile_vae.py
rename to onediff_sd_webui_extensions/compile/compile_vae.py
diff --git a/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py b/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py
new file mode 100644
index 000000000..efeaf6cfc
--- /dev/null
+++ b/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py
@@ -0,0 +1,29 @@
+import dataclasses
+import torch
+from onediff.infer_compiler import DeployableModule
+from modules import sd_models_types
+
+
+@dataclasses.dataclass
+class OneDiffCompiledGraph:
+    name: str = None
+    filename: str = None
+    sha: str = None
+    eager_module: torch.nn.Module = None
+    graph_module: DeployableModule = None
+    quantized: bool = False
+
+    def __init__(
+        self,
+        sd_model: sd_models_types.WebuiSdModel = None,
+        graph_module: DeployableModule = None,
+        quantized=False,
+    ):
+        if sd_model is None:
+            return
+        self.name = sd_model.sd_checkpoint_info.name
+        self.filename = sd_model.sd_checkpoint_info.filename
+        self.sha = sd_model.sd_model_hash
+        self.eager_module = sd_model.model.diffusion_model
+        self.graph_module = graph_module
+        self.quantized = quantized
diff --git a/onediff_sd_webui_extensions/onediff_hijack.py b/onediff_sd_webui_extensions/onediff_hijack.py
index c8da677c6..65241da36 100644
--- a/onediff_sd_webui_extensions/onediff_hijack.py
+++ b/onediff_sd_webui_extensions/onediff_hijack.py
@@ -1,5 +1,4 @@
-import compile_ldm
-import compile_sgm
+from compile import compile_ldm, compile_sgm
 import oneflow
 
 
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
new file mode 100644
index 000000000..a2b04c834
--- /dev/null
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -0,0 +1,13 @@
+from typing import Dict
+from compile.onediff_compiled_graph import OneDiffCompiledGraph
+
+# from compile_utils import OneDiffCompiledGraph
+
+current_unet_graph = OneDiffCompiledGraph()
+graph_dict = dict()
+current_unet_type = {
+    "is_sdxl": False,
+    "is_sd2": False,
+    "is_sd1": False,
+    "is_ssd": False,
+}
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 5e5766c04..b39caa716 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -7,9 +7,12 @@
 import gradio as gr
 import modules.scripts as scripts
 import modules.shared as shared
-from compile_ldm import SD21CompileCtx, compile_ldm_unet
-from compile_sgm import compile_sgm_unet
-from compile_vae import VaeCompileCtx
+from compile import (
+    SD21CompileCtx,
+    VaeCompileCtx,
+    get_compiled_graph,
+    OneDiffCompiledGraph,
+)
 from modules import script_callbacks
 from modules.processing import process_images
 from modules.sd_models import select_checkpoint
@@ -22,6 +25,9 @@
     get_all_compiler_caches,
     hints_message,
     refresh_all_compiler_caches,
+    check_structure_change_and_update,
+    load_graph,
+    save_graph,
 )
 
 from onediff import __version__ as onediff_version
@@ -30,11 +36,13 @@
     varify_can_use_quantization,
 )
 from onediff.utils import logger, parse_boolean_from_env
+import onediff_shared
 
 """oneflow_compiled UNetModel"""
-compiled_unet = None
-is_unet_quantized = False
-compiled_ckpt_name = None
+# compiled_unet = {}
+# compiled_unet = None
+# is_unet_quantized = False
+# compiled_ckpt_name = None
 
 
 def generate_graph_path(ckpt_name: str, model_name: str) -> str:
@@ -68,43 +76,18 @@ def get_calibrate_info(filename: str) -> Union[None, Dict]:
     return calibrate_info
 
 
-def compile_unet(
-    unet_model, quantization=False, *, options=None,
-):
-    from ldm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelLDM
-    from sgm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelSGM
-
-    if isinstance(unet_model, UNetModelLDM):
-        compiled_unet = compile_ldm_unet(unet_model, options=options)
-    elif isinstance(unet_model, UNetModelSGM):
-        compiled_unet = compile_sgm_unet(unet_model, options=options)
-    else:
-        warnings.warn(
-            f"Unsupported model type: {type(unet_model)} for compilation , skip",
-            RuntimeWarning,
-        )
-        compiled_unet = unet_model
-    # In OneDiff Community, quantization can be True when called by api
-    if quantization and varify_can_use_quantization():
-        calibrate_info = get_calibrate_info(
-            f"{Path(select_checkpoint().filename).stem}_sd_calibrate_info.txt"
-        )
-        compiled_unet = quantize_model(
-            compiled_unet, inplace=False, calibrate_info=calibrate_info
-        )
-    return compiled_unet
-
-
 class UnetCompileCtx(object):
     """The unet model is stored in a global variable.
     The global variables need to be replaced with compiled_unet before process_images is run,
     and then the original model restored so that subsequent reasoning with onediff disabled meets expectations.
     """
 
+    def __init__(self, compiled_unet):
+        self.compiled_unet = compiled_unet
+
     def __enter__(self):
         self._original_model = shared.sd_model.model.diffusion_model
-        global compiled_unet
-        shared.sd_model.model.diffusion_model = compiled_unet
+        shared.sd_model.model.diffusion_model = self.compiled_unet
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         shared.sd_model.model.diffusion_model = self._original_model
@@ -112,16 +95,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class Script(scripts.Script):
-    current_type = None
-
     def title(self):
         return "onediff_diffusion_model"
 
     def ui(self, is_img2img):
-        """this function should create gradio UI elements. See https://gradio.app/docs/#components
-        The return value should be an array of all components that are used in processing.
-        Values of those returned components will be passed to run() and process() functions.
-        """
         with gr.Row():
             # TODO: set choices as Tuple[str, str] after the version of gradio specified webui upgrades
             compiler_cache = gr.Dropdown(
@@ -142,7 +119,11 @@ def ui(self, is_img2img):
                 label="always_recompile",
                 visible=parse_boolean_from_env("ONEDIFF_DEBUG"),
             )
-        gr.HTML(hints_message, elem_id="hintMessage", visible=not varify_can_use_quantization())
+        gr.HTML(
+            hints_message,
+            elem_id="hintMessage",
+            visible=not varify_can_use_quantization(),
+        )
         is_quantized = gr.components.Checkbox(
             label="Model Quantization(int8) Speed Up",
             visible=varify_can_use_quantization(),
@@ -150,30 +131,7 @@ def ui(self, is_img2img):
         return [is_quantized, compiler_cache, save_cache_name, always_recompile]
 
     def show(self, is_img2img):
-        return True
-
-    def check_model_change(self, model):
-        is_changed = False
-
-        def get_model_type(model):
-            return {
-                "is_sdxl": model.is_sdxl,
-                "is_sd2": model.is_sd2,
-                "is_sd1": model.is_sd1,
-                "is_ssd": model.is_ssd,
-            }
-
-        if self.current_type is None:
-            is_changed = True
-        else:
-            for key, v in self.current_type.items():
-                if v != getattr(model, key):
-                    is_changed = True
-                    break
-
-        if is_changed is True:
-            self.current_type = get_model_type(model)
-        return is_changed
+        return scripts.AlwaysVisible
 
     def run(
         self,
@@ -184,67 +142,44 @@ def run(
         always_recompile=False,
     ):
 
-        global compiled_unet, compiled_ckpt_name, is_unet_quantized
-        current_checkpoint = shared.opts.sd_model_checkpoint
-        original_diffusion_model = shared.sd_model.model.diffusion_model
-
-        ckpt_changed = current_checkpoint != compiled_ckpt_name
-        model_changed = self.check_model_change(shared.sd_model)
-        quantization_changed = quantization != is_unet_quantized
+        current_checkpoint_name = shared.sd_model.sd_checkpoint_info.name
+        ckpt_changed = (
+            shared.sd_model.sd_checkpoint_info.name
+            != onediff_shared.current_unet_graph.name
+        )
+        structure_changed = check_structure_change_and_update(
+            onediff_shared.current_unet_type, shared.sd_model
+        )
+        quantization_changed = (
+            quantization != onediff_shared.current_unet_graph.quantized
+        )
         need_recompile = (
             (
                 quantization and ckpt_changed
             )  # always recompile when switching ckpt with 'int8 speed model' enabled
-            or model_changed  # always recompile when switching model to another structure
+            or structure_changed  # always recompile when switching model to another structure
             or quantization_changed  # always recompile when switching model from non-quantized to quantized (and vice versa)
             or always_recompile
         )
-
-        is_unet_quantized = quantization
-        compiled_ckpt_name = current_checkpoint
         if need_recompile:
-            compiled_unet = compile_unet(
-                original_diffusion_model, quantization=quantization
+            onediff_shared.current_unet_graph = get_compiled_graph(
+                shared.sd_model, quantization
             )
-
-            # Due to the version of gradio compatible with sd-webui, the CompilerCache dropdown box always returns a string
-            if compiler_cache not in [None, "None"]:
-                compiler_cache_path = all_compiler_caches_path() + f"/{compiler_cache}"
-                if not Path(compiler_cache_path).exists():
-                    raise FileNotFoundError(
-                        f"Cannot find cache {compiler_cache_path}, please make sure it exists"
-                    )
-                try:
-                    compiled_unet.load_graph(compiler_cache_path, run_warmup=True)
-                except zipfile.BadZipFile:
-                    raise RuntimeError(
-                        "Load cache failed. Please make sure that the --disable-safe-unpickle parameter is added when starting the webui"
-                    )
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Load cache failed ({e}). Please make sure cache has the same sd version (or unet architure) with current checkpoint"
-                    )
-
+            load_graph(onediff_shared.current_unet_graph, compiler_cache)
         else:
             logger.info(
-                f"Model {current_checkpoint} has same sd type of graph type {self.current_type}, skip compile"
+                f"Model {current_checkpoint_name} has same sd type of graph type {onediff_shared.current_unet_type}, skip compile"
             )
 
-        with UnetCompileCtx(), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate():
+        # register graph
+        onediff_shared.graph_dict[shared.sd_model.sd_model_hash] = OneDiffCompiledGraph(
+            shared.sd_model, graph_module=onediff_shared.current_unet_graph.graph_module
+        )
+        with UnetCompileCtx(
+            onediff_shared.current_unet_graph.graph_module
+        ), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate():
             proc = process_images(p)
-
-        if saved_cache_name != "":
-            if not os.access(str(all_compiler_caches_path()), os.W_OK):
-                raise PermissionError(
-                    f"The directory {all_compiler_caches_path()} does not have write permissions, and compiler cache cannot be written to this directory. \
-                                      Please change it in the settings to a directory with write permissions"
-                )
-            if not Path(all_compiler_caches_path()).exists():
-                Path(all_compiler_caches_path()).mkdir()
-            saved_cache_name = all_compiler_caches_path() + f"/{saved_cache_name}"
-            if not Path(saved_cache_name).exists():
-                compiled_unet.save_graph(saved_cache_name)
-
+        save_graph(onediff_shared.current_unet_graph, saved_cache_name)
         return proc
 
 
@@ -260,5 +195,23 @@ def on_ui_settings():
     )
 
 
+def cfg_denoisers_callback(params):
+    # print(f"current checkpoint: {shared.opts.sd_model_checkpoint}")
+    # import ipdb; ipdb.set_trace()
+    if "refiner" in shared.sd_model.sd_checkpoint_info.name:
+        pass
+        # import ipdb; ipdb.set_trace()
+        # shared.sd_model.model.diffusion_model
+
+    print(f"current checkpoint info: {shared.sd_model.sd_checkpoint_info.name}")
+    # shared.sd_model.model.diffusion_model = compile_unet(
+    #     shared.sd_model.model.diffusion_model
+    # )
+
+    # have to check if onediff enabled
+    # print('onediff denoiser callback')
+
+
 script_callbacks.on_ui_settings(on_ui_settings)
+script_callbacks.on_cfg_denoiser(cfg_denoisers_callback)
 onediff_do_hijack()
diff --git a/onediff_sd_webui_extensions/ui_utils.py b/onediff_sd_webui_extensions/ui_utils.py
index 7e442be4a..a23efbdf1 100644
--- a/onediff_sd_webui_extensions/ui_utils.py
+++ b/onediff_sd_webui_extensions/ui_utils.py
@@ -1,7 +1,12 @@
+import os
 from pathlib import Path
 from textwrap import dedent
+from onediff.infer_compiler import DeployableModule
+from zipfile import BadZipFile
+import onediff_shared
 
-hints_message = dedent("""\
+hints_message = dedent(
+    """\
 <div id="hintMessage" style="position: relative; padding: 20px; border: 1px solid #e0e0e0; border-radius: 5px; background-color: #f9f9f9;">
     <button onclick="document.getElementById('hintMessage').style.display = 'none'" style="position: absolute; top: 10px; right: 10px; background: none; border: none; font-size: 18px; cursor: pointer;">&times;</button>
     <div style="font-size: 18px; font-weight: bold; margin-bottom: 15px; color: #31708f;">
@@ -21,7 +26,8 @@
         <a href="https://github.com/siliconflow/onediff/issues" style="color: #31708f; text-decoration: none;">https://github.com/siliconflow/onediff/issues</a>
     </p>
 </div>
-""")
+"""
+)
 
 all_compiler_caches = []
 
@@ -46,3 +52,65 @@ def refresh_all_compiler_caches(path: Path = None):
     global all_compiler_caches
     path = path or all_compiler_caches_path()
     all_compiler_caches = [f.stem for f in Path(path).iterdir() if f.is_file()]
+
+
+def check_structure_change_and_update(current_type: dict[str, bool], model):
+    def get_model_type(model):
+        return {
+            "is_sdxl": model.is_sdxl,
+            "is_sd2": model.is_sd2,
+            "is_sd1": model.is_sd1,
+            "is_ssd": model.is_ssd,
+        }
+
+    changed = current_type != get_model_type(model)
+    current_type.update(**get_model_type(model))
+    return changed
+
+
+def load_graph(compiled_unet: DeployableModule, compiler_cache: str):
+    from compile import OneDiffCompiledGraph
+
+    if isinstance(compiled_unet, OneDiffCompiledGraph):
+        compiled_unet = compiled_unet.graph_module
+
+    if compiler_cache in [None, "None"]:
+        return
+
+    compiler_cache_path = all_compiler_caches_path() + f"/{compiler_cache}"
+    if not Path(compiler_cache_path).exists():
+        raise FileNotFoundError(
+            f"Cannot find cache {compiler_cache_path}, please make sure it exists"
+        )
+    try:
+        compiled_unet.load_graph(compiler_cache_path, run_warmup=True)
+    except BadZipFile:
+        raise RuntimeError(
+            "Load cache failed. Please make sure that the --disable-safe-unpickle parameter is added when starting the webui"
+        )
+    except Exception as e:
+        raise RuntimeError(
+            f"Load cache failed ({e}). Please make sure cache has the same sd version (or unet architure) with current checkpoint"
+        )
+    return compiled_unet
+
+
+def save_graph(compiled_unet: DeployableModule, saved_cache_name: str = ""):
+    from compile import OneDiffCompiledGraph
+
+    if isinstance(compiled_unet, OneDiffCompiledGraph):
+        compiled_unet = compiled_unet.graph_module
+
+    if saved_cache_name in ["", None]:
+        return
+
+    if not os.access(str(all_compiler_caches_path()), os.W_OK):
+        raise PermissionError(
+            f"The directory {all_compiler_caches_path()} does not have write permissions, and compiler cache cannot be written to this directory. \
+                                Please change it in the settings to a directory with write permissions"
+        )
+    if not Path(all_compiler_caches_path()).exists():
+        Path(all_compiler_caches_path()).mkdir()
+    saved_cache_name = all_compiler_caches_path() + f"/{saved_cache_name}"
+    if not Path(saved_cache_name).exists():
+        compiled_unet.save_graph(saved_cache_name)

From e4332cf7dec6cefaaa14ce29aab57f590b3ce469 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Wed, 29 May 2024 17:07:56 +0800
Subject: [PATCH 02/14] move mock utils

---
 .../{ => compile}/sd_webui_onediff_utils.py                       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename onediff_sd_webui_extensions/{ => compile}/sd_webui_onediff_utils.py (100%)

diff --git a/onediff_sd_webui_extensions/sd_webui_onediff_utils.py b/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
similarity index 100%
rename from onediff_sd_webui_extensions/sd_webui_onediff_utils.py
rename to onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py

From 686d5333248e6ea6decaf5817179aebd99a0520b Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Tue, 4 Jun 2024 15:58:20 +0800
Subject: [PATCH 03/14] fix bug of refiner

---
 .../compile/__init__.py                       |   6 +-
 .../compile/compile_ldm.py                    |   2 +-
 .../compile/compile_sgm.py                    |   2 +-
 .../compile/compile_utils.py                  |  14 +--
 .../compile/onediff_compiled_graph.py         |   4 +-
 onediff_sd_webui_extensions/onediff_hijack.py |   2 +-
 onediff_sd_webui_extensions/onediff_lora.py   | 118 ++++++++++++++++++
 onediff_sd_webui_extensions/onediff_shared.py |   5 +-
 .../scripts/onediff.py                        |  36 ++++--
 onediff_sd_webui_extensions/ui_utils.py       |  14 ++-
 10 files changed, 176 insertions(+), 27 deletions(-)

diff --git a/onediff_sd_webui_extensions/compile/__init__.py b/onediff_sd_webui_extensions/compile/__init__.py
index 4d225f4c6..c08ce8c49 100644
--- a/onediff_sd_webui_extensions/compile/__init__.py
+++ b/onediff_sd_webui_extensions/compile/__init__.py
@@ -1,9 +1,9 @@
 # from .compile_ldm import SD21CompileCtx, compile_ldm_unet
 from .compile_ldm import SD21CompileCtx
 
-# from .compile_sgm import compile_sgm_unet
-from .compile_vae import VaeCompileCtx
-
 # from .compile_utils import compile_unet, get_compiled_unet
 from .compile_utils import get_compiled_graph
+
+# from .compile_sgm import compile_sgm_unet
+from .compile_vae import VaeCompileCtx
 from .onediff_compiled_graph import OneDiffCompiledGraph
diff --git a/onediff_sd_webui_extensions/compile/compile_ldm.py b/onediff_sd_webui_extensions/compile/compile_ldm.py
index e87f7f696..9847e91b1 100644
--- a/onediff_sd_webui_extensions/compile/compile_ldm.py
+++ b/onediff_sd_webui_extensions/compile/compile_ldm.py
@@ -9,7 +9,7 @@
 from ldm.modules.diffusionmodules.openaimodel import ResBlock, UNetModel
 from ldm.modules.diffusionmodules.util import GroupNorm32
 from modules import shared
-from sd_webui_onediff_utils import (
+from .sd_webui_onediff_utils import (
     CrossAttentionOflow,
     GroupNorm32Oflow,
     timestep_embedding,
diff --git a/onediff_sd_webui_extensions/compile/compile_sgm.py b/onediff_sd_webui_extensions/compile/compile_sgm.py
index 154b3dc5c..4a6ad6d7e 100644
--- a/onediff_sd_webui_extensions/compile/compile_sgm.py
+++ b/onediff_sd_webui_extensions/compile/compile_sgm.py
@@ -1,5 +1,5 @@
 import oneflow as flow
-from sd_webui_onediff_utils import (
+from .sd_webui_onediff_utils import (
     CrossAttentionOflow,
     GroupNorm32Oflow,
     timestep_embedding,
diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
index 66c5fc503..26b4fa39c 100644
--- a/onediff_sd_webui_extensions/compile/compile_utils.py
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -1,24 +1,23 @@
 import os
-from typing import Dict
 
 # import modules.shared as shared
 import warnings
-from typing import Union, Dict
 from pathlib import Path
+from typing import Dict, Union
 
-from .compile_ldm import compile_ldm_unet
-from .compile_sgm import compile_sgm_unet
-from .onediff_compiled_graph import OneDiffCompiledGraph
 from ldm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelLDM
+from modules.sd_models import select_checkpoint
 from sgm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelSGM
+
 from onediff.optimization.quant_optimizer import (
     quantize_model,
     varify_can_use_quantization,
 )
 from onediff.utils import logger
-from onediff_shared import graph_dict
 
-from modules.sd_models import select_checkpoint
+from .compile_ldm import compile_ldm_unet
+from .compile_sgm import compile_sgm_unet
+from .onediff_compiled_graph import OneDiffCompiledGraph
 
 
 def compile_unet(
@@ -65,6 +64,7 @@ def get_calibrate_info(filename: str) -> Union[None, Dict]:
 
 
 def get_compiled_graph(sd_model, quantization) -> OneDiffCompiledGraph:
+    from onediff_shared import graph_dict
     if sd_model.sd_model_hash in graph_dict:
         return graph_dict[sd_model.sd_model_hash]
     else:
diff --git a/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py b/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py
index efeaf6cfc..d6a09aca3 100644
--- a/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py
+++ b/onediff_sd_webui_extensions/compile/onediff_compiled_graph.py
@@ -1,8 +1,10 @@
 import dataclasses
+
 import torch
-from onediff.infer_compiler import DeployableModule
 from modules import sd_models_types
 
+from onediff.infer_compiler import DeployableModule
+
 
 @dataclasses.dataclass
 class OneDiffCompiledGraph:
diff --git a/onediff_sd_webui_extensions/onediff_hijack.py b/onediff_sd_webui_extensions/onediff_hijack.py
index 65241da36..b6df91af0 100644
--- a/onediff_sd_webui_extensions/onediff_hijack.py
+++ b/onediff_sd_webui_extensions/onediff_hijack.py
@@ -1,5 +1,5 @@
-from compile import compile_ldm, compile_sgm
 import oneflow
+from compile import compile_ldm, compile_sgm
 
 
 # https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/1c0a0c4c26f78c32095ebc7f8af82f5c04fca8c0/modules/sd_hijack_unet.py#L8
diff --git a/onediff_sd_webui_extensions/onediff_lora.py b/onediff_sd_webui_extensions/onediff_lora.py
index 0bee88e9d..0d8ccfa80 100644
--- a/onediff_sd_webui_extensions/onediff_lora.py
+++ b/onediff_sd_webui_extensions/onediff_lora.py
@@ -1,10 +1,17 @@
 import torch
+from typing import Mapping, Any
 
 from onediff.infer_compiler import DeployableModule
 from onediff.infer_compiler.backends.oneflow.param_utils import (
     update_graph_related_tensor,
 )
 
+from onediff_shared import onediff_enabled
+
+from modules import sd_models
+from modules.sd_hijack_utils import CondFunc
+from compile import OneDiffCompiledGraph
+
 
 class HijackLoraActivate:
     def __init__(self):
@@ -57,3 +64,114 @@ def activate(self, p, params_list):
 
     activate._onediff_hijacked = True
     return activate
+
+
+# class HijackLoadModelWeights:
+#     # def __init__(self):
+#         # from modules import extra_networks
+
+#         # if "lora" in extra_networks.extra_network_registry:
+#         #     cls_extra_network_lora = type(extra_networks.extra_network_registry["lora"])
+#         # else:
+#         #     cls_extra_network_lora = None
+#         # self.lora_class = cls_extra_network_lora
+
+#     def __enter__(self):
+#         self.orig_func = sd_models.load_model_weights
+#         sd_models.load_model_weights = onediff_hijack_load_model_weights
+
+#     def __exit__(self, exc_type, exc_val, exc_tb):
+#         sd_models.load_model_weights = self.orig_func
+
+def onediff_hijack_load_model_weights(orig_func, model, checkpoint_info: sd_models.CheckpointInfo, state_dict: dict, timer):
+    # load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer)
+    sd_model_hash = checkpoint_info.calculate_shorthash()
+    import onediff_shared
+    cached_model: OneDiffCompiledGraph = onediff_shared.graph_dict.get(sd_model_hash, None)
+    if cached_model is not None:
+        model.model.diffusion_model = cached_model.graph_module
+        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("model.diffusion_model.")}
+    return orig_func(model, checkpoint_info, state_dict, timer)
+
+
+def onediff_hijack_load_state_dict(orig_func, self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False):
+    if len(state_dict) > 0 and next(iter(state_dict.values())).is_cuda and next(self.parameters()).is_meta:
+        return orig_func(self, state_dict, strict, assign=True)
+    else:
+        return orig_func(self, state_dict, strict, assign)
+
+
+def onediff_hijaced_LoadStateDictOnMeta___enter__(orig_func, self):
+    from modules import shared
+    if shared.cmd_opts.disable_model_loading_ram_optimization:
+        return
+
+    sd = self.state_dict
+    device = self.device
+
+    def load_from_state_dict(original, module, state_dict, prefix, *args, **kwargs):
+        used_param_keys = []
+
+        for name, param in module._parameters.items():
+            if param is None:
+                continue
+
+            key = prefix + name
+            sd_param = sd.pop(key, None)
+            if sd_param is not None:
+                state_dict[key] = sd_param.to(dtype=self.get_weight_dtype(key))
+                used_param_keys.append(key)
+
+            if param.is_meta:
+                dtype = sd_param.dtype if sd_param is not None else param.dtype
+                module._parameters[name] = torch.nn.parameter.Parameter(torch.zeros_like(param, device=device, dtype=dtype), requires_grad=param.requires_grad)
+
+        for name in module._buffers:
+            key = prefix + name
+
+            sd_param = sd.pop(key, None)
+            if sd_param is not None:
+                state_dict[key] = sd_param
+                used_param_keys.append(key)
+
+        original(module, state_dict, prefix, *args, **kwargs)
+
+        for key in used_param_keys:
+            state_dict.pop(key, None)
+
+    # def load_state_dict(original, module, state_dict, strict=True):
+    def load_state_dict(original, module, state_dict, strict=True):
+        """torch makes a lot of copies of the dictionary with weights, so just deleting entries from state_dict does not help
+        because the same values are stored in multiple copies of the dict. The trick used here is to give torch a dict with
+        all weights on meta device, i.e. deleted, and then it doesn't matter how many copies torch makes.
+
+        In _load_from_state_dict, the correct weight will be obtained from a single dict with the right weights (sd).
+
+        The dangerous thing about this is if _load_from_state_dict is not called, (if some exotic module overloads
+        the function and does not call the original) the state dict will just fail to load because weights
+        would be on the meta device.
+        """
+
+        if state_dict is sd:
+            state_dict = {k: v.to(device="meta", dtype=v.dtype) for k, v in state_dict.items()}
+
+        # ------------------- DIFF HERE -------------------
+        # original(module, state_dict, strict=strict)
+        if len(state_dict) > 0 and next(iter(state_dict.values())).is_cuda and next(module.parameters()).is_meta:
+            assign = True
+        else:
+            assign = False
+        # orig_func(original, module, state_dict, strict=strict, assign=assign)
+        original(module, state_dict, strict=strict, assign=assign)
+
+    module_load_state_dict = self.replace(torch.nn.Module, 'load_state_dict', lambda *args, **kwargs: load_state_dict(module_load_state_dict, *args, **kwargs))
+    module_load_from_state_dict = self.replace(torch.nn.Module, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(module_load_from_state_dict, *args, **kwargs))
+    linear_load_from_state_dict = self.replace(torch.nn.Linear, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(linear_load_from_state_dict, *args, **kwargs))
+    conv2d_load_from_state_dict = self.replace(torch.nn.Conv2d, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(conv2d_load_from_state_dict, *args, **kwargs))
+    mha_load_from_state_dict = self.replace(torch.nn.MultiheadAttention, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(mha_load_from_state_dict, *args, **kwargs))
+    layer_norm_load_from_state_dict = self.replace(torch.nn.LayerNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(layer_norm_load_from_state_dict, *args, **kwargs))
+    group_norm_load_from_state_dict = self.replace(torch.nn.GroupNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(group_norm_load_from_state_dict, *args, **kwargs))
+
+
+CondFunc("modules.sd_disable_initialization.LoadStateDictOnMeta.__enter__", onediff_hijaced_LoadStateDictOnMeta___enter__, lambda _, *args, **kwargs: onediff_enabled)
+CondFunc("modules.sd_models.load_model_weights", onediff_hijack_load_model_weights, lambda _, *args, **kwargs: onediff_enabled)
\ No newline at end of file
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index a2b04c834..9bdd82678 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -1,13 +1,16 @@
 from typing import Dict
+
 from compile.onediff_compiled_graph import OneDiffCompiledGraph
 
 # from compile_utils import OneDiffCompiledGraph
 
 current_unet_graph = OneDiffCompiledGraph()
-graph_dict = dict()
+graph_dict: Dict[str, OneDiffCompiledGraph] = dict()
+refiner_dict: Dict[str, str] = dict()
 current_unet_type = {
     "is_sdxl": False,
     "is_sd2": False,
     "is_sd1": False,
     "is_ssd": False,
 }
+onediff_enabled = True
\ No newline at end of file
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index b39caa716..4e27db5d5 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -1,4 +1,5 @@
 import os
+import torch
 import warnings
 import zipfile
 from pathlib import Path
@@ -7,11 +8,13 @@
 import gradio as gr
 import modules.scripts as scripts
 import modules.shared as shared
+import modules.sd_models as sd_models
+import onediff_shared
 from compile import (
+    OneDiffCompiledGraph,
     SD21CompileCtx,
     VaeCompileCtx,
     get_compiled_graph,
-    OneDiffCompiledGraph,
 )
 from modules import script_callbacks
 from modules.processing import process_images
@@ -22,12 +25,13 @@
 from oneflow import __version__ as oneflow_version
 from ui_utils import (
     all_compiler_caches_path,
+    check_structure_change_and_update,
     get_all_compiler_caches,
     hints_message,
-    refresh_all_compiler_caches,
-    check_structure_change_and_update,
     load_graph,
+    refresh_all_compiler_caches,
     save_graph,
+    onediff_enabled,
 )
 
 from onediff import __version__ as onediff_version
@@ -36,7 +40,6 @@
     varify_can_use_quantization,
 )
 from onediff.utils import logger, parse_boolean_from_env
-import onediff_shared
 
 """oneflow_compiled UNetModel"""
 # compiled_unet = {}
@@ -82,12 +85,13 @@ class UnetCompileCtx(object):
     and then the original model restored so that subsequent reasoning with onediff disabled meets expectations.
     """
 
-    def __init__(self, compiled_unet):
-        self.compiled_unet = compiled_unet
+    # def __init__(self, compiled_unet):
+    #     self.compiled_unet = compiled_unet
 
     def __enter__(self):
         self._original_model = shared.sd_model.model.diffusion_model
-        shared.sd_model.model.diffusion_model = self.compiled_unet
+            # onediff_shared.current_unet_graph.graph_module
+        shared.sd_model.model.diffusion_model = onediff_shared.current_unet_graph.graph_module
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         shared.sd_model.model.diffusion_model = self._original_model
@@ -131,7 +135,7 @@ def ui(self, is_img2img):
         return [is_quantized, compiler_cache, save_cache_name, always_recompile]
 
     def show(self, is_img2img):
-        return scripts.AlwaysVisible
+        return True
 
     def run(
         self,
@@ -141,6 +145,11 @@ def run(
         saved_cache_name="",
         always_recompile=False,
     ):
+        # restore checkpoint_info from refiner to base model
+        if sd_models.checkpoint_aliases.get(p.override_settings.get('sd_model_checkpoint')) is None:
+            p.override_settings.pop('sd_model_checkpoint', None)
+            sd_models.reload_model_weights()
+            torch.cuda.empty_cache()
 
         current_checkpoint_name = shared.sd_model.sd_checkpoint_info.name
         ckpt_changed = (
@@ -175,9 +184,8 @@ def run(
         onediff_shared.graph_dict[shared.sd_model.sd_model_hash] = OneDiffCompiledGraph(
             shared.sd_model, graph_module=onediff_shared.current_unet_graph.graph_module
         )
-        with UnetCompileCtx(
-            onediff_shared.current_unet_graph.graph_module
-        ), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate():
+
+        with UnetCompileCtx(), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate(), onediff_enabled():
             proc = process_images(p)
         save_graph(onediff_shared.current_unet_graph, saved_cache_name)
         return proc
@@ -196,9 +204,15 @@ def on_ui_settings():
 
 
 def cfg_denoisers_callback(params):
+    # check refiner model
     # print(f"current checkpoint: {shared.opts.sd_model_checkpoint}")
     # import ipdb; ipdb.set_trace()
     if "refiner" in shared.sd_model.sd_checkpoint_info.name:
+        # onediff_shared.current_unet_graph = get_compiled_graph(
+        #     shared.sd_model, quantization
+        # )
+        # load_graph(onediff_shared.current_unet_graph, compiler_cache)
+        # import ipdb; ipdb.set_trace()
         pass
         # import ipdb; ipdb.set_trace()
         # shared.sd_model.model.diffusion_model
diff --git a/onediff_sd_webui_extensions/ui_utils.py b/onediff_sd_webui_extensions/ui_utils.py
index a23efbdf1..b4fbf369e 100644
--- a/onediff_sd_webui_extensions/ui_utils.py
+++ b/onediff_sd_webui_extensions/ui_utils.py
@@ -1,10 +1,12 @@
 import os
 from pathlib import Path
 from textwrap import dedent
-from onediff.infer_compiler import DeployableModule
 from zipfile import BadZipFile
+
 import onediff_shared
 
+from onediff.infer_compiler import DeployableModule
+
 hints_message = dedent(
     """\
 <div id="hintMessage" style="position: relative; padding: 20px; border: 1px solid #e0e0e0; border-radius: 5px; background-color: #f9f9f9;">
@@ -114,3 +116,13 @@ def save_graph(compiled_unet: DeployableModule, saved_cache_name: str = ""):
     saved_cache_name = all_compiler_caches_path() + f"/{saved_cache_name}"
     if not Path(saved_cache_name).exists():
         compiled_unet.save_graph(saved_cache_name)
+
+
+from contextlib import contextmanager
+@contextmanager
+def onediff_enabled():
+    onediff_shared.onediff_enabled = True
+    try:
+        yield
+    finally:
+        onediff_shared.onediff_enabled = False

From 156724c0c78a845bdfb78c4eecd912923e77c0d3 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Tue, 4 Jun 2024 16:14:06 +0800
Subject: [PATCH 04/14] refine, format

---
 .../compile/__init__.py                       | 12 ++-
 .../compile/compile_ldm.py                    |  7 +-
 .../compile/compile_sgm.py                    | 11 ++-
 .../compile/compile_utils.py                  |  4 +-
 onediff_sd_webui_extensions/onediff_lora.py   | 60 +++++++-----
 onediff_sd_webui_extensions/onediff_shared.py |  2 +-
 .../scripts/onediff.py                        | 94 ++++---------------
 onediff_sd_webui_extensions/ui_utils.py       |  2 +-
 8 files changed, 72 insertions(+), 120 deletions(-)

diff --git a/onediff_sd_webui_extensions/compile/__init__.py b/onediff_sd_webui_extensions/compile/__init__.py
index c08ce8c49..90afcaceb 100644
--- a/onediff_sd_webui_extensions/compile/__init__.py
+++ b/onediff_sd_webui_extensions/compile/__init__.py
@@ -1,9 +1,11 @@
-# from .compile_ldm import SD21CompileCtx, compile_ldm_unet
 from .compile_ldm import SD21CompileCtx
-
-# from .compile_utils import compile_unet, get_compiled_unet
 from .compile_utils import get_compiled_graph
-
-# from .compile_sgm import compile_sgm_unet
 from .compile_vae import VaeCompileCtx
 from .onediff_compiled_graph import OneDiffCompiledGraph
+
+__all__ = [
+    "get_compiled_graph",
+    "SD21CompileCtx",
+    "VaeCompileCtx",
+    "OneDiffCompiledGraph",
+]
diff --git a/onediff_sd_webui_extensions/compile/compile_ldm.py b/onediff_sd_webui_extensions/compile/compile_ldm.py
index 9847e91b1..7b04e16aa 100644
--- a/onediff_sd_webui_extensions/compile/compile_ldm.py
+++ b/onediff_sd_webui_extensions/compile/compile_ldm.py
@@ -9,15 +9,16 @@
 from ldm.modules.diffusionmodules.openaimodel import ResBlock, UNetModel
 from ldm.modules.diffusionmodules.util import GroupNorm32
 from modules import shared
+
+from onediff.infer_compiler import oneflow_compile
+from onediff.infer_compiler.backends.oneflow.transform import proxy_class, register
+
 from .sd_webui_onediff_utils import (
     CrossAttentionOflow,
     GroupNorm32Oflow,
     timestep_embedding,
 )
 
-from onediff.infer_compiler import oneflow_compile
-from onediff.infer_compiler.backends.oneflow.transform import proxy_class, register
-
 __all__ = ["compile_ldm_unet"]
 
 
diff --git a/onediff_sd_webui_extensions/compile/compile_sgm.py b/onediff_sd_webui_extensions/compile/compile_sgm.py
index 4a6ad6d7e..09b86be59 100644
--- a/onediff_sd_webui_extensions/compile/compile_sgm.py
+++ b/onediff_sd_webui_extensions/compile/compile_sgm.py
@@ -1,9 +1,4 @@
 import oneflow as flow
-from .sd_webui_onediff_utils import (
-    CrossAttentionOflow,
-    GroupNorm32Oflow,
-    timestep_embedding,
-)
 from sgm.modules.attention import (
     BasicTransformerBlock,
     CrossAttention,
@@ -15,6 +10,12 @@
 from onediff.infer_compiler import oneflow_compile
 from onediff.infer_compiler.backends.oneflow.transform import proxy_class, register
 
+from .sd_webui_onediff_utils import (
+    CrossAttentionOflow,
+    GroupNorm32Oflow,
+    timestep_embedding,
+)
+
 __all__ = ["compile_sgm_unet"]
 
 
diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
index 26b4fa39c..42d53bc40 100644
--- a/onediff_sd_webui_extensions/compile/compile_utils.py
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -1,6 +1,3 @@
-import os
-
-# import modules.shared as shared
 import warnings
 from pathlib import Path
 from typing import Dict, Union
@@ -65,6 +62,7 @@ def get_calibrate_info(filename: str) -> Union[None, Dict]:
 
 def get_compiled_graph(sd_model, quantization) -> OneDiffCompiledGraph:
     from onediff_shared import graph_dict
+
     if sd_model.sd_model_hash in graph_dict:
         return graph_dict[sd_model.sd_model_hash]
     else:
diff --git a/onediff_sd_webui_extensions/onediff_lora.py b/onediff_sd_webui_extensions/onediff_lora.py
index 0d8ccfa80..a11705867 100644
--- a/onediff_sd_webui_extensions/onediff_lora.py
+++ b/onediff_sd_webui_extensions/onediff_lora.py
@@ -66,41 +66,44 @@ def activate(self, p, params_list):
     return activate
 
 
-# class HijackLoadModelWeights:
-#     # def __init__(self):
-#         # from modules import extra_networks
-
-#         # if "lora" in extra_networks.extra_network_registry:
-#         #     cls_extra_network_lora = type(extra_networks.extra_network_registry["lora"])
-#         # else:
-#         #     cls_extra_network_lora = None
-#         # self.lora_class = cls_extra_network_lora
-
-#     def __enter__(self):
-#         self.orig_func = sd_models.load_model_weights
-#         sd_models.load_model_weights = onediff_hijack_load_model_weights
-
-#     def __exit__(self, exc_type, exc_val, exc_tb):
-#         sd_models.load_model_weights = self.orig_func
-
-def onediff_hijack_load_model_weights(orig_func, model, checkpoint_info: sd_models.CheckpointInfo, state_dict: dict, timer):
+def onediff_hijack_load_model_weights(
+    orig_func, model, checkpoint_info: sd_models.CheckpointInfo, state_dict: dict, timer
+):
     # load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer)
     sd_model_hash = checkpoint_info.calculate_shorthash()
     import onediff_shared
-    cached_model: OneDiffCompiledGraph = onediff_shared.graph_dict.get(sd_model_hash, None)
+
+    cached_model: OneDiffCompiledGraph = onediff_shared.graph_dict.get(
+        sd_model_hash, None
+    )
     if cached_model is not None:
         model.model.diffusion_model = cached_model.graph_module
-        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("model.diffusion_model.")}
+        state_dict = {
+            k: v
+            for k, v in state_dict.items()
+            if not k.startswith("model.diffusion_model.")
+        }
     return orig_func(model, checkpoint_info, state_dict, timer)
 
 
-def onediff_hijack_load_state_dict(orig_func, self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False):
-    if len(state_dict) > 0 and next(iter(state_dict.values())).is_cuda and next(self.parameters()).is_meta:
+def onediff_hijack_load_state_dict(
+    orig_func,
+    self,
+    state_dict: Mapping[str, Any],
+    strict: bool = True,
+    assign: bool = False,
+):
+    if (
+        len(state_dict) > 0
+        and next(iter(state_dict.values())).is_cuda
+        and next(self.parameters()).is_meta
+    ):
         return orig_func(self, state_dict, strict, assign=True)
     else:
         return orig_func(self, state_dict, strict, assign)
 
 
+# fmt: off
 def onediff_hijaced_LoadStateDictOnMeta___enter__(orig_func, self):
     from modules import shared
     if shared.cmd_opts.disable_model_loading_ram_optimization:
@@ -171,7 +174,16 @@ def load_state_dict(original, module, state_dict, strict=True):
     mha_load_from_state_dict = self.replace(torch.nn.MultiheadAttention, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(mha_load_from_state_dict, *args, **kwargs))
     layer_norm_load_from_state_dict = self.replace(torch.nn.LayerNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(layer_norm_load_from_state_dict, *args, **kwargs))
     group_norm_load_from_state_dict = self.replace(torch.nn.GroupNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(group_norm_load_from_state_dict, *args, **kwargs))
+# fmt: on
 
 
-CondFunc("modules.sd_disable_initialization.LoadStateDictOnMeta.__enter__", onediff_hijaced_LoadStateDictOnMeta___enter__, lambda _, *args, **kwargs: onediff_enabled)
-CondFunc("modules.sd_models.load_model_weights", onediff_hijack_load_model_weights, lambda _, *args, **kwargs: onediff_enabled)
\ No newline at end of file
+CondFunc(
+    "modules.sd_disable_initialization.LoadStateDictOnMeta.__enter__",
+    onediff_hijaced_LoadStateDictOnMeta___enter__,
+    lambda _, *args, **kwargs: onediff_enabled,
+)
+CondFunc(
+    "modules.sd_models.load_model_weights",
+    onediff_hijack_load_model_weights,
+    lambda _, *args, **kwargs: onediff_enabled,
+)
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index 9bdd82678..233f0c887 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -13,4 +13,4 @@
     "is_sd1": False,
     "is_ssd": False,
 }
-onediff_enabled = True
\ No newline at end of file
+onediff_enabled = True
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 4e27db5d5..890cff67e 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -1,15 +1,11 @@
-import os
-import torch
-import warnings
-import zipfile
 from pathlib import Path
-from typing import Dict, Union
 
 import gradio as gr
 import modules.scripts as scripts
-import modules.shared as shared
 import modules.sd_models as sd_models
+import modules.shared as shared
 import onediff_shared
+import torch
 from compile import (
     OneDiffCompiledGraph,
     SD21CompileCtx,
@@ -18,65 +14,23 @@
 )
 from modules import script_callbacks
 from modules.processing import process_images
-from modules.sd_models import select_checkpoint
 from modules.ui_common import create_refresh_button
 from onediff_hijack import do_hijack as onediff_do_hijack
 from onediff_lora import HijackLoraActivate
-from oneflow import __version__ as oneflow_version
 from ui_utils import (
-    all_compiler_caches_path,
     check_structure_change_and_update,
     get_all_compiler_caches,
     hints_message,
     load_graph,
+    onediff_enabled,
     refresh_all_compiler_caches,
     save_graph,
-    onediff_enabled,
 )
 
-from onediff import __version__ as onediff_version
-from onediff.optimization.quant_optimizer import (
-    quantize_model,
-    varify_can_use_quantization,
-)
+from onediff.optimization.quant_optimizer import varify_can_use_quantization
 from onediff.utils import logger, parse_boolean_from_env
 
 """oneflow_compiled UNetModel"""
-# compiled_unet = {}
-# compiled_unet = None
-# is_unet_quantized = False
-# compiled_ckpt_name = None
-
-
-def generate_graph_path(ckpt_name: str, model_name: str) -> str:
-    base_output_dir = shared.opts.outdir_samples or shared.opts.outdir_txt2img_samples
-    save_ckpt_graphs_path = os.path.join(base_output_dir, "graphs", ckpt_name)
-    os.makedirs(save_ckpt_graphs_path, exist_ok=True)
-
-    file_name = f"{model_name}_graph_{onediff_version}_oneflow_{oneflow_version}"
-
-    graph_file_path = os.path.join(save_ckpt_graphs_path, file_name)
-
-    return graph_file_path
-
-
-def get_calibrate_info(filename: str) -> Union[None, Dict]:
-    calibration_path = Path(select_checkpoint().filename).parent / filename
-    if not calibration_path.exists():
-        return None
-
-    logger.info(f"Got calibrate info at {str(calibration_path)}")
-    calibrate_info = {}
-    with open(calibration_path, "r") as f:
-        for line in f.readlines():
-            line = line.strip()
-            items = line.split(" ")
-            calibrate_info[items[0]] = [
-                float(items[1]),
-                int(items[2]),
-                [float(x) for x in items[3].split(",")],
-            ]
-    return calibrate_info
 
 
 class UnetCompileCtx(object):
@@ -85,13 +39,11 @@ class UnetCompileCtx(object):
     and then the original model restored so that subsequent reasoning with onediff disabled meets expectations.
     """
 
-    # def __init__(self, compiled_unet):
-    #     self.compiled_unet = compiled_unet
-
     def __enter__(self):
         self._original_model = shared.sd_model.model.diffusion_model
-            # onediff_shared.current_unet_graph.graph_module
-        shared.sd_model.model.diffusion_model = onediff_shared.current_unet_graph.graph_module
+        shared.sd_model.model.diffusion_model = (
+            onediff_shared.current_unet_graph.graph_module
+        )
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         shared.sd_model.model.diffusion_model = self._original_model
@@ -146,8 +98,13 @@ def run(
         always_recompile=False,
     ):
         # restore checkpoint_info from refiner to base model
-        if sd_models.checkpoint_aliases.get(p.override_settings.get('sd_model_checkpoint')) is None:
-            p.override_settings.pop('sd_model_checkpoint', None)
+        if (
+            sd_models.checkpoint_aliases.get(
+                p.override_settings.get("sd_model_checkpoint")
+            )
+            is None
+        ):
+            p.override_settings.pop("sd_model_checkpoint", None)
             sd_models.reload_model_weights()
             torch.cuda.empty_cache()
 
@@ -204,28 +161,9 @@ def on_ui_settings():
 
 
 def cfg_denoisers_callback(params):
-    # check refiner model
-    # print(f"current checkpoint: {shared.opts.sd_model_checkpoint}")
-    # import ipdb; ipdb.set_trace()
-    if "refiner" in shared.sd_model.sd_checkpoint_info.name:
-        # onediff_shared.current_unet_graph = get_compiled_graph(
-        #     shared.sd_model, quantization
-        # )
-        # load_graph(onediff_shared.current_unet_graph, compiler_cache)
-        # import ipdb; ipdb.set_trace()
-        pass
-        # import ipdb; ipdb.set_trace()
-        # shared.sd_model.model.diffusion_model
-
-    print(f"current checkpoint info: {shared.sd_model.sd_checkpoint_info.name}")
-    # shared.sd_model.model.diffusion_model = compile_unet(
-    #     shared.sd_model.model.diffusion_model
-    # )
-
-    # have to check if onediff enabled
-    # print('onediff denoiser callback')
+    pass
 
 
 script_callbacks.on_ui_settings(on_ui_settings)
-script_callbacks.on_cfg_denoiser(cfg_denoisers_callback)
+# script_callbacks.on_cfg_denoiser(cfg_denoisers_callback)
 onediff_do_hijack()
diff --git a/onediff_sd_webui_extensions/ui_utils.py b/onediff_sd_webui_extensions/ui_utils.py
index b4fbf369e..bdb875a38 100644
--- a/onediff_sd_webui_extensions/ui_utils.py
+++ b/onediff_sd_webui_extensions/ui_utils.py
@@ -1,4 +1,5 @@
 import os
+from contextlib import contextmanager
 from pathlib import Path
 from textwrap import dedent
 from zipfile import BadZipFile
@@ -118,7 +119,6 @@ def save_graph(compiled_unet: DeployableModule, saved_cache_name: str = ""):
         compiled_unet.save_graph(saved_cache_name)
 
 
-from contextlib import contextmanager
 @contextmanager
 def onediff_enabled():
     onediff_shared.onediff_enabled = True

From 7b51da0b3ac3ea60d432df4316241b57508939ac Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Tue, 4 Jun 2024 17:01:16 +0800
Subject: [PATCH 05/14] add test

---
 tests/sd-webui/test_api.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/sd-webui/test_api.py b/tests/sd-webui/test_api.py
index c745ad86d..2fbc40cfb 100644
--- a/tests/sd-webui/test_api.py
+++ b/tests/sd-webui/test_api.py
@@ -79,3 +79,14 @@ def test_onediff_load_graph(url_txt2img):
     }
     data = {**get_base_args(), **script_args}
     post_request_and_check(url_txt2img, data)
+
+
+def test_onediff_refiner(url_txt2img):
+    extra_args = {
+        "refiner_checkpoint" :"sd_xl_refiner_1.0.safetensors [7440042bbd]",
+        "refiner_switch_at" : 0.8,
+    }
+    data = {**get_base_args(), **extra_args}
+    # loop 5 times for checking model switching between base and refiner
+    for _ in range(5):
+        post_request_and_check(url_txt2img, data)

From 0843f459251a52627c67bf70cabbd93707702ef0 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Tue, 4 Jun 2024 23:12:27 +0800
Subject: [PATCH 06/14] fix cuda memory of refiner

---
 .../compile/compile_utils.py                  | 14 +++-----
 onediff_sd_webui_extensions/onediff_lora.py   | 32 +++++++++++--------
 onediff_sd_webui_extensions/onediff_shared.py |  6 ++--
 .../scripts/onediff.py                        | 16 +++-------
 tests/sd-webui/test_api.py                    |  1 +
 5 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
index 42d53bc40..89339832f 100644
--- a/onediff_sd_webui_extensions/compile/compile_utils.py
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -5,6 +5,7 @@
 from ldm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelLDM
 from modules.sd_models import select_checkpoint
 from sgm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelSGM
+from ui_utils import check_structure_change_and_update
 
 from onediff.optimization.quant_optimizer import (
     quantize_model,
@@ -61,12 +62,7 @@ def get_calibrate_info(filename: str) -> Union[None, Dict]:
 
 
 def get_compiled_graph(sd_model, quantization) -> OneDiffCompiledGraph:
-    from onediff_shared import graph_dict
-
-    if sd_model.sd_model_hash in graph_dict:
-        return graph_dict[sd_model.sd_model_hash]
-    else:
-        compiled_unet = compile_unet(
-            sd_model.model.diffusion_model, quantization=quantization
-        )
-        return OneDiffCompiledGraph(sd_model, compiled_unet, quantization)
+    compiled_unet = compile_unet(
+        sd_model.model.diffusion_model, quantization=quantization
+    )
+    return OneDiffCompiledGraph(sd_model, compiled_unet, quantization)
diff --git a/onediff_sd_webui_extensions/onediff_lora.py b/onediff_sd_webui_extensions/onediff_lora.py
index a11705867..fb8e8b817 100644
--- a/onediff_sd_webui_extensions/onediff_lora.py
+++ b/onediff_sd_webui_extensions/onediff_lora.py
@@ -1,17 +1,15 @@
+from typing import Any, Mapping
+
 import torch
-from typing import Mapping, Any
+from modules import sd_models
+from modules.sd_hijack_utils import CondFunc
+from onediff_shared import onediff_enabled
 
 from onediff.infer_compiler import DeployableModule
 from onediff.infer_compiler.backends.oneflow.param_utils import (
     update_graph_related_tensor,
 )
 
-from onediff_shared import onediff_enabled
-
-from modules import sd_models
-from modules.sd_hijack_utils import CondFunc
-from compile import OneDiffCompiledGraph
-
 
 class HijackLoraActivate:
     def __init__(self):
@@ -60,7 +58,11 @@ def activate(self, p, params_list):
                     continue
                 networks.network_apply_weights(sub_module)
                 if isinstance(sub_module, torch.nn.Conv2d):
-                    update_graph_related_tensor(sub_module)
+                    # TODO(WangYi): refine here
+                    try:
+                        update_graph_related_tensor(sub_module)
+                    except:
+                        pass
 
     activate._onediff_hijacked = True
     return activate
@@ -73,16 +75,20 @@ def onediff_hijack_load_model_weights(
     sd_model_hash = checkpoint_info.calculate_shorthash()
     import onediff_shared
 
-    cached_model: OneDiffCompiledGraph = onediff_shared.graph_dict.get(
-        sd_model_hash, None
-    )
-    if cached_model is not None:
-        model.model.diffusion_model = cached_model.graph_module
+    if onediff_shared.current_unet_graph.sha == sd_model_hash:
+        model.model.diffusion_model = onediff_shared.current_unet_graph.graph_module
         state_dict = {
             k: v
             for k, v in state_dict.items()
             if not k.startswith("model.diffusion_model.")
         }
+
+        # for stable-diffusion-webui/modules/sd_models.py:load_model_weights model.is_ssd check
+        state_dict[
+            "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight"
+        ] = model.get_parameter(
+            "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight"
+        )
     return orig_func(model, checkpoint_info, state_dict, timer)
 
 
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index 233f0c887..a5dcd563a 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -2,10 +2,8 @@
 
 from compile.onediff_compiled_graph import OneDiffCompiledGraph
 
-# from compile_utils import OneDiffCompiledGraph
-
 current_unet_graph = OneDiffCompiledGraph()
-graph_dict: Dict[str, OneDiffCompiledGraph] = dict()
+current_quantization = False
 refiner_dict: Dict[str, str] = dict()
 current_unet_type = {
     "is_sdxl": False,
@@ -13,4 +11,4 @@
     "is_sd1": False,
     "is_ssd": False,
 }
-onediff_enabled = True
+onediff_enabled = False
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 890cff67e..0ab98eab2 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -5,13 +5,9 @@
 import modules.sd_models as sd_models
 import modules.shared as shared
 import onediff_shared
+import oneflow as flow
 import torch
-from compile import (
-    OneDiffCompiledGraph,
-    SD21CompileCtx,
-    VaeCompileCtx,
-    get_compiled_graph,
-)
+from compile import SD21CompileCtx, VaeCompileCtx, get_compiled_graph
 from modules import script_callbacks
 from modules.processing import process_images
 from modules.ui_common import create_refresh_button
@@ -97,7 +93,7 @@ def run(
         saved_cache_name="",
         always_recompile=False,
     ):
-        # restore checkpoint_info from refiner to base model
+        # restore checkpoint_info from refiner to base model if necessary
         if (
             sd_models.checkpoint_aliases.get(
                 p.override_settings.get("sd_model_checkpoint")
@@ -107,6 +103,7 @@ def run(
             p.override_settings.pop("sd_model_checkpoint", None)
             sd_models.reload_model_weights()
             torch.cuda.empty_cache()
+            flow.cuda.empty_cache()
 
         current_checkpoint_name = shared.sd_model.sd_checkpoint_info.name
         ckpt_changed = (
@@ -137,11 +134,6 @@ def run(
                 f"Model {current_checkpoint_name} has same sd type of graph type {onediff_shared.current_unet_type}, skip compile"
             )
 
-        # register graph
-        onediff_shared.graph_dict[shared.sd_model.sd_model_hash] = OneDiffCompiledGraph(
-            shared.sd_model, graph_module=onediff_shared.current_unet_graph.graph_module
-        )
-
         with UnetCompileCtx(), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate(), onediff_enabled():
             proc = process_images(p)
         save_graph(onediff_shared.current_unet_graph, saved_cache_name)
diff --git a/tests/sd-webui/test_api.py b/tests/sd-webui/test_api.py
index 2fbc40cfb..9c6d32fdc 100644
--- a/tests/sd-webui/test_api.py
+++ b/tests/sd-webui/test_api.py
@@ -83,6 +83,7 @@ def test_onediff_load_graph(url_txt2img):
 
 def test_onediff_refiner(url_txt2img):
     extra_args = {
+        "sd_model_checkpoint": "sd_xl_base_1.0.safetensors",
         "refiner_checkpoint" :"sd_xl_refiner_1.0.safetensors [7440042bbd]",
         "refiner_switch_at" : 0.8,
     }

From 345da80d6de630114d4c1654989585b13e29d16d Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Wed, 5 Jun 2024 12:43:53 +0800
Subject: [PATCH 07/14] refine

---
 onediff_sd_webui_extensions/README.md         |   2 +
 .../compile/compile_utils.py                  |   1 -
 onediff_sd_webui_extensions/onediff_hijack.py | 133 ++++++++++++++++++
 onediff_sd_webui_extensions/onediff_lora.py   | 132 -----------------
 onediff_sd_webui_extensions/onediff_shared.py |   3 -
 .../scripts/onediff.py                        |   6 +-
 tests/sd-webui/test_api.py                    |   3 +-
 7 files changed, 141 insertions(+), 139 deletions(-)

diff --git a/onediff_sd_webui_extensions/README.md b/onediff_sd_webui_extensions/README.md
index e4a0e3f3a..0e7b14d14 100644
--- a/onediff_sd_webui_extensions/README.md
+++ b/onediff_sd_webui_extensions/README.md
@@ -4,8 +4,10 @@
 - [Installation Guide](#installation-guide)
 - [Extensions Usage](#extensions-usage)
     - [Fast Model Switching](#fast-model-switching)
+    - [Compiler cache saving and loading](#compiler-cache-saving-and-loading)
     - [LoRA](#lora)
 - [Quantization](#quantization)
+- [Use OneDiff by API](#use-onediff-by-api)
 - [Contact](#contact)
 
 ## Performance of Community Edition
diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
index 89339832f..9d39fbc96 100644
--- a/onediff_sd_webui_extensions/compile/compile_utils.py
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -5,7 +5,6 @@
 from ldm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelLDM
 from modules.sd_models import select_checkpoint
 from sgm.modules.diffusionmodules.openaimodel import UNetModel as UNetModelSGM
-from ui_utils import check_structure_change_and_update
 
 from onediff.optimization.quant_optimizer import (
     quantize_model,
diff --git a/onediff_sd_webui_extensions/onediff_hijack.py b/onediff_sd_webui_extensions/onediff_hijack.py
index b6df91af0..355180202 100644
--- a/onediff_sd_webui_extensions/onediff_hijack.py
+++ b/onediff_sd_webui_extensions/onediff_hijack.py
@@ -1,5 +1,11 @@
+from typing import Any, Mapping
+
 import oneflow
+import torch
 from compile import compile_ldm, compile_sgm
+from modules import sd_models
+from modules.sd_hijack_utils import CondFunc
+from onediff_shared import onediff_enabled
 
 
 # https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/1c0a0c4c26f78c32095ebc7f8af82f5c04fca8c0/modules/sd_hijack_unet.py#L8
@@ -94,3 +100,130 @@ def undo_hijack():
         name="send_model_to_cpu",
         new_name="__onediff_original_send_model_to_cpu",
     )
+
+
+def onediff_hijack_load_model_weights(
+    orig_func, model, checkpoint_info: sd_models.CheckpointInfo, state_dict: dict, timer
+):
+    # load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer)
+    sd_model_hash = checkpoint_info.calculate_shorthash()
+    import onediff_shared
+
+    if onediff_shared.current_unet_graph.sha == sd_model_hash:
+        model.model.diffusion_model = onediff_shared.current_unet_graph.graph_module
+        state_dict = {
+            k: v
+            for k, v in state_dict.items()
+            if not k.startswith("model.diffusion_model.")
+        }
+
+        # for stable-diffusion-webui/modules/sd_models.py:load_model_weights model.is_ssd check
+        state_dict[
+            "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight"
+        ] = model.get_parameter(
+            "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight"
+        )
+    return orig_func(model, checkpoint_info, state_dict, timer)
+
+
+def onediff_hijack_load_state_dict(
+    orig_func,
+    self,
+    state_dict: Mapping[str, Any],
+    strict: bool = True,
+    assign: bool = False,
+):
+    if (
+        len(state_dict) > 0
+        and next(iter(state_dict.values())).is_cuda
+        and next(self.parameters()).is_meta
+    ):
+        return orig_func(self, state_dict, strict, assign=True)
+    else:
+        return orig_func(self, state_dict, strict, assign)
+
+
+# fmt: off
+def onediff_hijaced_LoadStateDictOnMeta___enter__(orig_func, self):
+    from modules import shared
+    if shared.cmd_opts.disable_model_loading_ram_optimization:
+        return
+
+    sd = self.state_dict
+    device = self.device
+
+    def load_from_state_dict(original, module, state_dict, prefix, *args, **kwargs):
+        used_param_keys = []
+
+        for name, param in module._parameters.items():
+            if param is None:
+                continue
+
+            key = prefix + name
+            sd_param = sd.pop(key, None)
+            if sd_param is not None:
+                state_dict[key] = sd_param.to(dtype=self.get_weight_dtype(key))
+                used_param_keys.append(key)
+
+            if param.is_meta:
+                dtype = sd_param.dtype if sd_param is not None else param.dtype
+                module._parameters[name] = torch.nn.parameter.Parameter(torch.zeros_like(param, device=device, dtype=dtype), requires_grad=param.requires_grad)
+
+        for name in module._buffers:
+            key = prefix + name
+
+            sd_param = sd.pop(key, None)
+            if sd_param is not None:
+                state_dict[key] = sd_param
+                used_param_keys.append(key)
+
+        original(module, state_dict, prefix, *args, **kwargs)
+
+        for key in used_param_keys:
+            state_dict.pop(key, None)
+
+    # def load_state_dict(original, module, state_dict, strict=True):
+    def load_state_dict(original, module, state_dict, strict=True):
+        """torch makes a lot of copies of the dictionary with weights, so just deleting entries from state_dict does not help
+        because the same values are stored in multiple copies of the dict. The trick used here is to give torch a dict with
+        all weights on meta device, i.e. deleted, and then it doesn't matter how many copies torch makes.
+
+        In _load_from_state_dict, the correct weight will be obtained from a single dict with the right weights (sd).
+
+        The dangerous thing about this is if _load_from_state_dict is not called, (if some exotic module overloads
+        the function and does not call the original) the state dict will just fail to load because weights
+        would be on the meta device.
+        """
+
+        if state_dict is sd:
+            state_dict = {k: v.to(device="meta", dtype=v.dtype) for k, v in state_dict.items()}
+
+        # ------------------- DIFF HERE -------------------
+        # original(module, state_dict, strict=strict)
+        if len(state_dict) > 0 and next(iter(state_dict.values())).is_cuda and next(module.parameters()).is_meta:
+            assign = True
+        else:
+            assign = False
+        # orig_func(original, module, state_dict, strict=strict, assign=assign)
+        original(module, state_dict, strict=strict, assign=assign)
+
+    module_load_state_dict = self.replace(torch.nn.Module, 'load_state_dict', lambda *args, **kwargs: load_state_dict(module_load_state_dict, *args, **kwargs))
+    module_load_from_state_dict = self.replace(torch.nn.Module, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(module_load_from_state_dict, *args, **kwargs))
+    linear_load_from_state_dict = self.replace(torch.nn.Linear, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(linear_load_from_state_dict, *args, **kwargs))
+    conv2d_load_from_state_dict = self.replace(torch.nn.Conv2d, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(conv2d_load_from_state_dict, *args, **kwargs))
+    mha_load_from_state_dict = self.replace(torch.nn.MultiheadAttention, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(mha_load_from_state_dict, *args, **kwargs))
+    layer_norm_load_from_state_dict = self.replace(torch.nn.LayerNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(layer_norm_load_from_state_dict, *args, **kwargs))
+    group_norm_load_from_state_dict = self.replace(torch.nn.GroupNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(group_norm_load_from_state_dict, *args, **kwargs))
+# fmt: on
+
+
+CondFunc(
+    "modules.sd_disable_initialization.LoadStateDictOnMeta.__enter__",
+    onediff_hijaced_LoadStateDictOnMeta___enter__,
+    lambda _, *args, **kwargs: onediff_enabled,
+)
+CondFunc(
+    "modules.sd_models.load_model_weights",
+    onediff_hijack_load_model_weights,
+    lambda _, *args, **kwargs: onediff_enabled,
+)
diff --git a/onediff_sd_webui_extensions/onediff_lora.py b/onediff_sd_webui_extensions/onediff_lora.py
index fb8e8b817..a1f4da8da 100644
--- a/onediff_sd_webui_extensions/onediff_lora.py
+++ b/onediff_sd_webui_extensions/onediff_lora.py
@@ -1,9 +1,4 @@
-from typing import Any, Mapping
-
 import torch
-from modules import sd_models
-from modules.sd_hijack_utils import CondFunc
-from onediff_shared import onediff_enabled
 
 from onediff.infer_compiler import DeployableModule
 from onediff.infer_compiler.backends.oneflow.param_utils import (
@@ -66,130 +61,3 @@ def activate(self, p, params_list):
 
     activate._onediff_hijacked = True
     return activate
-
-
-def onediff_hijack_load_model_weights(
-    orig_func, model, checkpoint_info: sd_models.CheckpointInfo, state_dict: dict, timer
-):
-    # load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer)
-    sd_model_hash = checkpoint_info.calculate_shorthash()
-    import onediff_shared
-
-    if onediff_shared.current_unet_graph.sha == sd_model_hash:
-        model.model.diffusion_model = onediff_shared.current_unet_graph.graph_module
-        state_dict = {
-            k: v
-            for k, v in state_dict.items()
-            if not k.startswith("model.diffusion_model.")
-        }
-
-        # for stable-diffusion-webui/modules/sd_models.py:load_model_weights model.is_ssd check
-        state_dict[
-            "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight"
-        ] = model.get_parameter(
-            "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight"
-        )
-    return orig_func(model, checkpoint_info, state_dict, timer)
-
-
-def onediff_hijack_load_state_dict(
-    orig_func,
-    self,
-    state_dict: Mapping[str, Any],
-    strict: bool = True,
-    assign: bool = False,
-):
-    if (
-        len(state_dict) > 0
-        and next(iter(state_dict.values())).is_cuda
-        and next(self.parameters()).is_meta
-    ):
-        return orig_func(self, state_dict, strict, assign=True)
-    else:
-        return orig_func(self, state_dict, strict, assign)
-
-
-# fmt: off
-def onediff_hijaced_LoadStateDictOnMeta___enter__(orig_func, self):
-    from modules import shared
-    if shared.cmd_opts.disable_model_loading_ram_optimization:
-        return
-
-    sd = self.state_dict
-    device = self.device
-
-    def load_from_state_dict(original, module, state_dict, prefix, *args, **kwargs):
-        used_param_keys = []
-
-        for name, param in module._parameters.items():
-            if param is None:
-                continue
-
-            key = prefix + name
-            sd_param = sd.pop(key, None)
-            if sd_param is not None:
-                state_dict[key] = sd_param.to(dtype=self.get_weight_dtype(key))
-                used_param_keys.append(key)
-
-            if param.is_meta:
-                dtype = sd_param.dtype if sd_param is not None else param.dtype
-                module._parameters[name] = torch.nn.parameter.Parameter(torch.zeros_like(param, device=device, dtype=dtype), requires_grad=param.requires_grad)
-
-        for name in module._buffers:
-            key = prefix + name
-
-            sd_param = sd.pop(key, None)
-            if sd_param is not None:
-                state_dict[key] = sd_param
-                used_param_keys.append(key)
-
-        original(module, state_dict, prefix, *args, **kwargs)
-
-        for key in used_param_keys:
-            state_dict.pop(key, None)
-
-    # def load_state_dict(original, module, state_dict, strict=True):
-    def load_state_dict(original, module, state_dict, strict=True):
-        """torch makes a lot of copies of the dictionary with weights, so just deleting entries from state_dict does not help
-        because the same values are stored in multiple copies of the dict. The trick used here is to give torch a dict with
-        all weights on meta device, i.e. deleted, and then it doesn't matter how many copies torch makes.
-
-        In _load_from_state_dict, the correct weight will be obtained from a single dict with the right weights (sd).
-
-        The dangerous thing about this is if _load_from_state_dict is not called, (if some exotic module overloads
-        the function and does not call the original) the state dict will just fail to load because weights
-        would be on the meta device.
-        """
-
-        if state_dict is sd:
-            state_dict = {k: v.to(device="meta", dtype=v.dtype) for k, v in state_dict.items()}
-
-        # ------------------- DIFF HERE -------------------
-        # original(module, state_dict, strict=strict)
-        if len(state_dict) > 0 and next(iter(state_dict.values())).is_cuda and next(module.parameters()).is_meta:
-            assign = True
-        else:
-            assign = False
-        # orig_func(original, module, state_dict, strict=strict, assign=assign)
-        original(module, state_dict, strict=strict, assign=assign)
-
-    module_load_state_dict = self.replace(torch.nn.Module, 'load_state_dict', lambda *args, **kwargs: load_state_dict(module_load_state_dict, *args, **kwargs))
-    module_load_from_state_dict = self.replace(torch.nn.Module, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(module_load_from_state_dict, *args, **kwargs))
-    linear_load_from_state_dict = self.replace(torch.nn.Linear, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(linear_load_from_state_dict, *args, **kwargs))
-    conv2d_load_from_state_dict = self.replace(torch.nn.Conv2d, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(conv2d_load_from_state_dict, *args, **kwargs))
-    mha_load_from_state_dict = self.replace(torch.nn.MultiheadAttention, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(mha_load_from_state_dict, *args, **kwargs))
-    layer_norm_load_from_state_dict = self.replace(torch.nn.LayerNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(layer_norm_load_from_state_dict, *args, **kwargs))
-    group_norm_load_from_state_dict = self.replace(torch.nn.GroupNorm, '_load_from_state_dict', lambda *args, **kwargs: load_from_state_dict(group_norm_load_from_state_dict, *args, **kwargs))
-# fmt: on
-
-
-CondFunc(
-    "modules.sd_disable_initialization.LoadStateDictOnMeta.__enter__",
-    onediff_hijaced_LoadStateDictOnMeta___enter__,
-    lambda _, *args, **kwargs: onediff_enabled,
-)
-CondFunc(
-    "modules.sd_models.load_model_weights",
-    onediff_hijack_load_model_weights,
-    lambda _, *args, **kwargs: onediff_enabled,
-)
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index a5dcd563a..8d9e4cf15 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -1,10 +1,7 @@
-from typing import Dict
-
 from compile.onediff_compiled_graph import OneDiffCompiledGraph
 
 current_unet_graph = OneDiffCompiledGraph()
 current_quantization = False
-refiner_dict: Dict[str, str] = dict()
 current_unet_type = {
     "is_sdxl": False,
     "is_sd2": False,
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 0ab98eab2..0561469d8 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -6,9 +6,9 @@
 import modules.shared as shared
 import onediff_shared
 import oneflow as flow
-import torch
 from compile import SD21CompileCtx, VaeCompileCtx, get_compiled_graph
 from modules import script_callbacks
+from modules.devices import torch_gc
 from modules.processing import process_images
 from modules.ui_common import create_refresh_button
 from onediff_hijack import do_hijack as onediff_do_hijack
@@ -102,7 +102,7 @@ def run(
         ):
             p.override_settings.pop("sd_model_checkpoint", None)
             sd_models.reload_model_weights()
-            torch.cuda.empty_cache()
+            torch_gc()
             flow.cuda.empty_cache()
 
         current_checkpoint_name = shared.sd_model.sd_checkpoint_info.name
@@ -137,6 +137,8 @@ def run(
         with UnetCompileCtx(), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate(), onediff_enabled():
             proc = process_images(p)
         save_graph(onediff_shared.current_unet_graph, saved_cache_name)
+        torch_gc()
+        flow.cuda.empty_cache()
         return proc
 
 
diff --git a/tests/sd-webui/test_api.py b/tests/sd-webui/test_api.py
index 9c6d32fdc..0ec72553c 100644
--- a/tests/sd-webui/test_api.py
+++ b/tests/sd-webui/test_api.py
@@ -1,3 +1,4 @@
+import os
 import numpy as np
 import pytest
 from PIL import Image
@@ -89,5 +90,5 @@ def test_onediff_refiner(url_txt2img):
     }
     data = {**get_base_args(), **extra_args}
     # loop 5 times for checking model switching between base and refiner
-    for _ in range(5):
+    for _ in range(3):
         post_request_and_check(url_txt2img, data)

From 03b3a89ee357c4b7a8ae4990da962602fa48afcc Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Thu, 6 Jun 2024 11:37:45 +0800
Subject: [PATCH 08/14] api test add model

---
 tests/sd-webui/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/sd-webui/utils.py b/tests/sd-webui/utils.py
index 4dc28773b..f0f520f2e 100644
--- a/tests/sd-webui/utils.py
+++ b/tests/sd-webui/utils.py
@@ -30,6 +30,7 @@ def get_base_args() -> Dict[str, Any]:
     return {
         "prompt": "1girl",
         "negative_prompt": "",
+        "sd_model_checkpoint": "checkpoints/AWPainting_v1.2.safetensors",
         "seed": SEED,
         "steps": NUM_STEPS,
         "width": WIDTH,

From e3acdbb830b47a4982a323f915520d6f33ffabe9 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Thu, 13 Jun 2024 16:48:38 +0800
Subject: [PATCH 09/14] support controlnet unet (controlnet model not supported
 now)

---
 .../compile/compile_utils.py                  |    6 +-
 .../compile/sd_webui_onediff_utils.py         |   25 +-
 .../onediff_controlnet.py                     | 1008 +++++++++++++++++
 onediff_sd_webui_extensions/onediff_shared.py |    4 +
 .../{ui_utils.py => onediff_utils.py}         |   27 +-
 .../scripts/onediff.py                        |   40 +-
 6 files changed, 1083 insertions(+), 27 deletions(-)
 create mode 100644 onediff_sd_webui_extensions/onediff_controlnet.py
 rename onediff_sd_webui_extensions/{ui_utils.py => onediff_utils.py} (89%)

diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
index 9d39fbc96..451fc26ba 100644
--- a/onediff_sd_webui_extensions/compile/compile_utils.py
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -61,7 +61,11 @@ def get_calibrate_info(filename: str) -> Union[None, Dict]:
 
 
 def get_compiled_graph(sd_model, quantization) -> OneDiffCompiledGraph:
+    diffusion_model = sd_model.model.diffusion_model
+    # for controlnet
+    if "forward" in diffusion_model.__dict__:
+        diffusion_model.__dict__.pop("forward")
     compiled_unet = compile_unet(
-        sd_model.model.diffusion_model, quantization=quantization
+        diffusion_model, quantization=quantization
     )
     return OneDiffCompiledGraph(sd_model, compiled_unet, quantization)
diff --git a/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py b/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
index db338fbf1..93aad2f49 100644
--- a/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
+++ b/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
@@ -13,17 +13,20 @@ def forward(self, x):
 
 
 # https://github.com/Stability-AI/generative-models/blob/059d8e9cd9c55aea1ef2ece39abf605efb8b7cc9/sgm/modules/diffusionmodules/util.py#L207
-def timestep_embedding(timesteps, dim, max_period=10000):
-    half = dim // 2
-    freqs = flow.exp(
-        -math.log(max_period)
-        * flow.arange(start=0, end=half, dtype=flow.float32)
-        / half
-    ).to(device=timesteps.device)
-    args = timesteps[:, None].float() * freqs[None]
-    embedding = flow.cat([flow.cos(args), flow.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = flow.cat([embedding, flow.zeros_like(embedding[:, :1])], dim=-1)
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    if not repeat_only:
+        half = dim // 2
+        freqs = flow.exp(
+            -math.log(max_period)
+            * flow.arange(start=0, end=half, dtype=flow.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = flow.cat([flow.cos(args), flow.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = flow.cat([embedding, flow.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        raise NotImplementedError("repeat_only=True is not implemented in timestep_embedding")
     return embedding
 
 
diff --git a/onediff_sd_webui_extensions/onediff_controlnet.py b/onediff_sd_webui_extensions/onediff_controlnet.py
new file mode 100644
index 000000000..6e3899a7d
--- /dev/null
+++ b/onediff_sd_webui_extensions/onediff_controlnet.py
@@ -0,0 +1,1008 @@
+import onediff_shared
+import oneflow as flow
+import torch
+import torch as th
+from compile import OneDiffCompiledGraph
+from compile.sd_webui_onediff_utils import (CrossAttentionOflow,
+                                            GroupNorm32Oflow,
+                                            timestep_embedding)
+from ldm.modules.attention import BasicTransformerBlock, CrossAttention
+from ldm.modules.diffusionmodules.openaimodel import ResBlock, UNetModel
+from ldm.modules.diffusionmodules.util import GroupNorm32
+from modules import devices
+from modules.sd_hijack_utils import CondFunc
+from onediff_utils import singleton_decorator
+
+from onediff.infer_compiler import oneflow_compile
+from onediff.infer_compiler.backends.oneflow.transform import (proxy_class,
+                                                               register)
+
+
+def torch_aligned_adding(base, x, require_channel_alignment):
+    if isinstance(x, float):
+        if x == 0.0:
+            return base
+        return base + x
+
+    if require_channel_alignment:
+        zeros = torch.zeros_like(base)
+        zeros[:, : x.shape[1], ...] = x
+        x = zeros
+
+    # resize to sample resolution
+    base_h, base_w = base.shape[-2:]
+    xh, xw = x.shape[-2:]
+
+    if xh > 1 or xw > 1:
+        if base_h != xh or base_w != xw:
+            # logger.info('[Warning] ControlNet finds unexpected mis-alignment in tensor shape.')
+            x = th.nn.functional.interpolate(x, size=(base_h, base_w), mode="nearest")
+
+    return base + x
+
+
+def oneflow_aligned_adding(base, x, require_channel_alignment):
+    if isinstance(x, float):
+        return base + x
+
+    if require_channel_alignment:
+        zeros = flow.zeros_like(base)
+        zeros[:, : x.shape[1], ...] = x
+        x = zeros
+
+    # resize to sample resolution
+    base_h, base_w = base.shape[-2:]
+    xh, xw = x.shape[-2:]
+
+    if xh > 1 or xw > 1 and (base_h != xh or base_w != xw):
+        # logger.info('[Warning] ControlNet finds unexpected mis-alignment in tensor shape.')
+        x = flow.nn.functional.interpolate(x, size=(base_h, base_w), mode="nearest")
+    return base + x
+
+
+cond_cast_unet = getattr(devices, "cond_cast_unet", lambda x: x)
+
+
+class TorchOnediffControlNetModel(torch.nn.Module):
+    def __init__(self, unet):
+        super().__init__()
+        self.time_embed = unet.time_embed
+        self.input_blocks = unet.input_blocks
+        self.label_emb = getattr(unet, "label_emb", None)
+        self.middle_block = unet.middle_block
+        self.output_blocks = unet.output_blocks
+        self.out = unet.out
+        self.model_channels = unet.model_channels
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        y,
+        total_t2i_adapter_embedding,
+        total_controlnet_embedding,
+        is_sdxl,
+        require_inpaint_hijack,
+    ):
+        from ldm.modules.diffusionmodules.util import timestep_embedding
+
+        hs = []
+        with th.no_grad():
+            t_emb = cond_cast_unet(
+                timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            )
+            emb = self.time_embed(t_emb)
+
+            if is_sdxl:
+                assert y.shape[0] == x.shape[0]
+                emb = emb + self.label_emb(y)
+
+            h = x
+            for i, module in enumerate(self.input_blocks):
+                self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3])
+                h = module(h, emb, context)
+
+                t2i_injection = [3, 5, 8] if is_sdxl else [2, 5, 8, 11]
+
+                if i in t2i_injection:
+                    h = torch_aligned_adding(
+                        h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack
+                    )
+
+                hs.append(h)
+
+            self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3])
+            h = self.middle_block(h, emb, context)
+
+        # U-Net Middle Block
+        h = torch_aligned_adding(
+            h, total_controlnet_embedding.pop(), require_inpaint_hijack
+        )
+
+        if len(total_t2i_adapter_embedding) > 0 and is_sdxl:
+            h = torch_aligned_adding(
+                h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack
+            )
+
+        # U-Net Decoder
+        for i, module in enumerate(self.output_blocks):
+            self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3])
+            h = th.cat(
+                [
+                    h,
+                    torch_aligned_adding(
+                        hs.pop(),
+                        total_controlnet_embedding.pop(),
+                        require_inpaint_hijack,
+                    ),
+                ],
+                dim=1,
+            )
+            h = module(h, emb, context)
+
+        # U-Net Output
+        h = h.type(x.dtype)
+        h = self.out(h)
+
+        return h
+
+
+class OneFlowOnediffControlNetModel(proxy_class(UNetModel)):
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        y,
+        total_t2i_adapter_embedding,
+        total_controlnet_embedding,
+        is_sdxl,
+        require_inpaint_hijack,
+    ):
+        x = x.half()
+        if y is not None:
+            y = y.half()
+        context = context.half()
+        hs = []
+        with flow.no_grad():
+            t_emb = cond_cast_unet(
+                timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            )
+            emb = self.time_embed(t_emb.half())
+
+            if is_sdxl:
+                assert y.shape[0] == x.shape[0]
+                emb = emb + self.label_emb(y)
+
+            h = x
+            for i, module in enumerate(self.input_blocks):
+                self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3])
+                h = module(h, emb, context)
+
+                t2i_injection = [3, 5, 8] if is_sdxl else [2, 5, 8, 11]
+
+                if i in t2i_injection:
+                    h = oneflow_aligned_adding(
+                        h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack
+                    )
+
+                hs.append(h)
+
+            self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3])
+            h = self.middle_block(h, emb, context)
+
+        # U-Net Middle Block
+        h = oneflow_aligned_adding(
+            h, total_controlnet_embedding.pop(), require_inpaint_hijack
+        )
+
+        if len(total_t2i_adapter_embedding) > 0 and is_sdxl:
+            h = oneflow_aligned_adding(
+                h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack
+            )
+
+        # U-Net Decoder
+        for i, module in enumerate(self.output_blocks):
+            self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3])
+            h = flow.cat(
+                [
+                    h,
+                    oneflow_aligned_adding(
+                        hs.pop(),
+                        total_controlnet_embedding.pop(),
+                        require_inpaint_hijack,
+                    ),
+                ],
+                dim=1,
+            )
+            h = h.half()
+            h = module(h, emb, context)
+
+        # U-Net Output
+        h = h.type(x.dtype)
+        h = self.out(h)
+
+        return h
+
+
+def compile_controlnet_ldm_unet(sd_model, unet_model, *, options=None):
+    for module in unet_model.modules():
+        if isinstance(module, BasicTransformerBlock):
+            module.checkpoint = False
+        if isinstance(module, ResBlock):
+            module.use_checkpoint = False
+    # return oneflow_compile(unet_model, options=options)
+    compiled_model = oneflow_compile(unet_model, options=options)
+    compiled_graph = OneDiffCompiledGraph(sd_model, compiled_model)
+    compiled_graph.eager_module = unet_model
+    compiled_graph.name += "_controlnet"
+    return compiled_graph
+
+
+torch2oflow_class_map = {
+    CrossAttention: CrossAttentionOflow,
+    GroupNorm32: GroupNorm32Oflow,
+    TorchOnediffControlNetModel: OneFlowOnediffControlNetModel,
+}
+register(package_names=["scripts.hook"], torch2oflow_class_map=torch2oflow_class_map)
+
+
+def check_if_controlnet_ext_loaded() -> bool:
+    from modules import extensions
+
+    return "sd-webui-controlnet" in extensions.loaded_extensions
+
+
+def hijacked_main_entry(self, p):
+    self._original_controlnet_main_entry(p)
+    sd_ldm = p.sd_model
+    unet = sd_ldm.model.diffusion_model
+
+    if onediff_shared.controlnet_compiled is False:
+    # if not getattr(self, "compiled", False):
+        from onediff_controlnet import TorchOnediffControlNetModel
+        onediff_model = TorchOnediffControlNetModel(unet)
+        onediff_shared.current_unet_graph = compile_controlnet_ldm_unet(
+            sd_ldm, onediff_model
+        )
+        onediff_shared.controlnet_compiled = True
+    else:
+        pass
+
+
+
+
+def get_controlnet_script(p):
+    for script in p.scripts.scripts:
+        if script.__module__ == "controlnet.py":
+            return script
+    return None
+
+
+def check_if_controlnet_enabled(p):
+    controlnet_script_class = get_controlnet_script(p)
+    if controlnet_script_class is None:
+        return False
+    return len(controlnet_script_class.get_enabled_units(p)) != 0
+
+
+@singleton_decorator
+def create_condfunc(p):
+    CondFunc(
+        "scripts.hook.UnetHook.hook", hijacked_hook, lambda _, *arg, **kwargs: True
+    )
+    # get controlnet script
+    controlnet_script = get_controlnet_script(p)
+    if controlnet_script is None:
+        return
+
+    controlnet_script._original_controlnet_main_entry = (
+        controlnet_script.controlnet_main_entry
+    )
+    controlnet_script.controlnet_main_entry = hijacked_main_entry.__get__(
+        controlnet_script
+    )
+
+
+
+def hijacked_hook(
+    orig_func,
+    self,
+    model,
+    sd_ldm,
+    control_params,
+    process,
+    batch_option_uint_separate=False,
+    batch_option_style_align=False,
+):
+    from modules import devices, lowvram, scripts, shared
+    from scripts.controlnet_sparsectrl import SparseCtrl
+    from scripts.enums import AutoMachine, ControlModelType, HiResFixOption
+    from scripts.hook import (AbstractLowScaleModel, blur, mark_prompt_context,
+                              predict_noise_from_start, predict_q_sample,
+                              predict_start_from_noise, register_schedule,
+                              torch_dfs, unmark_prompt_context)
+    from scripts.ipadapter.ipadapter_model import ImageEmbed
+    from scripts.logging import logger
+
+    self.model = model
+    self.sd_ldm = sd_ldm
+    self.control_params = control_params
+
+    model_is_sdxl = getattr(self.sd_ldm, "is_sdxl", False)
+
+    outer = self
+
+    def process_sample(*args, **kwargs):
+        # ControlNet must know whether a prompt is conditional prompt (positive prompt) or unconditional conditioning prompt (negative prompt).
+        # You can use the hook.py's `mark_prompt_context` to mark the prompts that will be seen by ControlNet.
+        # Let us say XXX is a MulticondLearnedConditioning or a ComposableScheduledPromptConditioning or a ScheduledPromptConditioning or a list of these components,
+        # if XXX is a positive prompt, you should call mark_prompt_context(XXX, positive=True)
+        # if XXX is a negative prompt, you should call mark_prompt_context(XXX, positive=False)
+        # After you mark the prompts, the ControlNet will know which prompt is cond/uncond and works as expected.
+        # After you mark the prompts, the mismatch errors will disappear.
+        mark_prompt_context(kwargs.get("conditioning", []), positive=True)
+        mark_prompt_context(
+            kwargs.get("unconditional_conditioning", []), positive=False
+        )
+        mark_prompt_context(getattr(process, "hr_c", []), positive=True)
+        mark_prompt_context(getattr(process, "hr_uc", []), positive=False)
+        return process.sample_before_CN_hack(*args, **kwargs)
+
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        is_sdxl = y is not None and model_is_sdxl
+        total_t2i_adapter_embedding = [0.0] * 4
+        if is_sdxl:
+            total_controlnet_embedding = [0.0] * 10
+        else:
+            total_controlnet_embedding = [0.0] * 13
+        require_inpaint_hijack = False
+        is_in_high_res_fix = False
+        batch_size = int(x.shape[0])
+
+        # Handle cond-uncond marker
+        (
+            cond_mark,
+            outer.current_uc_indices,
+            outer.current_c_indices,
+            context,
+        ) = unmark_prompt_context(context)
+        outer.model.cond_mark = cond_mark
+        # logger.info(str(cond_mark[:, 0, 0, 0].detach().cpu().numpy().tolist()) + ' - ' + str(outer.current_uc_indices))
+
+        # Revision
+        if is_sdxl:
+            revision_y1280 = 0
+
+            for param in outer.control_params:
+                if param.guidance_stopped:
+                    continue
+                if param.control_model_type == ControlModelType.ReVision:
+                    if param.vision_hint_count is None:
+                        k = (
+                            torch.Tensor(
+                                [int(param.preprocessor["threshold_a"] * 1000)]
+                            )
+                            .to(param.hint_cond)
+                            .long()
+                            .clip(0, 999)
+                        )
+                        param.vision_hint_count = outer.revision_q_sampler.q_sample(
+                            param.hint_cond, k
+                        )
+                    revision_emb = param.vision_hint_count
+                    if isinstance(revision_emb, torch.Tensor):
+                        revision_y1280 += revision_emb * param.weight
+
+            if isinstance(revision_y1280, torch.Tensor):
+                y[:, :1280] = revision_y1280 * cond_mark[:, :, 0, 0]
+                if any(
+                    "ignore_prompt" in param.preprocessor["name"]
+                    for param in outer.control_params
+                ) or (
+                    getattr(process, "prompt", "") == ""
+                    and getattr(process, "negative_prompt", "") == ""
+                ):
+                    context = torch.zeros_like(context)
+
+        # High-res fix
+        for param in outer.control_params:
+            # select which hint_cond to use
+            if param.used_hint_cond is None:
+                param.used_hint_cond = param.hint_cond
+                param.used_hint_cond_latent = None
+                param.used_hint_inpaint_hijack = None
+
+            # has high-res fix
+            if (
+                isinstance(param.hr_hint_cond, torch.Tensor)
+                and x.ndim == 4
+                and param.hint_cond.ndim == 4
+                and param.hr_hint_cond.ndim == 4
+            ):
+                _, _, h_lr, w_lr = param.hint_cond.shape
+                _, _, h_hr, w_hr = param.hr_hint_cond.shape
+                _, _, h, w = x.shape
+                h, w = h * 8, w * 8
+                if abs(h - h_lr) < abs(h - h_hr):
+                    is_in_high_res_fix = False
+                    if param.used_hint_cond is not param.hint_cond:
+                        param.used_hint_cond = param.hint_cond
+                        param.used_hint_cond_latent = None
+                        param.used_hint_inpaint_hijack = None
+                else:
+                    is_in_high_res_fix = True
+                    if param.used_hint_cond is not param.hr_hint_cond:
+                        param.used_hint_cond = param.hr_hint_cond
+                        param.used_hint_cond_latent = None
+                        param.used_hint_inpaint_hijack = None
+
+        self.is_in_high_res_fix = is_in_high_res_fix
+        outer.is_in_high_res_fix = is_in_high_res_fix
+
+        # Convert control image to latent
+        for param in outer.control_params:
+            if param.used_hint_cond_latent is not None:
+                continue
+            if (
+                param.control_model_type not in [ControlModelType.AttentionInjection]
+                and "colorfix" not in param.preprocessor["name"]
+                and "inpaint_only" not in param.preprocessor["name"]
+            ):
+                continue
+            param.used_hint_cond_latent = outer.call_vae_using_process(
+                process, param.used_hint_cond, batch_size=batch_size
+            )
+
+        # vram
+        for param in outer.control_params:
+            if getattr(param.control_model, "disable_memory_management", False):
+                continue
+
+            if param.control_model is not None:
+                if (
+                    outer.lowvram
+                    and is_sdxl
+                    and hasattr(param.control_model, "aggressive_lowvram")
+                ):
+                    param.control_model.aggressive_lowvram()
+                elif hasattr(param.control_model, "fullvram"):
+                    param.control_model.fullvram()
+                elif hasattr(param.control_model, "to"):
+                    param.control_model.to(devices.get_device_for("controlnet"))
+
+        # handle prompt token control
+        for param in outer.control_params:
+            if param.guidance_stopped or param.disabled_by_hr_option(
+                self.is_in_high_res_fix
+            ):
+                continue
+
+            if param.control_model_type not in [ControlModelType.T2I_StyleAdapter]:
+                continue
+
+            control = param.control_model(
+                x=x, hint=param.used_hint_cond, timesteps=timesteps, context=context
+            )
+            control = torch.cat([control.clone() for _ in range(batch_size)], dim=0)
+            control *= param.weight
+            control *= cond_mark[:, :, :, 0]
+            context = torch.cat([context, control.clone()], dim=1)
+
+        # handle ControlNet / T2I_Adapter
+        for param_index, param in enumerate(outer.control_params):
+            if param.guidance_stopped or param.disabled_by_hr_option(
+                self.is_in_high_res_fix
+            ):
+                continue
+
+            if not (
+                param.control_model_type.is_controlnet
+                or param.control_model_type == ControlModelType.T2I_Adapter
+            ):
+                continue
+
+            # inpaint model workaround
+            x_in = x
+            control_model = param.control_model.control_model
+
+            if param.control_model_type.is_controlnet:
+                if (
+                    x.shape[1] != control_model.input_blocks[0][0].in_channels
+                    and x.shape[1] == 9
+                ):
+                    # inpaint_model: 4 data + 4 downscaled image + 1 mask
+                    x_in = x[:, :4, ...]
+                    require_inpaint_hijack = True
+
+            assert (
+                param.used_hint_cond is not None
+            ), "Controlnet is enabled but no input image is given"
+
+            hint = param.used_hint_cond
+            if param.control_model_type == ControlModelType.InstantID:
+                assert isinstance(param.control_context_override, ImageEmbed)
+                controlnet_context = param.control_context_override.eval(cond_mark).to(
+                    x.device, dtype=x.dtype
+                )
+            else:
+                controlnet_context = context
+
+            # ControlNet inpaint protocol
+            if hint.shape[1] == 4 and not isinstance(control_model, SparseCtrl):
+                c = hint[:, 0:3, :, :]
+                m = hint[:, 3:4, :, :]
+                m = (m > 0.5).float()
+                hint = c * (1 - m) - m
+
+            control = param.control_model(
+                x=x_in, hint=hint, timesteps=timesteps, context=controlnet_context, y=y
+            )
+
+            if is_sdxl:
+                control_scales = [param.weight] * 10
+            else:
+                control_scales = [param.weight] * 13
+
+            if param.cfg_injection or param.global_average_pooling:
+                if param.control_model_type == ControlModelType.T2I_Adapter:
+                    control = [
+                        torch.cat([c.clone() for _ in range(batch_size)], dim=0)
+                        for c in control
+                    ]
+                control = [c * cond_mark for c in control]
+
+            high_res_fix_forced_soft_injection = False
+
+            if is_in_high_res_fix:
+                if "canny" in param.preprocessor["name"]:
+                    high_res_fix_forced_soft_injection = True
+                if "mlsd" in param.preprocessor["name"]:
+                    high_res_fix_forced_soft_injection = True
+
+            if param.soft_injection or high_res_fix_forced_soft_injection:
+                # important! use the soft weights with high-res fix can significantly reduce artifacts.
+                if param.control_model_type == ControlModelType.T2I_Adapter:
+                    control_scales = [
+                        param.weight * x for x in (0.25, 0.62, 0.825, 1.0)
+                    ]
+                elif param.control_model_type.is_controlnet:
+                    control_scales = [
+                        param.weight * (0.825 ** float(12 - i)) for i in range(13)
+                    ]
+
+            if is_sdxl and param.control_model_type.is_controlnet:
+                control_scales = control_scales[:10]
+
+            if param.advanced_weighting is not None:
+                logger.info(f"Advanced weighting enabled. {param.advanced_weighting}")
+                if param.soft_injection or high_res_fix_forced_soft_injection:
+                    logger.warn("Advanced weighting overwrites soft_injection effect.")
+                control_scales = param.advanced_weighting
+
+            control = [
+                param.apply_effective_region_mask(c * scale)
+                for c, scale in zip(control, control_scales)
+            ]
+            if param.global_average_pooling:
+                control = [torch.mean(c, dim=(2, 3), keepdim=True) for c in control]
+
+            for idx, item in enumerate(control):
+                target = None
+                if param.control_model_type.is_controlnet:
+                    target = total_controlnet_embedding
+                if param.control_model_type == ControlModelType.T2I_Adapter:
+                    target = total_t2i_adapter_embedding
+                if target is not None:
+                    if batch_option_uint_separate:
+                        for pi, ci in enumerate(outer.current_c_indices):
+                            if pi % len(outer.control_params) != param_index:
+                                item[ci] = 0
+                        for pi, ci in enumerate(outer.current_uc_indices):
+                            if pi % len(outer.control_params) != param_index:
+                                item[ci] = 0
+                        target[idx] = item + target[idx]
+                    else:
+                        target[idx] = item + target[idx]
+
+        # Replace x_t to support inpaint models
+        for param in outer.control_params:
+            if not isinstance(param.used_hint_cond, torch.Tensor):
+                continue
+            if param.used_hint_cond.ndim < 2 or param.used_hint_cond.shape[1] != 4:
+                continue
+            if x.shape[1] != 9:
+                continue
+            if param.used_hint_inpaint_hijack is None:
+                mask_pixel = param.used_hint_cond[:, 3:4, :, :]
+                image_pixel = param.used_hint_cond[:, 0:3, :, :]
+                mask_pixel = (mask_pixel > 0.5).to(mask_pixel.dtype)
+                masked_latent = outer.call_vae_using_process(
+                    process, image_pixel, batch_size, mask=mask_pixel
+                )
+                mask_latent = torch.nn.functional.max_pool2d(mask_pixel, (8, 8))
+                if mask_latent.shape[0] != batch_size:
+                    mask_latent = torch.cat(
+                        [mask_latent.clone() for _ in range(batch_size)], dim=0
+                    )
+                param.used_hint_inpaint_hijack = torch.cat(
+                    [mask_latent, masked_latent], dim=1
+                )
+                param.used_hint_inpaint_hijack.to(x.dtype).to(x.device)
+            x = torch.cat([x[:, :4, :, :], param.used_hint_inpaint_hijack], dim=1)
+
+        # vram
+        for param in outer.control_params:
+            if param.control_model is not None:
+                if outer.lowvram:
+                    param.control_model.to("cpu")
+
+        # A1111 fix for medvram.
+        if shared.cmd_opts.medvram or (
+            getattr(shared.cmd_opts, "medvram_sdxl", False) and is_sdxl
+        ):
+            try:
+                # Trigger the register_forward_pre_hook
+                outer.sd_ldm.model()
+            except Exception as e:
+                logger.debug("register_forward_pre_hook")
+                logger.debug(e)
+
+        # Clear attention and AdaIn cache
+        for module in outer.attn_module_list:
+            module.bank = []
+            module.style_cfgs = []
+        for module in outer.gn_module_list:
+            module.mean_bank = []
+            module.var_bank = []
+            module.style_cfgs = []
+
+        # Handle attention and AdaIn control
+        for param in outer.control_params:
+            if param.guidance_stopped or param.disabled_by_hr_option(
+                self.is_in_high_res_fix
+            ):
+                continue
+
+            if param.used_hint_cond_latent is None:
+                continue
+
+            if param.control_model_type not in [ControlModelType.AttentionInjection]:
+                continue
+
+            ref_xt = predict_q_sample(
+                outer.sd_ldm,
+                param.used_hint_cond_latent,
+                torch.round(timesteps.float()).long(),
+            )
+
+            # Inpaint Hijack
+            if x.shape[1] == 9:
+                ref_xt = torch.cat(
+                    [
+                        ref_xt,
+                        torch.zeros_like(ref_xt)[:, 0:1, :, :],
+                        param.used_hint_cond_latent,
+                    ],
+                    dim=1,
+                )
+
+            outer.current_style_fidelity = float(param.preprocessor["threshold_a"])
+            outer.current_style_fidelity = max(
+                0.0, min(1.0, outer.current_style_fidelity)
+            )
+
+            if is_sdxl:
+                # sdxl's attention hacking is highly unstable.
+                # We have no other methods but to reduce the style_fidelity a bit.
+                # By default, 0.5 ** 3.0 = 0.125
+                outer.current_style_fidelity = outer.current_style_fidelity ** 3.0
+
+            if param.cfg_injection:
+                outer.current_style_fidelity = 1.0
+            elif param.soft_injection or is_in_high_res_fix:
+                outer.current_style_fidelity = 0.0
+
+            control_name = param.preprocessor["name"]
+
+            if control_name in ["reference_only", "reference_adain+attn"]:
+                outer.attention_auto_machine = AutoMachine.Write
+                outer.attention_auto_machine_weight = param.weight
+
+            if control_name in ["reference_adain", "reference_adain+attn"]:
+                outer.gn_auto_machine = AutoMachine.Write
+                outer.gn_auto_machine_weight = param.weight
+
+            if is_sdxl:
+                outer.original_forward(
+                    x=ref_xt.to(devices.dtype_unet),
+                    timesteps=timesteps.to(devices.dtype_unet),
+                    context=context.to(devices.dtype_unet),
+                    y=y,
+                )
+            else:
+                outer.original_forward(
+                    x=ref_xt.to(devices.dtype_unet),
+                    timesteps=timesteps.to(devices.dtype_unet),
+                    context=context.to(devices.dtype_unet),
+                )
+
+            outer.attention_auto_machine = AutoMachine.Read
+            outer.gn_auto_machine = AutoMachine.Read
+
+        h = onediff_shared.current_unet_graph.graph_module(
+            x,
+            timesteps,
+            context,
+            y,
+            total_t2i_adapter_embedding,
+            total_controlnet_embedding,
+            is_sdxl,
+            require_inpaint_hijack,
+        )
+
+        # Post-processing for color fix
+        for param in outer.control_params:
+            if param.used_hint_cond_latent is None:
+                continue
+            if "colorfix" not in param.preprocessor["name"]:
+                continue
+
+            k = int(param.preprocessor["threshold_a"])
+            if is_in_high_res_fix and not param.disabled_by_hr_option(
+                self.is_in_high_res_fix
+            ):
+                k *= 2
+
+            # Inpaint hijack
+            xt = x[:, :4, :, :]
+
+            x0_origin = param.used_hint_cond_latent
+            t = torch.round(timesteps.float()).long()
+            x0_prd = predict_start_from_noise(outer.sd_ldm, xt, t, h)
+            x0 = x0_prd - blur(x0_prd, k) + blur(x0_origin, k)
+
+            if "+sharp" in param.preprocessor["name"]:
+                detail_weight = float(param.preprocessor["threshold_b"]) * 0.01
+                neg = detail_weight * blur(x0, k) + (1 - detail_weight) * x0
+                x0 = cond_mark * x0 + (1 - cond_mark) * neg
+
+            eps_prd = predict_noise_from_start(outer.sd_ldm, xt, t, x0)
+
+            w = max(0.0, min(1.0, float(param.weight)))
+            h = eps_prd * w + h * (1 - w)
+
+        # Post-processing for restore
+        for param in outer.control_params:
+            if param.used_hint_cond_latent is None:
+                continue
+            if "inpaint_only" not in param.preprocessor["name"]:
+                continue
+            if param.used_hint_cond.shape[1] != 4:
+                continue
+
+            # Inpaint hijack
+            xt = x[:, :4, :, :]
+
+            mask = param.used_hint_cond[:, 3:4, :, :]
+            mask = torch.nn.functional.max_pool2d(
+                mask, (10, 10), stride=(8, 8), padding=1
+            )
+
+            x0_origin = param.used_hint_cond_latent
+            t = torch.round(timesteps.float()).long()
+            x0_prd = predict_start_from_noise(outer.sd_ldm, xt, t, h)
+            x0 = x0_prd * mask + x0_origin * (1 - mask)
+            eps_prd = predict_noise_from_start(outer.sd_ldm, xt, t, x0)
+
+            w = max(0.0, min(1.0, float(param.weight)))
+            h = eps_prd * w + h * (1 - w)
+
+        return h
+
+    def move_all_control_model_to_cpu():
+        for param in getattr(outer, "control_params", []) or []:
+            if isinstance(param.control_model, torch.nn.Module):
+                param.control_model.to("cpu")
+
+    def forward_webui(*args, **kwargs):
+        forward_func = None
+        if "forward" in onediff_shared.current_unet_graph.graph_module._torch_module.__dict__:
+            forward_func = onediff_shared.current_unet_graph.graph_module._torch_module.__dict__.pop("forward")
+            _original_forward_func = onediff_shared.current_unet_graph.graph_module._torch_module.__dict__.pop("_original_forward")
+        # webui will handle other compoments
+        try:
+            if shared.cmd_opts.lowvram:
+                lowvram.send_everything_to_cpu()
+            return forward(*args, **kwargs)
+        except Exception as e:
+            move_all_control_model_to_cpu()
+            raise e
+        finally:
+            if outer.lowvram:
+                move_all_control_model_to_cpu()
+            if forward_func is not None:
+                onediff_shared.current_unet_graph.graph_module._torch_module.forward = forward_func
+                onediff_shared.current_unet_graph.graph_module._torch_module._original_forward = _original_forward_func
+
+    def hacked_basic_transformer_inner_forward(self, x, context=None):
+        x_norm1 = self.norm1(x)
+        self_attn1 = None
+        if self.disable_self_attn:
+            # Do not use self-attention
+            self_attn1 = self.attn1(x_norm1, context=context)
+        else:
+            # Use self-attention
+            self_attention_context = x_norm1
+            if outer.attention_auto_machine == AutoMachine.Write:
+                if outer.attention_auto_machine_weight > self.attn_weight:
+                    self.bank.append(self_attention_context.detach().clone())
+                    self.style_cfgs.append(outer.current_style_fidelity)
+            if outer.attention_auto_machine == AutoMachine.Read:
+                if len(self.bank) > 0:
+                    style_cfg = sum(self.style_cfgs) / float(len(self.style_cfgs))
+                    self_attn1_uc = self.attn1(
+                        x_norm1,
+                        context=torch.cat([self_attention_context] + self.bank, dim=1),
+                    )
+                    self_attn1_c = self_attn1_uc.clone()
+                    if len(outer.current_uc_indices) > 0 and style_cfg > 1e-5:
+                        self_attn1_c[outer.current_uc_indices] = self.attn1(
+                            x_norm1[outer.current_uc_indices],
+                            context=self_attention_context[outer.current_uc_indices],
+                        )
+                    self_attn1 = (
+                        style_cfg * self_attn1_c + (1.0 - style_cfg) * self_attn1_uc
+                    )
+                self.bank = []
+                self.style_cfgs = []
+            if (
+                outer.attention_auto_machine == AutoMachine.StyleAlign
+                and not outer.is_in_high_res_fix
+            ):
+                # very VRAM hungry - disable at high_res_fix
+
+                def shared_attn1(inner_x):
+                    BB, FF, CC = inner_x.shape
+                    return self.attn1(inner_x.reshape(1, BB * FF, CC)).reshape(
+                        BB, FF, CC
+                    )
+
+                uc_layer = shared_attn1(x_norm1[outer.current_uc_indices])
+                c_layer = shared_attn1(x_norm1[outer.current_c_indices])
+                self_attn1 = torch.zeros_like(x_norm1).to(uc_layer)
+                self_attn1[outer.current_uc_indices] = uc_layer
+                self_attn1[outer.current_c_indices] = c_layer
+                del uc_layer, c_layer
+            if self_attn1 is None:
+                self_attn1 = self.attn1(x_norm1, context=self_attention_context)
+
+        x = self_attn1.to(x.dtype) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+    def hacked_group_norm_forward(self, *args, **kwargs):
+        eps = 1e-6
+        x = self.original_forward_cn_hijack(*args, **kwargs)
+        y = None
+        if outer.gn_auto_machine == AutoMachine.Write:
+            if outer.gn_auto_machine_weight > self.gn_weight:
+                var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                self.mean_bank.append(mean)
+                self.var_bank.append(var)
+                self.style_cfgs.append(outer.current_style_fidelity)
+        if outer.gn_auto_machine == AutoMachine.Read:
+            if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+                style_cfg = sum(self.style_cfgs) / float(len(self.style_cfgs))
+                var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+                std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+                mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
+                var_acc = sum(self.var_bank) / float(len(self.var_bank))
+                std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+                y_uc = (((x - mean) / std) * std_acc) + mean_acc
+                y_c = y_uc.clone()
+                if len(outer.current_uc_indices) > 0 and style_cfg > 1e-5:
+                    y_c[outer.current_uc_indices] = x.to(y_c.dtype)[
+                        outer.current_uc_indices
+                    ]
+                y = style_cfg * y_c + (1.0 - style_cfg) * y_uc
+            self.mean_bank = []
+            self.var_bank = []
+            self.style_cfgs = []
+        if y is None:
+            y = x
+        return y.to(x.dtype)
+
+    if getattr(process, "sample_before_CN_hack", None) is None:
+        process.sample_before_CN_hack = process.sample
+    process.sample = process_sample
+
+    model._original_forward = model.forward
+    outer.original_forward = model.forward
+    model.forward = forward_webui.__get__(model, UNetModel)
+
+    if model_is_sdxl:
+        register_schedule(sd_ldm)
+        outer.revision_q_sampler = AbstractLowScaleModel()
+
+    need_attention_hijack = False
+
+    for param in outer.control_params:
+        if param.control_model_type in [ControlModelType.AttentionInjection]:
+            need_attention_hijack = True
+
+    if batch_option_style_align:
+        need_attention_hijack = True
+        outer.attention_auto_machine = AutoMachine.StyleAlign
+        outer.gn_auto_machine = AutoMachine.StyleAlign
+
+    all_modules = torch_dfs(model)
+
+    if need_attention_hijack:
+        attn_modules = [
+            module
+            for module in all_modules
+            if isinstance(module, BasicTransformerBlock)
+            or isinstance(module, BasicTransformerBlockSGM)
+        ]
+        attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+
+        for i, module in enumerate(attn_modules):
+            if getattr(module, "_original_inner_forward_cn_hijack", None) is None:
+                module._original_inner_forward_cn_hijack = module._forward
+            module._forward = hacked_basic_transformer_inner_forward.__get__(
+                module, BasicTransformerBlock
+            )
+            module.bank = []
+            module.style_cfgs = []
+            module.attn_weight = float(i) / float(len(attn_modules))
+
+        gn_modules = [model.middle_block]
+        model.middle_block.gn_weight = 0
+
+        if model_is_sdxl:
+            input_block_indices = [4, 5, 7, 8]
+            output_block_indices = [0, 1, 2, 3, 4, 5]
+        else:
+            input_block_indices = [4, 5, 7, 8, 10, 11]
+            output_block_indices = [0, 1, 2, 3, 4, 5, 6, 7]
+
+        for w, i in enumerate(input_block_indices):
+            module = model.input_blocks[i]
+            module.gn_weight = 1.0 - float(w) / float(len(input_block_indices))
+            gn_modules.append(module)
+
+        for w, i in enumerate(output_block_indices):
+            module = model.output_blocks[i]
+            module.gn_weight = float(w) / float(len(output_block_indices))
+            gn_modules.append(module)
+
+        for i, module in enumerate(gn_modules):
+            if getattr(module, "original_forward_cn_hijack", None) is None:
+                module.original_forward_cn_hijack = module.forward
+            module.forward = hacked_group_norm_forward.__get__(module, torch.nn.Module)
+            module.mean_bank = []
+            module.var_bank = []
+            module.style_cfgs = []
+            module.gn_weight *= 2
+
+        outer.attn_module_list = attn_modules
+        outer.gn_module_list = gn_modules
+    else:
+        for module in all_modules:
+            _original_inner_forward_cn_hijack = getattr(
+                module, "_original_inner_forward_cn_hijack", None
+            )
+            original_forward_cn_hijack = getattr(
+                module, "original_forward_cn_hijack", None
+            )
+            if _original_inner_forward_cn_hijack is not None:
+                module._forward = _original_inner_forward_cn_hijack
+            if original_forward_cn_hijack is not None:
+                module.forward = original_forward_cn_hijack
+        outer.attn_module_list = []
+        outer.gn_module_list = []
+
+    scripts.script_callbacks.on_cfg_denoiser(self.guidance_schedule_handler)
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index 8d9e4cf15..e06a51b24 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -9,3 +9,7 @@
     "is_ssd": False,
 }
 onediff_enabled = False
+
+# controlnet
+controlnet_compiled = False
+current_is_controlnet = False
diff --git a/onediff_sd_webui_extensions/ui_utils.py b/onediff_sd_webui_extensions/onediff_utils.py
similarity index 89%
rename from onediff_sd_webui_extensions/ui_utils.py
rename to onediff_sd_webui_extensions/onediff_utils.py
index bdb875a38..441bcdfc7 100644
--- a/onediff_sd_webui_extensions/ui_utils.py
+++ b/onediff_sd_webui_extensions/onediff_utils.py
@@ -1,4 +1,5 @@
 import os
+from functools import wraps
 from contextlib import contextmanager
 from pathlib import Path
 from textwrap import dedent
@@ -119,10 +120,22 @@ def save_graph(compiled_unet: DeployableModule, saved_cache_name: str = ""):
         compiled_unet.save_graph(saved_cache_name)
 
 
-@contextmanager
-def onediff_enabled():
-    onediff_shared.onediff_enabled = True
-    try:
-        yield
-    finally:
-        onediff_shared.onediff_enabled = False
+def onediff_enabled_decorator(func):
+    @wraps(func)
+    def wrapper(*arg, **kwargs):
+        onediff_shared.onediff_enabled = True
+        try:
+            return func(*arg, **kwargs)
+        finally:
+            onediff_shared.onediff_enabled = False
+    return wrapper
+
+
+def singleton_decorator(func):
+    has_been_called = False
+    def wrapper(*args, **kwargs):
+        nonlocal has_been_called
+        if not has_been_called:
+            has_been_called = True
+            return func(*args, **kwargs)
+    return wrapper
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 0561469d8..5e7f23513 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -13,14 +13,15 @@
 from modules.ui_common import create_refresh_button
 from onediff_hijack import do_hijack as onediff_do_hijack
 from onediff_lora import HijackLoraActivate
-from ui_utils import (
+import onediff_controlnet
+from onediff_utils import (
     check_structure_change_and_update,
     get_all_compiler_caches,
     hints_message,
     load_graph,
-    onediff_enabled,
     refresh_all_compiler_caches,
     save_graph,
+    onediff_enabled_decorator,
 )
 
 from onediff.optimization.quant_optimizer import varify_can_use_quantization
@@ -34,19 +35,25 @@ class UnetCompileCtx(object):
     The global variables need to be replaced with compiled_unet before process_images is run,
     and then the original model restored so that subsequent reasoning with onediff disabled meets expectations.
     """
+    def __init__(self, enabled):
+        self.enabled = enabled
 
     def __enter__(self):
+        if not self.enabled:
+            return
         self._original_model = shared.sd_model.model.diffusion_model
         shared.sd_model.model.diffusion_model = (
             onediff_shared.current_unet_graph.graph_module
         )
 
     def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
         shared.sd_model.model.diffusion_model = self._original_model
-        return False
 
 
 class Script(scripts.Script):
+
     def title(self):
         return "onediff_diffusion_model"
 
@@ -85,6 +92,7 @@ def ui(self, is_img2img):
     def show(self, is_img2img):
         return True
 
+    @onediff_enabled_decorator
     def run(
         self,
         p,
@@ -93,6 +101,10 @@ def run(
         saved_cache_name="",
         always_recompile=False,
     ):
+        controlnet_enabled = onediff_controlnet.check_if_controlnet_enabled(p)
+        if controlnet_enabled:
+            onediff_controlnet.create_condfunc(p)
+
         # restore checkpoint_info from refiner to base model if necessary
         if (
             sd_models.checkpoint_aliases.get(
@@ -116,27 +128,39 @@ def run(
         quantization_changed = (
             quantization != onediff_shared.current_unet_graph.quantized
         )
+        controlnet_enabled_status_changed = (
+            controlnet_enabled != onediff_shared.current_is_controlnet
+        )
         need_recompile = (
             (
                 quantization and ckpt_changed
             )  # always recompile when switching ckpt with 'int8 speed model' enabled
             or structure_changed  # always recompile when switching model to another structure
             or quantization_changed  # always recompile when switching model from non-quantized to quantized (and vice versa)
+            or controlnet_enabled_status_changed
             or always_recompile
         )
         if need_recompile:
-            onediff_shared.current_unet_graph = get_compiled_graph(
-                shared.sd_model, quantization
-            )
-            load_graph(onediff_shared.current_unet_graph, compiler_cache)
+            if not controlnet_enabled:
+                onediff_shared.current_unet_graph = get_compiled_graph(
+                    shared.sd_model, quantization
+                )
+                load_graph(onediff_shared.current_unet_graph, compiler_cache)
         else:
             logger.info(
                 f"Model {current_checkpoint_name} has same sd type of graph type {onediff_shared.current_unet_type}, skip compile"
             )
 
-        with UnetCompileCtx(), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate(), onediff_enabled():
+        with UnetCompileCtx(not controlnet_enabled), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate():
             proc = process_images(p)
         save_graph(onediff_shared.current_unet_graph, saved_cache_name)
+
+        if controlnet_enabled:
+            onediff_shared.current_is_controlnet = True
+        else:
+            onediff_shared.controlnet_compiled = False
+            onediff_shared.current_is_controlnet = False
+
         torch_gc()
         flow.cuda.empty_cache()
         return proc

From b66fed59c4d7bc6bce49a3d0be6d50136633788e Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Mon, 17 Jun 2024 18:30:43 +0800
Subject: [PATCH 10/14] refine

---
 .../compile/compile_utils.py                  |   4 +-
 .../compile/sd_webui_onediff_utils.py         |   4 +-
 .../onediff_controlnet.py                     | 113 ++++++++++++++----
 onediff_sd_webui_extensions/onediff_shared.py |   3 +-
 onediff_sd_webui_extensions/onediff_utils.py  |  13 +-
 .../scripts/onediff.py                        |  26 ++--
 tests/sd-webui/test_api.py                    |   1 -
 tests/sd-webui/utils.py                       |   1 -
 8 files changed, 113 insertions(+), 52 deletions(-)

diff --git a/onediff_sd_webui_extensions/compile/compile_utils.py b/onediff_sd_webui_extensions/compile/compile_utils.py
index 451fc26ba..d79278be2 100644
--- a/onediff_sd_webui_extensions/compile/compile_utils.py
+++ b/onediff_sd_webui_extensions/compile/compile_utils.py
@@ -65,7 +65,5 @@ def get_compiled_graph(sd_model, quantization) -> OneDiffCompiledGraph:
     # for controlnet
     if "forward" in diffusion_model.__dict__:
         diffusion_model.__dict__.pop("forward")
-    compiled_unet = compile_unet(
-        diffusion_model, quantization=quantization
-    )
+    compiled_unet = compile_unet(diffusion_model, quantization=quantization)
     return OneDiffCompiledGraph(sd_model, compiled_unet, quantization)
diff --git a/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py b/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
index 93aad2f49..c77f5c3d1 100644
--- a/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
+++ b/onediff_sd_webui_extensions/compile/sd_webui_onediff_utils.py
@@ -26,7 +26,9 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
         if dim % 2:
             embedding = flow.cat([embedding, flow.zeros_like(embedding[:, :1])], dim=-1)
     else:
-        raise NotImplementedError("repeat_only=True is not implemented in timestep_embedding")
+        raise NotImplementedError(
+            "repeat_only=True is not implemented in timestep_embedding"
+        )
     return embedding
 
 
diff --git a/onediff_sd_webui_extensions/onediff_controlnet.py b/onediff_sd_webui_extensions/onediff_controlnet.py
index 6e3899a7d..fa25c8fbb 100644
--- a/onediff_sd_webui_extensions/onediff_controlnet.py
+++ b/onediff_sd_webui_extensions/onediff_controlnet.py
@@ -1,11 +1,15 @@
+from functools import wraps
+
 import onediff_shared
 import oneflow as flow
 import torch
 import torch as th
 from compile import OneDiffCompiledGraph
-from compile.sd_webui_onediff_utils import (CrossAttentionOflow,
-                                            GroupNorm32Oflow,
-                                            timestep_embedding)
+from compile.sd_webui_onediff_utils import (
+    CrossAttentionOflow,
+    GroupNorm32Oflow,
+    timestep_embedding,
+)
 from ldm.modules.attention import BasicTransformerBlock, CrossAttention
 from ldm.modules.diffusionmodules.openaimodel import ResBlock, UNetModel
 from ldm.modules.diffusionmodules.util import GroupNorm32
@@ -14,10 +18,10 @@
 from onediff_utils import singleton_decorator
 
 from onediff.infer_compiler import oneflow_compile
-from onediff.infer_compiler.backends.oneflow.transform import (proxy_class,
-                                                               register)
+from onediff.infer_compiler.backends.oneflow.transform import proxy_class, register
 
 
+# https://github.com/Mikubill/sd-webui-controlnet/blob/8bbbd0e55ef6e5d71b09c2de2727b36e7bc825b0/scripts/hook.py#L238
 def torch_aligned_adding(base, x, require_channel_alignment):
     if isinstance(x, float):
         if x == 0.0:
@@ -41,8 +45,12 @@ def torch_aligned_adding(base, x, require_channel_alignment):
     return base + x
 
 
+# Due to the tracing mechanism in OneFlow, it's crucial to ensure that
+# the same conditional branches are taken during the first run as in subsequent runs.
+# Therefore, certain "optimizations" have been modified.
 def oneflow_aligned_adding(base, x, require_channel_alignment):
     if isinstance(x, float):
+        # remove `if x == 0.0: return base` here
         return base + x
 
     if require_channel_alignment:
@@ -226,13 +234,31 @@ def forward(
         return h
 
 
+def onediff_controlnet_decorator(func):
+    @wraps(func)
+    def wrapper(self, p, *arg, **kwargs):
+        try:
+            onediff_shared.controlnet_enabled = check_if_controlnet_enabled(p)
+            if onediff_shared.controlnet_enabled:
+                hijack_controlnet_extension(p)
+            return func(self, p, *arg, **kwargs)
+        finally:
+            if onediff_shared.controlnet_enabled:
+                onediff_shared.previous_is_controlnet = True
+            else:
+                onediff_shared.controlnet_compiled = False
+                onediff_shared.previous_is_controlnet = False
+
+    return wrapper
+
+
 def compile_controlnet_ldm_unet(sd_model, unet_model, *, options=None):
     for module in unet_model.modules():
         if isinstance(module, BasicTransformerBlock):
             module.checkpoint = False
         if isinstance(module, ResBlock):
             module.use_checkpoint = False
-    # return oneflow_compile(unet_model, options=options)
+    # TODO: refine here
     compiled_model = oneflow_compile(unet_model, options=options)
     compiled_graph = OneDiffCompiledGraph(sd_model, compiled_model)
     compiled_graph.eager_module = unet_model
@@ -260,8 +286,6 @@ def hijacked_main_entry(self, p):
     unet = sd_ldm.model.diffusion_model
 
     if onediff_shared.controlnet_compiled is False:
-    # if not getattr(self, "compiled", False):
-        from onediff_controlnet import TorchOnediffControlNetModel
         onediff_model = TorchOnediffControlNetModel(unet)
         onediff_shared.current_unet_graph = compile_controlnet_ldm_unet(
             sd_ldm, onediff_model
@@ -271,8 +295,6 @@ def hijacked_main_entry(self, p):
         pass
 
 
-
-
 def get_controlnet_script(p):
     for script in p.scripts.scripts:
         if script.__module__ == "controlnet.py":
@@ -282,15 +304,21 @@ def get_controlnet_script(p):
 
 def check_if_controlnet_enabled(p):
     controlnet_script_class = get_controlnet_script(p)
-    if controlnet_script_class is None:
-        return False
-    return len(controlnet_script_class.get_enabled_units(p)) != 0
+    return (
+        controlnet_script_class is not None
+        and len(controlnet_script_class.get_enabled_units(p)) != 0
+    )
 
 
+# When OneDiff is initializing, the controlnet extension has not yet been loaded.
+# Therefore, this function should be called during image generation
+# rather than during the initialization of the OneDiff.
 @singleton_decorator
-def create_condfunc(p):
+def hijack_controlnet_extension(p):
     CondFunc(
-        "scripts.hook.UnetHook.hook", hijacked_hook, lambda _, *arg, **kwargs: True
+        "scripts.hook.UnetHook.hook",
+        hijacked_controlnet_hook,
+        lambda _, *arg, **kwargs: onediff_shared.onediff_enabled,
     )
     # get controlnet script
     controlnet_script = get_controlnet_script(p)
@@ -305,8 +333,18 @@ def create_condfunc(p):
     )
 
 
+# We were intended to only hack the closure function `forward`
+# in the member function `hook` of the UnetHook class in the ControlNet extension.
+# But due to certain limitations, we were unable to directly only hack
+# the closure function `forward` within the `hook` method.
+# So we have to hack the entire member function `hook` in the `UnetHook` class.
 
-def hijacked_hook(
+# The function largely retains its original content,
+# with modifications specifically made within the `forward` function.
+# To identify the altered parts, you can search for the tag "modified by OneDiff"
+
+# https://github.com/Mikubill/sd-webui-controlnet/blob/8bbbd0e55ef6e5d71b09c2de2727b36e7bc825b0/scripts/hook.py#L442
+def hijacked_controlnet_hook(
     orig_func,
     self,
     model,
@@ -319,10 +357,17 @@ def hijacked_hook(
     from modules import devices, lowvram, scripts, shared
     from scripts.controlnet_sparsectrl import SparseCtrl
     from scripts.enums import AutoMachine, ControlModelType, HiResFixOption
-    from scripts.hook import (AbstractLowScaleModel, blur, mark_prompt_context,
-                              predict_noise_from_start, predict_q_sample,
-                              predict_start_from_noise, register_schedule,
-                              torch_dfs, unmark_prompt_context)
+    from scripts.hook import (
+        AbstractLowScaleModel,
+        blur,
+        mark_prompt_context,
+        predict_noise_from_start,
+        predict_q_sample,
+        predict_start_from_noise,
+        register_schedule,
+        torch_dfs,
+        unmark_prompt_context,
+    )
     from scripts.ipadapter.ipadapter_model import ImageEmbed
     from scripts.logging import logger
 
@@ -731,6 +776,7 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             outer.attention_auto_machine = AutoMachine.Read
             outer.gn_auto_machine = AutoMachine.Read
 
+        # modified by OneDiff
         h = onediff_shared.current_unet_graph.graph_module(
             x,
             timesteps,
@@ -807,10 +853,20 @@ def move_all_control_model_to_cpu():
                 param.control_model.to("cpu")
 
     def forward_webui(*args, **kwargs):
+        # ------ modified by OneDiff below ------
         forward_func = None
-        if "forward" in onediff_shared.current_unet_graph.graph_module._torch_module.__dict__:
-            forward_func = onediff_shared.current_unet_graph.graph_module._torch_module.__dict__.pop("forward")
-            _original_forward_func = onediff_shared.current_unet_graph.graph_module._torch_module.__dict__.pop("_original_forward")
+        if (
+            "forward"
+            in onediff_shared.current_unet_graph.graph_module._torch_module.__dict__
+        ):
+            forward_func = onediff_shared.current_unet_graph.graph_module._torch_module.__dict__.pop(
+                "forward"
+            )
+            _original_forward_func = onediff_shared.current_unet_graph.graph_module._torch_module.__dict__.pop(
+                "_original_forward"
+            )
+        # ------ modified by OneDiff above ------
+
         # webui will handle other compoments
         try:
             if shared.cmd_opts.lowvram:
@@ -822,9 +878,16 @@ def forward_webui(*args, **kwargs):
         finally:
             if outer.lowvram:
                 move_all_control_model_to_cpu()
+
+            # ------ modified by OneDiff below ------
             if forward_func is not None:
-                onediff_shared.current_unet_graph.graph_module._torch_module.forward = forward_func
-                onediff_shared.current_unet_graph.graph_module._torch_module._original_forward = _original_forward_func
+                onediff_shared.current_unet_graph.graph_module._torch_module.forward = (
+                    forward_func
+                )
+                onediff_shared.current_unet_graph.graph_module._torch_module._original_forward = (
+                    _original_forward_func
+                )
+            # ------ modified by OneDiff above ------
 
     def hacked_basic_transformer_inner_forward(self, x, context=None):
         x_norm1 = self.norm1(x)
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index e06a51b24..75da3d953 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -11,5 +11,6 @@
 onediff_enabled = False
 
 # controlnet
+controlnet_enabled = False
 controlnet_compiled = False
-current_is_controlnet = False
+previous_is_controlnet = False
diff --git a/onediff_sd_webui_extensions/onediff_utils.py b/onediff_sd_webui_extensions/onediff_utils.py
index 441bcdfc7..1ea53bebe 100644
--- a/onediff_sd_webui_extensions/onediff_utils.py
+++ b/onediff_sd_webui_extensions/onediff_utils.py
@@ -1,11 +1,13 @@
 import os
-from functools import wraps
 from contextlib import contextmanager
+from functools import wraps
 from pathlib import Path
 from textwrap import dedent
 from zipfile import BadZipFile
 
 import onediff_shared
+import oneflow as flow
+from modules.devices import torch_gc
 
 from onediff.infer_compiler import DeployableModule
 
@@ -122,20 +124,25 @@ def save_graph(compiled_unet: DeployableModule, saved_cache_name: str = ""):
 
 def onediff_enabled_decorator(func):
     @wraps(func)
-    def wrapper(*arg, **kwargs):
+    def wrapper(self, p, *arg, **kwargs):
         onediff_shared.onediff_enabled = True
         try:
-            return func(*arg, **kwargs)
+            return func(self, p, *arg, **kwargs)
         finally:
             onediff_shared.onediff_enabled = False
+            torch_gc()
+            flow.cuda.empty_cache()
+
     return wrapper
 
 
 def singleton_decorator(func):
     has_been_called = False
+
     def wrapper(*args, **kwargs):
         nonlocal has_been_called
         if not has_been_called:
             has_been_called = True
             return func(*args, **kwargs)
+
     return wrapper
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 5aa866cfe..180b31040 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -4,6 +4,7 @@
 import modules.scripts as scripts
 import modules.sd_models as sd_models
 import modules.shared as shared
+import onediff_controlnet
 import onediff_shared
 import oneflow as flow
 from compile import SD21CompileCtx, VaeCompileCtx, get_compiled_graph
@@ -13,15 +14,14 @@
 from modules.ui_common import create_refresh_button
 from onediff_hijack import do_hijack as onediff_do_hijack
 from onediff_lora import HijackLoraActivate
-import onediff_controlnet
 from onediff_utils import (
     check_structure_change_and_update,
     get_all_compiler_caches,
     hints_message,
     load_graph,
+    onediff_enabled_decorator,
     refresh_all_compiler_caches,
     save_graph,
-    onediff_enabled_decorator,
 )
 
 from onediff.optimization.quant_optimizer import varify_can_use_quantization
@@ -35,6 +35,7 @@ class UnetCompileCtx(object):
     The global variables need to be replaced with compiled_unet before process_images is run,
     and then the original model restored so that subsequent reasoning with onediff disabled meets expectations.
     """
+
     def __init__(self, enabled):
         self.enabled = enabled
 
@@ -92,6 +93,7 @@ def show(self, is_img2img):
         return True
 
     @onediff_enabled_decorator
+    @onediff_controlnet.onediff_controlnet_decorator
     def run(
         self,
         p,
@@ -100,10 +102,6 @@ def run(
         saved_cache_name="",
         always_recompile=False,
     ):
-        controlnet_enabled = onediff_controlnet.check_if_controlnet_enabled(p)
-        if controlnet_enabled:
-            onediff_controlnet.create_condfunc(p)
-
         # restore checkpoint_info from refiner to base model if necessary
         if (
             sd_models.checkpoint_aliases.get(
@@ -128,7 +126,7 @@ def run(
             quantization != onediff_shared.current_unet_graph.quantized
         )
         controlnet_enabled_status_changed = (
-            controlnet_enabled != onediff_shared.current_is_controlnet
+            onediff_shared.controlnet_enabled != onediff_shared.previous_is_controlnet
         )
         need_recompile = (
             (
@@ -140,7 +138,7 @@ def run(
             or always_recompile
         )
         if need_recompile:
-            if not controlnet_enabled:
+            if not onediff_shared.controlnet_enabled:
                 onediff_shared.current_unet_graph = get_compiled_graph(
                     shared.sd_model, quantization
                 )
@@ -150,18 +148,12 @@ def run(
                 f"Model {current_checkpoint_name} has same sd type of graph type {onediff_shared.current_unet_type}, skip compile"
             )
 
-        with UnetCompileCtx(not controlnet_enabled), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate():
+        with UnetCompileCtx(
+            not onediff_shared.controlnet_enabled
+        ), VaeCompileCtx(), SD21CompileCtx(), HijackLoraActivate():
             proc = process_images(p)
         save_graph(onediff_shared.current_unet_graph, saved_cache_name)
 
-        if controlnet_enabled:
-            onediff_shared.current_is_controlnet = True
-        else:
-            onediff_shared.controlnet_compiled = False
-            onediff_shared.current_is_controlnet = False
-
-        torch_gc()
-        flow.cuda.empty_cache()
         return proc
 
 
diff --git a/tests/sd-webui/test_api.py b/tests/sd-webui/test_api.py
index accd2036d..fa7550abe 100644
--- a/tests/sd-webui/test_api.py
+++ b/tests/sd-webui/test_api.py
@@ -21,7 +21,6 @@
     get_threshold,
 )
 
-THRESHOLD = 0.97
 
 @pytest.fixture(scope="session", autouse=True)
 def change_model():
diff --git a/tests/sd-webui/utils.py b/tests/sd-webui/utils.py
index 658829571..3a1bbaedd 100644
--- a/tests/sd-webui/utils.py
+++ b/tests/sd-webui/utils.py
@@ -30,7 +30,6 @@ def get_base_args() -> Dict[str, Any]:
     return {
         "prompt": "1girl",
         "negative_prompt": "",
-        "sd_model_checkpoint": "checkpoints/AWPainting_v1.2.safetensors",
         "seed": SEED,
         "steps": NUM_STEPS,
         "width": WIDTH,

From e35667280fc2bb488055258f37fabd110b17197e Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Tue, 18 Jun 2024 11:23:49 +0800
Subject: [PATCH 11/14] support recompile when switching model

---
 onediff_sd_webui_extensions/onediff_controlnet.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/onediff_sd_webui_extensions/onediff_controlnet.py b/onediff_sd_webui_extensions/onediff_controlnet.py
index fa25c8fbb..13b8100a2 100644
--- a/onediff_sd_webui_extensions/onediff_controlnet.py
+++ b/onediff_sd_webui_extensions/onediff_controlnet.py
@@ -1,6 +1,7 @@
 from functools import wraps
 
 import onediff_shared
+from onediff_utils import check_structure_change_and_update
 import oneflow as flow
 import torch
 import torch as th
@@ -285,7 +286,10 @@ def hijacked_main_entry(self, p):
     sd_ldm = p.sd_model
     unet = sd_ldm.model.diffusion_model
 
-    if onediff_shared.controlnet_compiled is False:
+    structure_changed = check_structure_change_and_update(
+        onediff_shared.current_unet_type, sd_ldm
+    )
+    if onediff_shared.controlnet_compiled is False or structure_changed:
         onediff_model = TorchOnediffControlNetModel(unet)
         onediff_shared.current_unet_graph = compile_controlnet_ldm_unet(
             sd_ldm, onediff_model

From b47e8fe0bdec97ffd336c5e4f378116582054159 Mon Sep 17 00:00:00 2001
From: Wang Yi <53533850+marigoold@users.noreply.github.com>
Date: Fri, 21 Jun 2024 13:02:38 +0800
Subject: [PATCH 12/14] Update test_api.py

---
 tests/sd-webui/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/sd-webui/test_api.py b/tests/sd-webui/test_api.py
index fa7550abe..00e74f9c2 100644
--- a/tests/sd-webui/test_api.py
+++ b/tests/sd-webui/test_api.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 import numpy as np
 import pytest
 from PIL import Image

From 16a38a2e1d6c491af8467bf90c185bd8fee47110 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Fri, 21 Jun 2024 15:13:41 +0800
Subject: [PATCH 13/14] fix bug of sdxl

---
 onediff_sd_webui_extensions/onediff_controlnet.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/onediff_sd_webui_extensions/onediff_controlnet.py b/onediff_sd_webui_extensions/onediff_controlnet.py
index 13b8100a2..c444c2349 100644
--- a/onediff_sd_webui_extensions/onediff_controlnet.py
+++ b/onediff_sd_webui_extensions/onediff_controlnet.py
@@ -254,10 +254,14 @@ def wrapper(self, p, *arg, **kwargs):
 
 
 def compile_controlnet_ldm_unet(sd_model, unet_model, *, options=None):
+    from sgm.modules.attention import BasicTransformerBlock as BasicTransformerBlockSGM
+    from ldm.modules.attention import BasicTransformerBlock as BasicTransformerBlockLDM
+    from sgm.modules.diffusionmodules.openaimodel import ResBlock as ResBlockSGM
+    from ldm.modules.diffusionmodules.openaimodel import ResBlock as ResBlockLDM
     for module in unet_model.modules():
-        if isinstance(module, BasicTransformerBlock):
+        if isinstance(module, (BasicTransformerBlockLDM, BasicTransformerBlockSGM)):
             module.checkpoint = False
-        if isinstance(module, ResBlock):
+        if isinstance(module, (ResBlockLDM, ResBlockSGM)):
             module.use_checkpoint = False
     # TODO: refine here
     compiled_model = oneflow_compile(unet_model, options=options)

From 3ae1f7b9c46f0f4e78c80eab69dc4e64dc93cb28 Mon Sep 17 00:00:00 2001
From: WangYi <buaawangyi03@gmail.com>
Date: Fri, 21 Jun 2024 17:35:52 +0800
Subject: [PATCH 14/14] fix bug of switching model

---
 .../onediff_controlnet.py                     |  6 ++---
 onediff_sd_webui_extensions/onediff_shared.py |  2 +-
 onediff_sd_webui_extensions/onediff_utils.py  | 24 +++++++++----------
 .../scripts/onediff.py                        |  8 +++----
 onediff_sd_webui_extensions/ui_utils.py       | 14 -----------
 5 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/onediff_sd_webui_extensions/onediff_controlnet.py b/onediff_sd_webui_extensions/onediff_controlnet.py
index c444c2349..c5da15651 100644
--- a/onediff_sd_webui_extensions/onediff_controlnet.py
+++ b/onediff_sd_webui_extensions/onediff_controlnet.py
@@ -1,7 +1,7 @@
 from functools import wraps
 
 import onediff_shared
-from onediff_utils import check_structure_change_and_update
+from onediff_utils import check_structure_change
 import oneflow as flow
 import torch
 import torch as th
@@ -290,8 +290,8 @@ def hijacked_main_entry(self, p):
     sd_ldm = p.sd_model
     unet = sd_ldm.model.diffusion_model
 
-    structure_changed = check_structure_change_and_update(
-        onediff_shared.current_unet_type, sd_ldm
+    structure_changed = check_structure_change(
+        onediff_shared.previous_unet_type, sd_ldm
     )
     if onediff_shared.controlnet_compiled is False or structure_changed:
         onediff_model = TorchOnediffControlNetModel(unet)
diff --git a/onediff_sd_webui_extensions/onediff_shared.py b/onediff_sd_webui_extensions/onediff_shared.py
index 75da3d953..38f6ccbbf 100644
--- a/onediff_sd_webui_extensions/onediff_shared.py
+++ b/onediff_sd_webui_extensions/onediff_shared.py
@@ -2,7 +2,7 @@
 
 current_unet_graph = OneDiffCompiledGraph()
 current_quantization = False
-current_unet_type = {
+previous_unet_type = {
     "is_sdxl": False,
     "is_sd2": False,
     "is_sd1": False,
diff --git a/onediff_sd_webui_extensions/onediff_utils.py b/onediff_sd_webui_extensions/onediff_utils.py
index 1ea53bebe..dc3a42927 100644
--- a/onediff_sd_webui_extensions/onediff_utils.py
+++ b/onediff_sd_webui_extensions/onediff_utils.py
@@ -8,6 +8,7 @@
 import onediff_shared
 import oneflow as flow
 from modules.devices import torch_gc
+from modules import shared
 
 from onediff.infer_compiler import DeployableModule
 
@@ -60,18 +61,8 @@ def refresh_all_compiler_caches(path: Path = None):
     all_compiler_caches = [f.stem for f in Path(path).iterdir() if f.is_file()]
 
 
-def check_structure_change_and_update(current_type: dict[str, bool], model):
-    def get_model_type(model):
-        return {
-            "is_sdxl": model.is_sdxl,
-            "is_sd2": model.is_sd2,
-            "is_sd1": model.is_sd1,
-            "is_ssd": model.is_ssd,
-        }
-
-    changed = current_type != get_model_type(model)
-    current_type.update(**get_model_type(model))
-    return changed
+def check_structure_change(current_type: dict[str, bool], model):
+    return current_type != get_model_type(model)
 
 
 def load_graph(compiled_unet: DeployableModule, compiler_cache: str):
@@ -130,6 +121,7 @@ def wrapper(self, p, *arg, **kwargs):
             return func(self, p, *arg, **kwargs)
         finally:
             onediff_shared.onediff_enabled = False
+            onediff_shared.previous_unet_type.update(**get_model_type(shared.sd_model))
             torch_gc()
             flow.cuda.empty_cache()
 
@@ -146,3 +138,11 @@ def wrapper(*args, **kwargs):
             return func(*args, **kwargs)
 
     return wrapper
+
+def get_model_type(model):
+    return {
+        "is_sdxl": model.is_sdxl,
+        "is_sd2": model.is_sd2,
+        "is_sd1": model.is_sd1,
+        "is_ssd": model.is_ssd,
+    }
diff --git a/onediff_sd_webui_extensions/scripts/onediff.py b/onediff_sd_webui_extensions/scripts/onediff.py
index 180b31040..bd4049ed9 100644
--- a/onediff_sd_webui_extensions/scripts/onediff.py
+++ b/onediff_sd_webui_extensions/scripts/onediff.py
@@ -15,7 +15,7 @@
 from onediff_hijack import do_hijack as onediff_do_hijack
 from onediff_lora import HijackLoraActivate
 from onediff_utils import (
-    check_structure_change_and_update,
+    check_structure_change,
     get_all_compiler_caches,
     hints_message,
     load_graph,
@@ -119,8 +119,8 @@ def run(
             shared.sd_model.sd_checkpoint_info.name
             != onediff_shared.current_unet_graph.name
         )
-        structure_changed = check_structure_change_and_update(
-            onediff_shared.current_unet_type, shared.sd_model
+        structure_changed = check_structure_change(
+            onediff_shared.previous_unet_type, shared.sd_model
         )
         quantization_changed = (
             quantization != onediff_shared.current_unet_graph.quantized
@@ -145,7 +145,7 @@ def run(
                 load_graph(onediff_shared.current_unet_graph, compiler_cache)
         else:
             logger.info(
-                f"Model {current_checkpoint_name} has same sd type of graph type {onediff_shared.current_unet_type}, skip compile"
+                f"Model {current_checkpoint_name} has same sd type of graph type {onediff_shared.previous_unet_type}, skip compile"
             )
 
         with UnetCompileCtx(
diff --git a/onediff_sd_webui_extensions/ui_utils.py b/onediff_sd_webui_extensions/ui_utils.py
index bdb875a38..097f51853 100644
--- a/onediff_sd_webui_extensions/ui_utils.py
+++ b/onediff_sd_webui_extensions/ui_utils.py
@@ -57,20 +57,6 @@ def refresh_all_compiler_caches(path: Path = None):
     all_compiler_caches = [f.stem for f in Path(path).iterdir() if f.is_file()]
 
 
-def check_structure_change_and_update(current_type: dict[str, bool], model):
-    def get_model_type(model):
-        return {
-            "is_sdxl": model.is_sdxl,
-            "is_sd2": model.is_sd2,
-            "is_sd1": model.is_sd1,
-            "is_ssd": model.is_ssd,
-        }
-
-    changed = current_type != get_model_type(model)
-    current_type.update(**get_model_type(model))
-    return changed
-
-
 def load_graph(compiled_unet: DeployableModule, compiler_cache: str):
     from compile import OneDiffCompiledGraph