From 0105fc4ef323f4d86f64c6c129494e5664dec5cf Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 19 Dec 2023 07:20:24 +0000
Subject: [PATCH 01/89] update

---
 src/diffusers/loaders/single_file.py       |    3 +-
 src/diffusers/loaders/single_file_utils.py | 1303 ++++++++++++++++++++
 2 files changed, 1304 insertions(+), 2 deletions(-)
 create mode 100644 src/diffusers/loaders/single_file_utils.py

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 742984449e4f..327e3ba29d7e 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -28,6 +28,7 @@
     logging,
 )
 from ..utils.import_utils import BACKENDS_MAPPING
+from .single_file_utils import download_from_original_stable_diffusion_ckpt
 
 
 if is_transformers_available():
@@ -149,8 +150,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         >>> pipeline.to("cuda")
         ```
         """
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
 
         original_config_file = kwargs.pop("original_config_file", None)
         config_files = kwargs.pop("config_files", None)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
new file mode 100644
index 000000000000..3edcaca28b58
--- /dev/null
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -0,0 +1,1303 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+
+import re
+from contextlib import nullcontext
+from io import BytesIO
+from typing import Dict, Optional, Union
+
+import requests
+import torch
+from omegaconf import OmegaConf
+from safetensors.torch import load_file as safe_load
+from transformers import (
+    AutoFeatureExtractor,
+    BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+    AutoencoderKL,
+    ControlNetModel,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
+from ...utils import is_accelerate_available, is_omegaconf_available, logging
+from ...utils.import_utils import BACKENDS_MAPPING
+from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from ..paint_by_example import PaintByExampleImageEncoder
+from ..pipeline_utils import DiffusionPipeline
+from .safety_checker import StableDiffusionSafetyChecker
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.utils import set_module_tensor_to_device
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+CONFIG_URLS = {
+    "v1": "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml",
+    "v2": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml",
+    "xl": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml",
+    "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
+}
+
+CHECKPOINT_KEY_NAMES = {
+    "v2": "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight",
+    "xl_base": "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias",
+    "xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias",
+}
+
+
+def fetch_original_config_file_from_url(checkpoint):
+    if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
+        config_url = CONFIG_URLS["v2"]
+
+    elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint:
+        config_url = CONFIG_URLS["xl"]
+
+    elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint:
+        config_url = CONFIG_URLS["xl_refiner"]
+
+    else:
+        config_url = CONFIG_URLS["v1"]
+
+    #TODO: Add upscale config
+
+    original_config_file = BytesIO(requests.get(config_url).content)
+
+    return original_config_file
+
+
+def fetch_original_config_file_from_file(checkpoint, config_files: list):
+    if "v1" in config_files:
+        return config_files["v1"]
+
+    if "v2" in config_files:
+        return config_files["v2"]
+
+    if "xl" in config_files:
+        return config_files["xl"]
+
+    if "xl_refiner" in config_files:
+        return config_files["xl_refiner"]
+
+    #TODO: Add upscale config
+
+    return
+
+
+def fetch_original_config(checkpoint, config_files: list):
+    if config_files is not None:
+        original_config_file = fetch_original_config_file_from_file(checkpoint, config_files)
+    else:
+        original_config_file = fetch_original_config_file_from_url(checkpoint)
+
+    original_config = OmegaConf.load(original_config_file)
+
+    return original_config
+
+
+def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True):
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    if isinstance(checkpoint_path_or_dict, str):
+        if from_safetensors:
+            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
+
+        else:
+            checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+
+    elif isinstance(checkpoint_path_or_dict, dict):
+        checkpoint = checkpoint_path_or_dict
+
+    return checkpoint
+
+
+def get_model_type(original_config, model_type=None):
+    if model_type is not None:
+        return model_type
+
+    has_cond_stage_config = "cond_stage_config" in original_config.model.params and original_config.model.params.cond_stage_config is not None
+    has_network_config = "network_config" in original_config.model.params and original_config.model.params.network_config is not None
+
+    if has_cond_stage_config:
+        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+
+    elif has_network_config:
+        context_dim = original_config.model.params.network_config.params.context_dim
+        if context_dim == 2048:
+            model_type = "SDXL"
+        else:
+            model_type = "SDXL-Refiner"
+    else:
+        raise ValueError("Unable to infer model type from config")
+
+    logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
+
+    return model_type
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config.model.params.control_stage_config.params
+    else:
+        if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
+            unet_params = original_config.model.params.unet_config.params
+        else:
+            unet_params = original_config.model.params.network_config.params
+
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params.transformer_depth is not None:
+        transformer_layers_per_block = (
+            unet_params.transformer_depth
+            if isinstance(unet_params.transformer_depth, int)
+            else list(unet_params.transformer_depth)
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
+    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
+            head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params.context_dim is not None:
+        context_dim = (
+            unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params.num_classes == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params.in_channels,
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params.num_res_blocks,
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params.disable_self_attentions
+
+    if "num_classes" in unet_params and isinstance(unet_params.num_classes, int):
+        config["num_class_embeds"] = unet_params.num_classes
+
+    if controlnet:
+        config["conditioning_channels"] = unet_params.hint_channels
+    else:
+        config["out_channels"] = unet_params.out_channels
+        config["up_block_types"] = tuple(up_block_types)
+
+    return config
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params.in_channels,
+        "out_channels": vae_params.out_ch,
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params.z_channels,
+        "layers_per_block": vae_params.num_res_blocks,
+    }
+    return config
+
+
+def convert_ldm_unet_checkpoint(
+    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
+):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    if skip_extract_state_dict:
+        unet_state_dict = checkpoint
+    else:
+        # extract state_dict for UNet
+        unet_state_dict = {}
+        keys = list(checkpoint.keys())
+
+        if controlnet:
+            unet_key = "control_model."
+        else:
+            unet_key = "model.diffusion_model."
+
+        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            logger.warning(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            if sum(k.startswith("model_ema") for k in keys) > 100:
+                logger.warning(
+                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                )
+
+            for key in keys:
+                if key.startswith(unet_key):
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if config["addition_embed_type"] == "text_time":
+        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    # Relevant to StableDiffusionUpscalePipeline
+    if "num_class_embeds" in config:
+        if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
+            new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if controlnet:
+        # conditioning embedding
+
+        orig_index = 0
+
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    return new_checkpoint
+
+def download_from_original_stable_diffusion_ckpt(
+    checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+    original_config_file: str = None,
+    image_size: Optional[int] = None,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    stable_unclip: Optional[str] = None,
+    stable_unclip_prior: Optional[str] = None,
+    clip_stats_path: Optional[str] = None,
+    controlnet: Optional[bool] = None,
+    adapter: Optional[bool] = None,
+    load_safety_checker: bool = True,
+    pipeline_class: DiffusionPipeline = None,
+    local_files_only=False,
+    vae_path=None,
+    vae=None,
+    text_encoder=None,
+    text_encoder_2=None,
+    tokenizer=None,
+    tokenizer_2=None,
+    config_files=None,
+) -> DiffusionPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path_or_dict (`str` or `dict`): Path to `.ckpt` file, or the state dict.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            inferred by looking for a key that only exists in SD2.0 models.
+        image_size (`int`, *optional*, defaults to 512):
+            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of input channels. If `None`, it will be automatically inferred.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        model_type (`str`, *optional*, defaults to `None`):
+            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
+            "FrozenCLIPEmbedder", "PaintByExample"]`.
+        is_img2img (`bool`, *optional*, defaults to `False`):
+            Whether the model should be loaded as an img2img pipeline.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        upcast_attention (`bool`, *optional*, defaults to `None`):
+            Whether the attention computation should always be upcasted. This is necessary when running stable
+            diffusion 2.1.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        load_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether to load the safety checker or not. Defaults to `True`.
+        pipeline_class (`str`, *optional*, defaults to `None`):
+            The pipeline class to use. Pass `None` to determine automatically.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        vae (`AutoencoderKL`, *optional*, defaults to `None`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. If
+            this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        text_encoder (`CLIPTextModel`, *optional*, defaults to `None`):
+            An instance of [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)
+            to use, specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)
+            variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`):
+            An instance of
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+            to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
+            needed.
+        config_files (`Dict[str, str]`, *optional*, defaults to `None`):
+            A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
+            will load the config files by itself, if needed. Valid keys are:
+                - `v1`: Config file for Stable Diffusion v1
+                - `v2`: Config file for Stable Diffusion v2
+                - `xl`: Config file for Stable Diffusion XL
+                - `xl_refiner`: Config file for Stable Diffusion XL Refiner
+        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    # import pipelines here to avoid circular import error when using from_single_file method
+    from diffusers import (
+        LDMTextToImagePipeline,
+        PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionUpscalePipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+        StableDiffusionXLImg2ImgPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLPipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
+
+    if prediction_type == "v-prediction":
+        prediction_type = "v_prediction"
+
+    if not is_omegaconf_available():
+        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+
+    checkpoint = load_checkpoint(checkpoint_path_or_dict)
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    original_config = fetch_original_config(checkpoint, config_files)
+    model_type = get_model_type(original_config, model_type)
+
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=path, extract_ema=extract_ema
+    )
+
+    num_channels = get_num_channels()
+
+    if pipeline_class is None:
+        # Check if we have a SDXL or SD model and initialize default pipeline
+        if model_type not in ["SDXL", "SDXL-Refiner"]:
+            pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
+        else:
+            pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
+
+    if num_in_channels is None and pipeline_class in [
+        StableDiffusionInpaintPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+    ]:
+        num_in_channels = 9
+    if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
+        num_in_channels = 7
+    elif num_in_channels is None:
+        num_in_channels = 4
+
+    if "unet_config" in original_config.model.params:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    if controlnet is None and "control_stage_config" in original_config.model.params:
+        path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+        controlnet = convert_controlnet_checkpoint(
+            checkpoint, original_config, path, image_size, upcast_attention, extract_ema
+        )
+
+    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
+
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        scheduler_dict = {
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "interpolation_type": "linear",
+            "num_train_timesteps": num_train_timesteps,
+            "prediction_type": "epsilon",
+            "sample_max_value": 1.0,
+            "set_alpha_to_one": False,
+            "skip_prk_steps": True,
+            "steps_offset": 1,
+            "timestep_spacing": "leading",
+        }
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
+        scheduler_type = "euler"
+    else:
+        beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
+        beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
+        scheduler = DDIMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type=prediction_type,
+        )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    if pipeline_class == StableDiffusionUpscalePipeline:
+        image_size = original_config.model.params.unet_config.params.image_size
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=path, extract_ema=extract_ema
+    )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        unet = UNet2DConditionModel(**unet_config)
+
+    if is_accelerate_available():
+        if model_type not in ["SDXL", "SDXL-Refiner"]:  # SBM Delay this.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+    else:
+        unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    if vae_path is None and vae is None:
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+        if (
+            "model" in original_config
+            and "params" in original_config.model
+            and "scale_factor" in original_config.model.params
+        ):
+            vae_scaling_factor = original_config.model.params.scale_factor
+        else:
+            vae_scaling_factor = 0.18215  # default SD scaling factor
+
+        vae_config["scaling_factor"] = vae_scaling_factor
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            vae = AutoencoderKL(**vae_config)
+
+        if is_accelerate_available():
+            for param_name, param in converted_vae_checkpoint.items():
+                set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+        else:
+            vae.load_state_dict(converted_vae_checkpoint)
+    elif vae is None:
+        vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        config_name = "stabilityai/stable-diffusion-2"
+        config_kwargs = {"subfolder": "text_encoder"}
+
+        if text_encoder is None:
+            text_model = convert_open_clip_checkpoint(
+                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+            )
+        else:
+            text_model = text_encoder
+
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'."
+            )
+
+        if stable_unclip is None:
+            if controlnet:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    controlnet=controlnet,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+            elif pipeline_class == StableDiffusionUpscalePipeline:
+                scheduler = DDIMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
+                )
+                low_res_scheduler = DDPMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+                )
+
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    low_res_scheduler=low_res_scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+
+            else:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+        else:
+            image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
+                original_config, clip_stats_path=clip_stats_path, device=device
+            )
+
+            if stable_unclip == "img2img":
+                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
+
+                pipe = StableUnCLIPImg2ImgPipeline(
+                    # image encoding components
+                    feature_extractor=feature_extractor,
+                    image_encoder=image_encoder,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            elif stable_unclip == "txt2img":
+                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
+                    karlo_model = "kakaobrain/karlo-v1-alpha"
+                    prior = PriorTransformer.from_pretrained(
+                        karlo_model, subfolder="prior", local_files_only=local_files_only
+                    )
+
+                    try:
+                        prior_tokenizer = CLIPTokenizer.from_pretrained(
+                            "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                        )
+                    except Exception:
+                        raise ValueError(
+                            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                        )
+                    prior_text_model = CLIPTextModelWithProjection.from_pretrained(
+                        "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                    )
+
+                    prior_scheduler = UnCLIPScheduler.from_pretrained(
+                        karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only
+                    )
+                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
+                else:
+                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
+
+                pipe = StableUnCLIPPipeline(
+                    # prior components
+                    prior_tokenizer=prior_tokenizer,
+                    prior_text_encoder=prior_text_model,
+                    prior=prior,
+                    prior_scheduler=prior_scheduler,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            else:
+                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
+    elif model_type == "PaintByExample":
+        vision_model = convert_paint_by_example_checkpoint(checkpoint)
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
+            )
+        pipe = PaintByExamplePipeline(
+            vae=vae,
+            image_encoder=vision_model,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        )
+    elif model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(
+            checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
+        )
+        try:
+            tokenizer = (
+                CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+                if tokenizer is None
+                else tokenizer
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        if load_safety_checker:
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        else:
+            safety_checker = None
+            feature_extractor = None
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+        else:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+    elif model_type in ["SDXL", "SDXL-Refiner"]:
+        is_refiner = model_type == "SDXL-Refiner"
+
+        if (is_refiner is False) and (tokenizer is None):
+            try:
+                tokenizer = CLIPTokenizer.from_pretrained(
+                    "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                )
+
+        if (is_refiner is False) and (text_encoder is None):
+            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+
+        if tokenizer_2 is None:
+            try:
+                tokenizer_2 = CLIPTokenizer.from_pretrained(
+                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
+                )
+
+        if text_encoder_2 is None:
+            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config_kwargs = {"projection_dim": 1280}
+            prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model."
+
+            text_encoder_2 = convert_open_clip_checkpoint(
+                checkpoint,
+                config_name,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                **config_kwargs,
+            )
+
+        if is_accelerate_available():  # SBM Now move model to cpu.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+        elif adapter:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                adapter=adapter,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+
+        else:
+            pipeline_kwargs = {
+                "vae": vae,
+                "text_encoder": text_encoder,
+                "tokenizer": tokenizer,
+                "text_encoder_2": text_encoder_2,
+                "tokenizer_2": tokenizer_2,
+                "unet": unet,
+                "scheduler": scheduler,
+            }
+
+            if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or (
+                pipeline_class == StableDiffusionXLInpaintPipeline
+            ):
+                pipeline_kwargs.update({"requires_aesthetics_score": is_refiner})
+
+            if is_refiner:
+                pipeline_kwargs.update({"force_zeros_for_empty_prompt": False})
+
+            pipe = pipeline_class(**pipeline_kwargs)
+    else:
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    return pipe
+

From 2686fddbf1e358dd017ccd43cfc587f9c481463d Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Dec 2023 13:54:32 +0000
Subject: [PATCH 02/89] update

---
 src/diffusers/loaders/single_file.py       |  96 +++++---
 src/diffusers/loaders/single_file_utils.py | 260 +++++++++++++++++++--
 2 files changed, 302 insertions(+), 54 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 327e3ba29d7e..4fb539f853ef 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -40,6 +40,62 @@
 logger = logging.get_logger(__name__)
 
 
+DIFFUSER_PIPELINE_CONFIGS = {
+    "StableDiffusionPipeline": None,
+    "StableDiffusionImg2ImgPipeline": None,
+    "StableDiffusionInpaintPipeline": None,
+    "StableDiffusionControlNetPipeline": None,
+}
+
+VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+MODEL_TYPE_FROM_PIPELINE_CLASS = {
+    "StableUnCLIPPipeline": "FrozenOpenCLIPEmbedder",
+    "StableUnCLIPImg2ImgPipeline": "FrozenOpenCLIPEmbedder",
+}
+
+
+
+def check_valid_url(pretrained_model_link_or_path):
+    # remove huggingface url
+    has_valid_url_prefix = False
+    for prefix in VALID_URL_PREFIXES:
+        if pretrained_model_link_or_path.startswith(prefix):
+            pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
+            has_valid_url_prefix = True
+
+    return has_valid_url_prefix
+
+
+def fetch_model_checkpoint(ckpt_path, cache_dir=None, resume_download=False, force_download=False, proxies=None, local_files_only=None, token=None, revision=None):
+    # get repo_id and (potentially nested) file path of ckpt in repo
+    repo_id = "/".join(ckpt_path.parts[:2])
+    file_path = "/".join(ckpt_path.parts[2:])
+
+    if file_path.startswith("blob/"):
+        file_path = file_path[len("blob/") :]
+
+    if file_path.startswith("main/"):
+        file_path = file_path[len("main/") :]
+
+    path = hf_hub_download(
+        repo_id,
+        filename=file_path,
+        cache_dir=cache_dir,
+        resume_download=resume_download,
+        proxies=proxies,
+        local_files_only=local_files_only,
+        token=token,
+        revision=revision,
+        force_download=force_download,
+    )
+
+    return path
+
+
+def infer_model_type(pipeline_class_name):
+    return MODEL_TYPE_FROM_PIPELINE_CLASS.get(pipeline_class_name, None)
+
+
 class FromSingleFileMixin:
     """
     Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`].
@@ -150,12 +206,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         >>> pipeline.to("cuda")
         ```
         """
-
         original_config_file = kwargs.pop("original_config_file", None)
         config_files = kwargs.pop("config_files", None)
         cache_dir = kwargs.pop("cache_dir", None)
         resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
         token = kwargs.pop("token", None)
@@ -221,43 +275,15 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         else:
             raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
 
-        # remove huggingface url
-        has_valid_url_prefix = False
-        valid_url_prefixes = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
-        for prefix in valid_url_prefixes:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-                has_valid_url_prefix = True
+        has_valid_url_prefix =  check_valid_url(pretrained_model_link_or_path)
 
         # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
         ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            if not has_valid_url_prefix:
-                raise ValueError(
-                    f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(valid_url_prefixes)}"
-                )
-
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
+        if (not ckpt_path.is_file()) and (not has_valid_url_prefix):
+            raise ValueError(
+                f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
+        pretrained_model_link_or_path = fetch_model_checkpoint(ckpt_path, cache_dir=cache_dir, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, revision=revision)
 
         pipe = download_from_original_stable_diffusion_ckpt(
             pretrained_model_link_or_path,
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 3edcaca28b58..0effa4d826bd 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Conversion script for the Stable Diffusion checkpoints."""
 
-import re
 from contextlib import nullcontext
 from io import BytesIO
 from typing import Dict, Optional, Union
@@ -26,18 +25,12 @@
 from transformers import (
     AutoFeatureExtractor,
     BertTokenizerFast,
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModel,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
 )
 
 from ...models import (
     AutoencoderKL,
-    ControlNetModel,
     PriorTransformer,
     UNet2DConditionModel,
 )
@@ -54,11 +47,8 @@
 )
 from ...utils import is_accelerate_available, is_omegaconf_available, logging
 from ...utils.import_utils import BACKENDS_MAPPING
-from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from ..paint_by_example import PaintByExampleImageEncoder
 from ..pipeline_utils import DiffusionPipeline
 from .safety_checker import StableDiffusionSafetyChecker
-from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 
 
 if is_accelerate_available():
@@ -147,7 +137,7 @@ def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True)
     return checkpoint
 
 
-def get_model_type(original_config, model_type=None):
+def set_model_type(original_config, model_type=None):
     if model_type is not None:
         return model_type
 
@@ -710,6 +700,242 @@ def convert_ldm_unet_checkpoint(
 
     return new_checkpoint
 
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
+    if text_encoder is None:
+        config_name = "openai/clip-vit-large-patch14"
+        try:
+            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            text_model = CLIPTextModel(config)
+    else:
+        text_model = text_encoder
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"]
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+def create_unet_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs):
+    extract_ema = kwargs.get("extract_ema", False)
+
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=path, extract_ema=extract_ema
+    )
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        unet = UNet2DConditionModel(**unet_config)
+
+    if is_accelerate_available():
+        for param_name, param in diffusers_format_unet_checkpoint.items():
+            set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+    else:
+        unet.load_state_dict(diffusers_format_unet_checkpoint)
+
+    return unet
+
+
+def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs):
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        vae = AutoencoderKL(**vae_config)
+
+    if is_accelerate_available():
+        for param_name, param in diffusers_format_vae_checkpoint.items():
+            set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+    else:
+        vae.load_state_dict(diffusers_format_vae_checkpoint)
+
+    return vae
+
+
+
 def download_from_original_stable_diffusion_ckpt(
     checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
     original_config_file: str = None,
@@ -737,6 +963,7 @@ def download_from_original_stable_diffusion_ckpt(
     tokenizer=None,
     tokenizer_2=None,
     config_files=None,
+    **kwargs
 ) -> DiffusionPipeline:
     """
     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
@@ -837,15 +1064,10 @@ def download_from_original_stable_diffusion_ckpt(
         checkpoint = checkpoint["state_dict"]
 
     original_config = fetch_original_config(checkpoint, config_files)
-    model_type = get_model_type(original_config, model_type)
-
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
-    diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=path, extract_ema=extract_ema
-    )
+    model_type = set_model_type(original_config, model_type)
 
-    num_channels = get_num_channels()
+    unet = create_unet_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
+    vae = create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
 
     if pipeline_class is None:
         # Check if we have a SDXL or SD model and initialize default pipeline

From ef656d73cde3f81a2a49db3c1e2fe0a1565fb365 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Dec 2023 14:07:54 +0000
Subject: [PATCH 03/89] update

---
 src/diffusers/loaders/single_file.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 4fb539f853ef..451592763e54 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 from contextlib import nullcontext
 from io import BytesIO
 from pathlib import Path
@@ -54,6 +55,10 @@
 }
 
 
+def extract_pipeline_compoments(pipeline_class):
+    components = inspect.signature(pipeline_class).parameters.keys()
+    return components
+
 
 def check_valid_url(pretrained_model_link_or_path):
     # remove huggingface url
@@ -284,6 +289,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
         pretrained_model_link_or_path = fetch_model_checkpoint(ckpt_path, cache_dir=cache_dir, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, revision=revision)
+        components = extract_pipeline_compoments(cls)
 
         pipe = download_from_original_stable_diffusion_ckpt(
             pretrained_model_link_or_path,

From daf4d05b1fccf84af21d7ce1db1e86ff46d45a24 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 25 Dec 2023 05:49:40 +0000
Subject: [PATCH 04/89] update

---
 src/diffusers/loaders/single_file.py | 32 ++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 451592763e54..236936277787 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -20,6 +20,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import validate_hf_hub_args
+from safetensors.torch import load_file as safe_load
 
 from ..utils import (
     deprecate,
@@ -55,7 +56,7 @@
 }
 
 
-def extract_pipeline_compoments(pipeline_class):
+def extract_pipeline_component_names(pipeline_class):
     components = inspect.signature(pipeline_class).parameters.keys()
     return components
 
@@ -97,10 +98,31 @@ def fetch_model_checkpoint(ckpt_path, cache_dir=None, resume_download=False, for
     return path
 
 
+def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True):
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    if isinstance(checkpoint_path_or_dict, str):
+        if from_safetensors:
+            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
+
+        else:
+            checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+
+    elif isinstance(checkpoint_path_or_dict, dict):
+        checkpoint = checkpoint_path_or_dict
+
+    return checkpoint
+
+
 def infer_model_type(pipeline_class_name):
     return MODEL_TYPE_FROM_PIPELINE_CLASS.get(pipeline_class_name, None)
 
 
+def build_component(component_name, **kwargs):
+    return
+
+
 class FromSingleFileMixin:
     """
     Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`].
@@ -289,7 +311,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
         pretrained_model_link_or_path = fetch_model_checkpoint(ckpt_path, cache_dir=cache_dir, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, revision=revision)
-        components = extract_pipeline_compoments(cls)
+        checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
+
+        component_names = extract_pipeline_component_names(cls)
+
+        pipeline_components = {}
+        for component in component_names:
+            pipeline_components[component] = build_component(component, checkpoint, **kwargs)
 
         pipe = download_from_original_stable_diffusion_ckpt(
             pretrained_model_link_or_path,

From 8b7eecd4d4bc196f916b737e518fb93d1f625ff7 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 26 Dec 2023 03:13:43 +0000
Subject: [PATCH 05/89] update

---
 src/diffusers/loaders/single_file.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 236936277787..9f5c3d48d109 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
 from contextlib import nullcontext
 from io import BytesIO
@@ -30,7 +31,7 @@
     logging,
 )
 from ..utils.import_utils import BACKENDS_MAPPING
-from .single_file_utils import download_from_original_stable_diffusion_ckpt
+from .single_file_utils import download_from_original_stable_diffusion_ckpt, fetch_original_config
 
 
 if is_transformers_available():
@@ -72,7 +73,7 @@ def check_valid_url(pretrained_model_link_or_path):
     return has_valid_url_prefix
 
 
-def fetch_model_checkpoint(ckpt_path, cache_dir=None, resume_download=False, force_download=False, proxies=None, local_files_only=None, token=None, revision=None):
+def download_model_checkpoint(ckpt_path, cache_dir=None, resume_download=False, force_download=False, proxies=None, local_files_only=None, token=None, revision=None):
     # get repo_id and (potentially nested) file path of ckpt in repo
     repo_id = "/".join(ckpt_path.parts[:2])
     file_path = "/".join(ckpt_path.parts[2:])
@@ -119,7 +120,14 @@ def infer_model_type(pipeline_class_name):
     return MODEL_TYPE_FROM_PIPELINE_CLASS.get(pipeline_class_name, None)
 
 
-def build_component(component_name, **kwargs):
+def build_component(component_name, original_config, checkpoint, **kwargs):
+    if component_name in kwargs:
+        return kwargs.pop(component_name, None)
+
+    component_class = getattr(importlib.import_module("diffusers"), component_name)
+
+
+
     return
 
 
@@ -310,14 +318,21 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             raise ValueError(
                 f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
-        pretrained_model_link_or_path = fetch_model_checkpoint(ckpt_path, cache_dir=cache_dir, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, revision=revision)
+        pretrained_model_link_or_path = download_model_checkpoint(ckpt_path, cache_dir=cache_dir, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, revision=revision)
         checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
+        global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
 
+        # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+        # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+        while "state_dict" in checkpoint:
+            checkpoint = checkpoint["state_dict"]
+
+        original_config = fetch_original_config(checkpoint, config_files)
         component_names = extract_pipeline_component_names(cls)
 
         pipeline_components = {}
         for component in component_names:
-            pipeline_components[component] = build_component(component, checkpoint, **kwargs)
+            pipeline_components[component] = build_component(component, checkpoint, original_config, **kwargs)
 
         pipe = download_from_original_stable_diffusion_ckpt(
             pretrained_model_link_or_path,
@@ -344,6 +359,8 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             local_files_only=local_files_only,
         )
 
+        pipe = cls(**pipeline_components, **kwargs)
+
         if torch_dtype is not None:
             pipe.to(dtype=torch_dtype)
 

From 0cd1be42d3b40d6804ded59437220d9079580f45 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 26 Dec 2023 05:09:03 +0000
Subject: [PATCH 06/89] update

---
 src/diffusers/loaders/single_file.py       | 46 ++++++++++++--
 src/diffusers/loaders/single_file_utils.py | 72 +++++++++++++++++++++-
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 9f5c3d48d109..809040b62eab 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -55,6 +55,15 @@
     "StableUnCLIPPipeline": "FrozenOpenCLIPEmbedder",
     "StableUnCLIPImg2ImgPipeline": "FrozenOpenCLIPEmbedder",
 }
+PIPELINE_COMPONENTS = {
+    "unet": ,
+    "vae": "AutoencoderKL",
+    "text_encoder": "CLIPTextModel",
+    "text_encoder_2": "CLIPTextModel",
+    "tokenizer": "CLIPTokenizer",
+    "tokenizer_2": "CLIPTokenizer",
+    "scheduler": "DiffusionScheduler",
+}
 
 
 def extract_pipeline_component_names(pipeline_class):
@@ -120,12 +129,41 @@ def infer_model_type(pipeline_class_name):
     return MODEL_TYPE_FROM_PIPELINE_CLASS.get(pipeline_class_name, None)
 
 
-def build_component(component_name, original_config, checkpoint, **kwargs):
+def build_component(pipeline_class_name, component_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     if component_name in kwargs:
         return kwargs.pop(component_name, None)
 
-    component_class = getattr(importlib.import_module("diffusers"), component_name)
+    if component_name == "unet":
+        unet = create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return unet
+
+    if component_name == "controlnet":
+        controlnet = create_controlnet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return controlnet
+
+    if component_name == "vae":
+        vae = create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
+        return vae
+
+    if component_name in ["text_encoder", "text_encoder_2"]:
+        text_encoder = create_text_encoder_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return text_encoder
+
+    if component_name in ["tokenizer", "tokenizer_2"]:
+        tokenizer = create_tokenizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return tokenizer
+
+    if component_name == "scheduler":
+        scheduler = create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return scheduler
+
+    if component_name == "image_normalizer":
+        image_normalizer = create_image_normalizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return image_normalizer
 
+    if component_name == "image_normalizer":
+        image_normalizer = create_image_normalizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
+        return image_normalizer
 
 
     return
@@ -332,7 +370,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         pipeline_components = {}
         for component in component_names:
-            pipeline_components[component] = build_component(component, checkpoint, original_config, **kwargs)
+            pipeline_components[component] = build_component(pipeline_class_name, component, checkpoint, original_config, **kwargs)
 
         pipe = download_from_original_stable_diffusion_ckpt(
             pretrained_model_link_or_path,
@@ -359,7 +397,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             local_files_only=local_files_only,
         )
 
-        pipe = cls(**pipeline_components, **kwargs)
+        pipe = cls(**pipeline_components)
 
         if torch_dtype is not None:
             pipe.to(dtype=torch_dtype)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 0effa4d826bd..76949f42af56 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -156,7 +156,7 @@ def set_model_type(original_config, model_type=None):
     else:
         raise ValueError("Unable to infer model type from config")
 
-    logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
+    logger.debug(f"No `model_type` given, `model_type` inferred as: {model_type}")
 
     return model_type
 
@@ -897,10 +897,78 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
     return text_model
 
 
-def create_unet_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs):
+def convert_controlnet_checkpoint(
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    use_linear_projection=None,
+    cross_attention_dim=None,
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+
+    ctrlnet_config.pop("sample_size")
+
+    if use_linear_projection is not None:
+        ctrlnet_config["use_linear_projection"] = use_linear_projection
+
+    if cross_attention_dim is not None:
+        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**ctrlnet_config)
+
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        skip_extract_state_dict = True
+    else:
+        skip_extract_state_dict = False
+
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint,
+        ctrlnet_config,
+        path=checkpoint_path,
+        extract_ema=extract_ema,
+        controlnet=True,
+        skip_extract_state_dict=skip_extract_state_dict,
+    )
+
+    if is_accelerate_available():
+        for param_name, param in converted_ctrl_checkpoint.items():
+            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
+    else:
+        controlnet.load_state_dict(converted_ctrl_checkpoint)
+
+    return controlnet
+
+
+def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
+    if "num_in_channels" in kwargs:
+        num_in_channels = kwargs.pop("num_in_channels")
+    elif pipeline_class_name in [
+        "StableDiffusionInpaintPipeline",
+        "StableDiffusionXLInpaintPipeline",
+        "StableDiffusionXLControlNetInpaintPipeline"]:
+        num_in_channels = 9
+    elif pipeline_class_name == "StableDiffusionUpscalePipeline":
+        num_in_channels = 7
+    else:
+        num_in_channels = 4
+
+    if "upcast_attention" in kwargs:
+        upcast_attention = kwargs.pop("upcast_attention")
+
     extract_ema = kwargs.get("extract_ema", False)
 
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["num_in_channels"] = num_in_channels
+    unet_config["upcast_attention"] = upcast_attention
+
     path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
     diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint, unet_config, path=path, extract_ema=extract_ema

From 16a80d3793f3aa0665e18e7a452426b523caddc8 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 26 Dec 2023 15:51:11 +0000
Subject: [PATCH 07/89] update

---
 src/diffusers/loaders/single_file.py       |   2 +-
 src/diffusers/loaders/single_file_utils.py | 106 +++++++++++++++++++++
 2 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 809040b62eab..0a8f69433ad4 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -370,7 +370,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         pipeline_components = {}
         for component in component_names:
-            pipeline_components[component] = build_component(pipeline_class_name, component, checkpoint, original_config, **kwargs)
+            pipeline_components[component] = build_component(pipeline_name, component, checkpoint, original_config, **kwargs)
 
         pipe = download_from_original_stable_diffusion_ckpt(
             pretrained_model_link_or_path,
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 76949f42af56..db78981e24f6 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Conversion script for the Stable Diffusion checkpoints."""
 
+import re
 from contextlib import nullcontext
 from io import BytesIO
 from typing import Dict, Optional, Union
@@ -70,6 +71,30 @@
     "xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias",
 }
 
+textenc_conversion_lst = [
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("ln_final.bias", "text_model.final_layer_norm.bias"),
+    ("text_projection", "text_projection.weight"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
 
 def fetch_original_config_file_from_url(checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
@@ -946,6 +971,87 @@ def convert_controlnet_checkpoint(
 
     return controlnet
 
+def convert_open_clip_checkpoint(
+    checkpoint,
+    config_name,
+    prefix="cond_stage_model.model.",
+    has_projection=False,
+    local_files_only=False,
+    **config_kwargs,
+):
+    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    # text_model = CLIPTextModelWithProjection.from_pretrained(
+    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
+    # )
+    try:
+        config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'."
+        )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
+
+    keys = list(checkpoint.keys())
+
+    keys_to_ignore = []
+    if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
+        # make sure to remove all keys > 22
+        keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
+        keys_to_ignore += ["cond_stage_model.model.text_projection"]
+
+    text_model_dict = {}
+
+    if prefix + "text_projection" in checkpoint:
+        d_model = int(checkpoint[prefix + "text_projection"].shape[0])
+    else:
+        d_model = 1024
+
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    for key in keys:
+        if key in keys_to_ignore:
+            continue
+        if key[len(prefix) :] in textenc_conversion_map:
+            if key.endswith("text_projection"):
+                value = checkpoint[key].T.contiguous()
+            else:
+                value = checkpoint[key]
+
+            text_model_dict[textenc_conversion_map[key[len(prefix) :]]] = value
+
+        if key.startswith(prefix + "transformer."):
+            new_key = key[len(prefix + "transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
 
 def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
     if "num_in_channels" in kwargs:

From 7289be17375577f4dca00a708c65c0739b44f932 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 28 Dec 2023 08:52:07 +0000
Subject: [PATCH 08/89] update

---
 src/diffusers/loaders/single_file.py       | 244 +++++-----
 src/diffusers/loaders/single_file_utils.py | 496 +++++++++++++++++++--
 2 files changed, 591 insertions(+), 149 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 0a8f69433ad4..1e670c954f1c 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -31,7 +31,14 @@
     logging,
 )
 from ..utils.import_utils import BACKENDS_MAPPING
-from .single_file_utils import download_from_original_stable_diffusion_ckpt, fetch_original_config
+from .single_file_utils import (
+    create_scheduler_components,
+    create_stable_unclip_components,
+    create_unet_model,
+    create_vae_model,
+    download_from_original_stable_diffusion_ckpt,
+    fetch_original_config,
+)
 
 
 if is_transformers_available():
@@ -43,26 +50,13 @@
 logger = logging.get_logger(__name__)
 
 
-DIFFUSER_PIPELINE_CONFIGS = {
-    "StableDiffusionPipeline": None,
-    "StableDiffusionImg2ImgPipeline": None,
-    "StableDiffusionInpaintPipeline": None,
-    "StableDiffusionControlNetPipeline": None,
-}
-
 VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
-MODEL_TYPE_FROM_PIPELINE_CLASS = {
+TEXT_ENCODER_FROM_PIPELINE_CLASS = {
     "StableUnCLIPPipeline": "FrozenOpenCLIPEmbedder",
     "StableUnCLIPImg2ImgPipeline": "FrozenOpenCLIPEmbedder",
-}
-PIPELINE_COMPONENTS = {
-    "unet": ,
-    "vae": "AutoencoderKL",
-    "text_encoder": "CLIPTextModel",
-    "text_encoder_2": "CLIPTextModel",
-    "tokenizer": "CLIPTokenizer",
-    "tokenizer_2": "CLIPTokenizer",
-    "scheduler": "DiffusionScheduler",
+    "LDMTextToImagePipeline": "LDMTextToImage",
+    "PaintByExamplePipeline": "PaintByExample",
+    "StableDiffusion": "stable-diffusion",
 }
 
 
@@ -82,7 +76,16 @@ def check_valid_url(pretrained_model_link_or_path):
     return has_valid_url_prefix
 
 
-def download_model_checkpoint(ckpt_path, cache_dir=None, resume_download=False, force_download=False, proxies=None, local_files_only=None, token=None, revision=None):
+def download_model_checkpoint(
+    ckpt_path,
+    cache_dir=None,
+    resume_download=False,
+    force_download=False,
+    proxies=None,
+    local_files_only=None,
+    token=None,
+    revision=None,
+):
     # get repo_id and (potentially nested) file path of ckpt in repo
     repo_id = "/".join(ckpt_path.parts[:2])
     file_path = "/".join(ckpt_path.parts[2:])
@@ -125,48 +128,94 @@ def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True)
     return checkpoint
 
 
-def infer_model_type(pipeline_class_name):
-    return MODEL_TYPE_FROM_PIPELINE_CLASS.get(pipeline_class_name, None)
-
-
-def build_component(pipeline_class_name, component_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+def build_component(
+    pipeline_components,
+    pipeline_class_name,
+    component_name,
+    original_config,
+    checkpoint,
+    checkpoint_path_or_dict,
+    **kwargs,
+):
     if component_name in kwargs:
         return kwargs.pop(component_name, None)
 
-    if component_name == "unet":
-        unet = create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return unet
+    if component_name in pipeline_components:
+        return {}
 
-    if component_name == "controlnet":
-        controlnet = create_controlnet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return controlnet
+    if component_name == "unet":
+        unet_components = create_unet_model(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return unet_components
 
     if component_name == "vae":
-        vae = create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
-        return vae
+        vae_components = create_vae_model(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return vae_components
 
-    if component_name in ["text_encoder", "text_encoder_2"]:
-        text_encoder = create_text_encoder_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return text_encoder
+    if component_name == "controlnet":
+        controlnet_components = create_controlnet_model(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return controlnet_components
 
-    if component_name in ["tokenizer", "tokenizer_2"]:
-        tokenizer = create_tokenizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return tokenizer
+    if component_name == "adapter":
+        adapter_components = create_adapter_model(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return adapter_components
 
     if component_name == "scheduler":
-        scheduler = create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return scheduler
+        scheduler_components = create_scheduler(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return scheduler_components
 
-    if component_name == "image_normalizer":
-        image_normalizer = create_image_normalizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return image_normalizer
+    if component_name in ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]:
+        text_encoder_components = create_text_encoders_and_tokenizers(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return text_encoder_components
 
-    if component_name == "image_normalizer":
-        image_normalizer = create_image_normalizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs)
-        return image_normalizer
+    return
 
 
-    return
+def build_additional_components(
+    pipeline_components,
+    pipeline_class_name,
+    component_name,
+    original_config,
+    checkpoint,
+    checkpoint_path_or_dict,
+    **kwargs,
+):
+    if component_name in kwargs:
+        return kwargs.pop(component_name, None)
+
+    if component_name in pipeline_components:
+        return {}
+
+    local_files_only = kwargs.pop("local_files_only", False)
+
+    if pipeline_class_name == ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
+        stable_unclip_components = create_stable_unclip_components(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return stable_unclip_components
+
+    if pipeline_class_name == "LDMTextToImagePipeline":
+        ldm_text_to_image_components = create_ldm_text_to_image_components(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return ldm_text_to_image_components
+
+    if pipeline_class_name == "PaintByExamplePipeline":
+        paint_by_example_components = create_paint_by_example_components(
+            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        )
+        return paint_by_example_components
 
 
 class FromSingleFileMixin:
@@ -281,30 +330,22 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         """
         original_config_file = kwargs.pop("original_config_file", None)
         config_files = kwargs.pop("config_files", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
         token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
         revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        load_safety_checker = kwargs.pop("load_safety_checker", True)
+
         extract_ema = kwargs.pop("extract_ema", False)
         image_size = kwargs.pop("image_size", None)
         scheduler_type = kwargs.pop("scheduler_type", "pndm")
         num_in_channels = kwargs.pop("num_in_channels", None)
         upcast_attention = kwargs.pop("upcast_attention", None)
-        load_safety_checker = kwargs.pop("load_safety_checker", True)
         prediction_type = kwargs.pop("prediction_type", None)
-        text_encoder = kwargs.pop("text_encoder", None)
-        text_encoder_2 = kwargs.pop("text_encoder_2", None)
-        vae = kwargs.pop("vae", None)
-        controlnet = kwargs.pop("controlnet", None)
-        adapter = kwargs.pop("adapter", None)
-        tokenizer = kwargs.pop("tokenizer", None)
-        tokenizer_2 = kwargs.pop("tokenizer_2", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
 
         pipeline_name = cls.__name__
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
@@ -313,42 +354,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         if from_safetensors and use_safetensors is False:
             raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
 
-        # TODO: For now we only support stable diffusion
-        stable_unclip = None
-        model_type = None
-
-        if pipeline_name in [
-            "StableDiffusionControlNetPipeline",
-            "StableDiffusionControlNetImg2ImgPipeline",
-            "StableDiffusionControlNetInpaintPipeline",
-        ]:
-            from ..models.controlnet import ControlNetModel
-            from ..pipelines.controlnet.multicontrolnet import MultiControlNetModel
-
-            #  list/tuple or a single instance of ControlNetModel or MultiControlNetModel
-            if not (
-                isinstance(controlnet, (ControlNetModel, MultiControlNetModel))
-                or isinstance(controlnet, (list, tuple))
-                and isinstance(controlnet[0], ControlNetModel)
-            ):
-                raise ValueError("ControlNet needs to be passed if loading from ControlNet pipeline.")
-        elif "StableDiffusion" in pipeline_name:
-            # Model type will be inferred from the checkpoint.
-            pass
-        elif pipeline_name == "StableUnCLIPPipeline":
-            model_type = "FrozenOpenCLIPEmbedder"
-            stable_unclip = "txt2img"
-        elif pipeline_name == "StableUnCLIPImg2ImgPipeline":
-            model_type = "FrozenOpenCLIPEmbedder"
-            stable_unclip = "img2img"
-        elif pipeline_name == "PaintByExamplePipeline":
-            model_type = "PaintByExample"
-        elif pipeline_name == "LDMTextToImagePipeline":
-            model_type = "LDMTextToImage"
-        else:
-            raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
-
-        has_valid_url_prefix =  check_valid_url(pretrained_model_link_or_path)
+        has_valid_url_prefix = check_valid_url(pretrained_model_link_or_path)
 
         # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
         ckpt_path = Path(pretrained_model_link_or_path)
@@ -356,9 +362,16 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             raise ValueError(
                 f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
-        pretrained_model_link_or_path = download_model_checkpoint(ckpt_path, cache_dir=cache_dir, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, revision=revision)
+        pretrained_model_link_or_path = download_model_checkpoint(
+            ckpt_path,
+            cache_dir=cache_dir,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+        )
         checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
-        global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
 
         # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
         # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
@@ -370,32 +383,15 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         pipeline_components = {}
         for component in component_names:
-            pipeline_components[component] = build_component(pipeline_name, component, checkpoint, original_config, **kwargs)
+            components = build_component(
+                pipeline_components, pipeline_name, component, checkpoint, original_config, **kwargs
+            )
+            pipeline_components.update(components)
 
-        pipe = download_from_original_stable_diffusion_ckpt(
-            pretrained_model_link_or_path,
-            pipeline_class=cls,
-            model_type=model_type,
-            stable_unclip=stable_unclip,
-            controlnet=controlnet,
-            adapter=adapter,
-            from_safetensors=from_safetensors,
-            extract_ema=extract_ema,
-            image_size=image_size,
-            scheduler_type=scheduler_type,
-            num_in_channels=num_in_channels,
-            upcast_attention=upcast_attention,
-            load_safety_checker=load_safety_checker,
-            prediction_type=prediction_type,
-            text_encoder=text_encoder,
-            text_encoder_2=text_encoder_2,
-            vae=vae,
-            tokenizer=tokenizer,
-            tokenizer_2=tokenizer_2,
-            original_config_file=original_config_file,
-            config_files=config_files,
-            local_files_only=local_files_only,
-        )
+        additional_components = set(pipeline_components.keys() - component_names)
+        if additional_components:
+            components = build_additional_components(pipeline_name, component, checkpoint, original_config, **kwargs)
+            pipeline_components.update(components)
 
         pipe = cls(**pipeline_components)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index db78981e24f6..cd1153e90f5c 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -26,16 +26,21 @@
 from transformers import (
     AutoFeatureExtractor,
     BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
+    CLIPVisionModel,
+    CLIPVisionModelWithProjection,
+    CLIPVisionTextModel,
+    CLIPVisionTextModelWithProjection,
 )
 
-from ...models import (
-    AutoencoderKL,
-    PriorTransformer,
-    UNet2DConditionModel,
-)
-from ...schedulers import (
+from ..models import AutoencoderKL, ControlNetModel, PriorTransformer, UNet2DConditionModel
+from ..pipelines.pipeline_utils import DiffusionPipeline
+from ..pipelines.stable_unclip.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+from ..schedulers import (
     DDIMScheduler,
     DDPMScheduler,
     DPMSolverMultistepScheduler,
@@ -46,9 +51,8 @@
     PNDMScheduler,
     UnCLIPScheduler,
 )
-from ...utils import is_accelerate_available, is_omegaconf_available, logging
-from ...utils.import_utils import BACKENDS_MAPPING
-from ..pipeline_utils import DiffusionPipeline
+from ..utils import is_accelerate_available, is_omegaconf_available, logging
+from ..utils.import_utils import BACKENDS_MAPPING
 from .safety_checker import StableDiffusionSafetyChecker
 
 
@@ -62,7 +66,7 @@
     "v1": "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml",
     "v2": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml",
     "xl": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml",
-    "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
+    "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml",
 }
 
 CHECKPOINT_KEY_NAMES = {
@@ -71,6 +75,20 @@
     "xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias",
 }
 
+SCHEDULER_DEFAULT_CONFIG = {
+    "beta_schedule": "scaled_linear",
+    "beta_start": 0.00085,
+    "beta_end": 0.012,
+    "interpolation_type": "linear",
+    "num_train_timesteps": 1000,
+    "prediction_type": "epsilon",
+    "sample_max_value": 1.0,
+    "set_alpha_to_one": False,
+    "skip_prk_steps": True,
+    "steps_offset": 1,
+    "timestep_spacing": "leading",
+}
+
 textenc_conversion_lst = [
     ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
     ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
@@ -109,7 +127,7 @@ def fetch_original_config_file_from_url(checkpoint):
     else:
         config_url = CONFIG_URLS["v1"]
 
-    #TODO: Add upscale config
+    # TODO: Add upscale config
 
     original_config_file = BytesIO(requests.get(config_url).content)
 
@@ -129,7 +147,7 @@ def fetch_original_config_file_from_file(checkpoint, config_files: list):
     if "xl_refiner" in config_files:
         return config_files["xl_refiner"]
 
-    #TODO: Add upscale config
+    # TODO: Add upscale config
 
     return
 
@@ -162,12 +180,21 @@ def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True)
     return checkpoint
 
 
-def set_model_type(original_config, model_type=None):
+def infer_model_type(pipeline_class_name, original_config, model_type=None):
     if model_type is not None:
         return model_type
 
-    has_cond_stage_config = "cond_stage_config" in original_config.model.params and original_config.model.params.cond_stage_config is not None
-    has_network_config = "network_config" in original_config.model.params and original_config.model.params.network_config is not None
+    if pipeline_class_name in ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
+        model_type = "FrozenOpenCLIPEmbedder"
+        return model_type
+
+    has_cond_stage_config = (
+        "cond_stage_config" in original_config.model.params
+        and original_config.model.params.cond_stage_config is not None
+    )
+    has_network_config = (
+        "network_config" in original_config.model.params and original_config.model.params.network_config is not None
+    )
 
     if has_cond_stage_config:
         model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
@@ -185,6 +212,11 @@ def set_model_type(original_config, model_type=None):
 
     return model_type
 
+
+def get_default_scheduler_config():
+    return SCHEDULER_DEFAULT_CONFIG
+
+
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -350,6 +382,7 @@ def conv_attn_to_linear(checkpoint):
             if checkpoint[key].ndim > 2:
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
+
 def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
@@ -971,6 +1004,7 @@ def convert_controlnet_checkpoint(
 
     return controlnet
 
+
 def convert_open_clip_checkpoint(
     checkpoint,
     config_name,
@@ -1053,22 +1087,173 @@ def convert_open_clip_checkpoint(
     return text_model
 
 
+def stable_unclip_image_encoder(original_config, local_files_only=False):
+    """
+    Returns the image processor and clip image encoder for the img2img unclip pipeline.
+
+    We currently know of two types of stable unclip models which separately use the clip and the openclip image
+    encoders.
+    """
+
+    image_embedder_config = original_config.model.params.embedder_config
+
+    sd_clip_image_embedder_class = image_embedder_config.target
+    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
+
+    if sd_clip_image_embedder_class == "ClipImageEmbedder":
+        clip_model_name = image_embedder_config.params.model
+
+        if clip_model_name == "ViT-L/14":
+            feature_extractor = CLIPImageProcessor()
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        else:
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
+
+    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
+        feature_extractor = CLIPImageProcessor()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
+        )
+
+    return feature_extractor, image_encoder
+
+
+def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+    model = PaintByExampleImageEncoder(config)
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+
+    model.mapper.load_state_dict(mapped_weights)
+
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+
+
+def stable_unclip_image_noising_components(
+    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
+):
+    """
+    Returns the noising components for the img2img and txt2img unclip pipelines.
+
+    Converts the stability noise augmentor into
+    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
+    2. a `DDPMScheduler` for holding the noise schedule
+
+    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
+    """
+    noise_aug_config = original_config.model.params.noise_aug_config
+    noise_aug_class = noise_aug_config.target
+    noise_aug_class = noise_aug_class.split(".")[-1]
+
+    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
+        noise_aug_config = noise_aug_config.params
+        embedding_dim = noise_aug_config.timestep_dim
+        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
+        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
+
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+
+        if "clip_stats_path" in noise_aug_config:
+            if clip_stats_path is None:
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
+
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
+            clip_mean = clip_mean[None, :]
+            clip_std = clip_std[None, :]
+
+            clip_stats_state_dict = {
+                "mean": clip_mean,
+                "std": clip_std,
+            }
+
+            image_normalizer.load_state_dict(clip_stats_state_dict)
+    else:
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
+
+    return image_normalizer, image_noising_scheduler
+
+
 def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
     if "num_in_channels" in kwargs:
-        num_in_channels = kwargs.pop("num_in_channels")
+        num_in_channels = kwargs.get("num_in_channels")
+
     elif pipeline_class_name in [
         "StableDiffusionInpaintPipeline",
         "StableDiffusionXLInpaintPipeline",
-        "StableDiffusionXLControlNetInpaintPipeline"]:
+        "StableDiffusionXLControlNetInpaintPipeline",
+    ]:
         num_in_channels = 9
+
     elif pipeline_class_name == "StableDiffusionUpscalePipeline":
         num_in_channels = 7
+
     else:
         num_in_channels = 4
 
-    if "upcast_attention" in kwargs:
-        upcast_attention = kwargs.pop("upcast_attention")
-
+    upcast_attention = kwargs.get("upcast_attention", False)
     extract_ema = kwargs.get("extract_ema", False)
 
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
@@ -1092,9 +1277,8 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     return unet
 
 
-def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs):
+def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
     diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
     with ctx():
@@ -1109,6 +1293,269 @@ def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, model
     return vae
 
 
+def create_text_encoder_components(
+    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+):
+    model_type = infer_model_type(pipeline_class_name, original_config)
+    local_files_only = kwargs.get("local_files_only", False)
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        config_name = "stabilityai/stable-diffusion-2"
+        config_kwargs = {"subfolder": "text_encoder"}
+
+        try:
+            text_encoder = convert_open_clip_checkpoint(
+                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+            )
+            tokenizer = CLIPTokenizer.from_pretrained(
+                config_name, subfolder="tokenizer", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder in the following path: '{config_name}'."
+            )
+        else:
+            return {"text_encoder": text_encoder, "tokenizer": tokenizer}
+
+    elif model_type == "FrozenCLIPEmbedder":
+        try:
+            config_name = "openai/clip-vit-large-patch14"
+            text_encoder = convert_ldm_clip_checkpoint(
+                checkpoint, local_files_only=local_files_only, text_encoder=None
+            )
+            tokenizer = CLIPTokenizer.from_pretrained(
+                config_name, subfolder="tokenizer", local_files_only=local_files_only
+            )
+
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
+            )
+        else:
+            return {"text_encoder": text_encoder, "tokenizer": tokenizer}
+
+    elif model_type == "SDXL-Refiner":
+        config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+        config_kwargs = {"projection_dim": 1280}
+        prefix = "conditioner.embedders.0.model."
+
+        try:
+            tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
+            text_encoder_2 = convert_open_clip_checkpoint(
+                checkpoint,
+                config_name,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                **config_kwargs,
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
+            )
+
+        else:
+            return {
+                "tokenizer_2": tokenizer_2,
+                "text_encoder_2": text_encoder_2,
+            }
+
+    elif model_type == "SDXL":
+        try:
+            config_name = "openai/clip-vit-large-patch14"
+            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
+            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        try:
+            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            prefix = "conditioner.embedders.1.model."
+            tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
+            text_encoder_2 = convert_open_clip_checkpoint(
+                checkpoint,
+                config_name,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                **config_kwargs,
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
+            )
+
+        return {
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder,
+            "tokenizer_2": tokenizer_2,
+            "text_encoder_2": text_encoder_2,
+        }
+
+    return
+
+
+def create_scheduler_component(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+    scheduler_config = get_default_scheduler_config()
+    model_type = infer_model_type(pipeline_class_name, original_config)
+
+    scheduler_type = kwargs.get("scheduler_type", "ddim")
+    prediction_type = kwargs.get("prediction_type", None)
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+
+    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
+    scheduler_config["num_train_timesteps"] = num_train_timesteps
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+
+    else:
+        prediction_type = prediction_type or "epsilon"
+
+    scheduler_config["prediction_type"] = prediction_type
+
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        scheduler_type = "euler"
+
+    else:
+        beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
+        beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
+        scheduler_config["beta_start"] = beta_start
+        scheduler_config["beta_end"] = beta_end
+        scheduler_config["beta_schedule"] = "scaled_linear"
+        scheduler_config["clip_sample"] = False
+        scheduler_config["set_alpha_to_one"] = False
+
+        scheduler_type = "ddim"
+
+    if scheduler_type == "pndm":
+        scheduler_config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(scheduler_config)
+
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    return {"scheduler": scheduler}
+
+
+def create_stable_unclip_components(
+    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+):
+    components = {}
+
+    local_files_only = kwargs.get("local_files_only", False)
+    clip_stats_path = kwargs.get("clip_stats_path", None)
+
+    image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
+        original_config,
+        clip_stats_path=clip_stats_path,
+    )
+
+    if pipeline_class_name == "StableUnCLIPPipeline":
+        stable_unclip_prior = kwargs.get("stable_unclip_prior", None)
+        if stable_unclip_prior is None and stable_unclip_prior != "karlo":
+            raise NotImplementedError(f"Unknown prior for Stable UnCLIP model: {stable_unclip_prior}")
+
+        try:
+            config_name = "kakaobrain/karlo-v1-alpha"
+            prior = PriorTransformer.from_pretrained(config_name, subfolder="prior", local_files_only=local_files_only)
+        except Exception as e:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the prior in the following path: '{config_name}'."
+            )
+
+        try:
+            config_name = "openai/clip-vit-large-patch14"
+            prior_tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
+            prior_text_encoder = CLIPTextModelWithProjection.from_pretrained(
+                config_name, local_files_only=local_files_only
+            )
+            prior_scheduler = DDPMScheduler.from_pretrained(
+                config_name, subfolder="prior_scheduler", local_files_only=local_files_only
+            )
+
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
+            )
+        else:
+            return {
+                "prior": prior,
+                "prior_tokenizer": prior_tokenizer,
+                "prior_text_encoder": prior_text_encoder,
+                "prior_scheduler": prior_scheduler,
+                "image_normalizer": image_normalizer,
+                "image_noise_scheduler": image_noising_scheduler,
+            }
+
+    else:
+        feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
+
+        return {
+            "feature_extractor": feature_extractor,
+            "image_encoder": image_encoder,
+            "image_normalizer": image_normalizer,
+            "image_noising_scheduler": image_noising_scheduler,
+        }
+
+    return
+
+
+def create_paint_by_example_components(
+    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+):
+    local_files_only = kwargs.get("local_files_only", False)
+    image_encoder = convert_paint_by_example_checkpoint(checkpoint)
+
+    try:
+        config_name = "openai/clip-vit-large-patch14"
+        tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+        )
+
+    try:
+        config_name = "CompVis/stable-diffusion-safety-checker"
+        feature_extractor = AutoFeatureExtractor.from_pretrained(config_name, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
+        )
+
+    return {
+        "image_encoder": image_encoder,
+        "tokenizer": tokenizer,
+        "feature_extractor": feature_extractor,
+    }
+
 
 def download_from_original_stable_diffusion_ckpt(
     checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
@@ -1137,7 +1584,7 @@ def download_from_original_stable_diffusion_ckpt(
     tokenizer=None,
     tokenizer_2=None,
     config_files=None,
-    **kwargs
+    **kwargs,
 ) -> DiffusionPipeline:
     """
     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
@@ -1238,7 +1685,7 @@ def download_from_original_stable_diffusion_ckpt(
         checkpoint = checkpoint["state_dict"]
 
     original_config = fetch_original_config(checkpoint, config_files)
-    model_type = set_model_type(original_config, model_type)
+    model_type = infer_model_type(original_config, model_type)
 
     unet = create_unet_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
     vae = create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
@@ -1696,4 +2143,3 @@ def download_from_original_stable_diffusion_ckpt(
         pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
 
     return pipe
-

From 0012dd23092208a8b481f98e6a558c782c2f4034 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 28 Dec 2023 11:43:35 +0000
Subject: [PATCH 09/89] update

---
 src/diffusers/loaders/single_file.py       | 57 +++-------------------
 src/diffusers/loaders/single_file_utils.py | 37 +++++++++-----
 2 files changed, 32 insertions(+), 62 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 1e670c954f1c..f2cda38c82b2 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import importlib
 import inspect
 from contextlib import nullcontext
 from io import BytesIO
@@ -32,11 +31,15 @@
 )
 from ..utils.import_utils import BACKENDS_MAPPING
 from .single_file_utils import (
-    create_scheduler_components,
+    create_adapter_model,
+    create_controlnet_model,
+    create_ldm_text_to_image_components,
+    create_paint_by_example_components,
+    create_scheduler,
     create_stable_unclip_components,
+    create_text_encoders_and_tokenizers,
     create_unet_model,
     create_vae_model,
-    download_from_original_stable_diffusion_ckpt,
     fetch_original_config,
 )
 
@@ -197,8 +200,6 @@ def build_additional_components(
     if component_name in pipeline_components:
         return {}
 
-    local_files_only = kwargs.pop("local_files_only", False)
-
     if pipeline_class_name == ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
         stable_unclip_components = create_stable_unclip_components(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
@@ -270,42 +271,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 If set to `None`, the safetensors weights are downloaded if they're available **and** if the
                 safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
                 weights. If set to `False`, safetensors weights are not loaded.
-            extract_ema (`bool`, *optional*, defaults to `False`):
-                Whether to extract the EMA weights or not. Pass `True` to extract the EMA weights which usually yield
-                higher quality images for inference. Non-EMA weights are usually better for continuing finetuning.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            prediction_type (`str`, *optional*):
-                The prediction type the model was trained on. Use `'epsilon'` for all Stable Diffusion v1 models and
-                the Stable Diffusion v2 base model. Use `'v_prediction'` for Stable Diffusion v2.
-            num_in_channels (`int`, *optional*, defaults to `None`):
-                The number of input channels. If `None`, it is automatically inferred.
-            scheduler_type (`str`, *optional*, defaults to `"pndm"`):
-                Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-                "ddim"]`.
-            load_safety_checker (`bool`, *optional*, defaults to `True`):
-                Whether to load the safety checker or not.
-            text_encoder ([`~transformers.CLIPTextModel`], *optional*, defaults to `None`):
-                An instance of `CLIPTextModel` to use, specifically the
-                [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. If this
-                parameter is `None`, the function loads a new instance of `CLIPTextModel` by itself if needed.
-            vae (`AutoencoderKL`, *optional*, defaults to `None`):
-                Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. If
-                this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
-            tokenizer ([`~transformers.CLIPTokenizer`], *optional*, defaults to `None`):
-                An instance of `CLIPTokenizer` to use. If this parameter is `None`, the function loads a new instance
-                of `CLIPTokenizer` by itself if needed.
-            original_config_file (`str`):
-                Path to `.yaml` config file corresponding to the original architecture. If `None`, will be
-                automatically inferred by looking for a key that only exists in SD2.0 models.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
         Examples:
 
         ```py
@@ -338,14 +303,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         revision = kwargs.pop("revision", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
-        load_safety_checker = kwargs.pop("load_safety_checker", True)
-
-        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", None)
-        scheduler_type = kwargs.pop("scheduler_type", "pndm")
-        num_in_channels = kwargs.pop("num_in_channels", None)
-        upcast_attention = kwargs.pop("upcast_attention", None)
-        prediction_type = kwargs.pop("prediction_type", None)
 
         pipeline_name = cls.__name__
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
@@ -378,7 +335,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(checkpoint, config_files)
+        original_config = fetch_original_config(original_config_file, checkpoint, config_files)
         component_names = extract_pipeline_component_names(cls)
 
         pipeline_components = {}
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index cd1153e90f5c..38557bbb1054 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -31,15 +31,15 @@
     CLIPTextModel,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
-    CLIPVisionModel,
+    CLIPVisionConfig,
     CLIPVisionModelWithProjection,
-    CLIPVisionTextModel,
-    CLIPVisionTextModelWithProjection,
 )
 
 from ..models import AutoencoderKL, ControlNetModel, PriorTransformer, UNet2DConditionModel
+from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from ..pipelines.paint_by_example import PaintByExampleImageEncoder
 from ..pipelines.pipeline_utils import DiffusionPipeline
-from ..pipelines.stable_unclip.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+from ..pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from ..schedulers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -1236,6 +1236,16 @@ def stable_unclip_image_noising_components(
     return image_normalizer, image_noising_scheduler
 
 
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.params.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
 def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
     if "num_in_channels" in kwargs:
         num_in_channels = kwargs.get("num_in_channels")
@@ -1278,7 +1288,7 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
 
 
 def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    vae_config = create_vae_diffusers_config(original_config)
     diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
     with ctx():
@@ -1293,9 +1303,7 @@ def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwa
     return vae
 
 
-def create_text_encoder_components(
-    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-):
+def create_text_encoder_tokenizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     model_type = infer_model_type(pipeline_class_name, original_config)
     local_files_only = kwargs.get("local_files_only", False)
 
@@ -1395,10 +1403,17 @@ def create_text_encoder_components(
             "text_encoder_2": text_encoder_2,
         }
 
+    elif model_type == "LDMText2Image":
+        text_config = create_ldm_bert_config(original_config)
+        text_encoder = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
+
+        return {"text_encoder": text_encoder, "tokenizer": tokenizer}
+
     return
 
 
-def create_scheduler_component(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     scheduler_config = get_default_scheduler_config()
     model_type = infer_model_type(pipeline_class_name, original_config)
 
@@ -1468,8 +1483,6 @@ def create_scheduler_component(pipeline_class_name, original_config, checkpoint,
 def create_stable_unclip_components(
     pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
 ):
-    components = {}
-
     local_files_only = kwargs.get("local_files_only", False)
     clip_stats_path = kwargs.get("clip_stats_path", None)
 
@@ -1486,7 +1499,7 @@ def create_stable_unclip_components(
         try:
             config_name = "kakaobrain/karlo-v1-alpha"
             prior = PriorTransformer.from_pretrained(config_name, subfolder="prior", local_files_only=local_files_only)
-        except Exception as e:
+        except Exception:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the prior in the following path: '{config_name}'."
             )

From 2616e03062bd269ed247ef99763e2dabf256d50e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 28 Dec 2023 12:36:05 +0000
Subject: [PATCH 10/89] update

---
 src/diffusers/loaders/single_file.py       |  9 ++++++++
 src/diffusers/loaders/single_file_utils.py | 25 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index f2cda38c82b2..93a6e1544978 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -41,6 +41,7 @@
     create_unet_model,
     create_vae_model,
     fetch_original_config,
+    infer_model_type,
 )
 
 
@@ -218,6 +219,14 @@ def build_additional_components(
         )
         return paint_by_example_components
 
+    if pipeline_class_name == "StableDiffusionXLImg2ImgPipeline":
+        model_type = infer_model_type(pipeline_class_name, original_config)
+        is_refiner = model_type == "SDXL-Refiner"
+        return {
+            "requires_aesthetics_score": is_refiner,
+            "force_zeros_for_empty_prompt": False if is_refiner else True,
+        }
+
 
 class FromSingleFileMixin:
     """
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 38557bbb1054..b916ff39e743 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1287,6 +1287,21 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     return unet
 
 
+def create_controlnet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
+    if "control_stage_config" not in original_config.model.params:
+        raise ValueError("Config does not have controlnet information")
+
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    extract_ema = kwargs.get("extract_ema", False)
+    upcast_attention = kwargs.get("upcast_attention", False)
+
+    controlnet = convert_controlnet_checkpoint(
+        checkpoint, original_config, path, image_size, upcast_attention, extract_ema
+    )
+
+    return {"controlnet": controlnet}
+
+
 def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     vae_config = create_vae_diffusers_config(original_config)
     diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
@@ -1477,6 +1492,16 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
+    """
+    elif model_type == "UpScale":
+        elif pipeline_class == StableDiffusionUpscalePipeline:
+            scheduler = DDIMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
+                )
+                low_res_scheduler = DDPMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+                )
+    """
     return {"scheduler": scheduler}
 
 

From 7db4f50e8577aa49b9e266e0bf61e1bd22ad808b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 28 Dec 2023 12:40:31 +0000
Subject: [PATCH 11/89] update'

---
 src/diffusers/loaders/single_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 93a6e1544978..5fef0f361e9d 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -219,7 +219,7 @@ def build_additional_components(
         )
         return paint_by_example_components
 
-    if pipeline_class_name == "StableDiffusionXLImg2ImgPipeline":
+    if pipeline_class_name in ["StableDiffusionXLImg2ImgPipeline", "StableDiffusionXLInpaintPipeline"]:
         model_type = infer_model_type(pipeline_class_name, original_config)
         is_refiner = model_type == "SDXL-Refiner"
         return {

From 872aa6cf4f31b891c10dd437b6aafafa48c9e98d Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 28 Dec 2023 12:58:49 +0000
Subject: [PATCH 12/89] update

---
 src/diffusers/loaders/single_file_utils.py | 37 ++++++++++++++++------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index b916ff39e743..950b8d52aebd 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1263,6 +1263,20 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     else:
         num_in_channels = 4
 
+    image_size = kwargs.get("image_size", 512)
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+
+    if pipeline_class_name == "StableDiffusionUpscalePipeline":
+        image_size = image_size or original_config.model.params.unet_config.params.image_size
+
+    elif (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+
     upcast_attention = kwargs.get("upcast_attention", False)
     extract_ema = kwargs.get("extract_ema", False)
 
@@ -1492,16 +1506,19 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
-    """
-    elif model_type == "UpScale":
-        elif pipeline_class == StableDiffusionUpscalePipeline:
-            scheduler = DDIMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
-                )
-                low_res_scheduler = DDPMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
-                )
-    """
+    if pipeline_class_name == "StableDiffusionUpscalePipeline":
+        scheduler = DDIMScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
+        )
+        low_res_scheduler = DDPMScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+        )
+
+        return {
+            "scheduler": scheduler,
+            "low_res_scheduler": low_res_scheduler,
+        }
+
     return {"scheduler": scheduler}
 
 

From 83c5b8e124068ae6b11e9652facd756c3df106a2 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 13:37:05 +0000
Subject: [PATCH 13/89] update

---
 src/diffusers/loaders/__init__.py          |   5 +-
 src/diffusers/loaders/autoencoder.py       | 226 +++++++++++++
 src/diffusers/loaders/controlnet.py        | 180 +++++++++++
 src/diffusers/loaders/single_file.py       | 353 ---------------------
 src/diffusers/loaders/single_file_utils.py |   4 +-
 5 files changed, 411 insertions(+), 357 deletions(-)
 create mode 100644 src/diffusers/loaders/autoencoder.py
 create mode 100644 src/diffusers/loaders/controlnet.py

diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index 45c8c97c76eb..6b3f0074db8c 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -54,12 +54,13 @@ def text_encoder_attn_modules(text_encoder):
 _import_structure = {}
 
 if is_torch_available():
-    _import_structure["single_file"] = ["FromOriginalControlnetMixin", "FromOriginalVAEMixin"]
+    _import_structure["autoencoder"] = ["FromOriginalVAEMixin"]
+    _import_structure["controlnet"] = ["FromOriginalControlnetMixin"]
     _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
     _import_structure["utils"] = ["AttnProcsLayers"]
 
     if is_transformers_available():
-        _import_structure["single_file"].extend(["FromSingleFileMixin"])
+        _import_structure["single_file"] = ["FromSingleFileMixin"]
         _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
         _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
         _import_structure["ip_adapter"] = ["IPAdapterMixin"]
diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
new file mode 100644
index 000000000000..2b044491b1ae
--- /dev/null
+++ b/src/diffusers/loaders/autoencoder.py
@@ -0,0 +1,226 @@
+import inspect
+from contextlib import nullcontext
+from io import BytesIO
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import validate_hf_hub_args
+from safetensors.torch import load_file as safe_load
+
+from ..utils import (
+    deprecate,
+    is_accelerate_available,
+    is_omegaconf_available,
+    is_transformers_available,
+    logging,
+)
+from ..utils.import_utils import BACKENDS_MAPPING
+
+
+if is_transformers_available():
+    pass
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+logger = logging.get_logger(__name__)
+
+
+class FromOriginalVAEMixin:
+    """
+    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into an [`AutoencoderKL`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted.
+            scaling_factor (`float`, *optional*, defaults to 0.18215):
+                The component-wise standard deviation of the trained latent space computed using the first batch of the
+                training set. This is used to scale the latent space to have unit variance when training the diffusion
+                model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
+                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
+                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        <Tip warning={true}>
+
+            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
+            a VAE from SDXL or a Stable Diffusion v2 model or higher.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        from diffusers import AutoencoderKL
+
+        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
+        model = AutoencoderKL.from_single_file(url)
+        ```
+        """
+        if not is_omegaconf_available():
+            raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+
+        from omegaconf import OmegaConf
+
+        from ..models import AutoencoderKL
+
+        # import here to avoid circular dependency
+        from ..pipelines.stable_diffusion.convert_from_ckpt import (
+            convert_ldm_vae_checkpoint,
+            create_vae_diffusers_config,
+        )
+
+        config_file = kwargs.pop("config_file", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        image_size = kwargs.pop("image_size", None)
+        scaling_factor = kwargs.pop("scaling_factor", None)
+        kwargs.pop("upcast_attention", None)
+
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
+        from_safetensors = file_extension == "safetensors"
+
+        if from_safetensors and use_safetensors is False:
+            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
+
+        # remove huggingface url
+        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
+            if pretrained_model_link_or_path.startswith(prefix):
+                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
+
+        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
+        ckpt_path = Path(pretrained_model_link_or_path)
+        if not ckpt_path.is_file():
+            # get repo_id and (potentially nested) file path of ckpt in repo
+            repo_id = "/".join(ckpt_path.parts[:2])
+            file_path = "/".join(ckpt_path.parts[2:])
+
+            if file_path.startswith("blob/"):
+                file_path = file_path[len("blob/") :]
+
+            if file_path.startswith("main/"):
+                file_path = file_path[len("main/") :]
+
+            pretrained_model_link_or_path = hf_hub_download(
+                repo_id,
+                filename=file_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                force_download=force_download,
+            )
+
+        if from_safetensors:
+            from safetensors import safe_open
+
+            checkpoint = {}
+            with safe_open(pretrained_model_link_or_path, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    checkpoint[key] = f.get_tensor(key)
+        else:
+            checkpoint = torch.load(pretrained_model_link_or_path, map_location="cpu")
+
+        if "state_dict" in checkpoint:
+            checkpoint = checkpoint["state_dict"]
+
+        if config_file is None:
+            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+            config_file = BytesIO(requests.get(config_url).content)
+
+        original_config = OmegaConf.load(config_file)
+
+        # default to sd-v1-5
+        image_size = image_size or 512
+
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+        if scaling_factor is None:
+            if (
+                "model" in original_config
+                and "params" in original_config.model
+                and "scale_factor" in original_config.model.params
+            ):
+                vae_scaling_factor = original_config.model.params.scale_factor
+            else:
+                vae_scaling_factor = 0.18215  # default SD scaling factor
+
+        vae_config["scaling_factor"] = vae_scaling_factor
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            vae = AutoencoderKL(**vae_config)
+
+        if is_accelerate_available():
+            from ..models.modeling_utils import load_model_dict_into_meta
+
+            load_model_dict_into_meta(vae, converted_vae_checkpoint, device="cpu")
+        else:
+            vae.load_state_dict(converted_vae_checkpoint)
+
+        if torch_dtype is not None:
+            vae.to(dtype=torch_dtype)
+
+        return vae
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
new file mode 100644
index 000000000000..5fdf36481c5a
--- /dev/null
+++ b/src/diffusers/loaders/controlnet.py
@@ -0,0 +1,180 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from contextlib import nullcontext
+from io import BytesIO
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import validate_hf_hub_args
+from safetensors.torch import load_file as safe_load
+
+from ..utils import (
+    deprecate,
+    is_accelerate_available,
+    is_omegaconf_available,
+    is_transformers_available,
+    logging,
+)
+from ..utils.import_utils import BACKENDS_MAPPING
+
+
+class FromOriginalControlnetMixin:
+    """
+    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        Examples:
+
+        ```py
+        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+
+        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
+        model = ControlNetModel.from_single_file(url)
+
+        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
+        ```
+        """
+        # import here to avoid circular dependency
+        from ..pipelines.stable_diffusion.convert_from_ckpt import download_controlnet_from_original_ckpt
+
+        config_file = kwargs.pop("config_file", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        num_in_channels = kwargs.pop("num_in_channels", None)
+        use_linear_projection = kwargs.pop("use_linear_projection", None)
+        revision = kwargs.pop("revision", None)
+        extract_ema = kwargs.pop("extract_ema", False)
+        image_size = kwargs.pop("image_size", None)
+        upcast_attention = kwargs.pop("upcast_attention", None)
+
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
+        from_safetensors = file_extension == "safetensors"
+
+        if from_safetensors and use_safetensors is False:
+            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
+
+        # remove huggingface url
+        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
+            if pretrained_model_link_or_path.startswith(prefix):
+                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
+
+        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
+        ckpt_path = Path(pretrained_model_link_or_path)
+        if not ckpt_path.is_file():
+            # get repo_id and (potentially nested) file path of ckpt in repo
+            repo_id = "/".join(ckpt_path.parts[:2])
+            file_path = "/".join(ckpt_path.parts[2:])
+
+            if file_path.startswith("blob/"):
+                file_path = file_path[len("blob/") :]
+
+            if file_path.startswith("main/"):
+                file_path = file_path[len("main/") :]
+
+            pretrained_model_link_or_path = hf_hub_download(
+                repo_id,
+                filename=file_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                force_download=force_download,
+            )
+
+        if config_file is None:
+            config_url = "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml"
+            config_file = BytesIO(requests.get(config_url).content)
+
+        image_size = image_size or 512
+
+        controlnet = download_controlnet_from_original_ckpt(
+            pretrained_model_link_or_path,
+            original_config_file=config_file,
+            image_size=image_size,
+            extract_ema=extract_ema,
+            num_in_channels=num_in_channels,
+            upcast_attention=upcast_attention,
+            from_safetensors=from_safetensors,
+            use_linear_projection=use_linear_projection,
+        )
+
+        if torch_dtype is not None:
+            controlnet.to(dtype=torch_dtype)
+
+        return controlnet
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 5fef0f361e9d..a35b1fb1ec37 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -31,9 +31,7 @@
 )
 from ..utils.import_utils import BACKENDS_MAPPING
 from .single_file_utils import (
-    create_adapter_model,
     create_controlnet_model,
-    create_ldm_text_to_image_components,
     create_paint_by_example_components,
     create_scheduler,
     create_stable_unclip_components,
@@ -233,12 +231,6 @@ class FromSingleFileMixin:
     Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`].
     """
 
-    @classmethod
-    def from_ckpt(cls, *args, **kwargs):
-        deprecation_message = "The function `from_ckpt` is deprecated in favor of `from_single_file` and will be removed in diffusers v.0.21. Please make sure to use `StableDiffusionPipeline.from_single_file(...)` instead."
-        deprecate("from_ckpt", "0.21.0", deprecation_message, standard_warn=False)
-        return cls.from_single_file(*args, **kwargs)
-
     @classmethod
     @validate_hf_hub_args
     def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
@@ -366,348 +358,3 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         return pipe
 
-
-class FromOriginalVAEMixin:
-    """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into an [`AutoencoderKL`].
-    """
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
-        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
-
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            scaling_factor (`float`, *optional*, defaults to 0.18215):
-                The component-wise standard deviation of the trained latent space computed using the first batch of the
-                training set. This is used to scale the latent space to have unit variance when training the diffusion
-                model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
-                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
-                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
-        <Tip warning={true}>
-
-            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
-            a VAE from SDXL or a Stable Diffusion v2 model or higher.
-
-        </Tip>
-
-        Examples:
-
-        ```py
-        from diffusers import AutoencoderKL
-
-        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
-        model = AutoencoderKL.from_single_file(url)
-        ```
-        """
-        if not is_omegaconf_available():
-            raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-        from omegaconf import OmegaConf
-
-        from ..models import AutoencoderKL
-
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import (
-            convert_ldm_vae_checkpoint,
-            create_vae_diffusers_config,
-        )
-
-        config_file = kwargs.pop("config_file", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        revision = kwargs.pop("revision", None)
-        image_size = kwargs.pop("image_size", None)
-        scaling_factor = kwargs.pop("scaling_factor", None)
-        kwargs.pop("upcast_attention", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
-            )
-
-        if from_safetensors:
-            from safetensors import safe_open
-
-            checkpoint = {}
-            with safe_open(pretrained_model_link_or_path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    checkpoint[key] = f.get_tensor(key)
-        else:
-            checkpoint = torch.load(pretrained_model_link_or_path, map_location="cpu")
-
-        if "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
-
-        if config_file is None:
-            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
-            config_file = BytesIO(requests.get(config_url).content)
-
-        original_config = OmegaConf.load(config_file)
-
-        # default to sd-v1-5
-        image_size = image_size or 512
-
-        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-        if scaling_factor is None:
-            if (
-                "model" in original_config
-                and "params" in original_config.model
-                and "scale_factor" in original_config.model.params
-            ):
-                vae_scaling_factor = original_config.model.params.scale_factor
-            else:
-                vae_scaling_factor = 0.18215  # default SD scaling factor
-
-        vae_config["scaling_factor"] = vae_scaling_factor
-
-        ctx = init_empty_weights if is_accelerate_available() else nullcontext
-        with ctx():
-            vae = AutoencoderKL(**vae_config)
-
-        if is_accelerate_available():
-            from ..models.modeling_utils import load_model_dict_into_meta
-
-            load_model_dict_into_meta(vae, converted_vae_checkpoint, device="cpu")
-        else:
-            vae.load_state_dict(converted_vae_checkpoint)
-
-        if torch_dtype is not None:
-            vae.to(dtype=torch_dtype)
-
-        return vae
-
-
-class FromOriginalControlnetMixin:
-    """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
-    """
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
-        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
-
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
-        Examples:
-
-        ```py
-        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-
-        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
-        model = ControlNetModel.from_single_file(url)
-
-        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
-        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
-        ```
-        """
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import download_controlnet_from_original_ckpt
-
-        config_file = kwargs.pop("config_file", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        num_in_channels = kwargs.pop("num_in_channels", None)
-        use_linear_projection = kwargs.pop("use_linear_projection", None)
-        revision = kwargs.pop("revision", None)
-        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", None)
-        upcast_attention = kwargs.pop("upcast_attention", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
-            )
-
-        if config_file is None:
-            config_url = "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml"
-            config_file = BytesIO(requests.get(config_url).content)
-
-        image_size = image_size or 512
-
-        controlnet = download_controlnet_from_original_ckpt(
-            pretrained_model_link_or_path,
-            original_config_file=config_file,
-            image_size=image_size,
-            extract_ema=extract_ema,
-            num_in_channels=num_in_channels,
-            upcast_attention=upcast_attention,
-            from_safetensors=from_safetensors,
-            use_linear_projection=use_linear_projection,
-        )
-
-        if torch_dtype is not None:
-            controlnet.to(dtype=torch_dtype)
-
-        return controlnet
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 950b8d52aebd..0015767967d4 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -39,6 +39,7 @@
 from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..pipelines.paint_by_example import PaintByExampleImageEncoder
 from ..pipelines.pipeline_utils import DiffusionPipeline
+from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from ..schedulers import (
     DDIMScheduler,
@@ -53,7 +54,6 @@
 )
 from ..utils import is_accelerate_available, is_omegaconf_available, logging
 from ..utils.import_utils import BACKENDS_MAPPING
-from .safety_checker import StableDiffusionSafetyChecker
 
 
 if is_accelerate_available():
@@ -1332,7 +1332,7 @@ def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwa
     return vae
 
 
-def create_text_encoder_tokenizer(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     model_type = infer_model_type(pipeline_class_name, original_config)
     local_files_only = kwargs.get("local_files_only", False)
 

From 5a8e10e3f41a79fa55e3cafa9945df7461e0374c Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 13:40:36 +0000
Subject: [PATCH 14/89] update

---
 src/diffusers/loaders/single_file.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index a35b1fb1ec37..da0a5e9c130e 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -163,12 +163,6 @@ def build_component(
         )
         return controlnet_components
 
-    if component_name == "adapter":
-        adapter_components = create_adapter_model(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-        )
-        return adapter_components
-
     if component_name == "scheduler":
         scheduler_components = create_scheduler(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs

From 7a8c72200a3b2838781ea745b7da2f84f92a9119 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 14:05:18 +0000
Subject: [PATCH 15/89] update

---
 src/diffusers/loaders/single_file.py       | 25 ++++++++++++----------
 src/diffusers/loaders/single_file_utils.py |  9 ++++++--
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index da0a5e9c130e..cc211f0a4b37 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -314,23 +314,26 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             raise ValueError(
                 f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
-        pretrained_model_link_or_path = download_model_checkpoint(
-            ckpt_path,
-            cache_dir=cache_dir,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            token=token,
-            revision=revision,
-        )
-        checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
+        if not ckpt_path.is_file():
+            pretrained_model_link_or_path = download_model_checkpoint(
+                ckpt_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+            checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
+        else:
+            checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
 
         # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
         # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(original_config_file, checkpoint, config_files)
+        original_config = fetch_original_config(checkpoint, original_config_file, config_files)
         component_names = extract_pipeline_component_names(cls)
 
         pipeline_components = {}
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 0015767967d4..242bbc2d9dfb 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -152,9 +152,14 @@ def fetch_original_config_file_from_file(checkpoint, config_files: list):
     return
 
 
-def fetch_original_config(checkpoint, config_files: list):
-    if config_files is not None:
+def fetch_original_config(checkpoint, original_config_file=None, config_files=None):
+    if original_config_file:
+        original_config = OmegaConf.load(original_config_file)
+        return original_config
+
+    elif config_files:
         original_config_file = fetch_original_config_file_from_file(checkpoint, config_files)
+
     else:
         original_config_file = fetch_original_config_file_from_url(checkpoint)
 

From ccf8d62c2298ff41d02ec8dd67739ac81e48cfb5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 14:24:27 +0000
Subject: [PATCH 16/89] update

---
 src/diffusers/loaders/single_file.py       |  9 ++++-
 src/diffusers/loaders/single_file_utils.py | 45 +++++++++++++---------
 2 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index cc211f0a4b37..00fa73a852e8 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -339,7 +339,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         pipeline_components = {}
         for component in component_names:
             components = build_component(
-                pipeline_components, pipeline_name, component, checkpoint, original_config, **kwargs
+                pipeline_components,
+                pipeline_name,
+                component,
+                original_config,
+                checkpoint,
+                pretrained_model_link_or_path,
+                **kwargs,
             )
             pipeline_components.update(components)
 
@@ -354,4 +360,3 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             pipe.to(dtype=torch_dtype)
 
         return pipe
-
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 242bbc2d9dfb..e6ab6160876f 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -222,6 +222,23 @@ def get_default_scheduler_config():
     return SCHEDULER_DEFAULT_CONFIG
 
 
+def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs):
+    image_size = kwargs.get("image_size", 512)
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+
+    if pipeline_class_name == "StableDiffusionUpscalePipeline":
+        image_size = image_size or original_config.model.params.unet_config.params.image_size
+
+    elif (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+
+    return image_size
+
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -1268,19 +1285,7 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     else:
         num_in_channels = 4
 
-    image_size = kwargs.get("image_size", 512)
-    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
-
-    if pipeline_class_name == "StableDiffusionUpscalePipeline":
-        image_size = image_size or original_config.model.params.unet_config.params.image_size
-
-    elif (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
+    image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
 
     upcast_attention = kwargs.get("upcast_attention", False)
     extract_ema = kwargs.get("extract_ema", False)
@@ -1303,7 +1308,7 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     else:
         unet.load_state_dict(diffusers_format_unet_checkpoint)
 
-    return unet
+    return {"unet": unet}
 
 
 def create_controlnet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
@@ -1321,10 +1326,13 @@ def create_controlnet_model(pipeline_class_name, original_config, checkpoint, ch
     return {"controlnet": controlnet}
 
 
-def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
-    vae_config = create_vae_diffusers_config(original_config)
+def create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+    image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
+
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
     diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
     with ctx():
         vae = AutoencoderKL(**vae_config)
 
@@ -1334,7 +1342,7 @@ def create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, **kwa
     else:
         vae.load_state_dict(diffusers_format_vae_checkpoint)
 
-    return vae
+    return {"vae": vae}
 
 
 def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
@@ -1425,7 +1433,8 @@ def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, ch
                 local_files_only=local_files_only,
                 **config_kwargs,
             )
-        except Exception:
+        except Exception as e:
+            print(e)
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
             )

From da9c9d52ebc2daa19014c50b1edd607b8d13a760 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 14:31:34 +0000
Subject: [PATCH 17/89] update

---
 src/diffusers/loaders/single_file.py       | 2 ++
 src/diffusers/loaders/single_file_utils.py | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 00fa73a852e8..629c5d8cfa23 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -347,6 +347,8 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 pretrained_model_link_or_path,
                 **kwargs,
             )
+            if not components:
+                continue
             pipeline_components.update(components)
 
         additional_components = set(pipeline_components.keys() - component_names)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index e6ab6160876f..110453458e38 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1268,7 +1268,7 @@ def create_ldm_bert_config(original_config):
     return config
 
 
-def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
+def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     if "num_in_channels" in kwargs:
         num_in_channels = kwargs.get("num_in_channels")
 
@@ -1291,7 +1291,7 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     extract_ema = kwargs.get("extract_ema", False)
 
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["num_in_channels"] = num_in_channels
+    unet_config["in_channels"] = num_in_channels
     unet_config["upcast_attention"] = upcast_attention
 
     path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
@@ -1423,6 +1423,7 @@ def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, ch
 
         try:
             config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config_kwargs = {"projection_dim": 1280}
             prefix = "conditioner.embedders.1.model."
             tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
             text_encoder_2 = convert_open_clip_checkpoint(
@@ -1434,7 +1435,6 @@ def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, ch
                 **config_kwargs,
             )
         except Exception as e:
-            print(e)
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
             )

From b791a713e55a010840a159da5c52dc808e0de083 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 14:35:09 +0000
Subject: [PATCH 18/89] up

---
 src/diffusers/loaders/single_file_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 110453458e38..f915285882a4 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -225,10 +225,14 @@ def get_default_scheduler_config():
 def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs):
     image_size = kwargs.get("image_size", 512)
     global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+    model_type = infer_model_type(pipeline_class_name, original_config, **kwargs)
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
         image_size = image_size or original_config.model.params.unet_config.params.image_size
 
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        image_size = 1024
+
     elif (
         "parameterization" in original_config["model"]["params"]
         and original_config["model"]["params"]["parameterization"] == "v"

From c6c8fc7fde908fbc9bf7d87673a2b72f863e04a1 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 14:38:32 +0000
Subject: [PATCH 19/89] update

---
 src/diffusers/loaders/single_file.py       |  6 ------
 src/diffusers/loaders/single_file_utils.py | 21 ++++++++++++---------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 629c5d8cfa23..940d340ec5fd 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -199,12 +199,6 @@ def build_additional_components(
         )
         return stable_unclip_components
 
-    if pipeline_class_name == "LDMTextToImagePipeline":
-        ldm_text_to_image_components = create_ldm_text_to_image_components(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-        )
-        return ldm_text_to_image_components
-
     if pipeline_class_name == "PaintByExamplePipeline":
         paint_by_example_components = create_paint_by_example_components(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index f915285882a4..2d2a034bf56b 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -237,12 +237,13 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
         "parameterization" in original_config["model"]["params"]
         and original_config["model"]["params"]["parameterization"] == "v"
     ):
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
+        # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+        # as it relies on a brittle global step parameter here
+        image_size = 512 if global_step == 875000 else 768
 
     return image_size
 
+
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -1315,7 +1316,9 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     return {"unet": unet}
 
 
-def create_controlnet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs):
+def create_controlnet_model(
+    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs
+):
     if "control_stage_config" not in original_config.model.params:
         raise ValueError("Config does not have controlnet information")
 
@@ -1349,7 +1352,9 @@ def create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoin
     return {"vae": vae}
 
 
-def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+def create_text_encoders_and_tokenizers(
+    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+):
     model_type = infer_model_type(pipeline_class_name, original_config)
     local_files_only = kwargs.get("local_files_only", False)
 
@@ -1450,7 +1455,7 @@ def create_text_encoders_and_tokenizers(pipeline_class_name, original_config, ch
             "text_encoder_2": text_encoder_2,
         }
 
-    elif model_type == "LDMText2Image":
+    elif pipeline_class_name == "LDMTextToImagePipeline":
         text_config = create_ldm_bert_config(original_config)
         text_encoder = convert_ldm_bert_checkpoint(checkpoint, text_config)
         tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
@@ -1525,9 +1530,7 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
-        scheduler = DDIMScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
-        )
+        scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler")
         low_res_scheduler = DDPMScheduler.from_pretrained(
             "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
         )

From 6ba7a50a796f836193d28841ff599e85e394df86 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Dec 2023 14:39:12 +0000
Subject: [PATCH 20/89] update

---
 src/diffusers/loaders/single_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 940d340ec5fd..cd925e3df942 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -191,7 +191,7 @@ def build_additional_components(
         return kwargs.pop(component_name, None)
 
     if component_name in pipeline_components:
-        return {}
+        return None
 
     if pipeline_class_name == ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
         stable_unclip_components = create_stable_unclip_components(

From b44d2b41633190589c4d27821255d98e8bb0b171 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 06:28:47 +0000
Subject: [PATCH 21/89] update

---
 src/diffusers/loaders/autoencoder.py       |   3 -
 src/diffusers/loaders/controlnet.py        |  13 -
 src/diffusers/loaders/single_file.py       |  15 +-
 src/diffusers/loaders/single_file_utils.py | 605 +--------------------
 4 files changed, 10 insertions(+), 626 deletions(-)

diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
index 2b044491b1ae..4f1df1310ec2 100644
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -1,4 +1,3 @@
-import inspect
 from contextlib import nullcontext
 from io import BytesIO
 from pathlib import Path
@@ -7,10 +6,8 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import validate_hf_hub_args
-from safetensors.torch import load_file as safe_load
 
 from ..utils import (
-    deprecate,
     is_accelerate_available,
     is_omegaconf_available,
     is_transformers_available,
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
index 5fdf36481c5a..4f709d75be71 100644
--- a/src/diffusers/loaders/controlnet.py
+++ b/src/diffusers/loaders/controlnet.py
@@ -11,25 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import inspect
-from contextlib import nullcontext
 from io import BytesIO
 from pathlib import Path
 
 import requests
-import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import validate_hf_hub_args
-from safetensors.torch import load_file as safe_load
-
-from ..utils import (
-    deprecate,
-    is_accelerate_available,
-    is_omegaconf_available,
-    is_transformers_available,
-    logging,
-)
-from ..utils.import_utils import BACKENDS_MAPPING
 
 
 class FromOriginalControlnetMixin:
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index cd925e3df942..f1efa29c2a71 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -12,24 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-from contextlib import nullcontext
-from io import BytesIO
 from pathlib import Path
 
-import requests
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import validate_hf_hub_args
 from safetensors.torch import load_file as safe_load
 
 from ..utils import (
-    deprecate,
     is_accelerate_available,
-    is_omegaconf_available,
     is_transformers_available,
     logging,
 )
-from ..utils.import_utils import BACKENDS_MAPPING
 from .single_file_utils import (
     create_controlnet_model,
     create_paint_by_example_components,
@@ -47,19 +41,12 @@
     pass
 
 if is_accelerate_available():
-    from accelerate import init_empty_weights
+    pass
 
 logger = logging.get_logger(__name__)
 
 
 VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
-TEXT_ENCODER_FROM_PIPELINE_CLASS = {
-    "StableUnCLIPPipeline": "FrozenOpenCLIPEmbedder",
-    "StableUnCLIPImg2ImgPipeline": "FrozenOpenCLIPEmbedder",
-    "LDMTextToImagePipeline": "LDMTextToImage",
-    "PaintByExamplePipeline": "PaintByExample",
-    "StableDiffusion": "stable-diffusion",
-}
 
 
 def extract_pipeline_component_names(pipeline_class):
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 2d2a034bf56b..c127a6b0964b 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -17,7 +17,7 @@
 import re
 from contextlib import nullcontext
 from io import BytesIO
-from typing import Dict, Optional, Union
+from typing import Optional
 
 import requests
 import torch
@@ -38,8 +38,6 @@
 from ..models import AutoencoderKL, ControlNetModel, PriorTransformer, UNet2DConditionModel
 from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..pipelines.paint_by_example import PaintByExampleImageEncoder
-from ..pipelines.pipeline_utils import DiffusionPipeline
-from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from ..schedulers import (
     DDIMScheduler,
@@ -50,10 +48,8 @@
     HeunDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    UnCLIPScheduler,
 )
-from ..utils import is_accelerate_available, is_omegaconf_available, logging
-from ..utils.import_utils import BACKENDS_MAPPING
+from ..utils import is_accelerate_available, logging
 
 
 if is_accelerate_available():
@@ -1327,7 +1323,12 @@ def create_controlnet_model(
     upcast_attention = kwargs.get("upcast_attention", False)
 
     controlnet = convert_controlnet_checkpoint(
-        checkpoint, original_config, path, image_size, upcast_attention, extract_ema
+        checkpoint,
+        original_config,
+        path,
+        image_size,
+        upcast_attention,
+        extract_ema,
     )
 
     return {"controlnet": controlnet}
@@ -1443,7 +1444,7 @@ def create_text_encoders_and_tokenizers(
                 local_files_only=local_files_only,
                 **config_kwargs,
             )
-        except Exception as e:
+        except Exception:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
             )
@@ -1631,591 +1632,3 @@ def create_paint_by_example_components(
         "tokenizer": tokenizer,
         "feature_extractor": feature_extractor,
     }
-
-
-def download_from_original_stable_diffusion_ckpt(
-    checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-    original_config_file: str = None,
-    image_size: Optional[int] = None,
-    prediction_type: str = None,
-    model_type: str = None,
-    extract_ema: bool = False,
-    scheduler_type: str = "pndm",
-    num_in_channels: Optional[int] = None,
-    upcast_attention: Optional[bool] = None,
-    device: str = None,
-    from_safetensors: bool = False,
-    stable_unclip: Optional[str] = None,
-    stable_unclip_prior: Optional[str] = None,
-    clip_stats_path: Optional[str] = None,
-    controlnet: Optional[bool] = None,
-    adapter: Optional[bool] = None,
-    load_safety_checker: bool = True,
-    pipeline_class: DiffusionPipeline = None,
-    local_files_only=False,
-    vae_path=None,
-    vae=None,
-    text_encoder=None,
-    text_encoder_2=None,
-    tokenizer=None,
-    tokenizer_2=None,
-    config_files=None,
-    **kwargs,
-) -> DiffusionPipeline:
-    """
-    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-    config file.
-
-    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-    Args:
-        checkpoint_path_or_dict (`str` or `dict`): Path to `.ckpt` file, or the state dict.
-        original_config_file (`str`):
-            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
-            inferred by looking for a key that only exists in SD2.0 models.
-        image_size (`int`, *optional*, defaults to 512):
-            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
-            Base. Use 768 for Stable Diffusion v2.
-        prediction_type (`str`, *optional*):
-            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
-            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
-        num_in_channels (`int`, *optional*, defaults to None):
-            The number of input channels. If `None`, it will be automatically inferred.
-        scheduler_type (`str`, *optional*, defaults to 'pndm'):
-            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-            "ddim"]`.
-        model_type (`str`, *optional*, defaults to `None`):
-            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
-            "FrozenCLIPEmbedder", "PaintByExample"]`.
-        is_img2img (`bool`, *optional*, defaults to `False`):
-            Whether the model should be loaded as an img2img pipeline.
-        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
-            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
-            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
-            inference. Non-EMA weights are usually better to continue fine-tuning.
-        upcast_attention (`bool`, *optional*, defaults to `None`):
-            Whether the attention computation should always be upcasted. This is necessary when running stable
-            diffusion 2.1.
-        device (`str`, *optional*, defaults to `None`):
-            The device to use. Pass `None` to determine automatically.
-        from_safetensors (`str`, *optional*, defaults to `False`):
-            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
-        load_safety_checker (`bool`, *optional*, defaults to `True`):
-            Whether to load the safety checker or not. Defaults to `True`.
-        pipeline_class (`str`, *optional*, defaults to `None`):
-            The pipeline class to use. Pass `None` to determine automatically.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            Whether or not to only look at local files (i.e., do not try to download the model).
-        vae (`AutoencoderKL`, *optional*, defaults to `None`):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. If
-            this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
-        text_encoder (`CLIPTextModel`, *optional*, defaults to `None`):
-            An instance of [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)
-            to use, specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)
-            variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
-        tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`):
-            An instance of
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
-            to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
-            needed.
-        config_files (`Dict[str, str]`, *optional*, defaults to `None`):
-            A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
-            will load the config files by itself, if needed. Valid keys are:
-                - `v1`: Config file for Stable Diffusion v1
-                - `v2`: Config file for Stable Diffusion v2
-                - `xl`: Config file for Stable Diffusion XL
-                - `xl_refiner`: Config file for Stable Diffusion XL Refiner
-        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
-    """
-
-    # import pipelines here to avoid circular import error when using from_single_file method
-    from diffusers import (
-        LDMTextToImagePipeline,
-        PaintByExamplePipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionUpscalePipeline,
-        StableDiffusionXLControlNetInpaintPipeline,
-        StableDiffusionXLImg2ImgPipeline,
-        StableDiffusionXLInpaintPipeline,
-        StableDiffusionXLPipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-    )
-
-    if prediction_type == "v-prediction":
-        prediction_type = "v_prediction"
-
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    checkpoint = load_checkpoint(checkpoint_path_or_dict)
-    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
-
-    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-    while "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    original_config = fetch_original_config(checkpoint, config_files)
-    model_type = infer_model_type(original_config, model_type)
-
-    unet = create_unet_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
-    vae = create_vae_model(original_config, checkpoint, checkpoint_path_or_dict, model_type, image_size, **kwargs)
-
-    if pipeline_class is None:
-        # Check if we have a SDXL or SD model and initialize default pipeline
-        if model_type not in ["SDXL", "SDXL-Refiner"]:
-            pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
-        else:
-            pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
-
-    if num_in_channels is None and pipeline_class in [
-        StableDiffusionInpaintPipeline,
-        StableDiffusionXLInpaintPipeline,
-        StableDiffusionXLControlNetInpaintPipeline,
-    ]:
-        num_in_channels = 9
-    if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
-        num_in_channels = 7
-    elif num_in_channels is None:
-        num_in_channels = 4
-
-    if "unet_config" in original_config.model.params:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-        if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 512
-
-    if controlnet is None and "control_stage_config" in original_config.model.params:
-        path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
-        controlnet = convert_controlnet_checkpoint(
-            checkpoint, original_config, path, image_size, upcast_attention, extract_ema
-        )
-
-    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
-
-    if model_type in ["SDXL", "SDXL-Refiner"]:
-        scheduler_dict = {
-            "beta_schedule": "scaled_linear",
-            "beta_start": 0.00085,
-            "beta_end": 0.012,
-            "interpolation_type": "linear",
-            "num_train_timesteps": num_train_timesteps,
-            "prediction_type": "epsilon",
-            "sample_max_value": 1.0,
-            "set_alpha_to_one": False,
-            "skip_prk_steps": True,
-            "steps_offset": 1,
-            "timestep_spacing": "leading",
-        }
-        scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
-        scheduler_type = "euler"
-    else:
-        beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
-        beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
-        scheduler = DDIMScheduler(
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            beta_start=beta_start,
-            num_train_timesteps=num_train_timesteps,
-            steps_offset=1,
-            clip_sample=False,
-            set_alpha_to_one=False,
-            prediction_type=prediction_type,
-        )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    if pipeline_class == StableDiffusionUpscalePipeline:
-        image_size = original_config.model.params.unet_config.params.image_size
-
-    # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["upcast_attention"] = upcast_attention
-
-    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=path, extract_ema=extract_ema
-    )
-
-    ctx = init_empty_weights if is_accelerate_available() else nullcontext
-    with ctx():
-        unet = UNet2DConditionModel(**unet_config)
-
-    if is_accelerate_available():
-        if model_type not in ["SDXL", "SDXL-Refiner"]:  # SBM Delay this.
-            for param_name, param in converted_unet_checkpoint.items():
-                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
-    else:
-        unet.load_state_dict(converted_unet_checkpoint)
-
-    # Convert the VAE model.
-    if vae_path is None and vae is None:
-        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-        if (
-            "model" in original_config
-            and "params" in original_config.model
-            and "scale_factor" in original_config.model.params
-        ):
-            vae_scaling_factor = original_config.model.params.scale_factor
-        else:
-            vae_scaling_factor = 0.18215  # default SD scaling factor
-
-        vae_config["scaling_factor"] = vae_scaling_factor
-
-        ctx = init_empty_weights if is_accelerate_available() else nullcontext
-        with ctx():
-            vae = AutoencoderKL(**vae_config)
-
-        if is_accelerate_available():
-            for param_name, param in converted_vae_checkpoint.items():
-                set_module_tensor_to_device(vae, param_name, "cpu", value=param)
-        else:
-            vae.load_state_dict(converted_vae_checkpoint)
-    elif vae is None:
-        vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
-
-    if model_type == "FrozenOpenCLIPEmbedder":
-        config_name = "stabilityai/stable-diffusion-2"
-        config_kwargs = {"subfolder": "text_encoder"}
-
-        if text_encoder is None:
-            text_model = convert_open_clip_checkpoint(
-                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
-            )
-        else:
-            text_model = text_encoder
-
-        try:
-            tokenizer = CLIPTokenizer.from_pretrained(
-                "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'."
-            )
-
-        if stable_unclip is None:
-            if controlnet:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    controlnet=controlnet,
-                    safety_checker=None,
-                    feature_extractor=None,
-                )
-                if hasattr(pipe, "requires_safety_checker"):
-                    pipe.requires_safety_checker = False
-
-            elif pipeline_class == StableDiffusionUpscalePipeline:
-                scheduler = DDIMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
-                )
-                low_res_scheduler = DDPMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
-                )
-
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    low_res_scheduler=low_res_scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
-                )
-
-            else:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
-                )
-                if hasattr(pipe, "requires_safety_checker"):
-                    pipe.requires_safety_checker = False
-
-        else:
-            image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
-                original_config, clip_stats_path=clip_stats_path, device=device
-            )
-
-            if stable_unclip == "img2img":
-                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
-
-                pipe = StableUnCLIPImg2ImgPipeline(
-                    # image encoding components
-                    feature_extractor=feature_extractor,
-                    image_encoder=image_encoder,
-                    # image noising components
-                    image_normalizer=image_normalizer,
-                    image_noising_scheduler=image_noising_scheduler,
-                    # regular denoising components
-                    tokenizer=tokenizer,
-                    text_encoder=text_model,
-                    unet=unet,
-                    scheduler=scheduler,
-                    # vae
-                    vae=vae,
-                )
-            elif stable_unclip == "txt2img":
-                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
-                    karlo_model = "kakaobrain/karlo-v1-alpha"
-                    prior = PriorTransformer.from_pretrained(
-                        karlo_model, subfolder="prior", local_files_only=local_files_only
-                    )
-
-                    try:
-                        prior_tokenizer = CLIPTokenizer.from_pretrained(
-                            "openai/clip-vit-large-patch14", local_files_only=local_files_only
-                        )
-                    except Exception:
-                        raise ValueError(
-                            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-                        )
-                    prior_text_model = CLIPTextModelWithProjection.from_pretrained(
-                        "openai/clip-vit-large-patch14", local_files_only=local_files_only
-                    )
-
-                    prior_scheduler = UnCLIPScheduler.from_pretrained(
-                        karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only
-                    )
-                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
-                else:
-                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
-
-                pipe = StableUnCLIPPipeline(
-                    # prior components
-                    prior_tokenizer=prior_tokenizer,
-                    prior_text_encoder=prior_text_model,
-                    prior=prior,
-                    prior_scheduler=prior_scheduler,
-                    # image noising components
-                    image_normalizer=image_normalizer,
-                    image_noising_scheduler=image_noising_scheduler,
-                    # regular denoising components
-                    tokenizer=tokenizer,
-                    text_encoder=text_model,
-                    unet=unet,
-                    scheduler=scheduler,
-                    # vae
-                    vae=vae,
-                )
-            else:
-                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
-    elif model_type == "PaintByExample":
-        vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        try:
-            tokenizer = CLIPTokenizer.from_pretrained(
-                "openai/clip-vit-large-patch14", local_files_only=local_files_only
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-            )
-        try:
-            feature_extractor = AutoFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
-            )
-        pipe = PaintByExamplePipeline(
-            vae=vae,
-            image_encoder=vision_model,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=feature_extractor,
-        )
-    elif model_type == "FrozenCLIPEmbedder":
-        text_model = convert_ldm_clip_checkpoint(
-            checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
-        )
-        try:
-            tokenizer = (
-                CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
-                if tokenizer is None
-                else tokenizer
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-            )
-
-        if load_safety_checker:
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
-            )
-            feature_extractor = AutoFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
-            )
-        else:
-            safety_checker = None
-            feature_extractor = None
-
-        if controlnet:
-            pipe = pipeline_class(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                controlnet=controlnet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-            )
-        else:
-            pipe = pipeline_class(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-            )
-    elif model_type in ["SDXL", "SDXL-Refiner"]:
-        is_refiner = model_type == "SDXL-Refiner"
-
-        if (is_refiner is False) and (tokenizer is None):
-            try:
-                tokenizer = CLIPTokenizer.from_pretrained(
-                    "openai/clip-vit-large-patch14", local_files_only=local_files_only
-                )
-            except Exception:
-                raise ValueError(
-                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-                )
-
-        if (is_refiner is False) and (text_encoder is None):
-            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
-
-        if tokenizer_2 is None:
-            try:
-                tokenizer_2 = CLIPTokenizer.from_pretrained(
-                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
-                )
-            except Exception:
-                raise ValueError(
-                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
-                )
-
-        if text_encoder_2 is None:
-            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
-            config_kwargs = {"projection_dim": 1280}
-            prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model."
-
-            text_encoder_2 = convert_open_clip_checkpoint(
-                checkpoint,
-                config_name,
-                prefix=prefix,
-                has_projection=True,
-                local_files_only=local_files_only,
-                **config_kwargs,
-            )
-
-        if is_accelerate_available():  # SBM Now move model to cpu.
-            for param_name, param in converted_unet_checkpoint.items():
-                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
-
-        if controlnet:
-            pipe = pipeline_class(
-                vae=vae,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                text_encoder_2=text_encoder_2,
-                tokenizer_2=tokenizer_2,
-                unet=unet,
-                controlnet=controlnet,
-                scheduler=scheduler,
-                force_zeros_for_empty_prompt=True,
-            )
-        elif adapter:
-            pipe = pipeline_class(
-                vae=vae,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                text_encoder_2=text_encoder_2,
-                tokenizer_2=tokenizer_2,
-                unet=unet,
-                adapter=adapter,
-                scheduler=scheduler,
-                force_zeros_for_empty_prompt=True,
-            )
-
-        else:
-            pipeline_kwargs = {
-                "vae": vae,
-                "text_encoder": text_encoder,
-                "tokenizer": tokenizer,
-                "text_encoder_2": text_encoder_2,
-                "tokenizer_2": tokenizer_2,
-                "unet": unet,
-                "scheduler": scheduler,
-            }
-
-            if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or (
-                pipeline_class == StableDiffusionXLInpaintPipeline
-            ):
-                pipeline_kwargs.update({"requires_aesthetics_score": is_refiner})
-
-            if is_refiner:
-                pipeline_kwargs.update({"force_zeros_for_empty_prompt": False})
-
-            pipe = pipeline_class(**pipeline_kwargs)
-    else:
-        text_config = create_ldm_bert_config(original_config)
-        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
-        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-
-    return pipe

From 41e97e0d106e04957092aa074730b0138aa682db Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 07:27:25 +0000
Subject: [PATCH 22/89] update

---
 src/diffusers/loaders/single_file_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index c127a6b0964b..755c42ff4f67 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1384,10 +1384,10 @@ def create_text_encoders_and_tokenizers(
                 checkpoint, local_files_only=local_files_only, text_encoder=None
             )
             tokenizer = CLIPTokenizer.from_pretrained(
-                config_name, subfolder="tokenizer", local_files_only=local_files_only
+                config_name, local_files_only=local_files_only
             )
 
-        except Exception:
+        except Exception as e:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
             )

From 658d80f3cf89d51b1217953b200313439c9d86e5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 08:00:17 +0000
Subject: [PATCH 23/89] update

---
 src/diffusers/loaders/single_file.py       | 67 ++++++++++++++++------
 src/diffusers/loaders/single_file_utils.py |  6 +-
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index f1efa29c2a71..e8e21edb7183 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -18,7 +18,9 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import validate_hf_hub_args
 from safetensors.torch import load_file as safe_load
+from transformers import AutoFeatureExtractor
 
+from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..utils import (
     is_accelerate_available,
     is_transformers_available,
@@ -166,39 +168,69 @@ def build_component(
 
 
 def build_additional_components(
-    pipeline_components,
     pipeline_class_name,
-    component_name,
     original_config,
     checkpoint,
     checkpoint_path_or_dict,
     **kwargs,
 ):
-    if component_name in kwargs:
-        return kwargs.pop(component_name, None)
-
-    if component_name in pipeline_components:
-        return None
+    components = {}
+    load_safety_checker = kwargs.get("load_safety_checker", False)
+    local_files_only = kwargs.get("local_files_only", False)
 
     if pipeline_class_name == ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
         stable_unclip_components = create_stable_unclip_components(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
         )
-        return stable_unclip_components
+        components.update(stable_unclip_components)
 
     if pipeline_class_name == "PaintByExamplePipeline":
         paint_by_example_components = create_paint_by_example_components(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
         )
-        return paint_by_example_components
+        return components.update(paint_by_example_components)
 
     if pipeline_class_name in ["StableDiffusionXLImg2ImgPipeline", "StableDiffusionXLInpaintPipeline"]:
         model_type = infer_model_type(pipeline_class_name, original_config)
         is_refiner = model_type == "SDXL-Refiner"
-        return {
-            "requires_aesthetics_score": is_refiner,
-            "force_zeros_for_empty_prompt": False if is_refiner else True,
-        }
+        components.update(
+            {
+                "requires_aesthetics_score": is_refiner,
+                "force_zeros_for_empty_prompt": False if is_refiner else True,
+            }
+        )
+
+    if pipeline_class_name in [
+        "StableDiffusionPipeline",
+        "StableDiffusionImg2ImgPipeline",
+        "StableDiffusionInpaintPipeline",
+        "StableDiffusionUpscalePipeline",
+        "StableDiffusionControlNetPipeline",
+        "StableDiffusionControlNetImg2ImgPipeline",
+        "StableDiffusionControlNetInpaintPipeline",
+        "StableDiffusionLDM3DPipeline",
+        "LatentConsistencyModelPipeline",
+        "LatentConsistencyModelImg2ImgPipeline",
+    ]:
+        if load_safety_checker:
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        else:
+            safety_checker = None
+            feature_extractor = None
+
+        components.update(
+            {
+                "safety_checker": safety_checker,
+                "feature_extractor": feature_extractor,
+            }
+        )
+
+    return components
 
 
 class FromSingleFileMixin:
@@ -332,10 +364,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 continue
             pipeline_components.update(components)
 
-        additional_components = set(pipeline_components.keys() - component_names)
+        additional_components = set(component_names - pipeline_components.keys())
         if additional_components:
-            components = build_additional_components(pipeline_name, component, checkpoint, original_config, **kwargs)
-            pipeline_components.update(components)
+            components = build_additional_components(
+                pipeline_name, original_config, checkpoint, pretrained_model_link_or_path, **kwargs
+            )
+            if components:
+                pipeline_components.update(components)
 
         pipe = cls(**pipeline_components)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 755c42ff4f67..5598c786baaa 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1383,11 +1383,9 @@ def create_text_encoders_and_tokenizers(
             text_encoder = convert_ldm_clip_checkpoint(
                 checkpoint, local_files_only=local_files_only, text_encoder=None
             )
-            tokenizer = CLIPTokenizer.from_pretrained(
-                config_name, local_files_only=local_files_only
-            )
+            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
 
-        except Exception as e:
+        except Exception:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
             )

From 5daf61a34288cc24b5936434df3be6772084071c Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 08:24:26 +0000
Subject: [PATCH 24/89] update

---
 src/diffusers/loaders/single_file_utils.py                    | 2 +-
 src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 5598c786baaa..af3b2b7fa878 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -224,7 +224,7 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
     model_type = infer_model_type(pipeline_class_name, original_config, **kwargs)
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
-        image_size = image_size or original_config.model.params.unet_config.params.image_size
+        image_size = original_config.model.params.unet_config.params.image_size
 
     if model_type in ["SDXL", "SDXL-Refiner"]:
         image_size = 1024
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 5aa23252b86a..d80f4dc8631d 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -1436,6 +1436,7 @@ def download_from_original_stable_diffusion_ckpt(
 
     if pipeline_class == StableDiffusionUpscalePipeline:
         image_size = original_config.model.params.unet_config.params.image_size
+    import ipdb; ipdb.set_trace()
 
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)

From af6cd361e2d122cac419d15efa07e21c45422699 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 08:45:48 +0000
Subject: [PATCH 25/89] update

---
 src/diffusers/loaders/single_file.py          |  2 +-
 src/diffusers/loaders/single_file_utils.py    | 27 +++++++++----------
 .../stable_diffusion/convert_from_ckpt.py     |  1 -
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index e8e21edb7183..f827b8ca4535 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -346,7 +346,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(checkpoint, original_config_file, config_files)
+        original_config = fetch_original_config(pipeline_name, checkpoint, original_config_file, config_files)
         component_names = extract_pipeline_component_names(cls)
 
         pipeline_components = {}
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index af3b2b7fa878..fe8e7d3099b7 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -110,7 +110,7 @@
 textenc_pattern = re.compile("|".join(protected.keys()))
 
 
-def fetch_original_config_file_from_url(checkpoint):
+def fetch_original_config_file_from_url(pipeline_class_name, checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
         config_url = CONFIG_URLS["v2"]
 
@@ -120,44 +120,41 @@ def fetch_original_config_file_from_url(checkpoint):
     elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint:
         config_url = CONFIG_URLS["xl_refiner"]
 
+    elif pipeline_class_name == "StableDiffusionUpscalePipeline":
+        config_url = CONFIG_URLS["upscale"]
+
     else:
         config_url = CONFIG_URLS["v1"]
 
-    # TODO: Add upscale config
-
     original_config_file = BytesIO(requests.get(config_url).content)
 
     return original_config_file
 
 
-def fetch_original_config_file_from_file(checkpoint, config_files: list):
-    if "v1" in config_files:
-        return config_files["v1"]
-
+def fetch_original_config_file_from_file(config_files: list):
     if "v2" in config_files:
         return config_files["v2"]
 
-    if "xl" in config_files:
+    elif "xl" in config_files:
         return config_files["xl"]
 
-    if "xl_refiner" in config_files:
+    elif "xl_refiner" in config_files:
         return config_files["xl_refiner"]
 
-    # TODO: Add upscale config
-
-    return
+    else:
+        return config_files["v1"]
 
 
-def fetch_original_config(checkpoint, original_config_file=None, config_files=None):
+def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None, config_files=None):
     if original_config_file:
         original_config = OmegaConf.load(original_config_file)
         return original_config
 
     elif config_files:
-        original_config_file = fetch_original_config_file_from_file(checkpoint, config_files)
+        original_config_file = fetch_original_config_file_from_file(config_files)
 
     else:
-        original_config_file = fetch_original_config_file_from_url(checkpoint)
+        original_config_file = fetch_original_config_file_from_url(pipeline_class_name, checkpoint)
 
     original_config = OmegaConf.load(original_config_file)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index d80f4dc8631d..5aa23252b86a 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -1436,7 +1436,6 @@ def download_from_original_stable_diffusion_ckpt(
 
     if pipeline_class == StableDiffusionUpscalePipeline:
         image_size = original_config.model.params.unet_config.params.image_size
-    import ipdb; ipdb.set_trace()
 
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)

From 6d743eff0ba5783dc9ab6c614da405c96d4b128e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 08:48:00 +0000
Subject: [PATCH 26/89] update

---
 src/diffusers/loaders/single_file_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index fe8e7d3099b7..d9c8b12d532c 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -223,7 +223,7 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
         image_size = original_config.model.params.unet_config.params.image_size
 
-    if model_type in ["SDXL", "SDXL-Refiner"]:
+    elif model_type in ["SDXL", "SDXL-Refiner"]:
         image_size = 1024
 
     elif (

From b7732a07efd8fbb260c3bcd3a93b2a34e4dbc2d2 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 09:21:12 +0000
Subject: [PATCH 27/89] update

---
 src/diffusers/loaders/single_file_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index d9c8b12d532c..9ee22ad43f90 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -222,9 +222,11 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
         image_size = original_config.model.params.unet_config.params.image_size
+        return image_size
 
     elif model_type in ["SDXL", "SDXL-Refiner"]:
         image_size = 1024
+        return image_size
 
     elif (
         "parameterization" in original_config["model"]["params"]
@@ -233,6 +235,7 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
         # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
         # as it relies on a brittle global step parameter here
         image_size = 512 if global_step == 875000 else 768
+        return image_size
 
     return image_size
 

From 9d10d2d28a3e5a4f0b2a8008366e5c1825730e30 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 09:23:18 +0000
Subject: [PATCH 28/89] update

---
 src/diffusers/loaders/single_file.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index f827b8ca4535..43924cb4ecdd 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -146,12 +146,6 @@ def build_component(
         )
         return vae_components
 
-    if component_name == "controlnet":
-        controlnet_components = create_controlnet_model(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-        )
-        return controlnet_components
-
     if component_name == "scheduler":
         scheduler_components = create_scheduler(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs

From 820313b8f40b12979031983c4eab7b4446d98bfe Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 09:30:59 +0000
Subject: [PATCH 29/89] update

---
 src/diffusers/loaders/single_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 43924cb4ecdd..4d93ea74587a 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -304,7 +304,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         local_files_only = kwargs.pop("local_files_only", None)
         revision = kwargs.pop("revision", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)
 
         pipeline_name = cls.__name__
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]

From efc638061562e34d888965aeb52ef76040cd243c Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 09:52:49 +0000
Subject: [PATCH 30/89] update

---
 src/diffusers/loaders/single_file.py | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 4d93ea74587a..e5cfb2933246 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -27,10 +27,7 @@
     logging,
 )
 from .single_file_utils import (
-    create_controlnet_model,
-    create_paint_by_example_components,
     create_scheduler,
-    create_stable_unclip_components,
     create_text_encoders_and_tokenizers,
     create_unet_model,
     create_vae_model,
@@ -164,27 +161,17 @@ def build_component(
 def build_additional_components(
     pipeline_class_name,
     original_config,
-    checkpoint,
-    checkpoint_path_or_dict,
     **kwargs,
 ):
     components = {}
     load_safety_checker = kwargs.get("load_safety_checker", False)
     local_files_only = kwargs.get("local_files_only", False)
 
-    if pipeline_class_name == ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
-        stable_unclip_components = create_stable_unclip_components(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-        )
-        components.update(stable_unclip_components)
-
-    if pipeline_class_name == "PaintByExamplePipeline":
-        paint_by_example_components = create_paint_by_example_components(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-        )
-        return components.update(paint_by_example_components)
-
-    if pipeline_class_name in ["StableDiffusionXLImg2ImgPipeline", "StableDiffusionXLInpaintPipeline"]:
+    if pipeline_class_name in [
+        "StableDiffusionXLImg2ImgPipeline",
+        "StableDiffusionXLInpaintPipeline",
+        "StableDiffusionXLControlNetImg2ImgPipeline",
+    ]:
         model_type = infer_model_type(pipeline_class_name, original_config)
         is_refiner = model_type == "SDXL-Refiner"
         components.update(
@@ -360,9 +347,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         additional_components = set(component_names - pipeline_components.keys())
         if additional_components:
-            components = build_additional_components(
-                pipeline_name, original_config, checkpoint, pretrained_model_link_or_path, **kwargs
-            )
+            components = build_additional_components(pipeline_name, original_config, **kwargs)
             if components:
                 pipeline_components.update(components)
 

From 94536262cb8939dd974f1c34f76f4b5e87912055 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 10:45:09 +0000
Subject: [PATCH 31/89] up

---
 src/diffusers/loaders/single_file.py | 48 ++++++++++++++++------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index e5cfb2933246..4ed845057ff0 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -47,6 +47,27 @@
 
 VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
 
+# Pipelines where safety_checker is a required argument
+SAFETY_CHECKER_PIPELINES = [
+    "StableDiffusionPipeline",
+    "StableDiffusionImg2ImgPipeline",
+    "StableDiffusionInpaintPipeline",
+    "StableDiffusionUpscalePipeline",
+    "StableDiffusionControlNetPipeline",
+    "StableDiffusionControlNetImg2ImgPipeline",
+    "StableDiffusionControlNetInpaintPipeline",
+    "StableDiffusionLDM3DPipeline",
+    "LatentConsistencyModelPipeline",
+    "LatentConsistencyModelImg2ImgPipeline",
+]
+
+# Pipelines that support the SDXL Refiner checkpoint
+REFINER_PIPELINES = [
+    "StableDiffusionXLImg2ImgPipeline",
+    "StableDiffusionXLInpaintPipeline",
+    "StableDiffusionXLControlNetImg2ImgPipeline",
+]
+
 
 def extract_pipeline_component_names(pipeline_class):
     components = inspect.signature(pipeline_class).parameters.keys()
@@ -167,11 +188,7 @@ def build_additional_components(
     load_safety_checker = kwargs.get("load_safety_checker", False)
     local_files_only = kwargs.get("local_files_only", False)
 
-    if pipeline_class_name in [
-        "StableDiffusionXLImg2ImgPipeline",
-        "StableDiffusionXLInpaintPipeline",
-        "StableDiffusionXLControlNetImg2ImgPipeline",
-    ]:
+    if pipeline_class_name in REFINER_PIPELINES:
         model_type = infer_model_type(pipeline_class_name, original_config)
         is_refiner = model_type == "SDXL-Refiner"
         components.update(
@@ -181,18 +198,7 @@ def build_additional_components(
             }
         )
 
-    if pipeline_class_name in [
-        "StableDiffusionPipeline",
-        "StableDiffusionImg2ImgPipeline",
-        "StableDiffusionInpaintPipeline",
-        "StableDiffusionUpscalePipeline",
-        "StableDiffusionControlNetPipeline",
-        "StableDiffusionControlNetImg2ImgPipeline",
-        "StableDiffusionControlNetInpaintPipeline",
-        "StableDiffusionLDM3DPipeline",
-        "LatentConsistencyModelPipeline",
-        "LatentConsistencyModelImg2ImgPipeline",
-    ]:
+    if pipeline_class_name in SAFETY_CHECKER_PIPELINES:
         if load_safety_checker:
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(
                 "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
@@ -293,7 +299,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         torch_dtype = kwargs.pop("torch_dtype", None)
         use_safetensors = kwargs.pop("use_safetensors", True)
 
-        pipeline_name = cls.__name__
+        pipeline_class_name = cls.__name__
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
         from_safetensors = file_extension == "safetensors"
 
@@ -327,14 +333,14 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(pipeline_name, checkpoint, original_config_file, config_files)
+        original_config = fetch_original_config(pipeline_class_name, checkpoint, original_config_file, config_files)
         component_names = extract_pipeline_component_names(cls)
 
         pipeline_components = {}
         for component in component_names:
             components = build_component(
                 pipeline_components,
-                pipeline_name,
+                pipeline_class_name,
                 component,
                 original_config,
                 checkpoint,
@@ -347,7 +353,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         additional_components = set(component_names - pipeline_components.keys())
         if additional_components:
-            components = build_additional_components(pipeline_name, original_config, **kwargs)
+            components = build_additional_components(pipeline_class_name, original_config, **kwargs)
             if components:
                 pipeline_components.update(components)
 

From afa62e6fa82fdcdfa4aaeabe5b855c85406391e7 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 11:11:25 +0000
Subject: [PATCH 32/89] update

---
 src/diffusers/loaders/single_file.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 4ed845057ff0..82d29ce73f68 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -75,14 +75,15 @@ def extract_pipeline_component_names(pipeline_class):
 
 
 def check_valid_url(pretrained_model_link_or_path):
-    # remove huggingface url
+    # check if url prefix is valid
+    # remove huggingface url prefix from model path
     has_valid_url_prefix = False
     for prefix in VALID_URL_PREFIXES:
         if pretrained_model_link_or_path.startswith(prefix):
             pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
             has_valid_url_prefix = True
 
-    return has_valid_url_prefix
+    return has_valid_url_prefix, pretrained_model_link_or_path
 
 
 def download_model_checkpoint(
@@ -306,7 +307,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         if from_safetensors and use_safetensors is False:
             raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
 
-        has_valid_url_prefix = check_valid_url(pretrained_model_link_or_path)
+        has_valid_url_prefix, pretrained_model_link_or_path = check_valid_url(pretrained_model_link_or_path)
 
         # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
         ckpt_path = Path(pretrained_model_link_or_path)
@@ -314,7 +315,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             raise ValueError(
                 f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
             )
-        if not ckpt_path.is_file():
+        if ckpt_path.is_file():
+            checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
+        else:
             pretrained_model_link_or_path = download_model_checkpoint(
                 ckpt_path,
                 cache_dir=cache_dir,
@@ -325,8 +328,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 revision=revision,
             )
             checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
-        else:
-            checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
 
         # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
         # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21

From e033f9f6084c4bedd6530fc20ac534b15d74d086 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 11:28:21 +0000
Subject: [PATCH 33/89] update

---
 src/diffusers/loaders/single_file.py       | 39 ++++++++++------------
 src/diffusers/loaders/single_file_utils.py |  2 +-
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 82d29ce73f68..fd5a955316ee 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -153,6 +153,9 @@ def build_component(
     if component_name in pipeline_components:
         return {}
 
+    load_safety_checker = kwargs.get("load_safety_checker", False)
+    local_files_only = kwargs.get("local_files_only", False)
+
     if component_name == "unet":
         unet_components = create_unet_model(
             pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
@@ -177,6 +180,20 @@ def build_component(
         )
         return text_encoder_components
 
+    if component_name == "safety_checker":
+        if load_safety_checker:
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        else:
+            safety_checker = None
+            feature_extractor = None
+
+        return {"safety_checker": safety_checker, "feature_extractor": feature_extractor}
+
     return
 
 
@@ -186,9 +203,6 @@ def build_additional_components(
     **kwargs,
 ):
     components = {}
-    load_safety_checker = kwargs.get("load_safety_checker", False)
-    local_files_only = kwargs.get("local_files_only", False)
-
     if pipeline_class_name in REFINER_PIPELINES:
         model_type = infer_model_type(pipeline_class_name, original_config)
         is_refiner = model_type == "SDXL-Refiner"
@@ -199,25 +213,6 @@ def build_additional_components(
             }
         )
 
-    if pipeline_class_name in SAFETY_CHECKER_PIPELINES:
-        if load_safety_checker:
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
-            )
-            feature_extractor = AutoFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
-            )
-        else:
-            safety_checker = None
-            feature_extractor = None
-
-        components.update(
-            {
-                "safety_checker": safety_checker,
-                "feature_extractor": feature_extractor,
-            }
-        )
-
     return components
 
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 9ee22ad43f90..f6e94ebc25d8 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -178,7 +178,7 @@ def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True)
     return checkpoint
 
 
-def infer_model_type(pipeline_class_name, original_config, model_type=None):
+def infer_model_type(pipeline_class_name, original_config, model_type=None, **kwargs):
     if model_type is not None:
         return model_type
 

From c0d62ac97efbcd19de85d8e0e834ff232440d981 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 11:39:00 +0000
Subject: [PATCH 34/89] update

---
 src/diffusers/loaders/single_file.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index fd5a955316ee..002f867653ea 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -148,7 +148,8 @@ def build_component(
     **kwargs,
 ):
     if component_name in kwargs:
-        return kwargs.pop(component_name, None)
+        component = kwargs.pop(component_name, None)
+        return {component_name: component}
 
     if component_name in pipeline_components:
         return {}

From 9605db5517167428aa54b0f2b1419ce7dac5f747 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 11:50:44 +0000
Subject: [PATCH 35/89] update

---
 src/diffusers/loaders/single_file.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 002f867653ea..228345cd80cb 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -46,21 +46,6 @@
 
 
 VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
-
-# Pipelines where safety_checker is a required argument
-SAFETY_CHECKER_PIPELINES = [
-    "StableDiffusionPipeline",
-    "StableDiffusionImg2ImgPipeline",
-    "StableDiffusionInpaintPipeline",
-    "StableDiffusionUpscalePipeline",
-    "StableDiffusionControlNetPipeline",
-    "StableDiffusionControlNetImg2ImgPipeline",
-    "StableDiffusionControlNetInpaintPipeline",
-    "StableDiffusionLDM3DPipeline",
-    "LatentConsistencyModelPipeline",
-    "LatentConsistencyModelImg2ImgPipeline",
-]
-
 # Pipelines that support the SDXL Refiner checkpoint
 REFINER_PIPELINES = [
     "StableDiffusionXLImg2ImgPipeline",

From e945e18637e926ce8181a1b823748c223379b988 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 30 Dec 2023 12:24:53 +0000
Subject: [PATCH 36/89] update'

---
 .../test_stable_diffusion_xl.py               | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
index 80bff3663a98..1cd8636df317 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -1049,3 +1049,26 @@ def test_stable_diffusion_lcm(self):
         max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
 
         assert max_diff < 1e-2
+
+    def test_download_ckpt_diff_format_is_same(self):
+        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+
+        pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_attn_processor(AttnProcessor())
+        pipe.to("cuda")
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_ckpt = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
+
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_attn_processor(AttnProcessor())
+        pipe.to("cuda")
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+
+        assert max_diff < 1e-3

From fa3a0d66345e8883f89784ee94757a0c475467b5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 10:44:24 +0000
Subject: [PATCH 37/89] update

---
 .../controlnet/test_controlnet_sdxl.py        | 44 ++++++++++++++++++-
 .../test_stable_diffusion_xl.py               | 20 +++++----
 .../test_stable_diffusion_xl_img2img.py       | 42 ++++++++++++++++++
 3 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py
index ba129e763c22..f91208c56fb4 100644
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -31,7 +31,14 @@
 from diffusers.models.unet_2d_blocks import UNetMidBlock2D
 from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import (
@@ -819,6 +826,41 @@ def test_depth(self):
         expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
         assert np.allclose(original_image, expected_image, atol=1e-04)
 
+    def test_download_ckpt_diff_format_is_same(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
+        single_file_url = (
+            "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+        )
+        pipe_single_file = StableDiffusionXLControlNetPipeline.from_single_file(
+            single_file_url, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe_single_file.unet.set_default_attn_processor()
+        pipe_single_file.enable_model_cpu_offload()
+        pipe_single_file.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+        single_file_images = pipe_single_file(
+            prompt, image=image, generator=generator, output_type="np", num_inference_steps=3
+        ).images
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (512, 512, 3)
+        assert single_file_images[0].shape == (512, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(images[0].flatten(), single_file_images[0].flatten())
+        assert max_diff < 1e-4
+
 
 class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNetPipelineFastTests):
     def test_controlnet_sdxl_guess(self):
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
index 1cd8636df317..70a1c0d86fd5 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -1051,24 +1051,28 @@ def test_stable_diffusion_lcm(self):
         assert max_diff < 1e-2
 
     def test_download_ckpt_diff_format_is_same(self):
-        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+        ckpt_path = (
+            "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+        )
 
-        pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
+        pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
 
         generator = torch.Generator(device="cpu").manual_seed(0)
         image_ckpt = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
 
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
 
         generator = torch.Generator(device="cpu").manual_seed(0)
         image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
 
         max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
 
-        assert max_diff < 1e-3
+        assert max_diff < 6e-3
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 0a7d4d0de4ca..893118080803 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import random
 import unittest
 
@@ -31,15 +32,19 @@
 from diffusers import (
     AutoencoderKL,
     AutoencoderTiny,
+    DDIMScheduler,
     EulerDiscreteScheduler,
     LCMScheduler,
     StableDiffusionXLImg2ImgPipeline,
     UNet2DConditionModel,
 )
+from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     floats_tensor,
+    numpy_cosine_similarity_distance,
     require_torch_gpu,
+    slow,
     torch_device,
 )
 
@@ -763,3 +768,40 @@ def test_inference_batch_single_identical(self):
 
     def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
+
+
+@slow
+class StableDiffusionXLImg2ImgIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_download_ckpt_diff_format_is_same(self):
+        ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors"
+        init_image = load_image(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_img2img/sketch-mountains-input.png"
+        )
+
+        pipe = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_ckpt = pipe(
+            "mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
+        ).images[0]
+
+        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0")
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.enable_model_cpu_offload()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = pipe(
+            "mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
+        ).images[0]
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+
+        assert max_diff < 1e-3

From bbc60be3b67b825a098d631301ef7b2a629d37dd Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 11:32:10 +0000
Subject: [PATCH 38/89] update

---
 .../controlnet/test_controlnet_sdxl.py        |  8 ++--
 .../test_stable_diffusion_xl_adapter.py       | 37 +++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py
index f91208c56fb4..c31c728c5bde 100644
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -827,7 +827,7 @@ def test_depth(self):
         assert np.allclose(original_image, expected_image, atol=1e-04)
 
     def test_download_ckpt_diff_format_is_same(self):
-        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16)
         single_file_url = (
             "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
         )
@@ -844,7 +844,7 @@ def test_download_ckpt_diff_format_is_same(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
         )
         single_file_images = pipe_single_file(
-            prompt, image=image, generator=generator, output_type="np", num_inference_steps=3
+            prompt, image=image, generator=generator, output_type="np", num_inference_steps=2
         ).images
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -853,13 +853,13 @@ def test_download_ckpt_diff_format_is_same(self):
         )
         pipe.unet.set_default_attn_processor()
         pipe.enable_model_cpu_offload()
-        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=2).images
 
         assert images[0].shape == (512, 512, 3)
         assert single_file_images[0].shape == (512, 512, 3)
 
         max_diff = numpy_cosine_similarity_distance(images[0].flatten(), single_file_images[0].flatten())
-        assert max_diff < 1e-4
+        assert max_diff < 5e-2
 
 
 class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNetPipelineFastTests):
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
index d1920d59b447..b500612116d1 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
@@ -697,3 +697,40 @@ def test_canny_lora(self):
         image_slice = images[0, -3:, -3:, -1].flatten()
         expected_slice = np.array([0.4284, 0.4337, 0.4319, 0.4255, 0.4329, 0.4280, 0.4338, 0.4420, 0.4226])
         assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
+
+    def test_download_ckpt_diff_format_is_same(self):
+        ckpt_path = (
+            "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+        )
+        adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16)
+        prompt = "toy"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/toy_canny.png"
+        )
+        pipe_single_file = StableDiffusionXLAdapterPipeline.from_single_file(
+            ckpt_path,
+            adapter=adapter,
+            torch_dtype=torch.float16,
+        )
+        pipe_single_file.enable_model_cpu_offload()
+        pipe_single_file.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        images_single_file = pipe_single_file(
+            prompt, image=image, generator=generator, output_type="np", num_inference_steps=3
+        ).images
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            adapter=adapter,
+            torch_dtype=torch.float16,
+        )
+        pipe.enable_model_cpu_offload()
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images_single_file[0].shape == (768, 512, 3)
+        assert images[0].shape == (768, 512, 3)
+
+        max_diff = numpy_cosine_similarity_distance(images[0].flatten(), images_single_file[0].flatten())
+        assert max_diff < 5e-3

From b69cddb0550cbab718ec5fe956ce47b6891ed087 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 11:44:25 +0000
Subject: [PATCH 39/89] update

---
 src/diffusers/loaders/single_file_utils.py | 118 +++------------------
 1 file changed, 15 insertions(+), 103 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index f6e94ebc25d8..4b7f4cafac31 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -24,7 +24,6 @@
 from omegaconf import OmegaConf
 from safetensors.torch import load_file as safe_load
 from transformers import (
-    AutoFeatureExtractor,
     BertTokenizerFast,
     CLIPImageProcessor,
     CLIPTextConfig,
@@ -35,7 +34,7 @@
     CLIPVisionModelWithProjection,
 )
 
-from ..models import AutoencoderKL, ControlNetModel, PriorTransformer, UNet2DConditionModel
+from ..models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
 from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..pipelines.paint_by_example import PaintByExampleImageEncoder
 from ..pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -240,6 +239,7 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
     return image_size
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -250,6 +250,7 @@ def shave_segments(path, n_shave_prefix_segments=1):
         return ".".join(path.split(".")[:n_shave_prefix_segments])
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside resnets to the new naming scheme (local renaming)
@@ -272,6 +273,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
 def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside resnets to the new naming scheme (local renaming)
@@ -288,6 +290,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
 def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -309,6 +312,7 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -339,6 +343,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
 def assign_to_checkpoint(
     paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
 ):
@@ -394,6 +399,7 @@ def assign_to_checkpoint(
             checkpoint[new_path] = old_checkpoint[path["old"]]
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
 def conv_attn_to_linear(checkpoint):
     keys = list(checkpoint.keys())
     attn_keys = ["query.weight", "key.weight", "value.weight"]
@@ -406,6 +412,7 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_unet_diffusers_config
 def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
@@ -510,6 +517,7 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     return config
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
 def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
@@ -534,6 +542,7 @@ def create_vae_diffusers_config(original_config, image_size: int):
     return config
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_unet_checkpoint
 def convert_ldm_unet_checkpoint(
     checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
 ):
@@ -782,6 +791,7 @@ def convert_ldm_unet_checkpoint(
     return new_checkpoint
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
     vae_state_dict = {}
@@ -889,6 +899,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_bert_checkpoint
 def convert_ldm_bert_checkpoint(checkpoint, config):
     def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
         hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
@@ -939,6 +950,7 @@ def _copy_layers(hf_layers, pt_layers):
     return hf_model
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_clip_checkpoint
 def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
     if text_encoder is None:
         config_name = "openai/clip-vit-large-patch14"
@@ -978,56 +990,7 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
     return text_model
 
 
-def convert_controlnet_checkpoint(
-    checkpoint,
-    original_config,
-    checkpoint_path,
-    image_size,
-    upcast_attention,
-    extract_ema,
-    use_linear_projection=None,
-    cross_attention_dim=None,
-):
-    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
-    ctrlnet_config["upcast_attention"] = upcast_attention
-
-    ctrlnet_config.pop("sample_size")
-
-    if use_linear_projection is not None:
-        ctrlnet_config["use_linear_projection"] = use_linear_projection
-
-    if cross_attention_dim is not None:
-        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
-
-    ctx = init_empty_weights if is_accelerate_available() else nullcontext
-    with ctx():
-        controlnet = ControlNetModel(**ctrlnet_config)
-
-    # Some controlnet ckpt files are distributed independently from the rest of the
-    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
-    if "time_embed.0.weight" in checkpoint:
-        skip_extract_state_dict = True
-    else:
-        skip_extract_state_dict = False
-
-    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint,
-        ctrlnet_config,
-        path=checkpoint_path,
-        extract_ema=extract_ema,
-        controlnet=True,
-        skip_extract_state_dict=skip_extract_state_dict,
-    )
-
-    if is_accelerate_available():
-        for param_name, param in converted_ctrl_checkpoint.items():
-            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
-    else:
-        controlnet.load_state_dict(converted_ctrl_checkpoint)
-
-    return controlnet
-
-
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_open_clip_checkpoint
 def convert_open_clip_checkpoint(
     checkpoint,
     config_name,
@@ -1312,28 +1275,6 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     return {"unet": unet}
 
 
-def create_controlnet_model(
-    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, image_size, **kwargs
-):
-    if "control_stage_config" not in original_config.model.params:
-        raise ValueError("Config does not have controlnet information")
-
-    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
-    extract_ema = kwargs.get("extract_ema", False)
-    upcast_attention = kwargs.get("upcast_attention", False)
-
-    controlnet = convert_controlnet_checkpoint(
-        checkpoint,
-        original_config,
-        path,
-        image_size,
-        upcast_attention,
-        extract_ema,
-    )
-
-    return {"controlnet": controlnet}
-
-
 def create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
 
@@ -1601,32 +1542,3 @@ def create_stable_unclip_components(
         }
 
     return
-
-
-def create_paint_by_example_components(
-    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-):
-    local_files_only = kwargs.get("local_files_only", False)
-    image_encoder = convert_paint_by_example_checkpoint(checkpoint)
-
-    try:
-        config_name = "openai/clip-vit-large-patch14"
-        tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
-    except Exception:
-        raise ValueError(
-            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-        )
-
-    try:
-        config_name = "CompVis/stable-diffusion-safety-checker"
-        feature_extractor = AutoFeatureExtractor.from_pretrained(config_name, local_files_only=local_files_only)
-    except Exception:
-        raise ValueError(
-            f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
-        )
-
-    return {
-        "image_encoder": image_encoder,
-        "tokenizer": tokenizer,
-        "feature_extractor": feature_extractor,
-    }

From 3ae0b8375c7355d9debf76499722e5177af7ee06 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 12:10:43 +0000
Subject: [PATCH 40/89] update

---
 .../stable_diffusion_xl/test_stable_diffusion_xl.py         | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
index 70a1c0d86fd5..d5ad5ee0d72b 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import copy
+import gc
 import tempfile
 import unittest
 
@@ -1024,6 +1025,11 @@ def callback_on_step_end(pipe, i, t, callback_kwargs):
 
 @slow
 class StableDiffusionXLPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def test_stable_diffusion_lcm(self):
         torch.manual_seed(0)
         unet = UNet2DConditionModel.from_pretrained(

From 6c19f0a6bc5b8b2f7501ee1ec381bacb99f4fe84 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 12:38:43 +0000
Subject: [PATCH 41/89] update

---
 src/diffusers/loaders/single_file_utils.py    |  3 +++
 .../test_stable_diffusion_xl_img2img.py       | 20 +++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 4b7f4cafac31..b8c2dcba3430 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -61,6 +61,7 @@
     "v1": "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml",
     "v2": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml",
     "xl": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml",
+    "xl_refiner": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml",
     "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml",
 }
 
@@ -1355,6 +1356,8 @@ def create_text_encoders_and_tokenizers(
 
         else:
             return {
+                "text_encoder": None,
+                "tokenizer": None,
                 "tokenizer_2": tokenizer_2,
                 "text_encoder_2": text_encoder_2,
             }
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 893118080803..33986b077ac6 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -784,24 +784,24 @@ def test_download_ckpt_diff_format_is_same(self):
             "/stable_diffusion_img2img/sketch-mountains-input.png"
         )
 
-        pipe = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path)
+        pipe_single_file = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
+        pipe_single_file.scheduler = DDIMScheduler.from_config(pipe_single_file.scheduler.config)
+        pipe_single_file.enable_model_cpu_offload()
+
+        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16)
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.enable_model_cpu_offload()
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        image_ckpt = pipe(
-            "mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
+        image = pipe(
+            prompt="mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
         ).images[0]
 
-        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0")
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
-
         generator = torch.Generator(device="cpu").manual_seed(0)
-        image = pipe(
-            "mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
+        image_single_file = pipe_single_file(
+            prompt="mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
         ).images[0]
 
-        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
 
         assert max_diff < 1e-3

From ba704fd4dd544760f3bf4571f02f1d5296bc99bf Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 13:49:47 +0000
Subject: [PATCH 42/89] update

---
 .../test_stable_diffusion_xl_img2img.py        | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 33986b077ac6..6d3d42b7a638 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -778,30 +778,32 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_download_ckpt_diff_format_is_same(self):
-        ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors"
+        ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0_0.9vae.safetensors"
         init_image = load_image(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
             "/stable_diffusion_img2img/sketch-mountains-input.png"
         )
 
-        pipe_single_file = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
-        pipe_single_file.scheduler = DDIMScheduler.from_config(pipe_single_file.scheduler.config)
-        pipe_single_file.enable_model_cpu_offload()
-
         pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16)
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
         pipe.enable_model_cpu_offload()
 
         generator = torch.Generator(device="cpu").manual_seed(0)
         image = pipe(
-            prompt="mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
+            prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np"
         ).images[0]
 
+        pipe_single_file = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
+        pipe_single_file.scheduler = DDIMScheduler.from_config(pipe_single_file.scheduler.config)
+        pipe_single_file.unet.set_default_attn_processor()
+        pipe_single_file.enable_model_cpu_offload()
+
         generator = torch.Generator(device="cpu").manual_seed(0)
         image_single_file = pipe_single_file(
-            prompt="mountains", image=init_image, num_inference_steps=2, generator=generator, output_type="np"
+            prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np"
         ).images[0]
 
         max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
 
-        assert max_diff < 1e-3
+        assert max_diff < 5e-3

From f3045286f5506b88c86700167f4104a2e03528b4 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 13:49:56 +0000
Subject: [PATCH 43/89] update

---
 .../stable_diffusion_xl/test_stable_diffusion_xl_img2img.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 6d3d42b7a638..97910b124eff 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -784,7 +784,9 @@ def test_download_ckpt_diff_format_is_same(self):
             "/stable_diffusion_img2img/sketch-mountains-input.png"
         )
 
-        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16)
+        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16
+        )
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.unet.set_default_attn_processor()
         pipe.enable_model_cpu_offload()

From 3c806be1cd37831487690223d395d0bd4be88683 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 13:51:48 +0000
Subject: [PATCH 44/89] update

---
 .../stable_diffusion_xl/test_stable_diffusion_xl_img2img.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 97910b124eff..830ecc5e6baf 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -778,7 +778,7 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_download_ckpt_diff_format_is_same(self):
-        ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0_0.9vae.safetensors"
+        ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors"
         init_image = load_image(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
             "/stable_diffusion_img2img/sketch-mountains-input.png"

From f86ba553161471d52b38fb1ee95f31184e0385ea Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 2 Jan 2024 13:57:18 +0000
Subject: [PATCH 45/89] update

---
 .../stable_diffusion_xl/test_stable_diffusion_xl_img2img.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 830ecc5e6baf..e505630cf6e1 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -808,4 +808,4 @@ def test_download_ckpt_diff_format_is_same(self):
 
         max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
 
-        assert max_diff < 5e-3
+        assert max_diff < 5e-2

From cf560a715e7aae4b8268244a9d83c62d3eb0bdfd Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 15 Jan 2024 12:54:10 +0000
Subject: [PATCH 46/89] update

---
 src/diffusers/loaders/single_file_utils.py | 405 ++++++---------------
 1 file changed, 111 insertions(+), 294 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index b8c2dcba3430..6f71133b5850 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -17,11 +17,10 @@
 import re
 from contextlib import nullcontext
 from io import BytesIO
-from typing import Optional
 
 import requests
 import torch
-from omegaconf import OmegaConf
+import yaml
 from safetensors.torch import load_file as safe_load
 from transformers import (
     BertTokenizerFast,
@@ -30,14 +29,11 @@
     CLIPTextModel,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
-    CLIPVisionConfig,
     CLIPVisionModelWithProjection,
 )
 
-from ..models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
+from ..models import AutoencoderKL, UNet2DConditionModel
 from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from ..pipelines.paint_by_example import PaintByExampleImageEncoder
-from ..pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from ..schedulers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -85,6 +81,53 @@
     "timestep_spacing": "leading",
 }
 
+DIFFUSERS_TO_LDM_MAPPING = {
+    "unet": {
+        "time_embedding.linear_1.weight": "time_embed.0.weight",
+        "time_embedding.linear_1.bias": "time_embed.0.bias",
+        "time_embedding.linear_2.weight": "time_embed.2.weight",
+        "time_embedding.linear_2.bias": "time_embed.2.bias",
+        "conv_in.weight": "input_blocks.0.0.weight",
+        "conv_in.bias": "input_blocks.0.0.bias",
+        "class_embed_type": {
+            "timestep": {
+                "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+                "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+                "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+                "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+            },
+            "text_time": {
+                "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+                "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+                "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+                "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+            },
+        },
+    },
+    "vae": {
+        "encoder.conv_in.weight": "encoder.conv_in.weight",
+        "encoder.conv_in.bias": "encoder.conv_in.bias",
+        "encoder.conv_out.weight": "encoder.conv_out.weight",
+        "encoder.conv_out.bias": "encoder.conv_out.bias",
+        "encoder.conv_norm_out.weight": "encoder.conv_norm_out.weight",
+        "encoder.conv_norm_out.bias": "encoder.conv_norm_out.bias",
+        "decoder.conv_in.weight": "decoder.conv_in.weight",
+        "decoder.conv_in.bias": "decoder.conv_in.bias",
+        "decoder.conv_out.weight": "decoder.conv_out.weight",
+        "decoder.conv_out.bias": "decoder.conv_out.bias",
+        "decoder.conv_norm_out.weight": "decoder.conv_norm_out.weight",
+        "decoder.conv_norm_out.bias": "decoder.conv_norm_out.bias",
+        "quant_conv.weight": "quant_conv.weight",
+        "quant_conv.bias": "quant_conv.bias",
+        "post_quant_conv.weight": "post_quant_conv.weight",
+        "post_quant_conv.bias": "post_quant_conv.bias",
+    },
+}
+
+
+UNET_TIME_EMBEDDING_LAYERS = []
+
+
 textenc_conversion_lst = [
     ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
     ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
@@ -147,7 +190,7 @@ def fetch_original_config_file_from_file(config_files: list):
 
 def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None, config_files=None):
     if original_config_file:
-        original_config = OmegaConf.load(original_config_file)
+        original_config = yaml.safe_load(original_config_file)
         return original_config
 
     elif config_files:
@@ -156,7 +199,7 @@ def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=
     else:
         original_config_file = fetch_original_config_file_from_url(pipeline_class_name, checkpoint)
 
-    original_config = OmegaConf.load(original_config_file)
+    original_config = yaml.safe_load(original_config_file)
 
     return original_config
 
@@ -187,18 +230,19 @@ def infer_model_type(pipeline_class_name, original_config, model_type=None, **kw
         return model_type
 
     has_cond_stage_config = (
-        "cond_stage_config" in original_config.model.params
-        and original_config.model.params.cond_stage_config is not None
+        "cond_stage_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["cond_stage_config"] is not None
     )
     has_network_config = (
-        "network_config" in original_config.model.params and original_config.model.params.network_config is not None
+        "network_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["network_config"] is not None
     )
 
     if has_cond_stage_config:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+        model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
 
     elif has_network_config:
-        context_dim = original_config.model.params.network_config.params.context_dim
+        context_dim = original_config["model"]["params"]["network_config"]["params"]["context_dim"]
         if context_dim == 2048:
             model_type = "SDXL"
         else:
@@ -221,7 +265,7 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
     model_type = infer_model_type(pipeline_class_name, original_config, **kwargs)
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
-        image_size = original_config.model.params.unet_config.params.image_size
+        image_size = original_config["model"]["params"].unet_config.params.image_size
         return image_size
 
     elif model_type in ["SDXL", "SDXL-Refiner"]:
@@ -413,57 +457,55 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_unet_diffusers_config
-def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+def create_unet_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    if controlnet:
-        unet_params = original_config.model.params.control_stage_config.params
+    if (
+        "unet_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["unet_config"] is not None
+    ):
+        unet_params = original_config["model"]["params"]["unet_config"]["params"]
     else:
-        if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
-            unet_params = original_config.model.params.unet_config.params
-        else:
-            unet_params = original_config.model.params.network_config.params
-
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+        unet_params = original_config["model"]["params"]["network_config"]["params"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    if unet_params.transformer_depth is not None:
+    if unet_params["transformer_depth"] is not None:
         transformer_layers_per_block = (
-            unet_params.transformer_depth
-            if isinstance(unet_params.transformer_depth, int)
-            else list(unet_params.transformer_depth)
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
         )
     else:
         transformer_layers_per_block = 1
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
     use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
     )
     if use_linear_projection:
         # stable diffusion 2-base-512 and 2-768
         if head_dim is None:
-            head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
-            head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
 
     class_embed_type = None
     addition_embed_type = None
@@ -471,13 +513,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     projection_class_embeddings_input_dim = None
     context_dim = None
 
-    if unet_params.context_dim is not None:
+    if unet_params["context_dim"] is not None:
         context_dim = (
-            unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
         )
 
     if "num_classes" in unet_params:
-        if unet_params.num_classes == "sequential":
+        if unet_params["num_classes"] == "sequential":
             if context_dim in [2048, 1280]:
                 # SDXL
                 addition_embed_type = "text_time"
@@ -485,14 +529,14 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
             else:
                 class_embed_type = "projection"
             assert "adm_in_channels" in unet_params
-            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
+        "in_channels": unet_params["in_channels"],
         "down_block_types": tuple(down_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
+        "layers_per_block": unet_params["num_res_blocks"],
         "cross_attention_dim": context_dim,
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
@@ -504,49 +548,42 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     }
 
     if "disable_self_attentions" in unet_params:
-        config["only_cross_attention"] = unet_params.disable_self_attentions
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
 
-    if "num_classes" in unet_params and isinstance(unet_params.num_classes, int):
-        config["num_class_embeds"] = unet_params.num_classes
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
 
-    if controlnet:
-        config["conditioning_channels"] = unet_params.hint_channels
-    else:
-        config["out_channels"] = unet_params.out_channels
-        config["up_block_types"] = tuple(up_block_types)
+    config["out_channels"] = unet_params["out_channels"]
+    config["up_block_types"] = tuple(up_block_types)
 
     return config
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
 def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
     }
+
     return config
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_unet_checkpoint
-def convert_ldm_unet_checkpoint(
-    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
-):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -558,10 +595,7 @@ def convert_ldm_unet_checkpoint(
         unet_state_dict = {}
         keys = list(checkpoint.keys())
 
-        if controlnet:
-            unet_key = "control_model."
-        else:
-            unet_key = "model.diffusion_model."
+        unet_key = "model.diffusion_model."
 
         # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
         if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
@@ -617,12 +651,10 @@ def convert_ldm_unet_checkpoint(
     new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
-    if not controlnet:
-        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
     # Retrieves the keys for the input blocks only
     num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
@@ -747,48 +779,6 @@ def convert_ldm_unet_checkpoint(
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
-    if controlnet:
-        # conditioning embedding
-
-        orig_index = 0
-
-        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-
-        orig_index += 2
-
-        diffusers_index = 0
-
-        while diffusers_index < 6:
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight"
-            )
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias"
-            )
-            diffusers_index += 1
-            orig_index += 2
-
-        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-
-        # down blocks
-        for i in range(num_input_blocks):
-            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
-            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
-
-        # mid block
-        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
-        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
-
     return new_checkpoint
 
 
@@ -824,13 +814,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    num_down_blocks = len(config["down_block_types"])
     down_blocks = {
         layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    num_up_blocks = len(config["up_block_types"])
     up_blocks = {
         layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
@@ -1082,7 +1072,7 @@ def stable_unclip_image_encoder(original_config, local_files_only=False):
     encoders.
     """
 
-    image_embedder_config = original_config.model.params.embedder_config
+    image_embedder_config = original_config["model"]["params"].embedder_config
 
     sd_clip_image_embedder_class = image_embedder_config.target
     sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
@@ -1111,120 +1101,8 @@ def stable_unclip_image_encoder(original_config, local_files_only=False):
     return feature_extractor, image_encoder
 
 
-def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
-    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
-    model = PaintByExampleImageEncoder(config)
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    # load clip vision
-    model.model.load_state_dict(text_model_dict)
-
-    # load mapper
-    keys_mapper = {
-        k[len("cond_stage_model.mapper.res") :]: v
-        for k, v in checkpoint.items()
-        if k.startswith("cond_stage_model.mapper")
-    }
-
-    MAPPING = {
-        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
-        "attn.c_proj": ["attn1.to_out.0"],
-        "ln_1": ["norm1"],
-        "ln_2": ["norm3"],
-        "mlp.c_fc": ["ff.net.0.proj"],
-        "mlp.c_proj": ["ff.net.2"],
-    }
-
-    mapped_weights = {}
-    for key, value in keys_mapper.items():
-        prefix = key[: len("blocks.i")]
-        suffix = key.split(prefix)[-1].split(".")[-1]
-        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
-        mapped_names = MAPPING[name]
-
-        num_splits = len(mapped_names)
-        for i, mapped_name in enumerate(mapped_names):
-            new_name = ".".join([prefix, mapped_name, suffix])
-            shape = value.shape[0] // num_splits
-            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
-
-    model.mapper.load_state_dict(mapped_weights)
-
-    # load final layer norm
-    model.final_layer_norm.load_state_dict(
-        {
-            "bias": checkpoint["cond_stage_model.final_ln.bias"],
-            "weight": checkpoint["cond_stage_model.final_ln.weight"],
-        }
-    )
-
-    # load final proj
-    model.proj_out.load_state_dict(
-        {
-            "bias": checkpoint["proj_out.bias"],
-            "weight": checkpoint["proj_out.weight"],
-        }
-    )
-
-    # load uncond vector
-    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
-    return model
-
-
-def stable_unclip_image_noising_components(
-    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
-):
-    """
-    Returns the noising components for the img2img and txt2img unclip pipelines.
-
-    Converts the stability noise augmentor into
-    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
-    2. a `DDPMScheduler` for holding the noise schedule
-
-    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
-    """
-    noise_aug_config = original_config.model.params.noise_aug_config
-    noise_aug_class = noise_aug_config.target
-    noise_aug_class = noise_aug_class.split(".")[-1]
-
-    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
-        noise_aug_config = noise_aug_config.params
-        embedding_dim = noise_aug_config.timestep_dim
-        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
-        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
-
-        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
-        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
-
-        if "clip_stats_path" in noise_aug_config:
-            if clip_stats_path is None:
-                raise ValueError("This stable unclip config requires a `clip_stats_path`")
-
-            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
-            clip_mean = clip_mean[None, :]
-            clip_std = clip_std[None, :]
-
-            clip_stats_state_dict = {
-                "mean": clip_mean,
-                "std": clip_std,
-            }
-
-            image_normalizer.load_state_dict(clip_stats_state_dict)
-    else:
-        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
-
-    return image_normalizer, image_noising_scheduler
-
-
 def create_ldm_bert_config(original_config):
-    bert_params = original_config.model.params.cond_stage_config.params
+    bert_params = original_config["model"]["params"].cond_stage_config.params
     config = LDMBertConfig(
         d_model=bert_params.n_embed,
         encoder_layers=bert_params.n_layer,
@@ -1416,7 +1294,7 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
     prediction_type = kwargs.get("prediction_type", None)
     global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
 
-    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
+    num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000
     scheduler_config["num_train_timesteps"] = num_train_timesteps
 
     if (
@@ -1437,8 +1315,8 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
         scheduler_type = "euler"
 
     else:
-        beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
-        beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
+        beta_start = getattr(original_config["model"]["params"], "linear_start", None) or 0.02
+        beta_end = getattr(original_config["model"]["params"], "linear_end", None) or 0.085
         scheduler_config["beta_start"] = beta_start
         scheduler_config["beta_end"] = beta_end
         scheduler_config["beta_schedule"] = "scaled_linear"
@@ -1484,64 +1362,3 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
         }
 
     return {"scheduler": scheduler}
-
-
-def create_stable_unclip_components(
-    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
-):
-    local_files_only = kwargs.get("local_files_only", False)
-    clip_stats_path = kwargs.get("clip_stats_path", None)
-
-    image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
-        original_config,
-        clip_stats_path=clip_stats_path,
-    )
-
-    if pipeline_class_name == "StableUnCLIPPipeline":
-        stable_unclip_prior = kwargs.get("stable_unclip_prior", None)
-        if stable_unclip_prior is None and stable_unclip_prior != "karlo":
-            raise NotImplementedError(f"Unknown prior for Stable UnCLIP model: {stable_unclip_prior}")
-
-        try:
-            config_name = "kakaobrain/karlo-v1-alpha"
-            prior = PriorTransformer.from_pretrained(config_name, subfolder="prior", local_files_only=local_files_only)
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the prior in the following path: '{config_name}'."
-            )
-
-        try:
-            config_name = "openai/clip-vit-large-patch14"
-            prior_tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
-            prior_text_encoder = CLIPTextModelWithProjection.from_pretrained(
-                config_name, local_files_only=local_files_only
-            )
-            prior_scheduler = DDPMScheduler.from_pretrained(
-                config_name, subfolder="prior_scheduler", local_files_only=local_files_only
-            )
-
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
-            )
-        else:
-            return {
-                "prior": prior,
-                "prior_tokenizer": prior_tokenizer,
-                "prior_text_encoder": prior_text_encoder,
-                "prior_scheduler": prior_scheduler,
-                "image_normalizer": image_normalizer,
-                "image_noise_scheduler": image_noising_scheduler,
-            }
-
-    else:
-        feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
-
-        return {
-            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder,
-            "image_normalizer": image_normalizer,
-            "image_noising_scheduler": image_noising_scheduler,
-        }
-
-    return

From 0ec1ed7a915354eca9cd296f57c9ab06c5c06cb3 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 16 Jan 2024 07:52:32 +0000
Subject: [PATCH 47/89] update

---
 src/diffusers/loaders/single_file_utils.py | 157 ++++++++++++---------
 1 file changed, 89 insertions(+), 68 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 6f71133b5850..f164375bf78c 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -124,6 +124,9 @@
     },
 }
 
+LDM_VAE_KEY = "first_stage_model."
+LDM_UNET_KEY = "model.diffusion_model."
+LDM_CLIP_CONFIG_NAME = "openai/clip-vit-large-patch14"
 
 UNET_TIME_EMBEDDING_LAYERS = []
 
@@ -782,36 +785,54 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     return new_checkpoint
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
+def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+
+def update_vae_attentions_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = (
+            ldm_key.replace(mapping["old"], mapping["new"])
+            .replace("norm.weight", "group_norm.weight")
+            .replace("norm.bias", "group_norm.bias")
+            .replace("q.weight", "to_q.weight")
+            .replace("q.bias", "to_q.bias")
+            .replace("k.weight", "to_k.weight")
+            .replace("k.bias", "to_k.bias")
+            .replace("v.weight", "to_v.weight")
+            .replace("v.bias", "to_v.bias")
+            .replace("proj_out.weight", "to_out.0.weight")
+            .replace("proj_out.bias", "to_out.0.bias")
+        )
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        shape = new_checkpoint[diffusers_key].shape
+
+        if len(shape) == 3:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0]
+        elif len(shape) == 4:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0, 0]
+
+
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
+    # remove the LDM_VAE_KEY prefix from the ldm checkpoint keys so that it is easier to map them to diffusers keys
     vae_state_dict = {}
     keys = list(checkpoint.keys())
-    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    vae_key = LDM_VAE_KEY if any(k.startswith(LDM_VAE_KEY) for k in keys) else ""
     for key in keys:
         if key.startswith(vae_key):
             vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
 
     new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    vae_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["vae"]
+    for diffusers_key, ldm_key in vae_diffusers_ldm_map.items():
+        if ldm_key not in vae_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = vae_state_dict[ldm_key]
 
     # Retrieves the keys for the encoder down blocks only
     num_down_blocks = len(config["down_block_types"])
@@ -819,15 +840,14 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len(config["up_block_types"])
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
     for i in range(num_down_blocks):
         resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"},
+        )
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
             new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
                 f"encoder.down.{i}.downsample.conv.weight"
@@ -836,31 +856,39 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
                 f"encoder.down.{i}.downsample.conv.bias"
             )
 
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
         resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
 
     mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(config["up_block_types"])
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
             key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
-
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"},
+        )
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
             new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
                 f"decoder.up.{block_id}.upsample.conv.weight"
@@ -869,24 +897,23 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
                 f"decoder.up.{block_id}.upsample.conv.bias"
             ]
 
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
         resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
 
     mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
     conv_attn_to_linear(new_checkpoint)
+
     return new_checkpoint
 
 
@@ -941,25 +968,19 @@ def _copy_layers(hf_layers, pt_layers):
     return hf_model
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_clip_checkpoint
-def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
-    if text_encoder is None:
-        config_name = "openai/clip-vit-large-patch14"
-        try:
-            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
-            )
+def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
+    try:
+        config = CLIPTextConfig.from_pretrained(LDM_CLIP_CONFIG_NAME, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
+        )
 
-        ctx = init_empty_weights if is_accelerate_available() else nullcontext
-        with ctx():
-            text_model = CLIPTextModel(config)
-    else:
-        text_model = text_encoder
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModel(config)
 
     keys = list(checkpoint.keys())
-
     text_model_dict = {}
 
     remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"]

From 4bb4ed4711c76b096fb7ceba7cf421f7b6d8da42 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 16 Jan 2024 13:51:38 +0000
Subject: [PATCH 48/89] update

---
 src/diffusers/loaders/single_file_utils.py | 189 +++++++++++++--------
 1 file changed, 117 insertions(+), 72 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index f164375bf78c..4bc6064b6cfc 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -83,25 +83,29 @@
 
 DIFFUSERS_TO_LDM_MAPPING = {
     "unet": {
-        "time_embedding.linear_1.weight": "time_embed.0.weight",
-        "time_embedding.linear_1.bias": "time_embed.0.bias",
-        "time_embedding.linear_2.weight": "time_embed.2.weight",
-        "time_embedding.linear_2.bias": "time_embed.2.bias",
-        "conv_in.weight": "input_blocks.0.0.weight",
-        "conv_in.bias": "input_blocks.0.0.bias",
+        "layers": {
+            "time_embedding.linear_1.weight": "time_embed.0.weight",
+            "time_embedding.linear_1.bias": "time_embed.0.bias",
+            "time_embedding.linear_2.weight": "time_embed.2.weight",
+            "time_embedding.linear_2.bias": "time_embed.2.bias",
+            "conv_in.weight": "input_blocks.0.0.weight",
+            "conv_in.bias": "input_blocks.0.0.bias",
+            "conv_norm_out.weight": "out.0.weight",
+            "conv_norm_out.bias": "out.0.bias",
+            "conv_out.weight": "out.2.weight",
+            "conv_out.bias": "out.2.bias",
+        },
         "class_embed_type": {
-            "timestep": {
-                "class_embedding.linear_1.weight": "label_emb.0.0.weight",
-                "class_embedding.linear_1.bias": "label_emb.0.0.bias",
-                "class_embedding.linear_2.weight": "label_emb.0.2.weight",
-                "class_embedding.linear_2.bias": "label_emb.0.2.bias",
-            },
-            "text_time": {
-                "class_embedding.linear_1.weight": "label_emb.0.0.weight",
-                "class_embedding.linear_1.bias": "label_emb.0.0.bias",
-                "class_embedding.linear_2.weight": "label_emb.0.2.weight",
-                "class_embedding.linear_2.bias": "label_emb.0.2.bias",
-            },
+            "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+        "addition_embed_type": {
+            "add_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "add_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "add_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "add_embedding.linear_2.bias": "label_emb.0.2.bias",
         },
     },
     "vae": {
@@ -586,6 +590,27 @@ def create_vae_diffusers_config(original_config, image_size: int):
     return config
 
 
+def update_unet_resnet_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping=None):
+    for ldm_key in ldm_keys:
+        diffusers_key = (
+            ldm_key.replace("in_layers.0", "norm1")
+            .replace("in_layers.2", "conv1")
+            .replace("out_layers.0", "norm2")
+            .replace("out_layers.3", "conv2")
+            .replace("emb_layers.1", "time_emb_proj")
+            .replace("skip_connection", "conv_shortcut")
+        )
+        if mapping:
+            diffusers_key = diffusers_key.replace(mapping["old"], mapping["new"])
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+
+def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in ldm_keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"])
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+
 def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
@@ -598,7 +623,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
         unet_state_dict = {}
         keys = list(checkpoint.keys())
 
-        unet_key = "model.diffusion_model."
+        unet_key = LDM_UNET_KEY
 
         # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
         if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
@@ -623,41 +648,25 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
                     unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
 
     new_checkpoint = {}
+    ldm_unet_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["layers"]
+    for diffusers_key, ldm_key in ldm_unet_keys.items():
+        new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    if config["class_embed_type"] is None:
-        # No parameters to port
-        ...
-    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
-        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
-    else:
-        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    if config["class_embed_type"] in ["timestep", "projection"]:
+        class_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["class_embed_type"]
+        for diffusers_key, ldm_key in class_embed_keys.items():
+            new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
 
     if config["addition_embed_type"] == "text_time":
-        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+        addition_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["addition_embed_type"]
+        for diffusers_key, ldm_key in addition_embed_keys.items():
+            new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
 
     # Relevant to StableDiffusionUpscalePipeline
     if "num_class_embeds" in config:
         if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
             new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
 
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
     # Retrieves the keys for the input blocks only
     num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
@@ -679,6 +688,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
         for layer_id in range(num_output_blocks)
     }
 
+    # Down blocks
     for i in range(1, num_input_blocks):
         block_id = (i - 1) // (config["layers_per_block"] + 1)
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
@@ -686,7 +696,12 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
         resnets = [
             key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
         ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            unet_state_dict,
+            {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
 
         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
             new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
@@ -696,48 +711,77 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
                 f"input_blocks.{i}.0.op.bias"
             )
 
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                unet_state_dict,
+                {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
             )
 
+    # Mid blocks
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
     resnet_1 = middle_blocks[2]
 
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    update_unet_resnet_ldm_to_diffusers(
+        resnet_0, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    )
+    update_unet_resnet_ldm_to_diffusers(
+        resnet_1, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.2", "new": "mid_block.resnets.1"}
+    )
+    update_unet_attention_ldm_to_diffusers(
+        attentions, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.1", "new": "mid_block.attentions.0"}
     )
 
+    # Up Blocks
     for i in range(num_output_blocks):
         block_id = i // (config["layers_per_block"] + 1)
         layer_in_block_id = i % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in output_blocks[i] if f"output_blocks.{i}.0" in key and f"output_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            unet_state_dict,
+            {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        attentions = [key for key in input_blocks[i] if f"output_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                unet_state_dict,
+                {"old": f"output_blocks.{i}.1", "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+        if f"output_blocks.{i}.1.conv.weight" in unet_state_dict:
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                f"output_blocks.{i}.1.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                f"output_blocks.{i}.1.conv.bias"
+            ]
+        if f"output_blocks.{i}.2.conv.weight" in unet_state_dict:
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                f"output_blocks.{i}.2.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                f"output_blocks.{i}.2.conv.bias"
+            ]
+
+        """
         output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
             layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
+            output_block_list.setdefault(layer_id, [])
+            output_block_list[layer_id].append(layer_name)
 
         if len(output_block_list) > 1:
             resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
@@ -781,6 +825,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
                 new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
+        """
 
     return new_checkpoint
 

From 68a49b1b458bcf203332e633ce9a913e8de7108b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 16 Jan 2024 16:03:37 +0000
Subject: [PATCH 49/89] update

---
 src/diffusers/loaders/single_file_utils.py | 227 ++-------------------
 1 file changed, 18 insertions(+), 209 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 4bc6064b6cfc..1f1f147f72f7 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -113,19 +113,27 @@
         "encoder.conv_in.bias": "encoder.conv_in.bias",
         "encoder.conv_out.weight": "encoder.conv_out.weight",
         "encoder.conv_out.bias": "encoder.conv_out.bias",
-        "encoder.conv_norm_out.weight": "encoder.conv_norm_out.weight",
-        "encoder.conv_norm_out.bias": "encoder.conv_norm_out.bias",
+        "encoder.conv_norm_out.weight": "encoder.norm_out.weight",
+        "encoder.conv_norm_out.bias": "encoder.norm_out.bias",
         "decoder.conv_in.weight": "decoder.conv_in.weight",
         "decoder.conv_in.bias": "decoder.conv_in.bias",
         "decoder.conv_out.weight": "decoder.conv_out.weight",
         "decoder.conv_out.bias": "decoder.conv_out.bias",
-        "decoder.conv_norm_out.weight": "decoder.conv_norm_out.weight",
-        "decoder.conv_norm_out.bias": "decoder.conv_norm_out.bias",
+        "decoder.conv_norm_out.weight": "decoder.norm_out.weight",
+        "decoder.conv_norm_out.bias": "decoder.norm_out.bias",
         "quant_conv.weight": "quant_conv.weight",
         "quant_conv.bias": "quant_conv.bias",
         "post_quant_conv.weight": "post_quant_conv.weight",
         "post_quant_conv.bias": "post_quant_conv.bias",
     },
+    "openclip": {
+        "positional_embedding": "text_model.embeddings.position_embedding.weight",
+        "token_embedding.weight": "text_model.embeddings.token_embedding.weight",
+        "ln_final.weight": "text_model.final_layer_norm.weight",
+        "ln_final.bias": "text_model.final_layer_norm.bias",
+        "text_projection": "text_projection.weight",
+        ""
+    }
 }
 
 LDM_VAE_KEY = "first_stage_model."
@@ -302,155 +310,6 @@ def shave_segments(path, n_shave_prefix_segments=1):
         return ".".join(path.split(".")[:n_shave_prefix_segments])
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "to_q.weight")
-        new_item = new_item.replace("q.bias", "to_q.bias")
-
-        new_item = new_item.replace("k.weight", "to_k.weight")
-        new_item = new_item.replace("k.bias", "to_k.bias")
-
-        new_item = new_item.replace("v.weight", "to_v.weight")
-        new_item = new_item.replace("v.bias", "to_v.bias")
-
-        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
-        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
-    attention layers, and takes into account additional replacements that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
-        shape = old_checkpoint[path["old"]].shape
-        if is_attn_weight and len(shape) == 3:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        elif is_attn_weight and len(shape) == 4:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
 def conv_attn_to_linear(checkpoint):
     keys = list(checkpoint.keys())
@@ -602,13 +461,13 @@ def update_unet_resnet_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, ma
         )
         if mapping:
             diffusers_key = diffusers_key.replace(mapping["old"], mapping["new"])
-        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
 
 
 def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping):
     for ldm_key in ldm_keys:
         diffusers_key = ldm_key.replace(mapping["old"], mapping["new"])
-        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
 
 
 def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
@@ -750,7 +609,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
             {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"},
         )
 
-        attentions = [key for key in input_blocks[i] if f"output_blocks.{i}.1" in key]
+        attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
         if attentions:
             update_unet_attention_ldm_to_diffusers(
                 attentions,
@@ -774,59 +633,6 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
                 f"output_blocks.{i}.2.conv.bias"
             ]
 
-        """
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            output_block_list.setdefault(layer_id, [])
-            output_block_list[layer_id].append(layer_name)
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
-            if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-        """
-
     return new_checkpoint
 
 
@@ -1211,6 +1017,9 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     with ctx():
         unet = UNet2DConditionModel(**unet_config)
 
+    print('difference')
+    print(set(unet.state_dict().keys()).difference(set(diffusers_format_unet_checkpoint.keys())))
+
     if is_accelerate_available():
         for param_name, param in diffusers_format_unet_checkpoint.items():
             set_module_tensor_to_device(unet, param_name, "cpu", value=param)

From e37abaf1ee4f41d4686831c99f5e8015cce403a5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 17 Jan 2024 06:55:59 +0000
Subject: [PATCH 50/89] update

---
 src/diffusers/loaders/single_file_utils.py | 44 +++++++++++-----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 1f1f147f72f7..cc25eaac0490 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -127,21 +127,27 @@
         "post_quant_conv.bias": "post_quant_conv.bias",
     },
     "openclip": {
-        "positional_embedding": "text_model.embeddings.position_embedding.weight",
-        "token_embedding.weight": "text_model.embeddings.token_embedding.weight",
+        # "positional_embedding": "text_model.embeddings.position_embedding.weight",
+        # "token_embedding.weight": "text_model.embeddings.token_embedding.weight",
         "ln_final.weight": "text_model.final_layer_norm.weight",
         "ln_final.bias": "text_model.final_layer_norm.bias",
         "text_projection": "text_projection.weight",
-        ""
-    }
+        "resblocks.": "text_model.encoder.layers.",
+        "ln_1": "layer_norm1",
+        "ln_2": "layer_norm2",
+        ".c_fc.": ".fc1.",
+        ".c_proj.": ".fc2.",
+        ".attn": ".self_attn",
+        "ln_final.": "transformer.text_model.final_layer_norm.",
+        "token_embedding.weight": "transformer.text_model.embeddings.token_embedding.weight",
+        "positional_embedding": "transformer.text_model.embeddings.position_embedding.weight",
+    },
 }
 
 LDM_VAE_KEY = "first_stage_model."
 LDM_UNET_KEY = "model.diffusion_model."
 LDM_CLIP_CONFIG_NAME = "openai/clip-vit-large-patch14"
-
-UNET_TIME_EMBEDDING_LAYERS = []
-
+LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
 
 textenc_conversion_lst = [
     ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
@@ -461,13 +467,13 @@ def update_unet_resnet_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, ma
         )
         if mapping:
             diffusers_key = diffusers_key.replace(mapping["old"], mapping["new"])
-        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
 
 
 def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping):
     for ldm_key in ldm_keys:
         diffusers_key = ldm_key.replace(mapping["old"], mapping["new"])
-        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
 
 
 def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
@@ -609,7 +615,9 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
             {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"},
         )
 
-        attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+        attentions = [
+            key for key in output_blocks[i] if f"output_blocks.{i}.1" in key and f"output_blocks.{i}.1.conv" not in key
+        ]
         if attentions:
             update_unet_attention_ldm_to_diffusers(
                 attentions,
@@ -834,12 +842,13 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
     keys = list(checkpoint.keys())
     text_model_dict = {}
 
-    remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"]
+    remove_prefixes = LDM_CLIP_PREFIX_TO_REMOVE
 
     for key in keys:
         for prefix in remove_prefixes:
             if key.startswith(prefix):
-                text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
+                diffusers_key = key.replace(prefix, "")
+                text_model_dict[diffusers_key] = checkpoint[key]
 
     if is_accelerate_available():
         for param_name, param in text_model_dict.items():
@@ -862,10 +871,6 @@ def convert_open_clip_checkpoint(
     local_files_only=False,
     **config_kwargs,
 ):
-    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-    # text_model = CLIPTextModelWithProjection.from_pretrained(
-    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
-    # )
     try:
         config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
     except Exception:
@@ -1017,9 +1022,6 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     with ctx():
         unet = UNet2DConditionModel(**unet_config)
 
-    print('difference')
-    print(set(unet.state_dict().keys()).difference(set(diffusers_format_unet_checkpoint.keys())))
-
     if is_accelerate_available():
         for param_name, param in diffusers_format_unet_checkpoint.items():
             set_module_tensor_to_device(unet, param_name, "cpu", value=param)
@@ -1075,9 +1077,7 @@ def create_text_encoders_and_tokenizers(
     elif model_type == "FrozenCLIPEmbedder":
         try:
             config_name = "openai/clip-vit-large-patch14"
-            text_encoder = convert_ldm_clip_checkpoint(
-                checkpoint, local_files_only=local_files_only, text_encoder=None
-            )
+            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
 
         except Exception:

From 1bd8ba3de16211ca835e9cf5c08fbeb001747828 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 17 Jan 2024 08:03:40 +0000
Subject: [PATCH 51/89] update

---
 src/diffusers/loaders/single_file_utils.py           |  4 ++--
 .../stable_diffusion/test_stable_diffusion.py        | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index cc25eaac0490..e4165d31568a 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1190,8 +1190,8 @@ def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoin
         scheduler_type = "euler"
 
     else:
-        beta_start = getattr(original_config["model"]["params"], "linear_start", None) or 0.02
-        beta_end = getattr(original_config["model"]["params"], "linear_end", None) or 0.085
+        beta_start = original_config["model"]["params"].get("linear_start", 0.02)
+        beta_end = original_config["model"]["params"].get("linear_end", 0.085)
         scheduler_config["beta_start"] = beta_start
         scheduler_config["beta_end"] = beta_end
         scheduler_config["beta_schedule"] = "scaled_linear"
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 8854b482dec7..eb053a523835 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1256,13 +1256,13 @@ def test_download_local(self):
     def test_download_ckpt_diff_format_is_same(self):
         ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
 
-        pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
+        sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
+        sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
+        sf_pipe.unet.set_attn_processor(AttnProcessor())
+        sf_pipe.to("cuda")
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        image_ckpt = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
+        image_single_file = sf_pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
 
         pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
@@ -1272,7 +1272,7 @@ def test_download_ckpt_diff_format_is_same(self):
         generator = torch.Generator(device="cpu").manual_seed(0)
         image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
 
-        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
 
         assert max_diff < 1e-3
 

From 1cce591cd186cb26ed31eef01eeed73e0839814f Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 17 Jan 2024 08:50:37 +0000
Subject: [PATCH 52/89] update

---
 src/diffusers/loaders/single_file_utils.py | 73 ----------------------
 1 file changed, 73 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index e4165d31568a..eb84d8852e27 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -246,10 +246,6 @@ def infer_model_type(pipeline_class_name, original_config, model_type=None, **kw
     if model_type is not None:
         return model_type
 
-    if pipeline_class_name in ["StableUnCLIPPipeline", "StableUnCLIPImg2ImgPipeline"]:
-        model_type = "FrozenOpenCLIPEmbedder"
-        return model_type
-
     has_cond_stage_config = (
         "cond_stage_config" in original_config["model"]["params"]
         and original_config["model"]["params"]["cond_stage_config"] is not None
@@ -305,17 +301,6 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
     return image_size
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
 def conv_attn_to_linear(checkpoint):
     keys = list(checkpoint.keys())
@@ -776,57 +761,6 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_bert_checkpoint
-def convert_ldm_bert_checkpoint(checkpoint, config):
-    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
-        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
-        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
-        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
-
-        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
-        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
-
-    def _copy_linear(hf_linear, pt_linear):
-        hf_linear.weight = pt_linear.weight
-        hf_linear.bias = pt_linear.bias
-
-    def _copy_layer(hf_layer, pt_layer):
-        # copy layer norms
-        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
-        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
-
-        # copy attn
-        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
-
-        # copy MLP
-        pt_mlp = pt_layer[1][1]
-        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
-        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
-
-    def _copy_layers(hf_layers, pt_layers):
-        for i, hf_layer in enumerate(hf_layers):
-            if i != 0:
-                i += i
-            pt_layer = pt_layers[i : i + 2]
-            _copy_layer(hf_layer, pt_layer)
-
-    hf_model = LDMBertModel(config).eval()
-
-    # copy  embeds
-    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
-    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
-
-    # copy layer norm
-    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
-
-    # copy hidden layers
-    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
-
-    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
-
-    return hf_model
-
-
 def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
     try:
         config = CLIPTextConfig.from_pretrained(LDM_CLIP_CONFIG_NAME, local_files_only=local_files_only)
@@ -1151,13 +1085,6 @@ def create_text_encoders_and_tokenizers(
             "text_encoder_2": text_encoder_2,
         }
 
-    elif pipeline_class_name == "LDMTextToImagePipeline":
-        text_config = create_ldm_bert_config(original_config)
-        text_encoder = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
-
-        return {"text_encoder": text_encoder, "tokenizer": tokenizer}
-
     return
 
 

From df4a8eaf37751a95d43509787dc9046befeb108b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 17 Jan 2024 10:24:26 +0000
Subject: [PATCH 53/89] update

---
 src/diffusers/loaders/single_file_utils.py | 62 ++--------------------
 1 file changed, 3 insertions(+), 59 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index eb84d8852e27..1b43f8c285c1 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -23,17 +23,13 @@
 import yaml
 from safetensors.torch import load_file as safe_load
 from transformers import (
-    BertTokenizerFast,
-    CLIPImageProcessor,
     CLIPTextConfig,
     CLIPTextModel,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
-    CLIPVisionModelWithProjection,
 )
 
 from ..models import AutoencoderKL, UNet2DConditionModel
-from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..schedulers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -149,6 +145,8 @@
 LDM_CLIP_CONFIG_NAME = "openai/clip-vit-large-patch14"
 LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
 
+SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = ['cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias', 'cond_stage_model.model.transformer.resblocks.23.attn.in_proj_weight', 'cond_stage_model.model.transformer.resblocks.23.attn.out_proj.bias', 'cond_stage_model.model.transformer.resblocks.23.attn.out_proj.weight', 'cond_stage_model.model.transformer.resblocks.23.ln_1.bias', 'cond_stage_model.model.transformer.resblocks.23.ln_1.weight', 'cond_stage_model.model.transformer.resblocks.23.ln_2.bias', 'cond_stage_model.model.transformer.resblocks.23.ln_2.weight', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.bias', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.weight', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.bias', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.weight', 'cond_stage_model.model.text_projection']
+
 textenc_conversion_lst = [
     ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
     ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
@@ -796,7 +794,6 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
     return text_model
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_open_clip_checkpoint
 def convert_open_clip_checkpoint(
     checkpoint,
     config_name,
@@ -818,12 +815,7 @@ def convert_open_clip_checkpoint(
 
     keys = list(checkpoint.keys())
 
-    keys_to_ignore = []
-    if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
-        # make sure to remove all keys > 22
-        keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
-        keys_to_ignore += ["cond_stage_model.model.text_projection"]
-
+    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
     text_model_dict = {}
 
     if prefix + "text_projection" in checkpoint:
@@ -832,7 +824,6 @@ def convert_open_clip_checkpoint(
         d_model = 1024
 
     text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
-
     for key in keys:
         if key in keys_to_ignore:
             continue
@@ -875,53 +866,6 @@ def convert_open_clip_checkpoint(
     return text_model
 
 
-def stable_unclip_image_encoder(original_config, local_files_only=False):
-    """
-    Returns the image processor and clip image encoder for the img2img unclip pipeline.
-
-    We currently know of two types of stable unclip models which separately use the clip and the openclip image
-    encoders.
-    """
-
-    image_embedder_config = original_config["model"]["params"].embedder_config
-
-    sd_clip_image_embedder_class = image_embedder_config.target
-    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
-
-    if sd_clip_image_embedder_class == "ClipImageEmbedder":
-        clip_model_name = image_embedder_config.params.model
-
-        if clip_model_name == "ViT-L/14":
-            feature_extractor = CLIPImageProcessor()
-            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-                "openai/clip-vit-large-patch14", local_files_only=local_files_only
-            )
-        else:
-            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
-
-    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
-        feature_extractor = CLIPImageProcessor()
-        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only
-        )
-    else:
-        raise NotImplementedError(
-            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
-        )
-
-    return feature_extractor, image_encoder
-
-
-def create_ldm_bert_config(original_config):
-    bert_params = original_config["model"]["params"].cond_stage_config.params
-    config = LDMBertConfig(
-        d_model=bert_params.n_embed,
-        encoder_layers=bert_params.n_layer,
-        encoder_ffn_dim=bert_params.n_embed * 4,
-    )
-    return config
-
-
 def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
     if "num_in_channels" in kwargs:
         num_in_channels = kwargs.get("num_in_channels")

From 249f78e1c85906cadc558cd9b7532b0c0227728d Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 17 Jan 2024 11:55:18 +0000
Subject: [PATCH 54/89] update

---
 src/diffusers/loaders/single_file.py       | 18 +++--
 src/diffusers/loaders/single_file_utils.py | 93 +++++++++++++++++++++-
 2 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 228345cd80cb..2e89de85b541 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -281,7 +281,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         torch_dtype = kwargs.pop("torch_dtype", None)
         use_safetensors = kwargs.pop("use_safetensors", True)
 
-        pipeline_class_name = cls.__name__
+        class_name = cls.__name__
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
         from_safetensors = file_extension == "safetensors"
 
@@ -315,14 +315,22 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(pipeline_class_name, checkpoint, original_config_file, config_files)
-        component_names = extract_pipeline_component_names(cls)
+        original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
+
+        if class_name == "AutoencoderKL":
+            component = build_component({}, "vae", original_config, checkpoint, pretrained_model_link_or_path)
+            return component["vae"]
 
+        if class_name == "ControlNetModel":
+            component = build_component({}, "controlnet", original_config, checkpoint, pretrained_model_link_or_path)
+            return component["controlnet"]
+
+        component_names = extract_pipeline_component_names(cls)
         pipeline_components = {}
         for component in component_names:
             components = build_component(
                 pipeline_components,
-                pipeline_class_name,
+                class_name,
                 component,
                 original_config,
                 checkpoint,
@@ -335,7 +343,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         additional_components = set(component_names - pipeline_components.keys())
         if additional_components:
-            components = build_additional_components(pipeline_class_name, original_config, **kwargs)
+            components = build_additional_components(class_name, original_config, **kwargs)
             if components:
                 pipeline_components.update(components)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 1b43f8c285c1..71fa08c189d3 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -55,6 +55,7 @@
     "xl": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml",
     "xl_refiner": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml",
     "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml",
+    "controlnet": "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml",
 }
 
 CHECKPOINT_KEY_NAMES = {
@@ -172,7 +173,7 @@
 textenc_pattern = re.compile("|".join(protected.keys()))
 
 
-def fetch_original_config_file_from_url(pipeline_class_name, checkpoint):
+def fetch_original_config_file_from_url(class_name, checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
         config_url = CONFIG_URLS["v2"]
 
@@ -182,9 +183,12 @@ def fetch_original_config_file_from_url(pipeline_class_name, checkpoint):
     elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint:
         config_url = CONFIG_URLS["xl_refiner"]
 
-    elif pipeline_class_name == "StableDiffusionUpscalePipeline":
+    elif class_name == "StableDiffusionUpscalePipeline":
         config_url = CONFIG_URLS["upscale"]
 
+    elif class_name == "ControlNetModel":
+        config_url = CONFIG_URLS["controlnet"]
+
     else:
         config_url = CONFIG_URLS["v1"]
 
@@ -414,6 +418,14 @@ def create_unet_diffusers_config(original_config, image_size: int):
     return config
 
 
+def create_controlnet_diffusers_config(original_config, image_size: int):
+    unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    config = create_unet_diffusers_config(original_config, image_size=image_size)
+
+    config["conditioning_channels"] = unet_params["hint_channels"]
+
+    return config
+
 def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
@@ -627,6 +639,83 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     return new_checkpoint
 
 
+def convert_controlnet_checkpoint(
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    use_linear_projection=None,
+    cross_attention_dim=None,
+):
+
+    """"
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+
+    ctrlnet_config.pop("sample_size")
+
+    if use_linear_projection is not None:
+        ctrlnet_config["use_linear_projection"] = use_linear_projection
+
+    if cross_attention_dim is not None:
+        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**ctrlnet_config)
+    """
+
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        skip_extract_state_dict = True
+    else:
+        skip_extract_state_dict = False
+
+    new_checkpoint = convert_ldm_unet_checkpoint(checkpoint, original_config)
+    orig_index = 0
+
+    new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+        f"input_hint_block.{orig_index}.weight"
+    )
+    new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+        f"input_hint_block.{orig_index}.bias"
+    )
+
+    orig_index += 2
+    diffusers_index = 0
+
+    while diffusers_index < 6:
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        diffusers_index += 1
+        orig_index += 2
+
+    new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+        f"input_hint_block.{orig_index}.weight"
+    )
+    new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+        f"input_hint_block.{orig_index}.bias"
+    )
+
+    # down blocks
+    for i in range(num_input_blocks):
+        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+    # mid block
+    new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+    new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    return new_checkpoint
+
+
 def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
     for ldm_key in keys:
         diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")

From 8a24733654ebdcfc107c307a58fbdb7610aad653 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 17 Jan 2024 16:02:08 +0000
Subject: [PATCH 55/89] update

---
 src/diffusers/loaders/single_file.py       |   5 +-
 src/diffusers/loaders/single_file_utils.py | 114 ++++++++++++++++-----
 src/diffusers/models/controlnet.py         |   4 +-
 3 files changed, 93 insertions(+), 30 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 2e89de85b541..78659de3326f 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -27,6 +27,7 @@
     logging,
 )
 from .single_file_utils import (
+    create_controlnet_model,
     create_scheduler,
     create_text_encoders_and_tokenizers,
     create_unet_model,
@@ -318,11 +319,11 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
 
         if class_name == "AutoencoderKL":
-            component = build_component({}, "vae", original_config, checkpoint, pretrained_model_link_or_path)
+            component = create_vae_model(class_name, original_config, checkpoint, pretrained_model_link_or_path)
             return component["vae"]
 
         if class_name == "ControlNetModel":
-            component = build_component({}, "controlnet", original_config, checkpoint, pretrained_model_link_or_path)
+            component = create_controlnet_model(class_name, original_config, checkpoint, **kwargs)
             return component["controlnet"]
 
         component_names = extract_pipeline_component_names(cls)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 71fa08c189d3..fae615951592 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -139,6 +139,10 @@
         "token_embedding.weight": "transformer.text_model.embeddings.token_embedding.weight",
         "positional_embedding": "transformer.text_model.embeddings.position_embedding.weight",
     },
+    "controlnet" : {
+        "controlnet_cond_embedding.conv_in.weight": "input_hint_block.0.weight",
+        "controlnet_cond_embedding.conv_in.bias": "input_hint_block.0.bias"
+    }
 }
 
 LDM_VAE_KEY = "first_stage_model."
@@ -510,14 +514,16 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     new_checkpoint = {}
     ldm_unet_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["layers"]
     for diffusers_key, ldm_key in ldm_unet_keys.items():
+        if ldm_key not in unet_state_dict:
+            continue
         new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
 
-    if config["class_embed_type"] in ["timestep", "projection"]:
+    if ("class_embed_type" in config) and (config["class_embed_type"] in ["timestep", "projection"]):
         class_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["class_embed_type"]
         for diffusers_key, ldm_key in class_embed_keys.items():
             new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
 
-    if config["addition_embed_type"] == "text_time":
+    if ("addition_embed_type" in config) and (config["addition_embed_type"] == "text_time"):
         addition_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["addition_embed_type"]
         for diffusers_key, ldm_key in addition_embed_keys.items():
             new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
@@ -641,16 +647,10 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
 
 def convert_controlnet_checkpoint(
     checkpoint,
-    original_config,
-    checkpoint_path,
-    image_size,
-    upcast_attention,
-    extract_ema,
-    use_linear_projection=None,
-    cross_attention_dim=None,
+    config,
 ):
 
-    """"
+    """
     ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     ctrlnet_config["upcast_attention"] = upcast_attention
 
@@ -674,48 +674,108 @@ def convert_controlnet_checkpoint(
     else:
         skip_extract_state_dict = False
 
-    new_checkpoint = convert_ldm_unet_checkpoint(checkpoint, original_config)
-    orig_index = 0
+    new_checkpoint = {}
+    ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]
+    for diffusers_key, ldm_key in ldm_controlnet_keys.items():
+        if ldm_key not in checkpoint:
+            continue
+        new_checkpoint[diffusers_key] = checkpoint[ldm_key]
 
-    new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-        f"input_hint_block.{orig_index}.weight"
-    )
-    new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-        f"input_hint_block.{orig_index}.bias"
-    )
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in checkpoint if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
 
+    # Down blocks
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            checkpoint,
+            {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        if f"input_blocks.{i}.0.op.weight" in checkpoint:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                checkpoint,
+                {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+    orig_index = 0
     orig_index += 2
     diffusers_index = 0
 
     while diffusers_index < 6:
-        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = checkpoint.pop(
             f"input_hint_block.{orig_index}.weight"
         )
-        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = checkpoint.pop(
             f"input_hint_block.{orig_index}.bias"
         )
         diffusers_index += 1
         orig_index += 2
 
-    new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+    new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = checkpoint.pop(
         f"input_hint_block.{orig_index}.weight"
     )
-    new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+    new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = checkpoint.pop(
         f"input_hint_block.{orig_index}.bias"
     )
 
     # down blocks
     for i in range(num_input_blocks):
-        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
-        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = checkpoint.pop(f"zero_convs.{i}.0.weight")
+        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = checkpoint.pop(f"zero_convs.{i}.0.bias")
 
     # mid block
-    new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
-    new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+    new_checkpoint["controlnet_mid_block.weight"] = checkpoint.pop("middle_block_out.0.weight")
+    new_checkpoint["controlnet_mid_block.bias"] = checkpoint.pop("middle_block_out.0.bias")
 
     return new_checkpoint
 
 
+def create_controlnet_model(
+    pipeline_class_name, original_config, checkpoint, **kwargs
+):
+    from ..models import ControlNetModel
+
+    image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
+    config = create_controlnet_diffusers_config(original_config, image_size=image_size)
+    diffusers_format_controlnet_checkpoint = convert_controlnet_checkpoint(checkpoint, original_config)
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**config)
+
+    if is_accelerate_available():
+        for param_name, param in diffusers_format_controlnet_checkpoint.items():
+            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
+    else:
+        controlnet.load_state_dict(diffusers_format_controlnet_checkpoint)
+
+    return {"controlnet": controlnet}
+
+
+
 def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
     for ldm_key in keys:
         diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")
@@ -999,6 +1059,8 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
 
 
 def create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+    from ..models import AutoencoderKL
+
     image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
 
     vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 1102f4f9d36d..8af13a6ec7d2 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -19,7 +19,7 @@
 from torch.nn import functional as F
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalControlnetMixin
+from ..loaders import FromSingleFileMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -102,7 +102,7 @@ def forward(self, conditioning):
         return embedding
 
 
-class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
+class ControlNetModel(ModelMixin, ConfigMixin, FromSingleFileMixin):
     """
     A ControlNet model.
 

From de77ff6831da64d9d1f8652bfc23c13f7141f407 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 18 Jan 2024 04:12:20 +0000
Subject: [PATCH 56/89] update

---
 src/diffusers/loaders/single_file.py       |   1 +
 src/diffusers/loaders/single_file_utils.py | 182 +++++++++++++--------
 2 files changed, 111 insertions(+), 72 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 78659de3326f..723fe9462c49 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -26,6 +26,7 @@
     is_transformers_available,
     logging,
 )
+from ..utils.hub_utils import _get_model_file
 from .single_file_utils import (
     create_controlnet_model,
     create_scheduler,
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index fae615951592..7ad49cc58b83 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -29,7 +29,7 @@
     CLIPTokenizer,
 )
 
-from ..models import AutoencoderKL, UNet2DConditionModel
+from ..models import UNet2DConditionModel
 from ..schedulers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -105,6 +105,26 @@
             "add_embedding.linear_2.bias": "label_emb.0.2.bias",
         },
     },
+    "controlnet": {
+        "layers": {
+            "controlnet_cond_embedding.conv_in.weight": "input_hint_block.0.weight",
+            "controlnet_cond_embedding.conv_in.bias": "input_hint_block.0.bias",
+            "controlnet_cond_embedding.conv_out.weight": "input_hint_block.14.weight",
+            "controlnet_cond_embedding.conv_out.bias": "input_hint_block.14.bias",
+        },
+        "class_embed_type": {
+            "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+        "addition_embed_type": {
+            "add_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "add_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "add_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "add_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+    },
     "vae": {
         "encoder.conv_in.weight": "encoder.conv_in.weight",
         "encoder.conv_in.bias": "encoder.conv_in.bias",
@@ -139,18 +159,30 @@
         "token_embedding.weight": "transformer.text_model.embeddings.token_embedding.weight",
         "positional_embedding": "transformer.text_model.embeddings.position_embedding.weight",
     },
-    "controlnet" : {
-        "controlnet_cond_embedding.conv_in.weight": "input_hint_block.0.weight",
-        "controlnet_cond_embedding.conv_in.bias": "input_hint_block.0.bias"
-    }
 }
 
+
 LDM_VAE_KEY = "first_stage_model."
 LDM_UNET_KEY = "model.diffusion_model."
+LDM_CONTROLNET_KEY = "control_model."
 LDM_CLIP_CONFIG_NAME = "openai/clip-vit-large-patch14"
 LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
 
-SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = ['cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias', 'cond_stage_model.model.transformer.resblocks.23.attn.in_proj_weight', 'cond_stage_model.model.transformer.resblocks.23.attn.out_proj.bias', 'cond_stage_model.model.transformer.resblocks.23.attn.out_proj.weight', 'cond_stage_model.model.transformer.resblocks.23.ln_1.bias', 'cond_stage_model.model.transformer.resblocks.23.ln_1.weight', 'cond_stage_model.model.transformer.resblocks.23.ln_2.bias', 'cond_stage_model.model.transformer.resblocks.23.ln_2.weight', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.bias', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.weight', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.bias', 'cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.weight', 'cond_stage_model.model.text_projection']
+SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = [
+    "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias",
+    "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_weight",
+    "cond_stage_model.model.transformer.resblocks.23.attn.out_proj.bias",
+    "cond_stage_model.model.transformer.resblocks.23.attn.out_proj.weight",
+    "cond_stage_model.model.transformer.resblocks.23.ln_1.bias",
+    "cond_stage_model.model.transformer.resblocks.23.ln_1.weight",
+    "cond_stage_model.model.transformer.resblocks.23.ln_2.bias",
+    "cond_stage_model.model.transformer.resblocks.23.ln_2.weight",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.bias",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.weight",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.bias",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.weight",
+    "cond_stage_model.model.text_projection",
+]
 
 textenc_conversion_lst = [
     ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
@@ -424,11 +456,26 @@ def create_unet_diffusers_config(original_config, image_size: int):
 
 def create_controlnet_diffusers_config(original_config, image_size: int):
     unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
-    config = create_unet_diffusers_config(original_config, image_size=image_size)
+    diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+
+    controlnet_config = {
+        "conditioning_channels": unet_params["hint_channels"],
+        "in_channels": diffusers_unet_config["in_channels"],
+        "down_block_types": diffusers_unet_config["down_block_types"],
+        "block_out_channels": diffusers_unet_config["block_out_channels"],
+        "layers_per_block": diffusers_unet_config["layers_per_block"],
+        "cross_attention_dim": diffusers_unet_config["cross_attention_dim"],
+        "attention_head_dim": diffusers_unet_config["attention_head_dim"],
+        "use_linear_projection": diffusers_unet_config["use_linear_projection"],
+        "class_embed_type": diffusers_unet_config["class_embed_type"],
+        "addition_embed_type": diffusers_unet_config["addition_embed_type"],
+        "addition_time_embed_dim": diffusers_unet_config["addition_time_embed_dim"],
+        "projection_class_embeddings_input_dim": diffusers_unet_config["projection_class_embeddings_input_dim"],
+        "transformer_layers_per_block": diffusers_unet_config["transformer_layers_per_block"],
+    }
 
-    config["conditioning_channels"] = unet_params["hint_channels"]
+    return controlnet_config
 
-    return config
 
 def create_vae_diffusers_config(original_config, image_size: int):
     """
@@ -475,7 +522,9 @@ def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint,
         new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
 
 
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
+def convert_ldm_unet_checkpoint(
+    checkpoint, config, unet_key, path=None, extract_ema=False, skip_extract_state_dict=False
+):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -649,42 +698,32 @@ def convert_controlnet_checkpoint(
     checkpoint,
     config,
 ):
-
-    """
-    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    ctrlnet_config["upcast_attention"] = upcast_attention
-
-    ctrlnet_config.pop("sample_size")
-
-    if use_linear_projection is not None:
-        ctrlnet_config["use_linear_projection"] = use_linear_projection
-
-    if cross_attention_dim is not None:
-        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
-
-    ctx = init_empty_weights if is_accelerate_available() else nullcontext
-    with ctx():
-        controlnet = ControlNetModel(**ctrlnet_config)
-    """
-
     # Some controlnet ckpt files are distributed independently from the rest of the
     # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
     if "time_embed.0.weight" in checkpoint:
-        skip_extract_state_dict = True
+        controlnet_state_dict = checkpoint
+
     else:
-        skip_extract_state_dict = False
+        controlnet_state_dict = {}
+        keys = list(checkpoint.keys())
+        controlnet_key = LDM_CONTROLNET_KEY
+        for key in keys:
+            if key.startswith(controlnet_key):
+                controlnet_state_dict[key.replace(controlnet_key, "")] = checkpoint.pop(key)
 
     new_checkpoint = {}
     ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]
     for diffusers_key, ldm_key in ldm_controlnet_keys.items():
-        if ldm_key not in checkpoint:
+        if ldm_key not in controlnet_state_dict:
             continue
-        new_checkpoint[diffusers_key] = checkpoint[ldm_key]
+        new_checkpoint[diffusers_key] = controlnet_state_dict[ldm_key]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "input_blocks" in layer})
+    num_input_blocks = len(
+        {".".join(layer.split(".")[:2]) for layer in controlnet_state_dict if "input_blocks" in layer}
+    )
     input_blocks = {
-        layer_id: [key for key in checkpoint if f"input_blocks.{layer_id}" in key]
+        layer_id: [key for key in controlnet_state_dict if f"input_blocks.{layer_id}" in key]
         for layer_id in range(num_input_blocks)
     }
 
@@ -699,15 +738,15 @@ def convert_controlnet_checkpoint(
         update_unet_resnet_ldm_to_diffusers(
             resnets,
             new_checkpoint,
-            checkpoint,
+            controlnet_state_dict,
             {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
         )
 
-        if f"input_blocks.{i}.0.op.weight" in checkpoint:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint.pop(
+        if f"input_blocks.{i}.0.op.weight" in controlnet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = controlnet_state_dict.pop(
                 f"input_blocks.{i}.0.op.weight"
             )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint.pop(
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = controlnet_state_dict.pop(
                 f"input_blocks.{i}.0.op.bias"
             )
 
@@ -716,55 +755,55 @@ def convert_controlnet_checkpoint(
             update_unet_attention_ldm_to_diffusers(
                 attentions,
                 new_checkpoint,
-                checkpoint,
+                controlnet_state_dict,
                 {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
             )
 
-    orig_index = 0
-    orig_index += 2
-    diffusers_index = 0
-
-    while diffusers_index < 6:
-        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = checkpoint.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = checkpoint.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-        diffusers_index += 1
-        orig_index += 2
-
-    new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = checkpoint.pop(
-        f"input_hint_block.{orig_index}.weight"
-    )
-    new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = checkpoint.pop(
-        f"input_hint_block.{orig_index}.bias"
-    )
-
-    # down blocks
+    # controlnet down blocks
     for i in range(num_input_blocks):
-        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = checkpoint.pop(f"zero_convs.{i}.0.weight")
-        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = checkpoint.pop(f"zero_convs.{i}.0.bias")
+        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.weight")
+        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.bias")
 
     # mid block
-    new_checkpoint["controlnet_mid_block.weight"] = checkpoint.pop("middle_block_out.0.weight")
-    new_checkpoint["controlnet_mid_block.bias"] = checkpoint.pop("middle_block_out.0.bias")
+    new_checkpoint["controlnet_mid_block.weight"] = controlnet_state_dict.pop("middle_block_out.0.weight")
+    new_checkpoint["controlnet_mid_block.bias"] = controlnet_state_dict.pop("middle_block_out.0.bias")
+
+    # controlnet cond embedding blocks
+    cond_embedding_blocks = {
+        ".".join(layer.split(".")[:2])
+        for layer in controlnet_state_dict
+        if "input_hint_block" in layer and ("input_hint_block.0" not in layer) and ("input_hint_block.14" not in layer)
+    }
+    num_cond_embedding_blocks = len(cond_embedding_blocks)
+
+    for idx in range(1, num_cond_embedding_blocks):
+        diffusers_idx = idx - 1
+        cond_block_id = 2 * idx
+
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = controlnet_state_dict.pop(
+            f"input_hint_block.{cond_block_id}.weight"
+        )
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = controlnet_state_dict.pop(
+            f"input_hint_block.{cond_block_id}.bias"
+        )
 
     return new_checkpoint
 
 
-def create_controlnet_model(
-    pipeline_class_name, original_config, checkpoint, **kwargs
-):
+def create_controlnet_model(pipeline_class_name, original_config, checkpoint, **kwargs):
     from ..models import ControlNetModel
 
     image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
-    config = create_controlnet_diffusers_config(original_config, image_size=image_size)
-    diffusers_format_controlnet_checkpoint = convert_controlnet_checkpoint(checkpoint, original_config)
+    upcast_attention = kwargs.get("upcast_attention", False)
+
+    diffusers_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
+    diffusers_config["upcast_attention"] = upcast_attention
+
+    diffusers_format_controlnet_checkpoint = convert_controlnet_checkpoint(checkpoint, diffusers_config)
 
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
     with ctx():
-        controlnet = ControlNetModel(**config)
+        controlnet = ControlNetModel(**diffusers_config)
 
     if is_accelerate_available():
         for param_name, param in diffusers_format_controlnet_checkpoint.items():
@@ -775,7 +814,6 @@ def create_controlnet_model(
     return {"controlnet": controlnet}
 
 
-
 def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
     for ldm_key in keys:
         diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")

From 0939565b29f4138a715232303569d8b40b7290a4 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 18 Jan 2024 05:55:26 +0000
Subject: [PATCH 57/89] update

---
 src/diffusers/loaders/single_file.py      | 68 ++++++++++++++++++++++-
 src/diffusers/pipelines/pipeline_utils.py |  9 ++-
 2 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 723fe9462c49..88cc06ba0bd4 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import os
+import re
 from pathlib import Path
 
 import torch
@@ -20,6 +22,8 @@
 from safetensors.torch import load_file as safe_load
 from transformers import AutoFeatureExtractor
 
+from ..models.modeling_utils import load_state_dict
+from ..pipelines.pipeline_utils import _get_pipeline_class
 from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..utils import (
     is_accelerate_available,
@@ -125,6 +129,22 @@ def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True)
     return checkpoint
 
 
+def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
+    pattern = r'([^/]+)/([^/]+)/(?:blob/main/)?(.+)'
+    weights_name = None
+    repo_id = None,
+    for prefix in VALID_URL_PREFIXES:
+        pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
+    match = re.match(pattern, pretrained_model_name_or_path)
+    if not match:
+        return repo_id, weights_name
+
+    repo_id = f"{match.group(1)}/{match.group(2)}"
+    weights_name = match.group(3)
+
+    return repo_id, weights_name
+
+
 def build_component(
     pipeline_components,
     pipeline_class_name,
@@ -292,6 +312,14 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         has_valid_url_prefix, pretrained_model_link_or_path = check_valid_url(pretrained_model_link_or_path)
 
+        """
+        if os.path.isfile(pretrained_model_link_or_path):
+            checkpoint = load_state_dict(pretrained_model_link_or_path)
+        else:
+            repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
+            checkpoint_path = _get_model_file(repo_id, weights_name=weights_name, use_safetensors=from_safetensors)
+            checkpoint = load_state_dict(checkpoint_path)
+        """
         # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
         ckpt_path = Path(pretrained_model_link_or_path)
         if (not ckpt_path.is_file()) and (not has_valid_url_prefix):
@@ -327,6 +355,44 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             component = create_controlnet_model(class_name, original_config, checkpoint, **kwargs)
             return component["controlnet"]
 
+        pipeline_class = _get_pipeline_class(cls, class_name=class_name)
+
+        # some modules can be passed directly to the init
+        # in this case they are already instantiated in `kwargs`
+        # extract them here
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        expected_keys = cls._get_init_keys(cls)
+        expected_keys.remove("self")
+        # remove general kwargs if present in dict
+        if "kwargs" in expected_keys:
+            expected_keys.remove("kwargs")
+
+        init_dict = {}
+        for key in expected_keys:
+            if key in kwargs:
+                init_dict[key] = kwargs.pop(key)
+
+        # define init kwargs and make sure that optional component modules are filtered out
+        init_kwargs = {
+            k: init_dict.pop(k)
+            for k in kwargs
+            if k in init_dict and k not in pipeline_class._optional_components
+        }
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
         component_names = extract_pipeline_component_names(cls)
         pipeline_components = {}
         for component in component_names:
@@ -349,7 +415,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             if components:
                 pipeline_components.update(components)
 
-        pipe = cls(**pipeline_components)
+        pipe = pipeline_class(**pipeline_components)
 
         if torch_dtype is not None:
             pipe.to(dtype=torch_dtype)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index de5dea679ee9..4d0bc8b13a92 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -351,7 +351,7 @@ def get_class_obj_and_candidates(
 
 def _get_pipeline_class(
     class_obj,
-    config,
+    config=None,
     load_connected_pipeline=False,
     custom_pipeline=None,
     repo_id=None,
@@ -389,7 +389,12 @@ def _get_pipeline_class(
         return class_obj
 
     diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
-    class_name = config["_class_name"]
+    class_name = class_name or config["_class_name"]
+    if not class_name:
+        raise ValueError(
+            "The class name could not be found in the configuration file. Please make sure to pass the correct `class_name`."
+        )
+
     class_name = class_name[4:] if class_name.startswith("Flax") else class_name
 
     pipeline_cls = getattr(diffusers_module, class_name)

From c22c2aa28bd5d351f8096d613b9e4025c1a1ded0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 18 Jan 2024 06:58:06 +0000
Subject: [PATCH 58/89] update

---
 src/diffusers/loaders/single_file.py       | 130 +++------------------
 src/diffusers/loaders/single_file_utils.py |   7 +-
 src/diffusers/utils/hub_utils.py           |  18 +--
 3 files changed, 31 insertions(+), 124 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 88cc06ba0bd4..4b23ed8fa086 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -59,6 +59,14 @@
     "StableDiffusionXLControlNetImg2ImgPipeline",
 ]
 
+LOADABLE_CLASSES = {
+    "diffusers": {
+        "ControlNetModel": "create_controlnet_model",
+        "AutoencoderKL": "create_vae_model",
+        "UNet2DConditionModel": "create_unet_model",
+    }
+}
+
 
 def extract_pipeline_component_names(pipeline_class):
     components = inspect.signature(pipeline_class).parameters.keys()
@@ -77,58 +85,6 @@ def check_valid_url(pretrained_model_link_or_path):
     return has_valid_url_prefix, pretrained_model_link_or_path
 
 
-def download_model_checkpoint(
-    ckpt_path,
-    cache_dir=None,
-    resume_download=False,
-    force_download=False,
-    proxies=None,
-    local_files_only=None,
-    token=None,
-    revision=None,
-):
-    # get repo_id and (potentially nested) file path of ckpt in repo
-    repo_id = "/".join(ckpt_path.parts[:2])
-    file_path = "/".join(ckpt_path.parts[2:])
-
-    if file_path.startswith("blob/"):
-        file_path = file_path[len("blob/") :]
-
-    if file_path.startswith("main/"):
-        file_path = file_path[len("main/") :]
-
-    path = hf_hub_download(
-        repo_id,
-        filename=file_path,
-        cache_dir=cache_dir,
-        resume_download=resume_download,
-        proxies=proxies,
-        local_files_only=local_files_only,
-        token=token,
-        revision=revision,
-        force_download=force_download,
-    )
-
-    return path
-
-
-def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True):
-    if device is None:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    if isinstance(checkpoint_path_or_dict, str):
-        if from_safetensors:
-            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
-
-        else:
-            checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
-
-    elif isinstance(checkpoint_path_or_dict, dict):
-        checkpoint = checkpoint_path_or_dict
-
-    return checkpoint
-
-
 def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
     pattern = r'([^/]+)/([^/]+)/(?:blob/main/)?(.+)'
     weights_name = None
@@ -295,6 +251,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         original_config_file = kwargs.pop("original_config_file", None)
         config_files = kwargs.pop("config_files", None)
         resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         cache_dir = kwargs.pop("cache_dir", None)
@@ -310,40 +267,23 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         if from_safetensors and use_safetensors is False:
             raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
 
-        has_valid_url_prefix, pretrained_model_link_or_path = check_valid_url(pretrained_model_link_or_path)
-
-        """
         if os.path.isfile(pretrained_model_link_or_path):
             checkpoint = load_state_dict(pretrained_model_link_or_path)
         else:
             repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
-            checkpoint_path = _get_model_file(repo_id, weights_name=weights_name, use_safetensors=from_safetensors)
-            checkpoint = load_state_dict(checkpoint_path)
-        """
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if (not ckpt_path.is_file()) and (not has_valid_url_prefix):
-            raise ValueError(
-                f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(VALID_URL_PREFIXES)}"
-            )
-        if ckpt_path.is_file():
-            checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
-        else:
-            pretrained_model_link_or_path = download_model_checkpoint(
-                ckpt_path,
+            checkpoint_path = _get_model_file(
+                repo_id,
+                weights_name=weights_name,
+                force_download=force_download,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
-            )
-            checkpoint = load_checkpoint(pretrained_model_link_or_path, from_safetensors=from_safetensors)
 
-        # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-        # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-        while "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
+            )
+            checkpoint = load_state_dict(checkpoint_path)
 
         original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
 
@@ -355,44 +295,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             component = create_controlnet_model(class_name, original_config, checkpoint, **kwargs)
             return component["controlnet"]
 
-        pipeline_class = _get_pipeline_class(cls, class_name=class_name)
-
-        # some modules can be passed directly to the init
-        # in this case they are already instantiated in `kwargs`
-        # extract them here
-        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
-        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
-        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
-
-        expected_keys = cls._get_init_keys(cls)
-        expected_keys.remove("self")
-        # remove general kwargs if present in dict
-        if "kwargs" in expected_keys:
-            expected_keys.remove("kwargs")
-
-        init_dict = {}
-        for key in expected_keys:
-            if key in kwargs:
-                init_dict[key] = kwargs.pop(key)
-
-        # define init kwargs and make sure that optional component modules are filtered out
-        init_kwargs = {
-            k: init_dict.pop(k)
-            for k in kwargs
-            if k in init_dict and k not in pipeline_class._optional_components
-        }
-        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
-
-        # remove `null` components
-        def load_module(name, value):
-            if value[0] is None:
-                return False
-            if name in passed_class_obj and passed_class_obj[name] is None:
-                return False
-            return True
-
-        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
-
         component_names = extract_pipeline_component_names(cls)
         pipeline_components = {}
         for component in component_names:
@@ -415,7 +317,7 @@ def load_module(name, value):
             if components:
                 pipeline_components.update(components)
 
-        pipe = pipeline_class(**pipeline_components)
+        pipe = cls(**pipeline_components)
 
         if torch_dtype is not None:
             pipe.to(dtype=torch_dtype)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 7ad49cc58b83..92e74df660e6 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -523,7 +523,7 @@ def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint,
 
 
 def convert_ldm_unet_checkpoint(
-    checkpoint, config, unet_key, path=None, extract_ema=False, skip_extract_state_dict=False
+    checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False
 ):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
@@ -793,6 +793,11 @@ def convert_controlnet_checkpoint(
 def create_controlnet_model(pipeline_class_name, original_config, checkpoint, **kwargs):
     from ..models import ControlNetModel
 
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
     image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
     upcast_attention = kwargs.get("upcast_attention", False)
 
diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index d762f015a7bc..8a5d163af154 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -244,15 +244,15 @@ def _get_model_file(
     pretrained_model_name_or_path: Union[str, Path],
     *,
     weights_name: str,
-    subfolder: Optional[str],
-    cache_dir: Optional[str],
-    force_download: bool,
-    proxies: Optional[Dict],
-    resume_download: bool,
-    local_files_only: bool,
-    token: Optional[str],
-    user_agent: Union[Dict, str, None],
-    revision: Optional[str],
+    subfolder: Optional[str] = None,
+    cache_dir: Optional[str]= None,
+    force_download: bool = False,
+    proxies: Optional[Dict] = None,
+    resume_download: bool = False,
+    local_files_only: bool = False,
+    token: Optional[str] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    revision: Optional[str] = None,
     commit_hash: Optional[str] = None,
 ):
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)

From eb71c80448206e0e5eac20c867c222d357854ce7 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 18 Jan 2024 13:13:54 +0000
Subject: [PATCH 59/89] update

---
 src/diffusers/loaders/single_file.py       |  10 +-
 src/diffusers/loaders/single_file_utils.py | 119 ++++++++++++---------
 src/diffusers/utils/hub_utils.py           |   2 +-
 3 files changed, 69 insertions(+), 62 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 4b23ed8fa086..fd08b5abc22e 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -14,16 +14,11 @@
 import inspect
 import os
 import re
-from pathlib import Path
 
-import torch
-from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import validate_hf_hub_args
-from safetensors.torch import load_file as safe_load
 from transformers import AutoFeatureExtractor
 
 from ..models.modeling_utils import load_state_dict
-from ..pipelines.pipeline_utils import _get_pipeline_class
 from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..utils import (
     is_accelerate_available,
@@ -86,9 +81,9 @@ def check_valid_url(pretrained_model_link_or_path):
 
 
 def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
-    pattern = r'([^/]+)/([^/]+)/(?:blob/main/)?(.+)'
+    pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
     weights_name = None
-    repo_id = None,
+    repo_id = (None,)
     for prefix in VALID_URL_PREFIXES:
         pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
     match = re.match(pattern, pretrained_model_name_or_path)
@@ -281,7 +276,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
-
             )
             checkpoint = load_state_dict(checkpoint_path)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 92e74df660e6..52e196bd24d1 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -144,29 +144,33 @@
         "post_quant_conv.bias": "post_quant_conv.bias",
     },
     "openclip": {
-        # "positional_embedding": "text_model.embeddings.position_embedding.weight",
-        # "token_embedding.weight": "text_model.embeddings.token_embedding.weight",
-        "ln_final.weight": "text_model.final_layer_norm.weight",
-        "ln_final.bias": "text_model.final_layer_norm.bias",
-        "text_projection": "text_projection.weight",
-        "resblocks.": "text_model.encoder.layers.",
-        "ln_1": "layer_norm1",
-        "ln_2": "layer_norm2",
-        ".c_fc.": ".fc1.",
-        ".c_proj.": ".fc2.",
-        ".attn": ".self_attn",
-        "ln_final.": "transformer.text_model.final_layer_norm.",
-        "token_embedding.weight": "transformer.text_model.embeddings.token_embedding.weight",
-        "positional_embedding": "transformer.text_model.embeddings.position_embedding.weight",
+        "layers": {
+            "text_model.embeddings.position_embedding.weight": "positional_embedding",
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.final_layer_norm.weight": "ln_final.weight",
+            "text_model.final_layer_norm.bias": "ln_final.bias",
+            "text_projection.weight": "text_projection",
+        },
+        "transformer": {
+            "text_model.encoder.layers.": "resblocks.",
+            "layer_norm1": "ln_1",
+            "layer_norm2": "ln_2",
+            ".fc1.": ".c_fc.",
+            ".fc2.": ".c_proj.",
+            ".self_attn": ".attn",
+            "transformer.text_model.final_layer_norm.": "ln_final.",
+            "transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "transformer.text_model.embeddings.position_embedding.weight": "positional_embedding",
+        },
     },
 }
 
-
 LDM_VAE_KEY = "first_stage_model."
 LDM_UNET_KEY = "model.diffusion_model."
 LDM_CONTROLNET_KEY = "control_model."
 LDM_CLIP_CONFIG_NAME = "openai/clip-vit-large-patch14"
 LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
+LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024
 
 SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = [
     "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias",
@@ -522,9 +526,7 @@ def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint,
         new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
 
 
-def convert_ldm_unet_checkpoint(
-    checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False
-):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -1005,46 +1007,55 @@ def convert_open_clip_checkpoint(
     with ctx():
         text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
 
-    keys = list(checkpoint.keys())
-
-    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
     text_model_dict = {}
+    text_proj_key = prefix + "text_projection"
+    text_proj_dim = (
+        int(checkpoint[text_proj_key].shape[0]) if text_proj_key in checkpoint else LDM_OPEN_CLIP_TEXT_PROJECTION_DIM
+    )
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
 
-    if prefix + "text_projection" in checkpoint:
-        d_model = int(checkpoint[prefix + "text_projection"].shape[0])
-    else:
-        d_model = 1024
+    openclip_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["layers"]
+    for diffusers_key, ldm_key in openclip_diffusers_ldm_map.items():
+        ldm_key = prefix + ldm_key
+        if ldm_key not in checkpoint:
+            continue
+        if ldm_key.endswith("text_projection"):
+            text_model_dict[diffusers_key] = checkpoint[ldm_key].T.contiguous()
+        else:
+            text_model_dict[diffusers_key] = checkpoint[ldm_key]
+
+    keys = list(checkpoint.keys())
+    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
 
-    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
     for key in keys:
         if key in keys_to_ignore:
             continue
-        if key[len(prefix) :] in textenc_conversion_map:
-            if key.endswith("text_projection"):
-                value = checkpoint[key].T.contiguous()
-            else:
-                value = checkpoint[key]
-
-            text_model_dict[textenc_conversion_map[key[len(prefix) :]]] = value
-
-        if key.startswith(prefix + "transformer."):
-            new_key = key[len(prefix + "transformer.") :]
-            if new_key.endswith(".in_proj_weight"):
-                new_key = new_key[: -len(".in_proj_weight")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
-                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
-                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
-            elif new_key.endswith(".in_proj_bias"):
-                new_key = new_key[: -len(".in_proj_bias")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
-                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
-                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
-            else:
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
 
-                text_model_dict[new_key] = checkpoint[key]
+        if not key.startswith(prefix + "transformer."):
+            continue
+
+        diffusers_key = key.replace(prefix + "transformer.", "")
+        transformer_diffusers_to_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["transformer"]
+        for new_key, old_key in transformer_diffusers_to_ldm_map.items():
+            diffusers_key = (
+                diffusers_key.replace(old_key, new_key).replace(".in_proj_weight", "").replace(".in_proj_bias", "")
+            )
+
+        if key.endswith(".in_proj_weight"):
+            weight_value = checkpoint[key]
+
+            text_model_dict[diffusers_key + ".q_proj.weight"] = weight_value[:text_proj_dim, :]
+            text_model_dict[diffusers_key + ".k_proj.weight"] = weight_value[text_proj_dim : text_proj_dim * 2, :]
+            text_model_dict[diffusers_key + ".v_proj.weight"] = weight_value[text_proj_dim * 2 :, :]
+
+        elif key.endswith(".in_proj_bias"):
+            weight_value = checkpoint[key]
+            text_model_dict[diffusers_key + ".q_proj.bias"] = weight_value[:text_proj_dim]
+            text_model_dict[diffusers_key + ".k_proj.bias"] = weight_value[text_proj_dim : text_proj_dim * 2]
+            text_model_dict[diffusers_key + ".v_proj.bias"] = weight_value[text_proj_dim * 2 :]
+
+        else:
+            text_model_dict[diffusers_key] = checkpoint[key]
 
     if is_accelerate_available():
         for param_name, param in text_model_dict.items():
@@ -1174,7 +1185,8 @@ def create_text_encoders_and_tokenizers(
                 local_files_only=local_files_only,
                 **config_kwargs,
             )
-        except Exception:
+        except Exception as e:
+            raise e
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
             )
@@ -1193,7 +1205,8 @@ def create_text_encoders_and_tokenizers(
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
             text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
 
-        except Exception:
+        except Exception as e:
+            raise e
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'."
             )
diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index 8a5d163af154..6f311b957abf 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -245,7 +245,7 @@ def _get_model_file(
     *,
     weights_name: str,
     subfolder: Optional[str] = None,
-    cache_dir: Optional[str]= None,
+    cache_dir: Optional[str] = None,
     force_download: bool = False,
     proxies: Optional[Dict] = None,
     resume_download: bool = False,

From 32349c5ba5ebeaf9f765d89943e08a739751db14 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 18 Jan 2024 15:08:10 +0000
Subject: [PATCH 60/89] update

---
 src/diffusers/loaders/single_file.py       | 117 +++++++-----
 src/diffusers/loaders/single_file_utils.py | 196 ++++++++++-----------
 2 files changed, 166 insertions(+), 147 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index fd08b5abc22e..74f7a134337a 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -19,6 +19,7 @@
 from transformers import AutoFeatureExtractor
 
 from ..models.modeling_utils import load_state_dict
+from ..pipelines.pipeline_utils import _get_pipeline_class
 from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..utils import (
     is_accelerate_available,
@@ -27,11 +28,11 @@
 )
 from ..utils.hub_utils import _get_model_file
 from .single_file_utils import (
-    create_controlnet_model,
-    create_scheduler,
-    create_text_encoders_and_tokenizers,
-    create_unet_model,
-    create_vae_model,
+    create_diffusers_controlnet_model_from_ldm,
+    create_diffusers_unet_model_from_ldm,
+    create_diffusers_vae_model_from_ldm,
+    create_scheduler_from_ldm,
+    create_text_encoders_and_tokenizers_from_ldm,
     fetch_original_config,
     infer_model_type,
 )
@@ -96,46 +97,57 @@ def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
     return repo_id, weights_name
 
 
-def build_component(
+def build_sub_model_components(
     pipeline_components,
     pipeline_class_name,
     component_name,
     original_config,
     checkpoint,
     checkpoint_path_or_dict,
+    local_files_only=False,
+    load_safety_checker=False,
     **kwargs,
 ):
-    if component_name in kwargs:
-        component = kwargs.pop(component_name, None)
-        return {component_name: component}
-
     if component_name in pipeline_components:
         return {}
 
-    load_safety_checker = kwargs.get("load_safety_checker", False)
-    local_files_only = kwargs.get("local_files_only", False)
+    model_type = kwargs.get("model_type", None)
+    image_size = kwargs.pop("image_size", None)
 
     if component_name == "unet":
-        unet_components = create_unet_model(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        num_in_channels = kwargs.pop("num_in_channels", None)
+        unet_components = create_diffusers_unet_model_from_ldm(
+            pipeline_class_name, original_config, checkpoint, num_in_channels=num_in_channels, image_size=image_size
         )
         return unet_components
 
     if component_name == "vae":
-        vae_components = create_vae_model(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        vae_components = create_diffusers_vae_model_from_ldm(
+            pipeline_class_name, original_config, checkpoint, image_size
         )
         return vae_components
 
     if component_name == "scheduler":
-        scheduler_components = create_scheduler(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        scheduler_type = kwargs.get("scheduler_type", "ddim")
+        prediction_type = kwargs.get("prediction_type", None)
+
+        scheduler_components = create_scheduler_from_ldm(
+            pipeline_class_name,
+            original_config,
+            checkpoint,
+            scheduler_type=scheduler_type,
+            prediction_type=prediction_type,
+            model_type=model_type,
         )
+
         return scheduler_components
 
     if component_name in ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]:
-        text_encoder_components = create_text_encoders_and_tokenizers(
-            pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+        text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
+            original_config,
+            checkpoint,
+            model_type=model_type,
+            local_files_only=local_files_only,
         )
         return text_encoder_components
 
@@ -156,7 +168,7 @@ def build_component(
     return
 
 
-def build_additional_components(
+def set_additional_components(
     pipeline_class_name,
     original_config,
     **kwargs,
@@ -282,36 +294,57 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
 
         if class_name == "AutoencoderKL":
-            component = create_vae_model(class_name, original_config, checkpoint, pretrained_model_link_or_path)
+            image_size = kwargs.pop("image_size", None)
+            component = create_diffusers_vae_model_from_ldm(
+                class_name, original_config, checkpoint, image_size=image_size
+            )
             return component["vae"]
 
         if class_name == "ControlNetModel":
-            component = create_controlnet_model(class_name, original_config, checkpoint, **kwargs)
-            return component["controlnet"]
+            upcast_attention = kwargs.pop("upcast_attention", False)
+            image_size = kwargs.pop("image_size", None)
 
-        component_names = extract_pipeline_component_names(cls)
-        pipeline_components = {}
-        for component in component_names:
-            components = build_component(
-                pipeline_components,
-                class_name,
-                component,
-                original_config,
-                checkpoint,
-                pretrained_model_link_or_path,
-                **kwargs,
+            component = create_diffusers_controlnet_model_from_ldm(
+                class_name, original_config, checkpoint, upcast_attention=upcast_attention, image_size=image_size
             )
-            if not components:
-                continue
-            pipeline_components.update(components)
+            return component["controlnet"]
+
+        pipeline_class = _get_pipeline_class(
+            cls,
+            config=None,
+            cache_dir=cache_dir,
+        )
 
-        additional_components = set(component_names - pipeline_components.keys())
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        init_kwargs = {}
+        for name in expected_modules:
+            if name in passed_class_obj:
+                init_kwargs[name] = passed_class_obj[name]
+            else:
+                components = build_sub_model_components(
+                    init_kwargs,
+                    class_name,
+                    name,
+                    original_config,
+                    checkpoint,
+                    pretrained_model_link_or_path,
+                    **kwargs,
+                )
+                if not components:
+                    continue
+                init_kwargs.update(components)
+
+        additional_components = set(optional_kwargs - init_kwargs.keys())
         if additional_components:
-            components = build_additional_components(class_name, original_config, **kwargs)
+            components = set_additional_components(class_name, original_config, **kwargs)
             if components:
-                pipeline_components.update(components)
+                init_kwargs.update(components)
 
-        pipe = cls(**pipeline_components)
+        init_kwargs.update(passed_pipe_kwargs)
+        pipe = pipeline_class(**init_kwargs)
 
         if torch_dtype is not None:
             pipe.to(dtype=torch_dtype)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 52e196bd24d1..cbfd8ea0d40e 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Conversion script for the Stable Diffusion checkpoints."""
 
-import re
 from contextlib import nullcontext
 from io import BytesIO
 
@@ -188,30 +187,6 @@
     "cond_stage_model.model.text_projection",
 ]
 
-textenc_conversion_lst = [
-    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
-    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
-    ("ln_final.weight", "text_model.final_layer_norm.weight"),
-    ("ln_final.bias", "text_model.final_layer_norm.bias"),
-    ("text_projection", "text_projection.weight"),
-]
-textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
-
-textenc_transformer_conversion_lst = [
-    # (stable-diffusion, HF Diffusers)
-    ("resblocks.", "text_model.encoder.layers."),
-    ("ln_1", "layer_norm1"),
-    ("ln_2", "layer_norm2"),
-    (".c_fc.", ".fc1."),
-    (".c_proj.", ".fc2."),
-    (".attn", ".self_attn"),
-    ("ln_final.", "transformer.text_model.final_layer_norm."),
-    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
-    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
-]
-protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
-textenc_pattern = re.compile("|".join(protected.keys()))
-
 
 def fetch_original_config_file_from_url(class_name, checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
@@ -284,7 +259,7 @@ def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True)
     return checkpoint
 
 
-def infer_model_type(pipeline_class_name, original_config, model_type=None, **kwargs):
+def infer_model_type(original_config, model_type=None):
     if model_type is not None:
         return model_type
 
@@ -318,10 +293,12 @@ def get_default_scheduler_config():
     return SCHEDULER_DEFAULT_CONFIG
 
 
-def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs):
-    image_size = kwargs.get("image_size", 512)
+def set_image_size(pipeline_class_name, original_config, checkpoint, image_size=None, model_type=None):
+    if image_size:
+        return image_size
+
     global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
-    model_type = infer_model_type(pipeline_class_name, original_config, **kwargs)
+    model_type = infer_model_type(original_config, model_type)
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
         image_size = original_config["model"]["params"].unet_config.params.image_size
@@ -340,7 +317,9 @@ def determine_image_size(pipeline_class_name, original_config, checkpoint, **kwa
         image_size = 512 if global_step == 875000 else 768
         return image_size
 
-    return image_size
+    else:
+        image_size = 512
+        return image_size
 
 
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
@@ -526,41 +505,36 @@ def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint,
         new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
 
 
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, skip_extract_state_dict=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
 
-    if skip_extract_state_dict:
-        unet_state_dict = checkpoint
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    unet_key = LDM_UNET_KEY
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        logger.warning("Checkpoint has both EMA and non-EMA weights.")
+        logger.warning(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
     else:
-        # extract state_dict for UNet
-        unet_state_dict = {}
-        keys = list(checkpoint.keys())
-
-        unet_key = LDM_UNET_KEY
-
-        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        if sum(k.startswith("model_ema") for k in keys) > 100:
             logger.warning(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
             )
-            for key in keys:
-                if key.startswith("model.diffusion_model"):
-                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-        else:
-            if sum(k.startswith("model_ema") for k in keys) > 100:
-                logger.warning(
-                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-                )
-
-            for key in keys:
-                if key.startswith(unet_key):
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
 
     new_checkpoint = {}
     ldm_unet_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["layers"]
@@ -792,7 +766,10 @@ def convert_controlnet_checkpoint(
     return new_checkpoint
 
 
-def create_controlnet_model(pipeline_class_name, original_config, checkpoint, **kwargs):
+def create_diffusers_controlnet_model_from_ldm(
+    pipeline_class_name, original_config, checkpoint, upcast_attention=False, image_size=None
+):
+    # import here to avoid circular imports
     from ..models import ControlNetModel
 
     # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
@@ -800,8 +777,7 @@ def create_controlnet_model(pipeline_class_name, original_config, checkpoint, **
     while "state_dict" in checkpoint:
         checkpoint = checkpoint["state_dict"]
 
-    image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
-    upcast_attention = kwargs.get("upcast_attention", False)
+    image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
 
     diffusers_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
     diffusers_config["upcast_attention"] = upcast_attention
@@ -953,7 +929,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
-def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
+def create_text_encoder_from_ldm_clip_checkpoint(checkpoint, local_files_only=False):
     try:
         config = CLIPTextConfig.from_pretrained(LDM_CLIP_CONFIG_NAME, local_files_only=local_files_only)
     except Exception:
@@ -988,7 +964,7 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
     return text_model
 
 
-def convert_open_clip_checkpoint(
+def create_text_encoder_from_open_clip_checkpoint(
     checkpoint,
     config_name,
     prefix="cond_stage_model.model.",
@@ -1069,36 +1045,35 @@ def convert_open_clip_checkpoint(
     return text_model
 
 
-def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
-    if "num_in_channels" in kwargs:
-        num_in_channels = kwargs.get("num_in_channels")
-
-    elif pipeline_class_name in [
-        "StableDiffusionInpaintPipeline",
-        "StableDiffusionXLInpaintPipeline",
-        "StableDiffusionXLControlNetInpaintPipeline",
-    ]:
-        num_in_channels = 9
-
-    elif pipeline_class_name == "StableDiffusionUpscalePipeline":
-        num_in_channels = 7
-
-    else:
-        num_in_channels = 4
+def create_diffusers_unet_model_from_ldm(
+    pipeline_class_name,
+    original_config,
+    checkpoint,
+    num_in_channels=None,
+    upcast_attention=False,
+    extract_ema=False,
+    image_size=None,
+):
+    if num_in_channels is None:
+        if pipeline_class_name in [
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+        ]:
+            num_in_channels = 9
 
-    image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
+        elif pipeline_class_name == "StableDiffusionUpscalePipeline":
+            num_in_channels = 7
 
-    upcast_attention = kwargs.get("upcast_attention", False)
-    extract_ema = kwargs.get("extract_ema", False)
+        else:
+            num_in_channels = 4
 
+    image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     unet_config["in_channels"] = num_in_channels
     unet_config["upcast_attention"] = upcast_attention
 
-    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
-    diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=path, extract_ema=extract_ema
-    )
+    diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config, extract_ema=extract_ema)
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
     with ctx():
         unet = UNet2DConditionModel(**unet_config)
@@ -1112,10 +1087,16 @@ def create_unet_model(pipeline_class_name, original_config, checkpoint, checkpoi
     return {"unet": unet}
 
 
-def create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+def create_diffusers_vae_model_from_ldm(
+    pipeline_class_name,
+    original_config,
+    checkpoint,
+    image_size=None,
+):
+    # import here to avoid circular imports
     from ..models import AutoencoderKL
 
-    image_size = determine_image_size(pipeline_class_name, original_config, checkpoint, **kwargs)
+    image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
 
     vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
     diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
@@ -1133,18 +1114,20 @@ def create_vae_model(pipeline_class_name, original_config, checkpoint, checkpoin
     return {"vae": vae}
 
 
-def create_text_encoders_and_tokenizers(
-    pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs
+def create_text_encoders_and_tokenizers_from_ldm(
+    original_config,
+    checkpoint,
+    model_type=None,
+    local_files_only=False,
 ):
-    model_type = infer_model_type(pipeline_class_name, original_config)
-    local_files_only = kwargs.get("local_files_only", False)
+    model_type = infer_model_type(original_config, model_type=model_type)
 
     if model_type == "FrozenOpenCLIPEmbedder":
         config_name = "stabilityai/stable-diffusion-2"
         config_kwargs = {"subfolder": "text_encoder"}
 
         try:
-            text_encoder = convert_open_clip_checkpoint(
+            text_encoder = create_text_encoder_from_open_clip_checkpoint(
                 checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
             )
             tokenizer = CLIPTokenizer.from_pretrained(
@@ -1160,7 +1143,7 @@ def create_text_encoders_and_tokenizers(
     elif model_type == "FrozenCLIPEmbedder":
         try:
             config_name = "openai/clip-vit-large-patch14"
-            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
 
         except Exception:
@@ -1177,7 +1160,7 @@ def create_text_encoders_and_tokenizers(
 
         try:
             tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
-            text_encoder_2 = convert_open_clip_checkpoint(
+            text_encoder_2 = create_text_encoder_from_open_clip_checkpoint(
                 checkpoint,
                 config_name,
                 prefix=prefix,
@@ -1185,8 +1168,7 @@ def create_text_encoders_and_tokenizers(
                 local_files_only=local_files_only,
                 **config_kwargs,
             )
-        except Exception as e:
-            raise e
+        except Exception:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
             )
@@ -1203,10 +1185,9 @@ def create_text_encoders_and_tokenizers(
         try:
             config_name = "openai/clip-vit-large-patch14"
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
-            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
 
-        except Exception as e:
-            raise e
+        except Exception:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'."
             )
@@ -1216,7 +1197,7 @@ def create_text_encoders_and_tokenizers(
             config_kwargs = {"projection_dim": 1280}
             prefix = "conditioner.embedders.1.model."
             tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
-            text_encoder_2 = convert_open_clip_checkpoint(
+            text_encoder_2 = create_text_encoder_from_open_clip_checkpoint(
                 checkpoint,
                 config_name,
                 prefix=prefix,
@@ -1239,12 +1220,17 @@ def create_text_encoders_and_tokenizers(
     return
 
 
-def create_scheduler(pipeline_class_name, original_config, checkpoint, checkpoint_path_or_dict, **kwargs):
+def create_scheduler_from_ldm(
+    pipeline_class_name,
+    original_config,
+    checkpoint,
+    prediction_type=None,
+    scheduler_type="ddim",
+    model_type=None,
+):
     scheduler_config = get_default_scheduler_config()
-    model_type = infer_model_type(pipeline_class_name, original_config)
+    model_type = infer_model_type(original_config, model_type=model_type)
 
-    scheduler_type = kwargs.get("scheduler_type", "ddim")
-    prediction_type = kwargs.get("prediction_type", None)
     global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
 
     num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000

From a076513fd2762a52b5df7f194c6ba39ad7030b3f Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 18 Jan 2024 16:50:05 +0000
Subject: [PATCH 61/89] update

---
 src/diffusers/loaders/single_file.py       | 3 +++
 src/diffusers/loaders/single_file_utils.py | 1 -
 src/diffusers/models/modeling_utils.py     | 8 +++++---
 src/diffusers/utils/__init__.py            | 1 +
 src/diffusers/utils/constants.py           | 1 +
 5 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 74f7a134337a..4bb7f330faf8 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -291,6 +291,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             )
             checkpoint = load_state_dict(checkpoint_path)
 
+        while "state_dict" in checkpoint:
+            checkpoint = checkpoint["state_dict"]
+
         original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
 
         if class_name == "AutoencoderKL":
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index cbfd8ea0d40e..bc3b28075286 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -509,7 +509,6 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
-
     # extract state_dict for UNet
     unet_state_dict = {}
     keys = list(checkpoint.keys())
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 445c3ca71caf..90a700d9443f 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -32,6 +32,7 @@
 from ..utils import (
     CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
+    SAFETENSORS_FILE_EXTENSION,
     SAFETENSORS_WEIGHTS_NAME,
     WEIGHTS_NAME,
     _add_variant,
@@ -102,10 +103,11 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
     Reads a checkpoint file, returning properly formatted errors if they arise.
     """
     try:
-        if os.path.basename(checkpoint_file) == _add_variant(WEIGHTS_NAME, variant):
-            return torch.load(checkpoint_file, map_location="cpu")
-        else:
+        file_extension = os.path.basename(checkpoint_file).split(".")[-1]
+        if file_extension == SAFETENSORS_FILE_EXTENSION:
             return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        else:
+            return torch.load(checkpoint_file, map_location="cpu")
     except Exception as e:
         try:
             with open(checkpoint_file) as f:
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index adc1cb8a3dfb..54fde3e1f714 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -28,6 +28,7 @@
     MIN_PEFT_VERSION,
     ONNX_EXTERNAL_WEIGHTS_NAME,
     ONNX_WEIGHTS_NAME,
+    SAFETENSORS_FILE_EXTENSION,
     SAFETENSORS_WEIGHTS_NAME,
     USE_PEFT_BACKEND,
     WEIGHTS_NAME,
diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py
index 8850da073e95..a397e8cf86d3 100644
--- a/src/diffusers/utils/constants.py
+++ b/src/diffusers/utils/constants.py
@@ -31,6 +31,7 @@
 FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
 ONNX_WEIGHTS_NAME = "model.onnx"
 SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
+SAFETENSORS_FILE_EXTENSION = "safetensors"
 ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
 HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
 DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"

From db3eb06ae4a968fcbc4d334583ddb6b52e6cd6d3 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 04:06:15 +0000
Subject: [PATCH 62/89] update

---
 src/diffusers/loaders/single_file_utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index bc3b28075286..390927f179d0 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -301,7 +301,7 @@ def set_image_size(pipeline_class_name, original_config, checkpoint, image_size=
     model_type = infer_model_type(original_config, model_type)
 
     if pipeline_class_name == "StableDiffusionUpscalePipeline":
-        image_size = original_config["model"]["params"].unet_config.params.image_size
+        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
         return image_size
 
     elif model_type in ["SDXL", "SDXL-Refiner"]:
@@ -771,11 +771,6 @@ def create_diffusers_controlnet_model_from_ldm(
     # import here to avoid circular imports
     from ..models import ControlNetModel
 
-    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-    while "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
     image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
 
     diffusers_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
@@ -1033,8 +1028,11 @@ def create_text_encoder_from_open_clip_checkpoint(
             text_model_dict[diffusers_key] = checkpoint[key]
 
     if is_accelerate_available():
-        for param_name, param in text_model_dict.items():
-            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+        try:
+            for param_name, param in text_model_dict.items():
+                set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+        except Exception as e:
+            raise e
     else:
         if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
             text_model_dict.pop("text_model.embeddings.position_ids", None)
@@ -1127,12 +1125,13 @@ def create_text_encoders_and_tokenizers_from_ldm(
 
         try:
             text_encoder = create_text_encoder_from_open_clip_checkpoint(
-                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+                checkpoint, config_name, has_projection=True, local_files_only=local_files_only, **config_kwargs
             )
             tokenizer = CLIPTokenizer.from_pretrained(
                 config_name, subfolder="tokenizer", local_files_only=local_files_only
             )
-        except Exception:
+        except Exception as e:
+            raise e
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder in the following path: '{config_name}'."
             )

From 9b42fbfdbee2336ed2be4463eb708f74b072dcbf Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 04:27:37 +0000
Subject: [PATCH 63/89] update

---
 src/diffusers/loaders/single_file_utils.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 390927f179d0..4a0b10ffd160 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -984,19 +984,21 @@ def create_text_encoder_from_open_clip_checkpoint(
     )
     text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
 
+    keys = list(checkpoint.keys())
+    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
+
     openclip_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["layers"]
     for diffusers_key, ldm_key in openclip_diffusers_ldm_map.items():
         ldm_key = prefix + ldm_key
         if ldm_key not in checkpoint:
             continue
+        if ldm_key in keys_to_ignore:
+            continue
         if ldm_key.endswith("text_projection"):
             text_model_dict[diffusers_key] = checkpoint[ldm_key].T.contiguous()
         else:
             text_model_dict[diffusers_key] = checkpoint[ldm_key]
 
-    keys = list(checkpoint.keys())
-    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
-
     for key in keys:
         if key in keys_to_ignore:
             continue
@@ -1028,11 +1030,9 @@ def create_text_encoder_from_open_clip_checkpoint(
             text_model_dict[diffusers_key] = checkpoint[key]
 
     if is_accelerate_available():
-        try:
-            for param_name, param in text_model_dict.items():
-                set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
-        except Exception as e:
-            raise e
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+
     else:
         if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
             text_model_dict.pop("text_model.embeddings.position_ids", None)
@@ -1125,7 +1125,7 @@ def create_text_encoders_and_tokenizers_from_ldm(
 
         try:
             text_encoder = create_text_encoder_from_open_clip_checkpoint(
-                checkpoint, config_name, has_projection=True, local_files_only=local_files_only, **config_kwargs
+                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
             )
             tokenizer = CLIPTokenizer.from_pretrained(
                 config_name, subfolder="tokenizer", local_files_only=local_files_only

From ffde1235fce6efbec98fe924659381a7324311ed Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 05:44:01 +0000
Subject: [PATCH 64/89] update

---
 src/diffusers/loaders/autoencoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
index 4f1df1310ec2..94240c0f4bcc 100644
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -9,7 +9,6 @@
 
 from ..utils import (
     is_accelerate_available,
-    is_omegaconf_available,
     is_transformers_available,
     logging,
 )

From fd2ec363692f246cc331c6e6ab53dccf772e21cf Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 06:13:20 +0000
Subject: [PATCH 65/89] update

---
 src/diffusers/loaders/single_file.py          | 38 +++----------------
 .../pipeline_stable_diffusion_xl_img2img.py   |  1 -
 2 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 4bb7f330faf8..1896440b6cd0 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -55,31 +55,6 @@
     "StableDiffusionXLControlNetImg2ImgPipeline",
 ]
 
-LOADABLE_CLASSES = {
-    "diffusers": {
-        "ControlNetModel": "create_controlnet_model",
-        "AutoencoderKL": "create_vae_model",
-        "UNet2DConditionModel": "create_unet_model",
-    }
-}
-
-
-def extract_pipeline_component_names(pipeline_class):
-    components = inspect.signature(pipeline_class).parameters.keys()
-    return components
-
-
-def check_valid_url(pretrained_model_link_or_path):
-    # check if url prefix is valid
-    # remove huggingface url prefix from model path
-    has_valid_url_prefix = False
-    for prefix in VALID_URL_PREFIXES:
-        if pretrained_model_link_or_path.startswith(prefix):
-            pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-            has_valid_url_prefix = True
-
-    return has_valid_url_prefix, pretrained_model_link_or_path
-
 
 def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
     pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
@@ -174,8 +149,9 @@ def set_additional_components(
     **kwargs,
 ):
     components = {}
+    model_type = kwargs.get("model_type", None)
     if pipeline_class_name in REFINER_PIPELINES:
-        model_type = infer_model_type(pipeline_class_name, original_config)
+        model_type = infer_model_type(original_config, model_type=model_type)
         is_refiner = model_type == "SDXL-Refiner"
         components.update(
             {
@@ -206,8 +182,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                       `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
                     - A path to a *file* containing all pipeline weights.
             torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
+                Override the default `torch.dtype` and load the model with another dtype.
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
@@ -291,6 +266,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             )
             checkpoint = load_state_dict(checkpoint_path)
 
+        # some checkpoints contain the model state dict under a "state_dict" key
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
@@ -340,11 +316,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                     continue
                 init_kwargs.update(components)
 
-        additional_components = set(optional_kwargs - init_kwargs.keys())
+        additional_components = set_additional_components(class_name, original_config, **kwargs)
         if additional_components:
-            components = set_additional_components(class_name, original_config, **kwargs)
-            if components:
-                init_kwargs.update(components)
+            init_kwargs.update(additional_components)
 
         init_kwargs.update(passed_pipe_kwargs)
         pipe = pipeline_class(**init_kwargs)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 1c22affba1aa..4e95a876ce7a 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -792,7 +792,6 @@ def _get_add_time_ids(
             self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
         )
         expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
-
         if (
             expected_add_embed_dim > passed_add_embed_dim
             and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim

From aee8b5f5d588c2fbec55445e28930596ed2fc20b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 06:24:35 +0000
Subject: [PATCH 66/89] update

---
 src/diffusers/loaders/single_file_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 4a0b10ffd160..a35ec5b390d5 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -687,7 +687,7 @@ def convert_controlnet_checkpoint(
                 controlnet_state_dict[key.replace(controlnet_key, "")] = checkpoint.pop(key)
 
     new_checkpoint = {}
-    ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]
+    ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]["layers"]
     for diffusers_key, ldm_key in ldm_controlnet_keys.items():
         if ldm_key not in controlnet_state_dict:
             continue

From 2fb9baf934629db718f00d4eaeeddd9dac80dfdd Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 07:24:19 +0000
Subject: [PATCH 67/89] update

---
 src/diffusers/loaders/autoencoder.py          | 222 ------------------
 src/diffusers/loaders/controlnet.py           | 167 -------------
 src/diffusers/loaders/single_file.py          |   3 +-
 src/diffusers/loaders/single_file_utils.py    |  34 +--
 .../models/autoencoders/autoencoder_kl.py     |   4 +-
 5 files changed, 16 insertions(+), 414 deletions(-)
 delete mode 100644 src/diffusers/loaders/autoencoder.py
 delete mode 100644 src/diffusers/loaders/controlnet.py

diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
deleted file mode 100644
index 94240c0f4bcc..000000000000
--- a/src/diffusers/loaders/autoencoder.py
+++ /dev/null
@@ -1,222 +0,0 @@
-from contextlib import nullcontext
-from io import BytesIO
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import validate_hf_hub_args
-
-from ..utils import (
-    is_accelerate_available,
-    is_transformers_available,
-    logging,
-)
-from ..utils.import_utils import BACKENDS_MAPPING
-
-
-if is_transformers_available():
-    pass
-
-if is_accelerate_available():
-    from accelerate import init_empty_weights
-
-logger = logging.get_logger(__name__)
-
-
-class FromOriginalVAEMixin:
-    """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into an [`AutoencoderKL`].
-    """
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
-        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
-
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            scaling_factor (`float`, *optional*, defaults to 0.18215):
-                The component-wise standard deviation of the trained latent space computed using the first batch of the
-                training set. This is used to scale the latent space to have unit variance when training the diffusion
-                model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
-                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
-                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
-        <Tip warning={true}>
-
-            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
-            a VAE from SDXL or a Stable Diffusion v2 model or higher.
-
-        </Tip>
-
-        Examples:
-
-        ```py
-        from diffusers import AutoencoderKL
-
-        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
-        model = AutoencoderKL.from_single_file(url)
-        ```
-        """
-        if not is_omegaconf_available():
-            raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-        from omegaconf import OmegaConf
-
-        from ..models import AutoencoderKL
-
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import (
-            convert_ldm_vae_checkpoint,
-            create_vae_diffusers_config,
-        )
-
-        config_file = kwargs.pop("config_file", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        revision = kwargs.pop("revision", None)
-        image_size = kwargs.pop("image_size", None)
-        scaling_factor = kwargs.pop("scaling_factor", None)
-        kwargs.pop("upcast_attention", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
-            )
-
-        if from_safetensors:
-            from safetensors import safe_open
-
-            checkpoint = {}
-            with safe_open(pretrained_model_link_or_path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    checkpoint[key] = f.get_tensor(key)
-        else:
-            checkpoint = torch.load(pretrained_model_link_or_path, map_location="cpu")
-
-        if "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
-
-        if config_file is None:
-            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
-            config_file = BytesIO(requests.get(config_url).content)
-
-        original_config = OmegaConf.load(config_file)
-
-        # default to sd-v1-5
-        image_size = image_size or 512
-
-        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-        if scaling_factor is None:
-            if (
-                "model" in original_config
-                and "params" in original_config.model
-                and "scale_factor" in original_config.model.params
-            ):
-                vae_scaling_factor = original_config.model.params.scale_factor
-            else:
-                vae_scaling_factor = 0.18215  # default SD scaling factor
-
-        vae_config["scaling_factor"] = vae_scaling_factor
-
-        ctx = init_empty_weights if is_accelerate_available() else nullcontext
-        with ctx():
-            vae = AutoencoderKL(**vae_config)
-
-        if is_accelerate_available():
-            from ..models.modeling_utils import load_model_dict_into_meta
-
-            load_model_dict_into_meta(vae, converted_vae_checkpoint, device="cpu")
-        else:
-            vae.load_state_dict(converted_vae_checkpoint)
-
-        if torch_dtype is not None:
-            vae.to(dtype=torch_dtype)
-
-        return vae
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
deleted file mode 100644
index 4f709d75be71..000000000000
--- a/src/diffusers/loaders/controlnet.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from io import BytesIO
-from pathlib import Path
-
-import requests
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import validate_hf_hub_args
-
-
-class FromOriginalControlnetMixin:
-    """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
-    """
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
-        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
-
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
-        Examples:
-
-        ```py
-        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-
-        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
-        model = ControlNetModel.from_single_file(url)
-
-        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
-        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
-        ```
-        """
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import download_controlnet_from_original_ckpt
-
-        config_file = kwargs.pop("config_file", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        num_in_channels = kwargs.pop("num_in_channels", None)
-        use_linear_projection = kwargs.pop("use_linear_projection", None)
-        revision = kwargs.pop("revision", None)
-        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", None)
-        upcast_attention = kwargs.pop("upcast_attention", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
-            )
-
-        if config_file is None:
-            config_url = "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml"
-            config_file = BytesIO(requests.get(config_url).content)
-
-        image_size = image_size or 512
-
-        controlnet = download_controlnet_from_original_ckpt(
-            pretrained_model_link_or_path,
-            original_config_file=config_file,
-            image_size=image_size,
-            extract_ema=extract_ema,
-            num_in_channels=num_in_channels,
-            upcast_attention=upcast_attention,
-            from_safetensors=from_safetensors,
-            use_linear_projection=use_linear_projection,
-        )
-
-        if torch_dtype is not None:
-            controlnet.to(dtype=torch_dtype)
-
-        return controlnet
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 1896440b6cd0..d07b58bac5f6 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -231,7 +231,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         ```
         """
         original_config_file = kwargs.pop("original_config_file", None)
-        config_files = kwargs.pop("config_files", None)
         resume_download = kwargs.pop("resume_download", False)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
@@ -270,7 +269,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
+        original_config = fetch_original_config(class_name, checkpoint, original_config_file)
 
         if class_name == "AutoencoderKL":
             image_size = kwargs.pop("image_size", None)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index a35ec5b390d5..3ab0aa716f21 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Conversion script for the Stable Diffusion checkpoints."""
 
+import os
+import re
 from contextlib import nullcontext
 from io import BytesIO
 
@@ -188,7 +190,7 @@
 ]
 
 
-def fetch_original_config_file_from_url(class_name, checkpoint):
+def infer_original_config_file(class_name, checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
         config_url = CONFIG_URLS["v2"]
 
@@ -212,30 +214,20 @@ def fetch_original_config_file_from_url(class_name, checkpoint):
     return original_config_file
 
 
-def fetch_original_config_file_from_file(config_files: list):
-    if "v2" in config_files:
-        return config_files["v2"]
+def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None):
 
-    elif "xl" in config_files:
-        return config_files["xl"]
+    def is_valid_url(url):
+        pattern = r'^(http|https):\/\/([\w.-]+)(\.[\w.-]+)+([\/\w\.-]*)*\/?$'
+        return bool(re.match(pattern, url))
 
-    elif "xl_refiner" in config_files:
-        return config_files["xl_refiner"]
+    if os.path.isfile(original_config_file):
+        with open(original_config_file, "r") as fp:
+            original_config_file = fp.read()
 
+    elif is_valid_url(original_config_file):
+        original_config_file = BytesIO(requests.get(original_config_file).content)
     else:
-        return config_files["v1"]
-
-
-def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None, config_files=None):
-    if original_config_file:
-        original_config = yaml.safe_load(original_config_file)
-        return original_config
-
-    elif config_files:
-        original_config_file = fetch_original_config_file_from_file(config_files)
-
-    else:
-        original_config_file = fetch_original_config_file_from_url(pipeline_class_name, checkpoint)
+        original_config_file = infer_original_config_file(pipeline_class_name, checkpoint)
 
     original_config = yaml.safe_load(original_config_file)
 
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 10a3ae58de9f..92d12a220f76 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalVAEMixin
+from ...loaders import FromSingleFileMixin
 from ...utils.accelerate_utils import apply_forward_hook
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -32,7 +32,7 @@
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
-class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+class AutoencoderKL(ModelMixin, ConfigMixin, FromSingleFileMixin):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
 

From bb8d317c7ed7a04f7fd35cfd3ee285e60cb873ad Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 07:31:16 +0000
Subject: [PATCH 68/89] clean

---
 src/diffusers/loaders/__init__.py                              | 3 ---
 .../models/autoencoders/autoencoder_kl_temporal_decoder.py     | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index b917cf528432..675246e408fa 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -54,8 +54,6 @@ def text_encoder_attn_modules(text_encoder):
 _import_structure = {}
 
 if is_torch_available():
-    _import_structure["autoencoder"] = ["FromOriginalVAEMixin"]
-    _import_structure["controlnet"] = ["FromOriginalControlnetMixin"]
     _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
     _import_structure["utils"] = ["AttnProcsLayers"]
 
@@ -70,7 +68,6 @@ def text_encoder_attn_modules(text_encoder):
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     if is_torch_available():
-        from .single_file import FromOriginalControlnetMixin, FromOriginalVAEMixin
         from .unet import UNet2DConditionLoadersMixin
         from .utils import AttnProcsLayers
 
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index dbafb4571d4a..5a4259577600 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -17,7 +17,6 @@
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalVAEMixin
 from ...utils import is_torch_version
 from ...utils.accelerate_utils import apply_forward_hook
 from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
@@ -162,7 +161,7 @@ def custom_forward(*inputs):
         return sample
 
 
-class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
 

From 480a4b4ae42aecef198a9a30fdc793e059b565fa Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 07:34:49 +0000
Subject: [PATCH 69/89] update

---
 src/diffusers/loaders/single_file.py                           | 1 -
 src/diffusers/loaders/single_file_utils.py                     | 3 +--
 .../pipeline_stable_diffusion_xl_img2img.py                    | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index d07b58bac5f6..1e4678cfbee0 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import inspect
 import os
 import re
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 3ab0aa716f21..8002f1d9efd1 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -215,9 +215,8 @@ def infer_original_config_file(class_name, checkpoint):
 
 
 def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None):
-
     def is_valid_url(url):
-        pattern = r'^(http|https):\/\/([\w.-]+)(\.[\w.-]+)+([\/\w\.-]*)*\/?$'
+        pattern = r"^(http|https):\/\/([\w.-]+)(\.[\w.-]+)+([\/\w\.-]*)*\/?$"
         return bool(re.match(pattern, url))
 
     if os.path.isfile(original_config_file):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 4e95a876ce7a..1c22affba1aa 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -792,6 +792,7 @@ def _get_add_time_ids(
             self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
         )
         expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
         if (
             expected_add_embed_dim > passed_add_embed_dim
             and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim

From 2483d516abfa3b535fb95f8524cd16451e7f6cc5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 07:42:12 +0000
Subject: [PATCH 70/89] update

---
 src/diffusers/loaders/single_file_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 8002f1d9efd1..50ceb4b8cb19 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -219,14 +219,18 @@ def is_valid_url(url):
         pattern = r"^(http|https):\/\/([\w.-]+)(\.[\w.-]+)+([\/\w\.-]*)*\/?$"
         return bool(re.match(pattern, url))
 
-    if os.path.isfile(original_config_file):
+    if original_config_file is None:
+        original_config_file = infer_original_config_file(pipeline_class_name, checkpoint)
+
+    elif os.path.isfile(original_config_file):
         with open(original_config_file, "r") as fp:
             original_config_file = fp.read()
 
     elif is_valid_url(original_config_file):
         original_config_file = BytesIO(requests.get(original_config_file).content)
+
     else:
-        original_config_file = infer_original_config_file(pipeline_class_name, checkpoint)
+        raise ValueError("Invalid `original_config_file` provided. Please set it to a valid file path or URL.")
 
     original_config = yaml.safe_load(original_config_file)
 

From dab7f014a8a4cae1c6afea990ea302904b909adf Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 07:47:50 +0000
Subject: [PATCH 71/89] clean up

---
 src/diffusers/loaders/single_file_utils.py | 24 +++-------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 50ceb4b8cb19..7d95f97b8b3d 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -169,7 +169,6 @@
 LDM_VAE_KEY = "first_stage_model."
 LDM_UNET_KEY = "model.diffusion_model."
 LDM_CONTROLNET_KEY = "control_model."
-LDM_CLIP_CONFIG_NAME = "openai/clip-vit-large-patch14"
 LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
 LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024
 
@@ -237,23 +236,6 @@ def is_valid_url(url):
     return original_config
 
 
-def load_checkpoint(checkpoint_path_or_dict, device=None, from_safetensors=True):
-    if device is None:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    if isinstance(checkpoint_path_or_dict, str):
-        if from_safetensors:
-            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
-
-        else:
-            checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
-
-    elif isinstance(checkpoint_path_or_dict, dict):
-        checkpoint = checkpoint_path_or_dict
-
-    return checkpoint
-
-
 def infer_model_type(original_config, model_type=None):
     if model_type is not None:
         return model_type
@@ -918,9 +900,9 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
-def create_text_encoder_from_ldm_clip_checkpoint(checkpoint, local_files_only=False):
+def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False):
     try:
-        config = CLIPTextConfig.from_pretrained(LDM_CLIP_CONFIG_NAME, local_files_only=local_files_only)
+        config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
     except Exception:
         raise ValueError(
             f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
@@ -1178,7 +1160,7 @@ def create_text_encoders_and_tokenizers_from_ldm(
         try:
             config_name = "openai/clip-vit-large-patch14"
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
-            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=local_files_only)
 
         except Exception:
             raise ValueError(

From 68ddb2559e616656301858d441a523ebd64a710f Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 08:16:52 +0000
Subject: [PATCH 72/89] clean up

---
 src/diffusers/loaders/single_file_utils.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 7d95f97b8b3d..55b438b0363e 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -936,8 +936,8 @@ def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_
 
 
 def create_text_encoder_from_open_clip_checkpoint(
-    checkpoint,
     config_name,
+    checkpoint,
     prefix="cond_stage_model.model.",
     has_projection=False,
     local_files_only=False,
@@ -1102,13 +1102,12 @@ def create_text_encoders_and_tokenizers_from_ldm(
 
         try:
             text_encoder = create_text_encoder_from_open_clip_checkpoint(
-                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+                config_name, checkpoint, local_files_only=local_files_only, **config_kwargs
             )
             tokenizer = CLIPTokenizer.from_pretrained(
                 config_name, subfolder="tokenizer", local_files_only=local_files_only
             )
-        except Exception as e:
-            raise e
+        except Exception:
             raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder in the following path: '{config_name}'."
             )
@@ -1118,7 +1117,7 @@ def create_text_encoders_and_tokenizers_from_ldm(
     elif model_type == "FrozenCLIPEmbedder":
         try:
             config_name = "openai/clip-vit-large-patch14"
-            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=local_files_only)
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
 
         except Exception:
@@ -1136,8 +1135,8 @@ def create_text_encoders_and_tokenizers_from_ldm(
         try:
             tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
             text_encoder_2 = create_text_encoder_from_open_clip_checkpoint(
-                checkpoint,
                 config_name,
+                checkpoint,
                 prefix=prefix,
                 has_projection=True,
                 local_files_only=local_files_only,
@@ -1173,8 +1172,8 @@ def create_text_encoders_and_tokenizers_from_ldm(
             prefix = "conditioner.embedders.1.model."
             tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
             text_encoder_2 = create_text_encoder_from_open_clip_checkpoint(
-                checkpoint,
                 config_name,
+                checkpoint,
                 prefix=prefix,
                 has_projection=True,
                 local_files_only=local_files_only,

From 7395283f23a66b634d0d316fdceadf17ed5f4f97 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 08:28:10 +0000
Subject: [PATCH 73/89] update

---
 src/diffusers/loaders/single_file_utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 55b438b0363e..4dc4c710133c 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -20,9 +20,7 @@
 from io import BytesIO
 
 import requests
-import torch
 import yaml
-from safetensors.torch import load_file as safe_load
 from transformers import (
     CLIPTextConfig,
     CLIPTextModel,
@@ -1117,7 +1115,9 @@ def create_text_encoders_and_tokenizers_from_ldm(
     elif model_type == "FrozenCLIPEmbedder":
         try:
             config_name = "openai/clip-vit-large-patch14"
-            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(
+                config_name, checkpoint, local_files_only=local_files_only
+            )
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
 
         except Exception:
@@ -1159,7 +1159,9 @@ def create_text_encoders_and_tokenizers_from_ldm(
         try:
             config_name = "openai/clip-vit-large-patch14"
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
-            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(
+                config_name, checkpoint, local_files_only=local_files_only
+            )
 
         except Exception:
             raise ValueError(

From 153e746b705ab3554292e7a164de6bb787eef1ed Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 08:43:21 +0000
Subject: [PATCH 74/89] clean

---
 src/diffusers/loaders/single_file.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 1e4678cfbee0..21670385af10 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -18,7 +18,6 @@
 from transformers import AutoFeatureExtractor
 
 from ..models.modeling_utils import load_state_dict
-from ..pipelines.pipeline_utils import _get_pipeline_class
 from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..utils import (
     is_accelerate_available,
@@ -286,6 +285,8 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             )
             return component["controlnet"]
 
+        from ..pipelines.pipeline_utils import _get_pipeline_class
+
         pipeline_class = _get_pipeline_class(
             cls,
             config=None,

From a371c3b3827e2083beb4f6814ab0b93e831f4854 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 08:54:51 +0000
Subject: [PATCH 75/89] clean

---
 src/diffusers/loaders/single_file_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 4dc4c710133c..475b7d3819fc 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -15,9 +15,9 @@
 """ Conversion script for the Stable Diffusion checkpoints."""
 
 import os
-import re
 from contextlib import nullcontext
 from io import BytesIO
+from urllib.parse import urlparse
 
 import requests
 import yaml
@@ -213,8 +213,11 @@ def infer_original_config_file(class_name, checkpoint):
 
 def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None):
     def is_valid_url(url):
-        pattern = r"^(http|https):\/\/([\w.-]+)(\.[\w.-]+)+([\/\w\.-]*)*\/?$"
-        return bool(re.match(pattern, url))
+        result = urlparse(url)
+        if result.scheme and result.netloc:
+            return True
+
+        return False
 
     if original_config_file is None:
         original_config_file = infer_original_config_file(pipeline_class_name, checkpoint)

From ba66fb81a0c8db48fed7abe833409f447b95708b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 09:08:23 +0000
Subject: [PATCH 76/89] update

---
 src/diffusers/loaders/single_file.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 21670385af10..d23b2b9e87c1 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -18,10 +18,7 @@
 from transformers import AutoFeatureExtractor
 
 from ..models.modeling_utils import load_state_dict
-from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ..utils import (
-    is_accelerate_available,
-    is_transformers_available,
     logging,
 )
 from ..utils.hub_utils import _get_model_file
@@ -36,12 +33,6 @@
 )
 
 
-if is_transformers_available():
-    pass
-
-if is_accelerate_available():
-    pass
-
 logger = logging.get_logger(__name__)
 
 
@@ -126,6 +117,8 @@ def build_sub_model_components(
 
     if component_name == "safety_checker":
         if load_safety_checker:
+            from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(
                 "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
             )

From b65861800e1cdc71ef8e666df56ef87ffbb11d86 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 09:52:08 +0000
Subject: [PATCH 77/89] updaet

---
 src/diffusers/loaders/__init__.py             |   4 +
 src/diffusers/loaders/autoencoder.py          | 123 +++++++++++++++++
 src/diffusers/loaders/controlnet.py           | 127 ++++++++++++++++++
 src/diffusers/loaders/single_file.py          |  89 +++---------
 src/diffusers/loaders/single_file_utils.py    |  87 ++++++++++--
 .../models/autoencoders/autoencoder_kl.py     |   4 +-
 src/diffusers/models/controlnet.py            |   4 +-
 7 files changed, 354 insertions(+), 84 deletions(-)
 create mode 100644 src/diffusers/loaders/autoencoder.py
 create mode 100644 src/diffusers/loaders/controlnet.py

diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index 675246e408fa..58e425359e6b 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -56,6 +56,8 @@ def text_encoder_attn_modules(text_encoder):
 if is_torch_available():
     _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
     _import_structure["utils"] = ["AttnProcsLayers"]
+    _import_structure["controlnet"] = ["FromOriginalControlnetMixin"]
+    _import_structure["autoencoder"] = ["FromOriginalVAEMixin"]
 
     if is_transformers_available():
         _import_structure["single_file"] = ["FromSingleFileMixin"]
@@ -68,6 +70,8 @@ def text_encoder_attn_modules(text_encoder):
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     if is_torch_available():
+        from .autoencoder import FromOriginalVAEMixin
+        from .controlnet import FromOriginalControlnetMixin
         from .unet import UNet2DConditionLoadersMixin
         from .utils import AttnProcsLayers
 
diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
new file mode 100644
index 000000000000..8936d4f0bea3
--- /dev/null
+++ b/src/diffusers/loaders/autoencoder.py
@@ -0,0 +1,123 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from .single_file_utils import (
+    create_diffusers_vae_model_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+)
+
+
+class FromOriginalVAEMixin:
+    """
+    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        Examples:
+
+        ```py
+        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+
+        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
+        model = ControlNetModel.from_single_file(url)
+
+        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
+        ```
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)
+
+        class_name = cls.__name__
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+        )
+
+        image_size = kwargs.pop("image_size", None)
+        component = create_diffusers_vae_model_from_ldm(class_name, original_config, checkpoint, image_size=image_size)
+        vae = component["vae"]
+        if torch_dtype is not None:
+            vae = vae.to(torch_dtype)
+
+        return vae
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
new file mode 100644
index 000000000000..88008f006f89
--- /dev/null
+++ b/src/diffusers/loaders/controlnet.py
@@ -0,0 +1,127 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from .single_file_utils import (
+    create_diffusers_controlnet_model_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+)
+
+
+class FromOriginalControlnetMixin:
+    """
+    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        Examples:
+
+        ```py
+        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+
+        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
+        model = ControlNetModel.from_single_file(url)
+
+        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
+        ```
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)
+
+        class_name = cls.__name__
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+        )
+
+        upcast_attention = kwargs.pop("upcast_attention", False)
+        image_size = kwargs.pop("image_size", None)
+
+        component = create_diffusers_controlnet_model_from_ldm(
+            class_name, original_config, checkpoint, upcast_attention=upcast_attention, image_size=image_size
+        )
+        controlnet = component["controlnet"]
+        if torch_dtype is not None:
+            controlnet = controlnet.to(torch_dtype)
+
+        return controlnet
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index d23b2b9e87c1..d747bfacde0b 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -11,32 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import re
 
 from huggingface_hub.utils import validate_hf_hub_args
-from transformers import AutoFeatureExtractor
 
-from ..models.modeling_utils import load_state_dict
-from ..utils import (
-    logging,
-)
-from ..utils.hub_utils import _get_model_file
+from ..utils import logging
 from .single_file_utils import (
-    create_diffusers_controlnet_model_from_ldm,
     create_diffusers_unet_model_from_ldm,
     create_diffusers_vae_model_from_ldm,
     create_scheduler_from_ldm,
     create_text_encoders_and_tokenizers_from_ldm,
-    fetch_original_config,
+    fetch_ldm_config_and_checkpoint,
     infer_model_type,
 )
 
 
 logger = logging.get_logger(__name__)
 
-
-VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
 # Pipelines that support the SDXL Refiner checkpoint
 REFINER_PIPELINES = [
     "StableDiffusionXLImg2ImgPipeline",
@@ -45,29 +35,12 @@
 ]
 
 
-def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
-    pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
-    weights_name = None
-    repo_id = (None,)
-    for prefix in VALID_URL_PREFIXES:
-        pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
-    match = re.match(pattern, pretrained_model_name_or_path)
-    if not match:
-        return repo_id, weights_name
-
-    repo_id = f"{match.group(1)}/{match.group(2)}"
-    weights_name = match.group(3)
-
-    return repo_id, weights_name
-
-
 def build_sub_model_components(
     pipeline_components,
     pipeline_class_name,
     component_name,
     original_config,
     checkpoint,
-    checkpoint_path_or_dict,
     local_files_only=False,
     load_safety_checker=False,
     **kwargs,
@@ -117,6 +90,8 @@ def build_sub_model_components(
 
     if component_name == "safety_checker":
         if load_safety_checker:
+            from transformers import AutoFeatureExtractor
+
             from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(
@@ -233,50 +208,20 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         use_safetensors = kwargs.pop("use_safetensors", True)
 
         class_name = cls.__name__
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        if os.path.isfile(pretrained_model_link_or_path):
-            checkpoint = load_state_dict(pretrained_model_link_or_path)
-        else:
-            repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
-            checkpoint_path = _get_model_file(
-                repo_id,
-                weights_name=weights_name,
-                force_download=force_download,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-            )
-            checkpoint = load_state_dict(checkpoint_path)
-
-        # some checkpoints contain the model state dict under a "state_dict" key
-        while "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
-
-        original_config = fetch_original_config(class_name, checkpoint, original_config_file)
-
-        if class_name == "AutoencoderKL":
-            image_size = kwargs.pop("image_size", None)
-            component = create_diffusers_vae_model_from_ldm(
-                class_name, original_config, checkpoint, image_size=image_size
-            )
-            return component["vae"]
 
-        if class_name == "ControlNetModel":
-            upcast_attention = kwargs.pop("upcast_attention", False)
-            image_size = kwargs.pop("image_size", None)
-
-            component = create_diffusers_controlnet_model_from_ldm(
-                class_name, original_config, checkpoint, upcast_attention=upcast_attention, image_size=image_size
-            )
-            return component["controlnet"]
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+        )
 
         from ..pipelines.pipeline_utils import _get_pipeline_class
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 475b7d3819fc..386ec0bd4657 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -15,20 +15,15 @@
 """ Conversion script for the Stable Diffusion checkpoints."""
 
 import os
+import re
 from contextlib import nullcontext
 from io import BytesIO
 from urllib.parse import urlparse
 
 import requests
 import yaml
-from transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
 
-from ..models import UNet2DConditionModel
+from ..models.modeling_utils import load_state_dict
 from ..schedulers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -39,8 +34,17 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ..utils import is_accelerate_available, logging
+from ..utils import is_accelerate_available, is_transformers_available, logging
+from ..utils.hub_utils import _get_model_file
+
 
+if is_transformers_available():
+    from transformers import (
+        CLIPTextConfig,
+        CLIPTextModel,
+        CLIPTextModelWithProjection,
+        CLIPTokenizer,
+    )
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
@@ -187,6 +191,71 @@
 ]
 
 
+VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+
+
+def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
+    pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
+    weights_name = None
+    repo_id = (None,)
+    for prefix in VALID_URL_PREFIXES:
+        pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
+    match = re.match(pattern, pretrained_model_name_or_path)
+    if not match:
+        return repo_id, weights_name
+
+    repo_id = f"{match.group(1)}/{match.group(2)}"
+    weights_name = match.group(3)
+
+    return repo_id, weights_name
+
+
+def fetch_ldm_config_and_checkpoint(
+    pretrained_model_link_or_path,
+    class_name,
+    original_config_file=None,
+    resume_download=False,
+    force_download=False,
+    proxies=None,
+    token=None,
+    cache_dir=None,
+    local_files_only=None,
+    revision=None,
+    use_safetensors=True,
+):
+    file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
+    from_safetensors = file_extension == "safetensors"
+
+    if from_safetensors and use_safetensors is False:
+        raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
+
+    if os.path.isfile(pretrained_model_link_or_path):
+        checkpoint = load_state_dict(pretrained_model_link_or_path)
+
+    else:
+        repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
+        checkpoint_path = _get_model_file(
+            repo_id,
+            weights_name=weights_name,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+        )
+        checkpoint = load_state_dict(checkpoint_path)
+
+    # some checkpoints contain the model state dict under a "state_dict" key
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    original_config = fetch_original_config(class_name, checkpoint, original_config_file)
+
+    return original_config, checkpoint
+
+
 def infer_original_config_file(class_name, checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
         config_url = CONFIG_URLS["v2"]
@@ -1029,6 +1098,8 @@ def create_diffusers_unet_model_from_ldm(
     extract_ema=False,
     image_size=None,
 ):
+    from ..models import UNet2DConditionModel
+
     if num_in_channels is None:
         if pipeline_class_name in [
             "StableDiffusionInpaintPipeline",
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 92d12a220f76..10a3ae58de9f 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromSingleFileMixin
+from ...loaders import FromOriginalVAEMixin
 from ...utils.accelerate_utils import apply_forward_hook
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -32,7 +32,7 @@
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
-class AutoencoderKL(ModelMixin, ConfigMixin, FromSingleFileMixin):
+class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
 
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 8af13a6ec7d2..1102f4f9d36d 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -19,7 +19,7 @@
 from torch.nn import functional as F
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromSingleFileMixin
+from ..loaders import FromOriginalControlnetMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -102,7 +102,7 @@ def forward(self, conditioning):
         return embedding
 
 
-class ControlNetModel(ModelMixin, ConfigMixin, FromSingleFileMixin):
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
     """
     A ControlNet model.
 

From 36203576d5c8e58b0dc823207c6628599d70a343 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 10:07:50 +0000
Subject: [PATCH 78/89] clean up

---
 src/diffusers/loaders/__init__.py          |  8 +++---
 src/diffusers/loaders/autoencoder.py       | 29 ++++++++++++----------
 src/diffusers/loaders/controlnet.py        |  2 +-
 src/diffusers/loaders/single_file_utils.py | 10 +++-----
 src/diffusers/models/controlnet.py         |  4 +--
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index 58e425359e6b..4da047435d8e 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -54,11 +54,11 @@ def text_encoder_attn_modules(text_encoder):
 _import_structure = {}
 
 if is_torch_available():
-    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
-    _import_structure["utils"] = ["AttnProcsLayers"]
-    _import_structure["controlnet"] = ["FromOriginalControlnetMixin"]
     _import_structure["autoencoder"] = ["FromOriginalVAEMixin"]
 
+    _import_structure["controlnet"] = ["FromOriginalControlNetMixin"]
+    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
+    _import_structure["utils"] = ["AttnProcsLayers"]
     if is_transformers_available():
         _import_structure["single_file"] = ["FromSingleFileMixin"]
         _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
@@ -71,7 +71,7 @@ def text_encoder_attn_modules(text_encoder):
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     if is_torch_available():
         from .autoencoder import FromOriginalVAEMixin
-        from .controlnet import FromOriginalControlnetMixin
+        from .controlnet import FromOriginalControlNetMixin
         from .unet import UNet2DConditionLoadersMixin
         from .utils import AttnProcsLayers
 
diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
index 8936d4f0bea3..e21f651b8d78 100644
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -22,14 +22,14 @@
 
 class FromOriginalVAEMixin:
     """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    Load pretrained AutoencoderKL weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
     """
 
     @classmethod
     @validate_hf_hub_args
     def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         r"""
-        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
         `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
 
         Parameters:
@@ -62,32 +62,35 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                 allowed by Git.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
             use_safetensors (`bool`, *optional*, defaults to `None`):
                 If set to `None`, the safetensors weights are downloaded if they're available **and** if the
                 safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
                 weights. If set to `False`, safetensors weights are not loaded.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to overwrite load and saveable variables (for example the pipeline components of the
                 specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
                 method. See example below for more information.
 
+        <Tip warning={true}>
+
+            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
+            a VAE from SDXL or a Stable Diffusion v2 model or higher.
+
+        </Tip>
+
         Examples:
 
         ```py
-        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+        from diffusers import AutoencoderKL
 
-        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
-        model = ControlNetModel.from_single_file(url)
-
-        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
-        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
+        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
+        model = AutoencoderKL.from_single_file(url)
         ```
         """
+
         original_config_file = kwargs.pop("original_config_file", None)
         resume_download = kwargs.pop("resume_download", False)
         force_download = kwargs.pop("force_download", False)
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
index 88008f006f89..527a77109aae 100644
--- a/src/diffusers/loaders/controlnet.py
+++ b/src/diffusers/loaders/controlnet.py
@@ -20,7 +20,7 @@
 )
 
 
-class FromOriginalControlnetMixin:
+class FromOriginalControlNetMixin:
     """
     Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
     """
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 386ec0bd4657..e76ea516d8d4 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -507,7 +507,7 @@ def create_controlnet_diffusers_config(original_config, image_size: int):
     return controlnet_config
 
 
-def create_vae_diffusers_config(original_config, image_size: int):
+def create_vae_diffusers_config(original_config, image_size, scaling_factor=0.18125):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
@@ -526,6 +526,7 @@ def create_vae_diffusers_config(original_config, image_size: int):
         "block_out_channels": tuple(block_out_channels),
         "latent_channels": vae_params["z_channels"],
         "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": scaling_factor,
     }
 
     return config
@@ -1134,17 +1135,14 @@ def create_diffusers_unet_model_from_ldm(
 
 
 def create_diffusers_vae_model_from_ldm(
-    pipeline_class_name,
-    original_config,
-    checkpoint,
-    image_size=None,
+    pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=0.18125
 ):
     # import here to avoid circular imports
     from ..models import AutoencoderKL
 
     image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
 
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size, scaling_factor=scaling_factor)
     diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
 
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 1102f4f9d36d..7f30410c41a5 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -19,7 +19,7 @@
 from torch.nn import functional as F
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalControlnetMixin
+from ..loaders import FromOriginalControlNetMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -102,7 +102,7 @@ def forward(self, conditioning):
         return embedding
 
 
-class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
     """
     A ControlNet model.
 

From dae09d087470624c7d0daff4b45923eb7cb6621e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 10:21:03 +0000
Subject: [PATCH 79/89] fix docs

---
 docs/source/en/api/loaders/single_file.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/loaders/single_file.md b/docs/source/en/api/loaders/single_file.md
index 52e44606455b..62dbc21067c5 100644
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -30,8 +30,8 @@ To learn more about how to load single file weights, see the [Load different Sta
 
 ## FromOriginalVAEMixin
 
-[[autodoc]] loaders.single_file.FromOriginalVAEMixin
+[[autodoc]] loaders.autoencoder.FromOriginalVAEMixin
 
 ## FromOriginalControlnetMixin
 
-[[autodoc]] loaders.single_file.FromOriginalControlnetMixin
\ No newline at end of file
+[[autodoc]] loaders.controlnet.FromOriginalControlNetMixin
\ No newline at end of file

From 0746cf957a6c4ab047e6565d5628a2905d63a70b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 22 Jan 2024 06:56:30 +0000
Subject: [PATCH 80/89] update

---
 src/diffusers/loaders/single_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index d747bfacde0b..9da2402cfcde 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -48,7 +48,7 @@ def build_sub_model_components(
     if component_name in pipeline_components:
         return {}
 
-    model_type = kwargs.get("model_type", None)
+    model_type = kwargs.pop("model_type", None)
     image_size = kwargs.pop("image_size", None)
 
     if component_name == "unet":

From dbfb8f1ea9c61a2b4e02f926245be2b3d387e577 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 22 Jan 2024 09:13:08 +0000
Subject: [PATCH 81/89] update

---
 src/diffusers/loaders/single_file.py            | 17 ++++++++++-------
 src/diffusers/loaders/single_file_utils.py      |  4 ++--
 src/diffusers/models/unet_3d_condition.py       |  2 +-
 .../test_stable_diffusion_inpaint.py            | 10 +++++-----
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 9da2402cfcde..2751c8969fdf 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -43,14 +43,13 @@ def build_sub_model_components(
     checkpoint,
     local_files_only=False,
     load_safety_checker=False,
-    **kwargs,
+    model_type=None,
+    image_size=None,
+    **kwargs
 ):
     if component_name in pipeline_components:
         return {}
 
-    model_type = kwargs.pop("model_type", None)
-    image_size = kwargs.pop("image_size", None)
-
     if component_name == "unet":
         num_in_channels = kwargs.pop("num_in_channels", None)
         unet_components = create_diffusers_unet_model_from_ldm(
@@ -112,10 +111,9 @@ def build_sub_model_components(
 def set_additional_components(
     pipeline_class_name,
     original_config,
-    **kwargs,
+    model_type=None,
 ):
     components = {}
-    model_type = kwargs.get("model_type", None)
     if pipeline_class_name in REFINER_PIPELINES:
         model_type = infer_model_type(original_config, model_type=model_type)
         is_refiner = model_type == "SDXL-Refiner"
@@ -235,6 +233,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
         passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
 
+        model_type = kwargs.pop("model_type", None)
+        image_size = kwargs.pop("image_size", None)
+
         init_kwargs = {}
         for name in expected_modules:
             if name in passed_class_obj:
@@ -247,13 +248,15 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                     original_config,
                     checkpoint,
                     pretrained_model_link_or_path,
+                    model_type=model_type,
+                    image_size=image_size,
                     **kwargs,
                 )
                 if not components:
                     continue
                 init_kwargs.update(components)
 
-        additional_components = set_additional_components(class_name, original_config, **kwargs)
+        additional_components = set_additional_components(class_name, original_config, model_type=model_type)
         if additional_components:
             init_kwargs.update(additional_components)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index e76ea516d8d4..630b9f19eb89 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -191,14 +191,14 @@
 ]
 
 
-VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+VALID_HF_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
 
 
 def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
     pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
     weights_name = None
     repo_id = (None,)
-    for prefix in VALID_URL_PREFIXES:
+    for prefix in VALID_HF_URL_PREFIXES:
         pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
     match = re.match(pattern, pretrained_model_name_or_path)
     if not match:
diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py
index fc8695e064b5..d403204d220c 100644
--- a/src/diffusers/models/unet_3d_condition.py
+++ b/src/diffusers/models/unet_3d_condition.py
@@ -533,7 +533,7 @@ def forward(
 
         Args:
             sample (`torch.FloatTensor`):
-                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+                The noisy input tensor with the following shape `(batch, channel, num_frames, height, width)`.
             timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
             encoder_hidden_states (`torch.FloatTensor`):
                 The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index fe664b21e271..43321c4f9eba 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -753,14 +753,14 @@ def test_download_local(self):
     def test_download_ckpt_diff_format_is_same(self):
         ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
 
-        pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
+        sf_pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
+        sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
+        sf_pipe.unet.set_attn_processor(AttnProcessor())
+        sf_pipe.to("cuda")
 
         inputs = self.get_inputs(torch_device)
         inputs["num_inference_steps"] = 5
-        image_ckpt = pipe(**inputs).images[0]
+        image_ckpt = sf_pipe(**inputs).images[0]
 
         pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)

From 82ce94e1e39e4621899b6ef612f92561bb05373e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 22 Jan 2024 10:03:03 +0000
Subject: [PATCH 82/89] Revert "update"

This reverts commit dbfb8f1ea9c61a2b4e02f926245be2b3d387e577.
---
 src/diffusers/loaders/single_file.py            | 17 +++++++----------
 src/diffusers/loaders/single_file_utils.py      |  4 ++--
 src/diffusers/models/unet_3d_condition.py       |  2 +-
 .../test_stable_diffusion_inpaint.py            | 10 +++++-----
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 2751c8969fdf..9da2402cfcde 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -43,13 +43,14 @@ def build_sub_model_components(
     checkpoint,
     local_files_only=False,
     load_safety_checker=False,
-    model_type=None,
-    image_size=None,
-    **kwargs
+    **kwargs,
 ):
     if component_name in pipeline_components:
         return {}
 
+    model_type = kwargs.pop("model_type", None)
+    image_size = kwargs.pop("image_size", None)
+
     if component_name == "unet":
         num_in_channels = kwargs.pop("num_in_channels", None)
         unet_components = create_diffusers_unet_model_from_ldm(
@@ -111,9 +112,10 @@ def build_sub_model_components(
 def set_additional_components(
     pipeline_class_name,
     original_config,
-    model_type=None,
+    **kwargs,
 ):
     components = {}
+    model_type = kwargs.get("model_type", None)
     if pipeline_class_name in REFINER_PIPELINES:
         model_type = infer_model_type(original_config, model_type=model_type)
         is_refiner = model_type == "SDXL-Refiner"
@@ -233,9 +235,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
         passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
 
-        model_type = kwargs.pop("model_type", None)
-        image_size = kwargs.pop("image_size", None)
-
         init_kwargs = {}
         for name in expected_modules:
             if name in passed_class_obj:
@@ -248,15 +247,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                     original_config,
                     checkpoint,
                     pretrained_model_link_or_path,
-                    model_type=model_type,
-                    image_size=image_size,
                     **kwargs,
                 )
                 if not components:
                     continue
                 init_kwargs.update(components)
 
-        additional_components = set_additional_components(class_name, original_config, model_type=model_type)
+        additional_components = set_additional_components(class_name, original_config, **kwargs)
         if additional_components:
             init_kwargs.update(additional_components)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 630b9f19eb89..e76ea516d8d4 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -191,14 +191,14 @@
 ]
 
 
-VALID_HF_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
 
 
 def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
     pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
     weights_name = None
     repo_id = (None,)
-    for prefix in VALID_HF_URL_PREFIXES:
+    for prefix in VALID_URL_PREFIXES:
         pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
     match = re.match(pattern, pretrained_model_name_or_path)
     if not match:
diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py
index d403204d220c..fc8695e064b5 100644
--- a/src/diffusers/models/unet_3d_condition.py
+++ b/src/diffusers/models/unet_3d_condition.py
@@ -533,7 +533,7 @@ def forward(
 
         Args:
             sample (`torch.FloatTensor`):
-                The noisy input tensor with the following shape `(batch, channel, num_frames, height, width)`.
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
             timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
             encoder_hidden_states (`torch.FloatTensor`):
                 The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 43321c4f9eba..fe664b21e271 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -753,14 +753,14 @@ def test_download_local(self):
     def test_download_ckpt_diff_format_is_same(self):
         ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
 
-        sf_pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
-        sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
-        sf_pipe.unet.set_attn_processor(AttnProcessor())
-        sf_pipe.to("cuda")
+        pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_attn_processor(AttnProcessor())
+        pipe.to("cuda")
 
         inputs = self.get_inputs(torch_device)
         inputs["num_inference_steps"] = 5
-        image_ckpt = sf_pipe(**inputs).images[0]
+        image_ckpt = pipe(**inputs).images[0]
 
         pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)

From 6f8446a51f2d2781f2d6bb5da700bec2b68f04c0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 22 Jan 2024 10:24:41 +0000
Subject: [PATCH 83/89] update

---
 src/diffusers/loaders/single_file.py              | 15 +++++++++------
 .../test_stable_diffusion_inpaint.py              |  5 ++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 9da2402cfcde..b74a085aa999 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -43,14 +43,13 @@ def build_sub_model_components(
     checkpoint,
     local_files_only=False,
     load_safety_checker=False,
+    model_type=None,
+    image_size=None,
     **kwargs,
 ):
     if component_name in pipeline_components:
         return {}
 
-    model_type = kwargs.pop("model_type", None)
-    image_size = kwargs.pop("image_size", None)
-
     if component_name == "unet":
         num_in_channels = kwargs.pop("num_in_channels", None)
         unet_components = create_diffusers_unet_model_from_ldm(
@@ -112,10 +111,9 @@ def build_sub_model_components(
 def set_additional_components(
     pipeline_class_name,
     original_config,
-    **kwargs,
+    model_type=None,
 ):
     components = {}
-    model_type = kwargs.get("model_type", None)
     if pipeline_class_name in REFINER_PIPELINES:
         model_type = infer_model_type(original_config, model_type=model_type)
         is_refiner = model_type == "SDXL-Refiner"
@@ -235,6 +233,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
         passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
 
+        model_type = kwargs.pop("model_type", None)
+        image_size = kwargs.pop("image_size", None)
+
         init_kwargs = {}
         for name in expected_modules:
             if name in passed_class_obj:
@@ -247,13 +248,15 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                     original_config,
                     checkpoint,
                     pretrained_model_link_or_path,
+                    model_type=model_type,
+                    image_size=image_size,
                     **kwargs,
                 )
                 if not components:
                     continue
                 init_kwargs.update(components)
 
-        additional_components = set_additional_components(class_name, original_config, **kwargs)
+        additional_components = set_additional_components(class_name, original_config, model_type=model_type)
         if additional_components:
             init_kwargs.update(additional_components)
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index fe664b21e271..7ec6964b0688 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -43,6 +43,7 @@
     load_image,
     load_numpy,
     nightly,
+    numpy_cosine_similarity_distance,
     require_python39_or_higher,
     require_torch_2,
     require_torch_gpu,
@@ -771,7 +772,9 @@ def test_download_ckpt_diff_format_is_same(self):
         inputs["num_inference_steps"] = 5
         image = pipe(**inputs).images[0]
 
-        assert np.max(np.abs(image - image_ckpt)) < 5e-4
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+
+        assert max_diff < 1e-4
 
 
 @slow

From b2c95612eb0a1c091aebcb65ee632c93b088ee18 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 22 Jan 2024 10:29:20 +0000
Subject: [PATCH 84/89] update

---
 src/diffusers/loaders/autoencoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
index e21f651b8d78..6e65bd1c0070 100644
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -22,7 +22,7 @@
 
 class FromOriginalVAEMixin:
     """
-    Load pretrained AutoencoderKL weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    Load pretrained AutoencoderKL weights saved in the `.ckpt` or `.safetensors` format into a [`AutoencoderKL`].
     """
 
     @classmethod

From e297ac84e831359cb882f580a376e787a6d8bde3 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 23 Jan 2024 04:35:52 +0000
Subject: [PATCH 85/89] update

---
 src/diffusers/loaders/single_file.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index b74a085aa999..5416d2f135c5 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -14,7 +14,7 @@
 
 from huggingface_hub.utils import validate_hf_hub_args
 
-from ..utils import logging
+from ..utils import is_transformers_available, logging
 from .single_file_utils import (
     create_diffusers_unet_model_from_ldm,
     create_diffusers_vae_model_from_ldm,
@@ -34,6 +34,9 @@
     "StableDiffusionXLControlNetImg2ImgPipeline",
 ]
 
+if is_transformers_available():
+    from transformers import AutoFeatureExtractor
+
 
 def build_sub_model_components(
     pipeline_components,
@@ -89,8 +92,6 @@ def build_sub_model_components(
 
     if component_name == "safety_checker":
         if load_safety_checker:
-            from transformers import AutoFeatureExtractor
-
             from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(

From d1e3466a594737de2f8898b8e3c803b974ab5b88 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 23 Jan 2024 06:59:20 +0000
Subject: [PATCH 86/89] update

---
 src/diffusers/loaders/single_file.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 5416d2f135c5..034271aaba33 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -97,14 +97,18 @@ def build_sub_model_components(
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(
                 "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
             )
+        else:
+            safety_checker = None
+        return {"safety_checker": safety_checker}
+
+    if component_name == "feature_extractor":
+        if load_safety_checker:
             feature_extractor = AutoFeatureExtractor.from_pretrained(
                 "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
             )
         else:
-            safety_checker = None
             feature_extractor = None
-
-        return {"safety_checker": safety_checker, "feature_extractor": feature_extractor}
+        return {"feature_extractor": feature_extractor}
 
     return
 
@@ -201,7 +205,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         cache_dir = kwargs.pop("cache_dir", None)
-        local_files_only = kwargs.pop("local_files_only", None)
+        local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
         use_safetensors = kwargs.pop("use_safetensors", True)
@@ -236,6 +240,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         model_type = kwargs.pop("model_type", None)
         image_size = kwargs.pop("image_size", None)
+        load_safety_checker = (kwargs.pop("load_safety_checker", False)) or (
+            passed_class_obj.get("safety_checker", None) is not None
+        )
 
         init_kwargs = {}
         for name in expected_modules:
@@ -248,9 +255,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
                     name,
                     original_config,
                     checkpoint,
-                    pretrained_model_link_or_path,
                     model_type=model_type,
                     image_size=image_size,
+                    load_safety_checker=load_safety_checker,
+                    local_files_only=local_files_only,
                     **kwargs,
                 )
                 if not components:

From 650a63255f43367b1ef51cd677443808a8be0655 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 23 Jan 2024 07:28:07 +0000
Subject: [PATCH 87/89] fix controlnet

---
 src/diffusers/loaders/single_file_utils.py | 40 +++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index e76ea516d8d4..d152c291dac3 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -110,6 +110,12 @@
     },
     "controlnet": {
         "layers": {
+            "time_embedding.linear_1.weight": "time_embed.0.weight",
+            "time_embedding.linear_1.bias": "time_embed.0.bias",
+            "time_embedding.linear_2.weight": "time_embed.2.weight",
+            "time_embedding.linear_2.bias": "time_embed.2.bias",
+            "conv_in.weight": "input_blocks.0.0.weight",
+            "conv_in.bias": "input_blocks.0.0.bias",
             "controlnet_cond_embedding.conv_in.weight": "input_hint_block.0.weight",
             "controlnet_cond_embedding.conv_in.bias": "input_hint_block.0.bias",
             "controlnet_cond_embedding.conv_out.weight": "input_hint_block.14.weight",
@@ -787,6 +793,38 @@ def convert_controlnet_checkpoint(
         new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.weight")
         new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.bias")
 
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {".".join(layer.split(".")[:2]) for layer in controlnet_state_dict if "middle_block" in layer}
+    )
+    middle_blocks = {
+        layer_id: [key for key in controlnet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    if middle_blocks:
+        resnet_0 = middle_blocks[0]
+        attentions = middle_blocks[1]
+        resnet_1 = middle_blocks[2]
+
+        update_unet_resnet_ldm_to_diffusers(
+            resnet_0,
+            new_checkpoint,
+            controlnet_state_dict,
+            mapping={"old": "middle_block.0", "new": "mid_block.resnets.0"},
+        )
+        update_unet_resnet_ldm_to_diffusers(
+            resnet_1,
+            new_checkpoint,
+            controlnet_state_dict,
+            mapping={"old": "middle_block.2", "new": "mid_block.resnets.1"},
+        )
+        update_unet_attention_ldm_to_diffusers(
+            attentions,
+            new_checkpoint,
+            controlnet_state_dict,
+            mapping={"old": "middle_block.1", "new": "mid_block.attentions.0"},
+        )
+
     # mid block
     new_checkpoint["controlnet_mid_block.weight"] = controlnet_state_dict.pop("middle_block_out.0.weight")
     new_checkpoint["controlnet_mid_block.bias"] = controlnet_state_dict.pop("middle_block_out.0.bias")
@@ -799,7 +837,7 @@ def convert_controlnet_checkpoint(
     }
     num_cond_embedding_blocks = len(cond_embedding_blocks)
 
-    for idx in range(1, num_cond_embedding_blocks):
+    for idx in range(1, num_cond_embedding_blocks + 1):
         diffusers_idx = idx - 1
         cond_block_id = 2 * idx
 

From 99fdba90648491c2ae2e51ccb65d41af3e77ea55 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 23 Jan 2024 07:47:13 +0000
Subject: [PATCH 88/89] fix scheduler

---
 src/diffusers/loaders/single_file_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index d152c291dac3..fb1ad14fd3e2 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -1348,8 +1348,6 @@ def create_scheduler_from_ldm(
         scheduler_config["clip_sample"] = False
         scheduler_config["set_alpha_to_one"] = False
 
-        scheduler_type = "ddim"
-
     if scheduler_type == "pndm":
         scheduler_config["skip_prk_steps"] = True
         scheduler = PNDMScheduler.from_config(scheduler_config)

From 8c9af6c1e7933f53a2874df4ec439e5f286140ba Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 23 Jan 2024 08:03:30 +0000
Subject: [PATCH 89/89] fix controlnet tests

---
 tests/pipelines/controlnet/test_controlnet.py | 49 +++++++++-------
 .../controlnet/test_controlnet_img2img.py     | 56 +++++++++++--------
 .../controlnet/test_controlnet_inpaint.py     |  4 +-
 3 files changed, 65 insertions(+), 44 deletions(-)

diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index c034a9b68bd8..05f3ade5089f 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -37,6 +37,7 @@
     enable_full_determinism,
     load_image,
     load_numpy,
+    numpy_cosine_similarity_distance,
     require_python39_or_higher,
     require_torch_2,
     require_torch_gpu,
@@ -1022,39 +1023,49 @@ def test_v11_shuffle_global_pool_conditions(self):
 
     def test_load_local(self):
         controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-        pipe_1 = StableDiffusionControlNetPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
 
         controlnet = ControlNetModel.from_single_file(
             "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
         )
-        pipe_2 = StableDiffusionControlNetPipeline.from_single_file(
+        pipe_sf = StableDiffusionControlNetPipeline.from_single_file(
             "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
             safety_checker=None,
             controlnet=controlnet,
+            scheduler_type="pndm",
         )
-        pipes = [pipe_1, pipe_2]
-        images = []
-
-        for pipe in pipes:
-            pipe.enable_model_cpu_offload()
-            pipe.set_progress_bar_config(disable=None)
+        pipe_sf.unet.set_default_attn_processor()
+        pipe_sf.enable_model_cpu_offload()
 
-            generator = torch.Generator(device="cpu").manual_seed(0)
-            prompt = "bird"
-            image = load_image(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-            )
+        control_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+        prompt = "bird"
 
-            output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-            images.append(output.images[0])
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            prompt,
+            image=control_image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
 
-            del pipe
-            gc.collect()
-            torch.cuda.empty_cache()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output_sf = pipe_sf(
+            prompt,
+            image=control_image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
 
-        assert np.abs(images[0] - images[1]).max() < 1e-3
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 1e-3
 
 
 @slow
diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py
index b4b67e6476f6..939eb34ef0c6 100644
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -39,6 +39,7 @@
     enable_full_determinism,
     floats_tensor,
     load_numpy,
+    numpy_cosine_similarity_distance,
     require_torch_gpu,
     slow,
     torch_device,
@@ -421,46 +422,53 @@ def test_canny(self):
 
     def test_load_local(self):
         controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-        pipe_1 = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
 
         controlnet = ControlNetModel.from_single_file(
             "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
         )
-        pipe_2 = StableDiffusionControlNetImg2ImgPipeline.from_single_file(
+        pipe_sf = StableDiffusionControlNetImg2ImgPipeline.from_single_file(
             "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
             safety_checker=None,
             controlnet=controlnet,
+            scheduler_type="pndm",
         )
+        pipe_sf.unet.set_default_attn_processor()
+        pipe_sf.enable_model_cpu_offload()
+
         control_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
         ).resize((512, 512))
         image = load_image(
             "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
         ).resize((512, 512))
+        prompt = "bird"
 
-        pipes = [pipe_1, pipe_2]
-        images = []
-        for pipe in pipes:
-            pipe.enable_model_cpu_offload()
-            pipe.set_progress_bar_config(disable=None)
-
-            generator = torch.Generator(device="cpu").manual_seed(0)
-            prompt = "bird"
-            output = pipe(
-                prompt,
-                image=image,
-                control_image=control_image,
-                strength=0.9,
-                generator=generator,
-                output_type="np",
-                num_inference_steps=3,
-            )
-            images.append(output.images[0])
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe(
+            prompt,
+            image=image,
+            control_image=control_image,
+            strength=0.9,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
 
-            del pipe
-            gc.collect()
-            torch.cuda.empty_cache()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output_sf = pipe_sf(
+            prompt,
+            image=image,
+            control_image=control_image,
+            strength=0.9,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
 
-        assert np.abs(images[0] - images[1]).max() < 1e-3
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 1e-3
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py
index 7c3371c197d4..7db336df9448 100644
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -569,6 +569,7 @@ def test_load_local(self):
             "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
             safety_checker=None,
             controlnet=controlnet,
+            scheduler_type="pndm",
         )
         control_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
@@ -605,4 +606,5 @@ def test_load_local(self):
             gc.collect()
             torch.cuda.empty_cache()
 
-        assert np.abs(images[0] - images[1]).max() < 1e-3
+        max_diff = numpy_cosine_similarity_distance(images[0].flatten(), images[1].flatten())
+        assert max_diff < 1e-3