diff --git a/scripts/conversion_ldm_uncond.py b/scripts/conversion_ldm_uncond.py index d2ebb3934b66..8c22cc1ce8f2 100644 --- a/scripts/conversion_ldm_uncond.py +++ b/scripts/conversion_ldm_uncond.py @@ -1,13 +1,13 @@ import argparse -import OmegaConf import torch +import yaml from diffusers import DDIMScheduler, LDMPipeline, UNetLDMModel, VQModel def convert_ldm_original(checkpoint_path, config_path, output_path): - config = OmegaConf.load(config_path) + config = yaml.safe_load(config_path) state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] keys = list(state_dict.keys()) @@ -25,8 +25,8 @@ def convert_ldm_original(checkpoint_path, config_path, output_path): if key.startswith(unet_key): unet_state_dict[key.replace(unet_key, "")] = state_dict[key] - vqvae_init_args = config.model.params.first_stage_config.params - unet_init_args = config.model.params.unet_config.params + vqvae_init_args = config["model"]["params"]["first_stage_config"]["params"] + unet_init_args = config["model"]["params"]["unet_config"]["params"] vqvae = VQModel(**vqvae_init_args).eval() vqvae.load_state_dict(first_stage_dict) @@ -35,10 +35,10 @@ def convert_ldm_original(checkpoint_path, config_path, output_path): unet.load_state_dict(unet_state_dict) noise_scheduler = DDIMScheduler( - timesteps=config.model.params.timesteps, + timesteps=config["model"]["params"]["timesteps"], beta_schedule="scaled_linear", - beta_start=config.model.params.linear_start, - beta_end=config.model.params.linear_end, + beta_start=config["model"]["params"]["linear_start"], + beta_end=config["model"]["params"]["linear_end"], clip_sample=False, ) diff --git a/scripts/convert_gligen_to_diffusers.py b/scripts/convert_gligen_to_diffusers.py index 816e4c112e6f..30d789b60634 100644 --- a/scripts/convert_gligen_to_diffusers.py +++ b/scripts/convert_gligen_to_diffusers.py @@ -2,6 +2,7 @@ import re import torch +import yaml from transformers import ( CLIPProcessor, CLIPTextModel, @@ -28,8 +29,6 @@ textenc_conversion_map, textenc_pattern, ) -from diffusers.utils import is_omegaconf_available -from diffusers.utils.import_utils import BACKENDS_MAPPING def convert_open_clip_checkpoint(checkpoint): @@ -370,52 +369,52 @@ def convert_gligen_unet_checkpoint(checkpoint, config, path=None, extract_ema=Fa def create_vae_config(original_config, image_size: int): - vae_params = original_config.autoencoder.params.ddconfig - _ = original_config.autoencoder.params.embed_dim + vae_params = original_config["autoencoder"]["params"]["ddconfig"] + _ = original_config["autoencoder"]["params"]["embed_dim"] - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) config = { "sample_size": image_size, - "in_channels": vae_params.in_channels, - "out_channels": vae_params.out_ch, + "in_channels": vae_params["in_channels"], + "out_channels": vae_params["out_ch"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "latent_channels": vae_params.z_channels, - "layers_per_block": vae_params.num_res_blocks, + "latent_channels": vae_params["z_channels"], + "layers_per_block": vae_params["num_res_blocks"], } return config def create_unet_config(original_config, image_size: int, attention_type): - unet_params = original_config.model.params - vae_params = original_config.autoencoder.params.ddconfig + unet_params = original_config["model"]["params"] + vae_params = original_config["autoencoder"]["params"]["ddconfig"] - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1) - head_dim = unet_params.num_heads if "num_heads" in unet_params else None + head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None use_linear_projection = ( - unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False + unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False ) if use_linear_projection: if head_dim is None: @@ -423,11 +422,11 @@ def create_unet_config(original_config, image_size: int, attention_type): config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params.in_channels, + "in_channels": unet_params["in_channels"], "down_block_types": tuple(down_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": unet_params.num_res_blocks, - "cross_attention_dim": unet_params.context_dim, + "layers_per_block": unet_params["num_res_blocks"], + "cross_attention_dim": unet_params["context_dim"], "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, "attention_type": attention_type, @@ -445,11 +444,6 @@ def convert_gligen_to_diffusers( num_in_channels: int = None, device: str = None, ): - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" checkpoint = torch.load(checkpoint_path, map_location=device) @@ -461,14 +455,14 @@ def convert_gligen_to_diffusers( else: print("global_step key not found in model") - original_config = OmegaConf.load(original_config_file) + original_config = yaml.safe_load(original_config_file) if num_in_channels is not None: original_config["model"]["params"]["in_channels"] = num_in_channels - num_train_timesteps = original_config.diffusion.params.timesteps - beta_start = original_config.diffusion.params.linear_start - beta_end = original_config.diffusion.params.linear_end + num_train_timesteps = original_config["diffusion"]["params"]["timesteps"] + beta_start = original_config["diffusion"]["params"]["linear_start"] + beta_end = original_config["diffusion"]["params"]["linear_end"] scheduler = DDIMScheduler( beta_end=beta_end, diff --git a/scripts/convert_if.py b/scripts/convert_if.py index 66d7f694c8e1..c4588f4b25ef 100644 --- a/scripts/convert_if.py +++ b/scripts/convert_if.py @@ -4,6 +4,7 @@ import numpy as np import torch +import yaml from torch.nn import functional as F from transformers import CLIPConfig, CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, T5Tokenizer @@ -11,14 +12,6 @@ from diffusers.pipelines.deepfloyd_if.safety_checker import IFSafetyChecker -try: - from omegaconf import OmegaConf -except ImportError: - raise ImportError( - "OmegaConf is required to convert the IF checkpoints. Please install it with `pip install" " OmegaConf`." - ) - - def parse_args(): parser = argparse.ArgumentParser() @@ -143,8 +136,8 @@ def convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safet def get_stage_1_unet(unet_config, unet_checkpoint_path): - original_unet_config = OmegaConf.load(unet_config) - original_unet_config = original_unet_config.params + original_unet_config = yaml.safe_load(unet_config) + original_unet_config = original_unet_config["params"] unet_diffusers_config = create_unet_diffusers_config(original_unet_config) @@ -215,11 +208,11 @@ def convert_safety_checker(p_head_path, w_head_path): def create_unet_diffusers_config(original_unet_config, class_embed_type=None): - attention_resolutions = parse_list(original_unet_config.attention_resolutions) - attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions] + attention_resolutions = parse_list(original_unet_config["attention_resolutions"]) + attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions] - channel_mult = parse_list(original_unet_config.channel_mult) - block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult] + channel_mult = parse_list(original_unet_config["channel_mult"]) + block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult] down_block_types = [] resolution = 1 @@ -227,7 +220,7 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None): for i in range(len(block_out_channels)): if resolution in attention_resolutions: block_type = "SimpleCrossAttnDownBlock2D" - elif original_unet_config.resblock_updown: + elif original_unet_config["resblock_updown"]: block_type = "ResnetDownsampleBlock2D" else: block_type = "DownBlock2D" @@ -241,17 +234,17 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None): for i in range(len(block_out_channels)): if resolution in attention_resolutions: block_type = "SimpleCrossAttnUpBlock2D" - elif original_unet_config.resblock_updown: + elif original_unet_config["resblock_updown"]: block_type = "ResnetUpsampleBlock2D" else: block_type = "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - head_dim = original_unet_config.num_head_channels + head_dim = original_unet_config["num_head_channels"] use_linear_projection = ( - original_unet_config.use_linear_in_transformer + original_unet_config["use_linear_in_transformer"] if "use_linear_in_transformer" in original_unet_config else False ) @@ -264,27 +257,27 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None): if class_embed_type is None: if "num_classes" in original_unet_config: - if original_unet_config.num_classes == "sequential": + if original_unet_config["num_classes"] == "sequential": class_embed_type = "projection" assert "adm_in_channels" in original_unet_config - projection_class_embeddings_input_dim = original_unet_config.adm_in_channels + projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"] else: raise NotImplementedError( - f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}" + f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}" ) config = { - "sample_size": original_unet_config.image_size, - "in_channels": original_unet_config.in_channels, + "sample_size": original_unet_config["image_size"], + "in_channels": original_unet_config["in_channels"], "down_block_types": tuple(down_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": original_unet_config.num_res_blocks, - "cross_attention_dim": original_unet_config.encoder_channels, + "layers_per_block": original_unet_config["num_res_blocks"], + "cross_attention_dim": original_unet_config["encoder_channels"], "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, "class_embed_type": class_embed_type, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, - "out_channels": original_unet_config.out_channels, + "out_channels": original_unet_config["out_channels"], "up_block_types": tuple(up_block_types), "upcast_attention": False, # TODO: guessing "cross_attention_norm": "group_norm", @@ -293,11 +286,11 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None): "act_fn": "gelu", } - if original_unet_config.use_scale_shift_norm: + if original_unet_config["use_scale_shift_norm"]: config["resnet_time_scale_shift"] = "scale_shift" if "encoder_dim" in original_unet_config: - config["encoder_hid_dim"] = original_unet_config.encoder_dim + config["encoder_hid_dim"] = original_unet_config["encoder_dim"] return config @@ -725,15 +718,15 @@ def parse_list(value): def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_size=None): orig_path = unet_checkpoint_path - original_unet_config = OmegaConf.load(os.path.join(orig_path, "config.yml")) - original_unet_config = original_unet_config.params + original_unet_config = yaml.safe_load(os.path.join(orig_path, "config.yml")) + original_unet_config = original_unet_config["params"] unet_diffusers_config = superres_create_unet_diffusers_config(original_unet_config) - unet_diffusers_config["time_embedding_dim"] = original_unet_config.model_channels * int( - original_unet_config.channel_mult.split(",")[-1] + unet_diffusers_config["time_embedding_dim"] = original_unet_config["model_channels"] * int( + original_unet_config["channel_mult"].split(",")[-1] ) - if original_unet_config.encoder_dim != original_unet_config.encoder_channels: - unet_diffusers_config["encoder_hid_dim"] = original_unet_config.encoder_dim + if original_unet_config["encoder_dim"] != original_unet_config["encoder_channels"]: + unet_diffusers_config["encoder_hid_dim"] = original_unet_config["encoder_dim"] unet_diffusers_config["class_embed_type"] = "timestep" unet_diffusers_config["addition_embed_type"] = "text" @@ -742,16 +735,16 @@ def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_siz unet_diffusers_config["resnet_out_scale_factor"] = 1 / 0.7071 unet_diffusers_config["mid_block_scale_factor"] = 1 / 0.7071 unet_diffusers_config["only_cross_attention"] = ( - bool(original_unet_config.disable_self_attentions) + bool(original_unet_config["disable_self_attentions"]) if ( "disable_self_attentions" in original_unet_config - and isinstance(original_unet_config.disable_self_attentions, int) + and isinstance(original_unet_config["disable_self_attentions"], int) ) else True ) if sample_size is None: - unet_diffusers_config["sample_size"] = original_unet_config.image_size + unet_diffusers_config["sample_size"] = original_unet_config["image_size"] else: # The second upscaler unet's sample size is incorrectly specified # in the config and is instead hardcoded in source @@ -783,11 +776,11 @@ def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_siz def superres_create_unet_diffusers_config(original_unet_config): - attention_resolutions = parse_list(original_unet_config.attention_resolutions) - attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions] + attention_resolutions = parse_list(original_unet_config["attention_resolutions"]) + attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions] - channel_mult = parse_list(original_unet_config.channel_mult) - block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult] + channel_mult = parse_list(original_unet_config["channel_mult"]) + block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult] down_block_types = [] resolution = 1 @@ -795,7 +788,7 @@ def superres_create_unet_diffusers_config(original_unet_config): for i in range(len(block_out_channels)): if resolution in attention_resolutions: block_type = "SimpleCrossAttnDownBlock2D" - elif original_unet_config.resblock_updown: + elif original_unet_config["resblock_updown"]: block_type = "ResnetDownsampleBlock2D" else: block_type = "DownBlock2D" @@ -809,16 +802,16 @@ def superres_create_unet_diffusers_config(original_unet_config): for i in range(len(block_out_channels)): if resolution in attention_resolutions: block_type = "SimpleCrossAttnUpBlock2D" - elif original_unet_config.resblock_updown: + elif original_unet_config["resblock_updown"]: block_type = "ResnetUpsampleBlock2D" else: block_type = "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - head_dim = original_unet_config.num_head_channels + head_dim = original_unet_config["num_head_channels"] use_linear_projection = ( - original_unet_config.use_linear_in_transformer + original_unet_config["use_linear_in_transformer"] if "use_linear_in_transformer" in original_unet_config else False ) @@ -831,26 +824,26 @@ def superres_create_unet_diffusers_config(original_unet_config): projection_class_embeddings_input_dim = None if "num_classes" in original_unet_config: - if original_unet_config.num_classes == "sequential": + if original_unet_config["num_classes"] == "sequential": class_embed_type = "projection" assert "adm_in_channels" in original_unet_config - projection_class_embeddings_input_dim = original_unet_config.adm_in_channels + projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"] else: raise NotImplementedError( - f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}" + f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}" ) config = { - "in_channels": original_unet_config.in_channels, + "in_channels": original_unet_config["in_channels"], "down_block_types": tuple(down_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": tuple(original_unet_config.num_res_blocks), - "cross_attention_dim": original_unet_config.encoder_channels, + "layers_per_block": tuple(original_unet_config["num_res_blocks"]), + "cross_attention_dim": original_unet_config["encoder_channels"], "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, "class_embed_type": class_embed_type, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, - "out_channels": original_unet_config.out_channels, + "out_channels": original_unet_config["out_channels"], "up_block_types": tuple(up_block_types), "upcast_attention": False, # TODO: guessing "cross_attention_norm": "group_norm", @@ -858,7 +851,7 @@ def superres_create_unet_diffusers_config(original_unet_config): "act_fn": "gelu", } - if original_unet_config.use_scale_shift_norm: + if original_unet_config["use_scale_shift_norm"]: config["resnet_time_scale_shift"] = "scale_shift" return config diff --git a/scripts/convert_original_audioldm2_to_diffusers.py b/scripts/convert_original_audioldm2_to_diffusers.py index f0b22cb4b4c7..8c9878526ac0 100644 --- a/scripts/convert_original_audioldm2_to_diffusers.py +++ b/scripts/convert_original_audioldm2_to_diffusers.py @@ -19,6 +19,7 @@ from typing import List, Union import torch +import yaml from transformers import ( AutoFeatureExtractor, AutoTokenizer, @@ -45,7 +46,7 @@ LMSDiscreteScheduler, PNDMScheduler, ) -from diffusers.utils import is_omegaconf_available, is_safetensors_available +from diffusers.utils import is_safetensors_available from diffusers.utils.import_utils import BACKENDS_MAPPING @@ -212,41 +213,41 @@ def create_unet_diffusers_config(original_config, image_size: int): """ Creates a UNet config for diffusers based on the config of the original AudioLDM2 model. """ - unet_params = original_config.model.params.unet_config.params - vae_params = original_config.model.params.first_stage_config.params.ddconfig + unet_params = original_config["model"]["params"]["unet_config"]["params"] + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1) - cross_attention_dim = list(unet_params.context_dim) if "context_dim" in unet_params else block_out_channels + cross_attention_dim = list(unet_params["context_dim"]) if "context_dim" in unet_params else block_out_channels if len(cross_attention_dim) > 1: # require two or more cross-attention layers per-block, each of different dimension cross_attention_dim = [cross_attention_dim for _ in range(len(block_out_channels))] config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params.in_channels, - "out_channels": unet_params.out_channels, + "in_channels": unet_params["in_channels"], + "out_channels": unet_params["out_channels"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": unet_params.num_res_blocks, - "transformer_layers_per_block": unet_params.transformer_depth, + "layers_per_block": unet_params["num_res_blocks"], + "transformer_layers_per_block": unet_params["transformer_depth"], "cross_attention_dim": tuple(cross_attention_dim), } @@ -259,24 +260,24 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int): Creates a VAE config for diffusers based on the config of the original AudioLDM2 model. Compared to the original Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE. """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] + _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"] - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) - scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215 + scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215 config = { "sample_size": image_size, - "in_channels": vae_params.in_channels, - "out_channels": vae_params.out_ch, + "in_channels": vae_params["in_channels"], + "out_channels": vae_params["out_ch"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "latent_channels": vae_params.z_channels, - "layers_per_block": vae_params.num_res_blocks, + "latent_channels": vae_params["z_channels"], + "layers_per_block": vae_params["num_res_blocks"], "scaling_factor": float(scaling_factor), } return config @@ -285,9 +286,9 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int): # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular def create_diffusers_schedular(original_config): schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, + num_train_timesteps=original_config["model"]["params"]["timesteps"], + beta_start=original_config["model"]["params"]["linear_start"], + beta_end=original_config["model"]["params"]["linear_end"], beta_schedule="scaled_linear", ) return schedular @@ -692,17 +693,17 @@ def create_transformers_vocoder_config(original_config): """ Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model. """ - vocoder_params = original_config.model.params.vocoder_config.params + vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"] config = { - "model_in_dim": vocoder_params.num_mels, - "sampling_rate": vocoder_params.sampling_rate, - "upsample_initial_channel": vocoder_params.upsample_initial_channel, - "upsample_rates": list(vocoder_params.upsample_rates), - "upsample_kernel_sizes": list(vocoder_params.upsample_kernel_sizes), - "resblock_kernel_sizes": list(vocoder_params.resblock_kernel_sizes), + "model_in_dim": vocoder_params["num_mels"], + "sampling_rate": vocoder_params["sampling_rate"], + "upsample_initial_channel": vocoder_params["upsample_initial_channel"], + "upsample_rates": list(vocoder_params["upsample_rates"]), + "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]), + "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]), "resblock_dilation_sizes": [ - list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes + list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"] ], "normalize_before": False, } @@ -876,11 +877,6 @@ def load_pipeline_from_original_AudioLDM2_ckpt( return: An AudioLDM2Pipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - if from_safetensors: if not is_safetensors_available(): raise ValueError(BACKENDS_MAPPING["safetensors"][1]) @@ -903,9 +899,8 @@ def load_pipeline_from_original_AudioLDM2_ckpt( if original_config_file is None: original_config = DEFAULT_CONFIG - original_config = OmegaConf.create(original_config) else: - original_config = OmegaConf.load(original_config_file) + original_config = yaml.safe_load(original_config_file) if image_size is not None: original_config["model"]["params"]["unet_config"]["params"]["image_size"] = image_size @@ -926,9 +921,9 @@ def load_pipeline_from_original_AudioLDM2_ckpt( if prediction_type is None: prediction_type = "epsilon" - num_train_timesteps = original_config.model.params.timesteps - beta_start = original_config.model.params.linear_start - beta_end = original_config.model.params.linear_end + num_train_timesteps = original_config["model"]["params"]["timesteps"] + beta_start = original_config["model"]["params"]["linear_start"] + beta_end = original_config["model"]["params"]["linear_end"] scheduler = DDIMScheduler( beta_end=beta_end, @@ -1026,9 +1021,9 @@ def load_pipeline_from_original_AudioLDM2_ckpt( # Convert the GPT2 encoder model: AudioLDM2 uses the same configuration as the original GPT2 base model gpt2_config = GPT2Config.from_pretrained("gpt2") gpt2_model = GPT2Model(gpt2_config) - gpt2_model.config.max_new_tokens = ( - original_config.model.params.cond_stage_config.crossattn_audiomae_generated.params.sequence_gen_length - ) + gpt2_model.config.max_new_tokens = original_config["model"]["params"]["cond_stage_config"][ + "crossattn_audiomae_generated" + ]["params"]["sequence_gen_length"] converted_gpt2_checkpoint = extract_sub_model(checkpoint, key_prefix="cond_stage_models.0.model.") gpt2_model.load_state_dict(converted_gpt2_checkpoint) diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py index 940c74e9cd65..79e8fcc1afa7 100644 --- a/scripts/convert_original_audioldm_to_diffusers.py +++ b/scripts/convert_original_audioldm_to_diffusers.py @@ -18,6 +18,7 @@ import re import torch +import yaml from transformers import ( AutoTokenizer, ClapTextConfig, @@ -38,8 +39,6 @@ PNDMScheduler, UNet2DConditionModel, ) -from diffusers.utils import is_omegaconf_available -from diffusers.utils.import_utils import BACKENDS_MAPPING # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments @@ -215,45 +214,45 @@ def create_unet_diffusers_config(original_config, image_size: int): """ Creates a UNet config for diffusers based on the config of the original AudioLDM model. """ - unet_params = original_config.model.params.unet_config.params - vae_params = original_config.model.params.first_stage_config.params.ddconfig + unet_params = original_config["model"]["params"]["unet_config"]["params"] + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1) cross_attention_dim = ( - unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels + unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels ) class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None projection_class_embeddings_input_dim = ( - unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None + unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None ) - class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None + class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params.in_channels, - "out_channels": unet_params.out_channels, + "in_channels": unet_params["in_channels"], + "out_channels": unet_params["out_channels"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": unet_params.num_res_blocks, + "layers_per_block": unet_params["num_res_blocks"], "cross_attention_dim": cross_attention_dim, "class_embed_type": class_embed_type, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, @@ -269,24 +268,24 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int): Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE. """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] + _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"] - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) - scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215 + scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215 config = { "sample_size": image_size, - "in_channels": vae_params.in_channels, - "out_channels": vae_params.out_ch, + "in_channels": vae_params["in_channels"], + "out_channels": vae_params["out_ch"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "latent_channels": vae_params.z_channels, - "layers_per_block": vae_params.num_res_blocks, + "latent_channels": vae_params["z_channels"], + "layers_per_block": vae_params["num_res_blocks"], "scaling_factor": float(scaling_factor), } return config @@ -295,9 +294,9 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int): # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular def create_diffusers_schedular(original_config): schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, + num_train_timesteps=original_config["model"]["params"]["timesteps"], + beta_start=original_config["model"]["params"]["linear_start"], + beta_end=original_config["model"]["params"]["linear_end"], beta_schedule="scaled_linear", ) return schedular @@ -668,17 +667,17 @@ def create_transformers_vocoder_config(original_config): """ Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model. """ - vocoder_params = original_config.model.params.vocoder_config.params + vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"] config = { - "model_in_dim": vocoder_params.num_mels, - "sampling_rate": vocoder_params.sampling_rate, - "upsample_initial_channel": vocoder_params.upsample_initial_channel, - "upsample_rates": list(vocoder_params.upsample_rates), - "upsample_kernel_sizes": list(vocoder_params.upsample_kernel_sizes), - "resblock_kernel_sizes": list(vocoder_params.resblock_kernel_sizes), + "model_in_dim": vocoder_params["num_mels"], + "sampling_rate": vocoder_params["sampling_rate"], + "upsample_initial_channel": vocoder_params["upsample_initial_channel"], + "upsample_rates": list(vocoder_params["upsample_rates"]), + "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]), + "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]), "resblock_dilation_sizes": [ - list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes + list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"] ], "normalize_before": False, } @@ -818,11 +817,6 @@ def load_pipeline_from_original_audioldm_ckpt( return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - if from_safetensors: from safetensors import safe_open @@ -842,9 +836,8 @@ def load_pipeline_from_original_audioldm_ckpt( if original_config_file is None: original_config = DEFAULT_CONFIG - original_config = OmegaConf.create(original_config) else: - original_config = OmegaConf.load(original_config_file) + original_config = yaml.safe_load(original_config_file) if num_in_channels is not None: original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels @@ -868,9 +861,9 @@ def load_pipeline_from_original_audioldm_ckpt( if image_size is None: image_size = 512 - num_train_timesteps = original_config.model.params.timesteps - beta_start = original_config.model.params.linear_start - beta_end = original_config.model.params.linear_end + num_train_timesteps = original_config["model"]["params"]["timesteps"] + beta_start = original_config["model"]["params"]["linear_start"] + beta_end = original_config["model"]["params"]["linear_end"] scheduler = DDIMScheduler( beta_end=beta_end, diff --git a/scripts/convert_original_musicldm_to_diffusers.py b/scripts/convert_original_musicldm_to_diffusers.py index bbc2fc96f89f..b7da888a068b 100644 --- a/scripts/convert_original_musicldm_to_diffusers.py +++ b/scripts/convert_original_musicldm_to_diffusers.py @@ -18,6 +18,7 @@ import re import torch +import yaml from transformers import ( AutoFeatureExtractor, AutoTokenizer, @@ -39,8 +40,6 @@ PNDMScheduler, UNet2DConditionModel, ) -from diffusers.utils import is_omegaconf_available -from diffusers.utils.import_utils import BACKENDS_MAPPING # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments @@ -212,45 +211,45 @@ def create_unet_diffusers_config(original_config, image_size: int): """ Creates a UNet config for diffusers based on the config of the original MusicLDM model. """ - unet_params = original_config.model.params.unet_config.params - vae_params = original_config.model.params.first_stage_config.params.ddconfig + unet_params = original_config["model"]["params"]["unet_config"]["params"] + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1) cross_attention_dim = ( - unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels + unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels ) class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None projection_class_embeddings_input_dim = ( - unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None + unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None ) - class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None + class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params.in_channels, - "out_channels": unet_params.out_channels, + "in_channels": unet_params["in_channels"], + "out_channels": unet_params["out_channels"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": unet_params.num_res_blocks, + "layers_per_block": unet_params["num_res_blocks"], "cross_attention_dim": cross_attention_dim, "class_embed_type": class_embed_type, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, @@ -266,24 +265,24 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int): Creates a VAE config for diffusers based on the config of the original MusicLDM model. Compared to the original Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE. """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] + _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"] - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) - scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215 + scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215 config = { "sample_size": image_size, - "in_channels": vae_params.in_channels, - "out_channels": vae_params.out_ch, + "in_channels": vae_params["in_channels"], + "out_channels": vae_params["out_ch"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "latent_channels": vae_params.z_channels, - "layers_per_block": vae_params.num_res_blocks, + "latent_channels": vae_params["z_channels"], + "layers_per_block": vae_params["num_res_blocks"], "scaling_factor": float(scaling_factor), } return config @@ -292,9 +291,9 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int): # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular def create_diffusers_schedular(original_config): schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, + num_train_timesteps=original_config["model"]["params"]["timesteps"], + beta_start=original_config["model"]["params"]["linear_start"], + beta_end=original_config["model"]["params"]["linear_end"], beta_schedule="scaled_linear", ) return schedular @@ -674,17 +673,17 @@ def create_transformers_vocoder_config(original_config): """ Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model. """ - vocoder_params = original_config.model.params.vocoder_config.params + vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"] config = { - "model_in_dim": vocoder_params.num_mels, - "sampling_rate": vocoder_params.sampling_rate, - "upsample_initial_channel": vocoder_params.upsample_initial_channel, - "upsample_rates": list(vocoder_params.upsample_rates), - "upsample_kernel_sizes": list(vocoder_params.upsample_kernel_sizes), - "resblock_kernel_sizes": list(vocoder_params.resblock_kernel_sizes), + "model_in_dim": vocoder_params["num_mels"], + "sampling_rate": vocoder_params["sampling_rate"], + "upsample_initial_channel": vocoder_params["upsample_initial_channel"], + "upsample_rates": list(vocoder_params["upsample_rates"]), + "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]), + "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]), "resblock_dilation_sizes": [ - list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes + list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"] ], "normalize_before": False, } @@ -823,12 +822,6 @@ def load_pipeline_from_original_MusicLDM_ckpt( If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. return: An MusicLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ - - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - if from_safetensors: from safetensors import safe_open @@ -848,9 +841,8 @@ def load_pipeline_from_original_MusicLDM_ckpt( if original_config_file is None: original_config = DEFAULT_CONFIG - original_config = OmegaConf.create(original_config) else: - original_config = OmegaConf.load(original_config_file) + original_config = yaml.safe_load(original_config_file) if num_in_channels is not None: original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels @@ -874,9 +866,9 @@ def load_pipeline_from_original_MusicLDM_ckpt( if image_size is None: image_size = 512 - num_train_timesteps = original_config.model.params.timesteps - beta_start = original_config.model.params.linear_start - beta_end = original_config.model.params.linear_end + num_train_timesteps = original_config["model"]["params"]["timesteps"] + beta_start = original_config["model"]["params"]["linear_start"] + beta_end = original_config["model"]["params"]["linear_end"] scheduler = DDIMScheduler( beta_end=beta_end, diff --git a/scripts/convert_vae_pt_to_diffusers.py b/scripts/convert_vae_pt_to_diffusers.py index a8ba48bc001c..a4f967c94fa6 100644 --- a/scripts/convert_vae_pt_to_diffusers.py +++ b/scripts/convert_vae_pt_to_diffusers.py @@ -3,7 +3,7 @@ import requests import torch -from omegaconf import OmegaConf +import yaml from diffusers import AutoencoderKL from diffusers.pipelines.stable_diffusion.convert_from_ckpt import ( @@ -126,7 +126,7 @@ def vae_pt_to_vae_diffuser( ) io_obj = io.BytesIO(r.content) - original_config = OmegaConf.load(io_obj) + original_config = yaml.safe_load(io_obj) image_size = 512 device = "cuda" if torch.cuda.is_available() else "cpu" if checkpoint_path.endswith("safetensors"): diff --git a/scripts/convert_vq_diffusion_to_diffusers.py b/scripts/convert_vq_diffusion_to_diffusers.py index 58ed2d93d5df..7da6b4094986 100644 --- a/scripts/convert_vq_diffusion_to_diffusers.py +++ b/scripts/convert_vq_diffusion_to_diffusers.py @@ -45,51 +45,45 @@ from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings -try: - from omegaconf import OmegaConf -except ImportError: - raise ImportError( - "OmegaConf is required to convert the VQ Diffusion checkpoints. Please install it with `pip install" - " OmegaConf`." - ) - # vqvae model PORTED_VQVAES = ["image_synthesis.modeling.codecs.image_codec.patch_vqgan.PatchVQGAN"] def vqvae_model_from_original_config(original_config): - assert original_config.target in PORTED_VQVAES, f"{original_config.target} has not yet been ported to diffusers." + assert ( + original_config["target"] in PORTED_VQVAES + ), f"{original_config['target']} has not yet been ported to diffusers." - original_config = original_config.params + original_config = original_config["params"] - original_encoder_config = original_config.encoder_config.params - original_decoder_config = original_config.decoder_config.params + original_encoder_config = original_config["encoder_config"]["params"] + original_decoder_config = original_config["decoder_config"]["params"] - in_channels = original_encoder_config.in_channels - out_channels = original_decoder_config.out_ch + in_channels = original_encoder_config["in_channels"] + out_channels = original_decoder_config["out_ch"] down_block_types = get_down_block_types(original_encoder_config) up_block_types = get_up_block_types(original_decoder_config) - assert original_encoder_config.ch == original_decoder_config.ch - assert original_encoder_config.ch_mult == original_decoder_config.ch_mult + assert original_encoder_config["ch"] == original_decoder_config["ch"] + assert original_encoder_config["ch_mult"] == original_decoder_config["ch_mult"] block_out_channels = tuple( - [original_encoder_config.ch * a_ch_mult for a_ch_mult in original_encoder_config.ch_mult] + [original_encoder_config["ch"] * a_ch_mult for a_ch_mult in original_encoder_config["ch_mult"]] ) - assert original_encoder_config.num_res_blocks == original_decoder_config.num_res_blocks - layers_per_block = original_encoder_config.num_res_blocks + assert original_encoder_config["num_res_blocks"] == original_decoder_config["num_res_blocks"] + layers_per_block = original_encoder_config["num_res_blocks"] - assert original_encoder_config.z_channels == original_decoder_config.z_channels - latent_channels = original_encoder_config.z_channels + assert original_encoder_config["z_channels"] == original_decoder_config["z_channels"] + latent_channels = original_encoder_config["z_channels"] - num_vq_embeddings = original_config.n_embed + num_vq_embeddings = original_config["n_embed"] # Hard coded value for ResnetBlock.GoupNorm(num_groups) in VQ-diffusion norm_num_groups = 32 - e_dim = original_config.embed_dim + e_dim = original_config["embed_dim"] model = VQModel( in_channels=in_channels, @@ -108,9 +102,9 @@ def vqvae_model_from_original_config(original_config): def get_down_block_types(original_encoder_config): - attn_resolutions = coerce_attn_resolutions(original_encoder_config.attn_resolutions) - num_resolutions = len(original_encoder_config.ch_mult) - resolution = coerce_resolution(original_encoder_config.resolution) + attn_resolutions = coerce_attn_resolutions(original_encoder_config["attn_resolutions"]) + num_resolutions = len(original_encoder_config["ch_mult"]) + resolution = coerce_resolution(original_encoder_config["resolution"]) curr_res = resolution down_block_types = [] @@ -129,9 +123,9 @@ def get_down_block_types(original_encoder_config): def get_up_block_types(original_decoder_config): - attn_resolutions = coerce_attn_resolutions(original_decoder_config.attn_resolutions) - num_resolutions = len(original_decoder_config.ch_mult) - resolution = coerce_resolution(original_decoder_config.resolution) + attn_resolutions = coerce_attn_resolutions(original_decoder_config["attn_resolutions"]) + num_resolutions = len(original_decoder_config["ch_mult"]) + resolution = coerce_resolution(original_decoder_config["resolution"]) curr_res = [r // 2 ** (num_resolutions - 1) for r in resolution] up_block_types = [] @@ -150,7 +144,7 @@ def get_up_block_types(original_decoder_config): def coerce_attn_resolutions(attn_resolutions): - attn_resolutions = OmegaConf.to_object(attn_resolutions) + attn_resolutions = list(attn_resolutions) attn_resolutions_ = [] for ar in attn_resolutions: if isinstance(ar, (list, tuple)): @@ -161,7 +155,6 @@ def coerce_attn_resolutions(attn_resolutions): def coerce_resolution(resolution): - resolution = OmegaConf.to_object(resolution) if isinstance(resolution, int): resolution = [resolution, resolution] # H, W elif isinstance(resolution, (tuple, list)): @@ -472,18 +465,18 @@ def transformer_model_from_original_config( original_diffusion_config, original_transformer_config, original_content_embedding_config ): assert ( - original_diffusion_config.target in PORTED_DIFFUSIONS - ), f"{original_diffusion_config.target} has not yet been ported to diffusers." + original_diffusion_config["target"] in PORTED_DIFFUSIONS + ), f"{original_diffusion_config['target']} has not yet been ported to diffusers." assert ( - original_transformer_config.target in PORTED_TRANSFORMERS - ), f"{original_transformer_config.target} has not yet been ported to diffusers." + original_transformer_config["target"] in PORTED_TRANSFORMERS + ), f"{original_transformer_config['target']} has not yet been ported to diffusers." assert ( - original_content_embedding_config.target in PORTED_CONTENT_EMBEDDINGS - ), f"{original_content_embedding_config.target} has not yet been ported to diffusers." + original_content_embedding_config["target"] in PORTED_CONTENT_EMBEDDINGS + ), f"{original_content_embedding_config['target']} has not yet been ported to diffusers." - original_diffusion_config = original_diffusion_config.params - original_transformer_config = original_transformer_config.params - original_content_embedding_config = original_content_embedding_config.params + original_diffusion_config = original_diffusion_config["params"] + original_transformer_config = original_transformer_config["params"] + original_content_embedding_config = original_content_embedding_config["params"] inner_dim = original_transformer_config["n_embd"] @@ -689,13 +682,11 @@ def transformer_feedforward_to_diffusers_checkpoint(checkpoint, *, diffusers_fee def read_config_file(filename): # The yaml file contains annotations that certain values should - # loaded as tuples. By default, OmegaConf will panic when reading - # these. Instead, we can manually read the yaml with the FullLoader and then - # construct the OmegaConf object. + # loaded as tuples. with open(filename) as f: original_config = yaml.load(f, FullLoader) - return OmegaConf.create(original_config) + return original_config # We take separate arguments for the vqvae because the ITHQ vqvae config file @@ -792,9 +783,9 @@ def read_config_file(filename): original_config = read_config_file(args.original_config_file).model - diffusion_config = original_config.params.diffusion_config - transformer_config = original_config.params.diffusion_config.params.transformer_config - content_embedding_config = original_config.params.diffusion_config.params.content_emb_config + diffusion_config = original_config["params"]["diffusion_config"] + transformer_config = original_config["params"]["diffusion_config"]["params"]["transformer_config"] + content_embedding_config = original_config["params"]["diffusion_config"]["params"]["content_emb_config"] pre_checkpoint = torch.load(args.checkpoint_path, map_location=checkpoint_map_location) @@ -831,7 +822,7 @@ def read_config_file(filename): # The learned embeddings are stored on the transformer in the original VQ-diffusion. We store them on a separate # model, so we pull them off the checkpoint before the checkpoint is deleted. - learnable_classifier_free_sampling_embeddings = diffusion_config.params.learnable_cf + learnable_classifier_free_sampling_embeddings = diffusion_config["params"].learnable_cf if learnable_classifier_free_sampling_embeddings: learned_classifier_free_sampling_embeddings_embeddings = checkpoint["transformer.empty_text_embed"] diff --git a/scripts/convert_zero123_to_diffusers.py b/scripts/convert_zero123_to_diffusers.py index bdcb2cd2e113..f016312b8bb6 100644 --- a/scripts/convert_zero123_to_diffusers.py +++ b/scripts/convert_zero123_to_diffusers.py @@ -14,6 +14,7 @@ import argparse import torch +import yaml from accelerate import init_empty_weights from accelerate.utils import set_module_tensor_to_device from pipeline_zero1to3 import CCProjection, Zero1to3StableDiffusionPipeline @@ -38,51 +39,54 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa Creates a config for the diffusers based on the config of the LDM model. """ if controlnet: - unet_params = original_config.model.params.control_stage_config.params + unet_params = original_config["model"]["params"]["control_stage_config"]["params"] else: - if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None: - unet_params = original_config.model.params.unet_config.params + if ( + "unet_config" in original_config["model"]["params"] + and original_config["model"]["params"]["unet_config"] is not None + ): + unet_params = original_config["model"]["params"]["unet_config"]["params"] else: - unet_params = original_config.model.params.network_config.params + unet_params = original_config["model"]["params"]["network_config"]["params"] - vae_params = original_config.model.params.first_stage_config.params.ddconfig + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - if unet_params.transformer_depth is not None: + if unet_params["transformer_depth"] is not None: transformer_layers_per_block = ( - unet_params.transformer_depth - if isinstance(unet_params.transformer_depth, int) - else list(unet_params.transformer_depth) + unet_params["transformer_depth"] + if isinstance(unet_params["transformer_depth"], int) + else list(unet_params["transformer_depth"]) ) else: transformer_layers_per_block = 1 - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1) - head_dim = unet_params.num_heads if "num_heads" in unet_params else None + head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None use_linear_projection = ( - unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False + unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False ) if use_linear_projection: # stable diffusion 2-base-512 and 2-768 if head_dim is None: - head_dim_mult = unet_params.model_channels // unet_params.num_head_channels - head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)] + head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"] + head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])] class_embed_type = None addition_embed_type = None @@ -90,13 +94,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa projection_class_embeddings_input_dim = None context_dim = None - if unet_params.context_dim is not None: + if unet_params["context_dim"] is not None: context_dim = ( - unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0] + unet_params["context_dim"] + if isinstance(unet_params["context_dim"], int) + else unet_params["context_dim"][0] ) if "num_classes" in unet_params: - if unet_params.num_classes == "sequential": + if unet_params["num_classes"] == "sequential": if context_dim in [2048, 1280]: # SDXL addition_embed_type = "text_time" @@ -104,16 +110,16 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa else: class_embed_type = "projection" assert "adm_in_channels" in unet_params - projection_class_embeddings_input_dim = unet_params.adm_in_channels + projection_class_embeddings_input_dim = unet_params["adm_in_channels"] else: - raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}") + raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params["num_classes"]}") config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params.in_channels, + "in_channels": unet_params["in_channels"], "down_block_types": tuple(down_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": unet_params.num_res_blocks, + "layers_per_block": unet_params["num_res_blocks"], "cross_attention_dim": context_dim, "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, @@ -125,9 +131,9 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa } if controlnet: - config["conditioning_channels"] = unet_params.hint_channels + config["conditioning_channels"] = unet_params["hint_channels"] else: - config["out_channels"] = unet_params.out_channels + config["out_channels"] = unet_params["out_channels"] config["up_block_types"] = tuple(up_block_types) return config @@ -487,22 +493,22 @@ def create_vae_diffusers_config(original_config, image_size: int): """ Creates a config for the diffusers based on the config of the LDM model. """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] + _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"] - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) config = { "sample_size": image_size, - "in_channels": vae_params.in_channels, - "out_channels": vae_params.out_ch, + "in_channels": vae_params["in_channels"], + "out_channels": vae_params["out_ch"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "latent_channels": vae_params.z_channels, - "layers_per_block": vae_params.num_res_blocks, + "latent_channels": vae_params["z_channels"], + "layers_per_block": vae_params["num_res_blocks"], } return config @@ -679,18 +685,16 @@ def convert_from_original_zero123_ckpt(checkpoint_path, original_config_file, ex del ckpt torch.cuda.empty_cache() - from omegaconf import OmegaConf - - original_config = OmegaConf.load(original_config_file) - original_config.model.params.cond_stage_config.target.split(".")[-1] + original_config = yaml.safe_load(original_config_file) + original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1] num_in_channels = 8 original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels prediction_type = "epsilon" image_size = 256 - num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000 + num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000 - beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02 - beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085 + beta_start = getattr(original_config["model"]["params"], "linear_start", None) or 0.02 + beta_end = getattr(original_config["model"]["params"], "linear_end", None) or 0.085 scheduler = DDIMScheduler( beta_end=beta_end, beta_schedule="scaled_linear", @@ -721,10 +725,10 @@ def convert_from_original_zero123_ckpt(checkpoint_path, original_config_file, ex if ( "model" in original_config - and "params" in original_config.model - and "scale_factor" in original_config.model.params + and "params" in original_config["model"] + and "scale_factor" in original_config["model"]["params"] ): - vae_scaling_factor = original_config.model.params.scale_factor + vae_scaling_factor = original_config["model"]["params"]["scale_factor"] else: vae_scaling_factor = 0.18215 # default SD scaling factor diff --git a/setup.py b/setup.py index 177c918d38ca..bb0f37817097 100644 --- a/setup.py +++ b/setup.py @@ -110,7 +110,6 @@ "note_seq", "librosa", "numpy", - "omegaconf", "parameterized", "peft>=0.6.0", "protobuf>=3.20.3,<4", @@ -213,7 +212,6 @@ def run(self): "invisible-watermark", "k-diffusion", "librosa", - "omegaconf", "parameterized", "pytest", "pytest-timeout", diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 03e8fe7a0a00..e92a486bffc1 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -22,7 +22,6 @@ "note_seq": "note_seq", "librosa": "librosa", "numpy": "numpy", - "omegaconf": "omegaconf", "parameterized": "parameterized", "peft": "peft>=0.6.0", "protobuf": "protobuf>=3.20.3,<4", diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index 742984449e4f..4086b1a2a8e8 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -17,17 +17,11 @@ import requests import torch +import yaml from huggingface_hub import hf_hub_download from huggingface_hub.utils import validate_hf_hub_args -from ..utils import ( - deprecate, - is_accelerate_available, - is_omegaconf_available, - is_transformers_available, - logging, -) -from ..utils.import_utils import BACKENDS_MAPPING +from ..utils import deprecate, is_accelerate_available, is_transformers_available, logging if is_transformers_available(): @@ -370,11 +364,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): model = AutoencoderKL.from_single_file(url) ``` """ - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - from ..models import AutoencoderKL # import here to avoid circular dependency @@ -452,7 +441,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" config_file = BytesIO(requests.get(config_url).content) - original_config = OmegaConf.load(config_file) + original_config = yaml.safe_load(config_file) # default to sd-v1-5 image_size = image_size or 512 @@ -463,10 +452,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): if scaling_factor is None: if ( "model" in original_config - and "params" in original_config.model - and "scale_factor" in original_config.model.params + and "params" in original_config["model"] + and "scale_factor" in original_config["model"]["params"] ): - vae_scaling_factor = original_config.model.params.scale_factor + vae_scaling_factor = original_config["model"]["params"]["scale_factor"] else: vae_scaling_factor = 0.18215 # default SD scaling factor diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 5aa23252b86a..b871f7bbe537 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -21,6 +21,7 @@ import requests import torch +import yaml from transformers import ( AutoFeatureExtractor, BertTokenizerFast, @@ -50,8 +51,7 @@ PNDMScheduler, UnCLIPScheduler, ) -from ...utils import is_accelerate_available, is_omegaconf_available, logging -from ...utils.import_utils import BACKENDS_MAPPING +from ...utils import is_accelerate_available, logging from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel from ..paint_by_example import PaintByExampleImageEncoder from ..pipeline_utils import DiffusionPipeline @@ -237,51 +237,54 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa Creates a config for the diffusers based on the config of the LDM model. """ if controlnet: - unet_params = original_config.model.params.control_stage_config.params + unet_params = original_config["model"]["params"]["control_stage_config"]["params"] else: - if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None: - unet_params = original_config.model.params.unet_config.params + if ( + "unet_config" in original_config["model"]["params"] + and original_config["model"]["params"]["unet_config"] is not None + ): + unet_params = original_config["model"]["params"]["unet_config"]["params"] else: - unet_params = original_config.model.params.network_config.params + unet_params = original_config["model"]["params"]["network_config"]["params"] - vae_params = original_config.model.params.first_stage_config.params.ddconfig + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - if unet_params.transformer_depth is not None: + if unet_params["transformer_depth"] is not None: transformer_layers_per_block = ( - unet_params.transformer_depth - if isinstance(unet_params.transformer_depth, int) - else list(unet_params.transformer_depth) + unet_params["transformer_depth"] + if isinstance(unet_params["transformer_depth"], int) + else list(unet_params["transformer_depth"]) ) else: transformer_layers_per_block = 1 - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1) - head_dim = unet_params.num_heads if "num_heads" in unet_params else None + head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None use_linear_projection = ( - unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False + unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False ) if use_linear_projection: # stable diffusion 2-base-512 and 2-768 if head_dim is None: - head_dim_mult = unet_params.model_channels // unet_params.num_head_channels - head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)] + head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"] + head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])] class_embed_type = None addition_embed_type = None @@ -289,13 +292,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa projection_class_embeddings_input_dim = None context_dim = None - if unet_params.context_dim is not None: + if unet_params["context_dim"] is not None: context_dim = ( - unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0] + unet_params["context_dim"] + if isinstance(unet_params["context_dim"], int) + else unet_params["context_dim"][0] ) if "num_classes" in unet_params: - if unet_params.num_classes == "sequential": + if unet_params["num_classes"] == "sequential": if context_dim in [2048, 1280]: # SDXL addition_embed_type = "text_time" @@ -303,14 +308,14 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa else: class_embed_type = "projection" assert "adm_in_channels" in unet_params - projection_class_embeddings_input_dim = unet_params.adm_in_channels + projection_class_embeddings_input_dim = unet_params["adm_in_channels"] config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params.in_channels, + "in_channels": unet_params["in_channels"], "down_block_types": tuple(down_block_types), "block_out_channels": tuple(block_out_channels), - "layers_per_block": unet_params.num_res_blocks, + "layers_per_block": unet_params["num_res_blocks"], "cross_attention_dim": context_dim, "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, @@ -322,15 +327,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa } if "disable_self_attentions" in unet_params: - config["only_cross_attention"] = unet_params.disable_self_attentions + config["only_cross_attention"] = unet_params["disable_self_attentions"] - if "num_classes" in unet_params and isinstance(unet_params.num_classes, int): - config["num_class_embeds"] = unet_params.num_classes + if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int): + config["num_class_embeds"] = unet_params["num_classes"] if controlnet: - config["conditioning_channels"] = unet_params.hint_channels + config["conditioning_channels"] = unet_params["hint_channels"] else: - config["out_channels"] = unet_params.out_channels + config["out_channels"] = unet_params["out_channels"] config["up_block_types"] = tuple(up_block_types) return config @@ -340,38 +345,38 @@ def create_vae_diffusers_config(original_config, image_size: int): """ Creates a config for the diffusers based on the config of the LDM model. """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] + _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"] - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) config = { "sample_size": image_size, - "in_channels": vae_params.in_channels, - "out_channels": vae_params.out_ch, + "in_channels": vae_params["in_channels"], + "out_channels": vae_params["out_ch"], "down_block_types": tuple(down_block_types), "up_block_types": tuple(up_block_types), "block_out_channels": tuple(block_out_channels), - "latent_channels": vae_params.z_channels, - "layers_per_block": vae_params.num_res_blocks, + "latent_channels": vae_params["z_channels"], + "layers_per_block": vae_params["num_res_blocks"], } return config def create_diffusers_schedular(original_config): schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, + num_train_timesteps=original_config["model"]["params"]["timesteps"], + beta_start=original_config["model"]["params"]["linear_start"], + beta_end=original_config["model"]["params"]["linear_end"], beta_schedule="scaled_linear", ) return schedular def create_ldm_bert_config(original_config): - bert_params = original_config.model.params.cond_stage_config.params + bert_params = original_config["model"]["params"]["cond_stage_config"]["params"] config = LDMBertConfig( d_model=bert_params.n_embed, encoder_layers=bert_params.n_layer, @@ -1006,9 +1011,9 @@ def stable_unclip_image_encoder(original_config, local_files_only=False): encoders. """ - image_embedder_config = original_config.model.params.embedder_config + image_embedder_config = original_config["model"]["params"]["embedder_config"] - sd_clip_image_embedder_class = image_embedder_config.target + sd_clip_image_embedder_class = image_embedder_config["target"] sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1] if sd_clip_image_embedder_class == "ClipImageEmbedder": @@ -1047,8 +1052,8 @@ def stable_unclip_image_noising_components( If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided. """ - noise_aug_config = original_config.model.params.noise_aug_config - noise_aug_class = noise_aug_config.target + noise_aug_config = original_config["model"]["params"]["noise_aug_config"] + noise_aug_class = noise_aug_config["target"] noise_aug_class = noise_aug_class.split(".")[-1] if noise_aug_class == "CLIPEmbeddingNoiseAugmentation": @@ -1245,11 +1250,6 @@ def download_from_original_stable_diffusion_ckpt( if prediction_type == "v-prediction": prediction_type = "v_prediction" - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - if isinstance(checkpoint_path_or_dict, str): if from_safetensors: from safetensors.torch import load_file as safe_load @@ -1318,18 +1318,18 @@ def download_from_original_stable_diffusion_ckpt( if config_url is not None: original_config_file = BytesIO(requests.get(config_url).content) - original_config = OmegaConf.load(original_config_file) + original_config = yaml.safe_load(original_config_file) # Convert the text model. if ( model_type is None - and "cond_stage_config" in original_config.model.params - and original_config.model.params.cond_stage_config is not None + and "cond_stage_config" in original_config["model"]["params"] + and original_config["model"]["params"]["cond_stage_config"] is not None ): - model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] + model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1] logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") - elif model_type is None and original_config.model.params.network_config is not None: - if original_config.model.params.network_config.params.context_dim == 2048: + elif model_type is None and original_config["model"]["params"]["network_config"] is not None: + if original_config["model"]["params"]["network_config"]["params"]["context_dim"] == 2048: model_type = "SDXL" else: model_type = "SDXL-Refiner" @@ -1354,7 +1354,7 @@ def download_from_original_stable_diffusion_ckpt( elif num_in_channels is None: num_in_channels = 4 - if "unet_config" in original_config.model.params: + if "unet_config" in original_config["model"]["params"]: original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels if ( @@ -1375,13 +1375,16 @@ def download_from_original_stable_diffusion_ckpt( if image_size is None: image_size = 512 - if controlnet is None and "control_stage_config" in original_config.model.params: + if controlnet is None and "control_stage_config" in original_config["model"]["params"]: path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else "" controlnet = convert_controlnet_checkpoint( checkpoint, original_config, path, image_size, upcast_attention, extract_ema ) - num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000 + if "timesteps" in original_config["model"]["params"]: + num_train_timesteps = original_config["model"]["params"]["timesteps"] + else: + num_train_timesteps = 1000 if model_type in ["SDXL", "SDXL-Refiner"]: scheduler_dict = { @@ -1400,8 +1403,15 @@ def download_from_original_stable_diffusion_ckpt( scheduler = EulerDiscreteScheduler.from_config(scheduler_dict) scheduler_type = "euler" else: - beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02 - beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085 + if "linear_start" in original_config["model"]["params"]: + beta_start = original_config["model"]["params"]["linear_start"] + else: + beta_start = 0.02 + + if "linear_end" in original_config["model"]["params"]: + beta_end = original_config["model"]["params"]["linear_end"] + else: + beta_end = 0.085 scheduler = DDIMScheduler( beta_end=beta_end, beta_schedule="scaled_linear", @@ -1435,7 +1445,7 @@ def download_from_original_stable_diffusion_ckpt( raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") if pipeline_class == StableDiffusionUpscalePipeline: - image_size = original_config.model.params.unet_config.params.image_size + image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"] # Convert the UNet2DConditionModel model. unet_config = create_unet_diffusers_config(original_config, image_size=image_size) @@ -1464,10 +1474,10 @@ def download_from_original_stable_diffusion_ckpt( if ( "model" in original_config - and "params" in original_config.model - and "scale_factor" in original_config.model.params + and "params" in original_config["model"] + and "scale_factor" in original_config["model"]["params"] ): - vae_scaling_factor = original_config.model.params.scale_factor + vae_scaling_factor = original_config["model"]["params"]["scale_factor"] else: vae_scaling_factor = 0.18215 # default SD scaling factor @@ -1803,11 +1813,6 @@ def download_controlnet_from_original_ckpt( use_linear_projection: Optional[bool] = None, cross_attention_dim: Optional[bool] = None, ) -> DiffusionPipeline: - if not is_omegaconf_available(): - raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - - from omegaconf import OmegaConf - if from_safetensors: from safetensors import safe_open @@ -1827,12 +1832,12 @@ def download_controlnet_from_original_ckpt( while "state_dict" in checkpoint: checkpoint = checkpoint["state_dict"] - original_config = OmegaConf.load(original_config_file) + original_config = yaml.safe_load(original_config_file) if num_in_channels is not None: original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels - if "control_stage_config" not in original_config.model.params: + if "control_stage_config" not in original_config["model"]["params"]: raise ValueError("`control_stage_config` not present in original config") controlnet = convert_controlnet_checkpoint( diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index adc1cb8a3dfb..468476e0c748 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -66,7 +66,6 @@ is_k_diffusion_version, is_librosa_available, is_note_seq_available, - is_omegaconf_available, is_onnx_available, is_peft_available, is_scipy_available, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index b3278af2f6a5..ac1565023b09 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -223,12 +223,6 @@ except importlib_metadata.PackageNotFoundError: _wandb_available = False -_omegaconf_available = importlib.util.find_spec("omegaconf") is not None -try: - _omegaconf_version = importlib_metadata.version("omegaconf") - logger.debug(f"Successfully imported omegaconf version {_omegaconf_version}") -except importlib_metadata.PackageNotFoundError: - _omegaconf_available = False _tensorboard_available = importlib.util.find_spec("tensorboard") try: @@ -345,10 +339,6 @@ def is_wandb_available(): return _wandb_available -def is_omegaconf_available(): - return _omegaconf_available - - def is_tensorboard_available(): return _tensorboard_available @@ -449,12 +439,6 @@ def is_peft_available(): install wandb` """ -# docstyle-ignore -OMEGACONF_IMPORT_ERROR = """ -{0} requires the omegaconf library but it was not found in your environment. You can install it with pip: `pip -install omegaconf` -""" - # docstyle-ignore TENSORBOARD_IMPORT_ERROR = """ {0} requires the tensorboard library but it was not found in your environment. You can install it with pip: `pip @@ -506,7 +490,6 @@ def is_peft_available(): ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)), ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)), ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)), - ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)), ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)), ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)), ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),