Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions docs/source/en/using-diffusers/loading_adapters.md
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ image.save("sdxl_t2i.png")
</div>
</div>

You can use the IP-Adapter face model to apply specific faces to your images. It is an effective way to maintain consistent characters in your image generations.
You can use the IP-Adapter face models to apply specific faces to your images. It is an effective way to maintain consistent characters in your image generations.
Weights are loaded with the same method used for the other IP-Adapters.

```python
Expand All @@ -495,7 +495,7 @@ pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-a

<Tip>

It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model.
It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face models.


</Tip>
Expand Down Expand Up @@ -549,6 +549,57 @@ image = pipeline(
</div>
</div>

IP Adapter FaceID is an experimental IP Adapter model that uses image embeddings generated by third-party software, so no image encoder needs to be loaded.
You must pass the image embedding tensor as `image_embeds` to the StableDiffusionPipeline instead of `ip_adapter_image`.

```
import cv2
from insightface.app import FaceAnalysis
import numpy as np
from PIL import Image
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler
from diffusers.utils import load_image

noise_scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
steps_offset=1
)

pipeline = StableDiffusionPipeline.from_pretrained(
"SG161222/Realistic_Vision_V4.0_noVAE",
torch_dtype=torch.float16,
scheduler=noise_scheduler
).to("cuda")

generator = torch.Generator(device="cpu").manual_seed(42)
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")

# Extract image embeddings
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
faces = app.get(image)
image = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)

# Load IP Adapter weights and run inference
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", weight_name="ip-adapter-faceid_sd15.bin")
pipeline.set_ip_adapter_scale(0.7)
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
image_embeds=image,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=20, num_images_per_prompt=1, width=512, height=704,
generator=generator
).images[0]
```

### LCM-Lora

You can use IP-Adapter with LCM-Lora to achieve "instant fine-tune" with custom images. Note that you need to load IP-Adapter weights before loading the LCM-Lora weights.
Expand Down
31 changes: 21 additions & 10 deletions src/diffusers/loaders/ip_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
from ..models.attention_processor import (
IPAdapterAttnProcessor,
IPAdapterAttnProcessor2_0,
LoRAIPAdapterAttnProcessor,
LoRAIPAdapterAttnProcessor2_0,
)

logger = logging.get_logger(__name__)
Expand All @@ -46,7 +48,6 @@ class IPAdapterMixin:
def load_ip_adapter(
self,
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
subfolder: str,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change, can we leave as it was here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done as you suggested!

weight_name: str,
**kwargs,
):
Expand Down Expand Up @@ -95,6 +96,7 @@ def load_ip_adapter(
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", None)

user_agent = {
"file_type": "attn_procs_weights",
Expand Down Expand Up @@ -135,14 +137,15 @@ def load_ip_adapter(
# load CLIP image encoer here if it has not been registered to the pipeline yet
if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
if not isinstance(pretrained_model_name_or_path_or_dict, dict):
logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
pretrained_model_name_or_path_or_dict,
subfolder=os.path.join(subfolder, "image_encoder"),
).to(self.device, dtype=self.dtype)
self.image_encoder = image_encoder
else:
raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
try:
logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
pretrained_model_name_or_path_or_dict,
subfolder=os.path.join(subfolder, "image_encoder"),
).to(self.device, dtype=self.dtype)
self.image_encoder = image_encoder
except TypeError:
print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
try:
logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
pretrained_model_name_or_path_or_dict,
subfolder=os.path.join(subfolder, "image_encoder"),
).to(self.device, dtype=self.dtype)
self.image_encoder = image_encoder
except TypeError:
print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
try:
logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
pretrained_model_name_or_path_or_dict,
subfolder=os.path.join(subfolder, "image_encoder"),
).to(self.device, dtype=self.dtype)
self.image_encoder = image_encoder
except TypeError:
print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")

Let's try to not use try...except here please


# create feature extractor if it has not been registered to the pipeline yet
if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
Expand All @@ -155,5 +158,13 @@ def load_ip_adapter(
def set_ip_adapter_scale(self, scale):
unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
for attn_processor in unet.attn_processors.values():
if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
if isinstance(
attn_processor,
(
IPAdapterAttnProcessor,
IPAdapterAttnProcessor2_0,
LoRAIPAdapterAttnProcessor,
LoRAIPAdapterAttnProcessor2_0,
),
):
attn_processor.scale = scale
80 changes: 62 additions & 18 deletions src/diffusers/loaders/unet.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,13 +707,20 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict):
diffusers_name = key.replace("proj", "image_embeds")
updated_state_dict[diffusers_name] = value

elif "proj.3.weight" in state_dict:
elif "proj.0.weight" in state_dict:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is using this key a better option? Can we use a more resilient condition here to avoid side-effects?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed to proj.0.weight because both IPAdapter Full and FaceID state dicts have it, while the IPAdapter Full proj.3.weight key is named norm.weight in the FaceID model

# IP-Adapter Full
clip_embeddings_dim = state_dict["proj.0.weight"].shape[0]
cross_attention_dim = state_dict["proj.3.weight"].shape[0]
clip_embeddings_dim_in = state_dict["proj.0.weight"].shape[1]
clip_embeddings_dim_out = state_dict["proj.0.weight"].shape[0]
multiplier = clip_embeddings_dim_out // clip_embeddings_dim_in
norm_layer = "proj.3.weight" if "proj.3.weight" in state_dict else "norm.weight"
cross_attention_dim = state_dict[norm_layer].shape[0]
num_tokens = state_dict["proj.2.weight"].shape[0] // cross_attention_dim

image_projection = IPAdapterFullImageProjection(
cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
cross_attention_dim=cross_attention_dim,
image_embed_dim=clip_embeddings_dim_in,
mult=multiplier,
num_tokens=num_tokens,
)

for key, value in state_dict.items():
Expand Down Expand Up @@ -767,14 +774,24 @@ def _load_ip_adapter_weights(self, state_dict):
AttnProcessor2_0,
IPAdapterAttnProcessor,
IPAdapterAttnProcessor2_0,
LoRAAttnProcessor,
LoRAAttnProcessor2_0,
LoRAIPAdapterAttnProcessor,
LoRAIPAdapterAttnProcessor2_0,
)

use_lora = False
if "proj.weight" in state_dict["image_proj"]:
# IP-Adapter
num_image_text_embeds = 4
elif "proj.3.weight" in state_dict["image_proj"]:
elif "proj.0.weight" in state_dict["image_proj"]:
# IP-Adapter Full Face
num_image_text_embeds = 257 # 256 CLIP tokens + 1 CLS token
for k in state_dict["ip_adapter"].keys():
if "lora" in k:
num_image_text_embeds = 4
use_lora = True
break
else:
# IP-Adapter Plus
num_image_text_embeds = state_dict["image_proj"]["latents"].shape[1]
Expand All @@ -797,20 +814,47 @@ def _load_ip_adapter_weights(self, state_dict):
block_id = int(name[len("down_blocks.")])
hidden_size = self.config.block_out_channels[block_id]
if cross_attention_dim is None or "motion_modules" in name:
attn_processor_class = (
AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
)
attn_procs[name] = attn_processor_class()
if use_lora:
attn_processor_class = (
LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
)
attn_procs[name] = attn_processor_class(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
rank=128,
).to(self.device, dtype=self.dtype)
else:
attn_processor_class = (
AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
)
attn_procs[name] = attn_processor_class()
else:
attn_processor_class = (
IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
)
attn_procs[name] = attn_processor_class(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
scale=1.0,
num_tokens=num_image_text_embeds,
).to(dtype=self.dtype, device=self.device)
if use_lora:
attn_processor_class = (
LoRAIPAdapterAttnProcessor2_0
if hasattr(F, "scaled_dot_product_attention")
else LoRAIPAdapterAttnProcessor
)
attn_procs[name] = attn_processor_class(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
scale=1.0,
rank=128,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to make rank an argument?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for now, but perhaps new IP Adapter will be released in the future, using different LoRA ranks. Do you think it is better to remove it for now?

num_tokens=num_image_text_embeds,
).to(dtype=self.dtype, device=self.device)

else:
attn_processor_class = (
IPAdapterAttnProcessor2_0
if hasattr(F, "scaled_dot_product_attention")
else IPAdapterAttnProcessor
)
attn_procs[name] = attn_processor_class(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
scale=1.0,
num_tokens=num_image_text_embeds,
).to(dtype=self.dtype, device=self.device)

value_dict = {}
for k, w in attn_procs[name].state_dict().items():
Expand Down
Loading