huggingface · patrickvonplaten · Jan 15, 2024 · Dec 21, 2023 · Dec 25, 2023 · Dec 26, 2023
diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
@@ -485,7 +485,7 @@ image.save("sdxl_t2i.png")
   </div>
 </div>
 
-You can use the IP-Adapter face model to apply specific faces to your images.  It is an effective way to maintain consistent characters in your image generations.
+You can use the IP-Adapter face models to apply specific faces to your images.  It is an effective way to maintain consistent characters in your image generations.
 Weights are loaded with the same method used for the other IP-Adapters.  
 
 ```python
@@ -495,7 +495,7 @@ pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-a
 
 <Tip>
 
-It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model. 
+It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face models. 
 
 
 </Tip>
@@ -549,6 +549,57 @@ image = pipeline(
   </div>
 </div>
 
+IP Adapter FaceID is an experimental IP Adapter model that uses image embeddings generated by third-party software, so no image encoder needs to be loaded.
+You must pass the image embedding tensor as `image_embeds` to the StableDiffusionPipeline instead of `ip_adapter_image`.
+
+```
+import cv2
+from insightface.app import FaceAnalysis
+import numpy as np
+from PIL import Image
+import torch
+from diffusers import StableDiffusionPipeline, DDIMScheduler
+from diffusers.utils import load_image
+
+noise_scheduler = DDIMScheduler(
+    num_train_timesteps=1000,
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    clip_sample=False,
+    set_alpha_to_one=False,
+    steps_offset=1
+)
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    "SG161222/Realistic_Vision_V4.0_noVAE",
+    torch_dtype=torch.float16,
+    scheduler=noise_scheduler
+).to("cuda")
+
+generator = torch.Generator(device="cpu").manual_seed(42)
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
+
+# Extract image embeddings
+app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+app.prepare(ctx_id=0, det_size=(640, 640))
+
+image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
+faces = app.get(image)
+image = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
+
+# Load IP Adapter weights and run inference
+pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", weight_name="ip-adapter-faceid_sd15.bin")
+pipeline.set_ip_adapter_scale(0.7)
+images = pipeline(
+  prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
+  image_embeds=image,
+  negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+  num_inference_steps=20, num_images_per_prompt=1, width=512, height=704, 
+  generator=generator
+).images[0]
+```
+
 ### LCM-Lora
 
 You can use IP-Adapter with LCM-Lora to achieve "instant fine-tune" with custom images. Note that you need to load IP-Adapter weights before loading the LCM-Lora weights.

diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
@@ -34,6 +34,8 @@
     from ..models.attention_processor import (
         IPAdapterAttnProcessor,
         IPAdapterAttnProcessor2_0,
+        LoRAIPAdapterAttnProcessor,
+        LoRAIPAdapterAttnProcessor2_0,
     )
 
 logger = logging.get_logger(__name__)
@@ -46,7 +48,6 @@ class IPAdapterMixin:
     def load_ip_adapter(
         self,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-        subfolder: str,
         weight_name: str,
         **kwargs,
     ):
@@ -95,6 +96,7 @@ def load_ip_adapter(
         local_files_only = kwargs.pop("local_files_only", None)
         token = kwargs.pop("token", None)
         revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
 
         user_agent = {
             "file_type": "attn_procs_weights",
@@ -135,14 +137,15 @@ def load_ip_adapter(
         # load CLIP image encoer here if it has not been registered to the pipeline yet
         if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
             if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-                logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
-                image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-                    pretrained_model_name_or_path_or_dict,
-                    subfolder=os.path.join(subfolder, "image_encoder"),
-                ).to(self.device, dtype=self.dtype)
-                self.image_encoder = image_encoder
-            else:
-                raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
+                try:
+                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                        pretrained_model_name_or_path_or_dict,
+                        subfolder=os.path.join(subfolder, "image_encoder"),
+                    ).to(self.device, dtype=self.dtype)
+                    self.image_encoder = image_encoder
+                except TypeError:
+                    print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
-                try:
-                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
-                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-                        pretrained_model_name_or_path_or_dict,
-                        subfolder=os.path.join(subfolder, "image_encoder"),
-                    ).to(self.device, dtype=self.dtype)
-                    self.image_encoder = image_encoder
-                except TypeError:
-                    print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
+                try:
+                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                        pretrained_model_name_or_path_or_dict,
+                        subfolder=os.path.join(subfolder, "image_encoder"),
+                    ).to(self.device, dtype=self.dtype)
+                    self.image_encoder = image_encoder
+                except TypeError:
+                    print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
-                try:
-                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
-                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-                        pretrained_model_name_or_path_or_dict,
-                        subfolder=os.path.join(subfolder, "image_encoder"),
-                    ).to(self.device, dtype=self.dtype)
-                    self.image_encoder = image_encoder
-                except TypeError:
-                    print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
+                try:
+                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                        pretrained_model_name_or_path_or_dict,
+                        subfolder=os.path.join(subfolder, "image_encoder"),
+                    ).to(self.device, dtype=self.dtype)
+                    self.image_encoder = image_encoder
+                except TypeError:
+                    print("IPAdapter: `subfolder` not found, `image_encoder` is None, use image_embeds.")
 
         # create feature extractor if it has not been registered to the pipeline yet
         if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
@@ -155,5 +158,13 @@ def load_ip_adapter(
     def set_ip_adapter_scale(self, scale):
         unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
         for attn_processor in unet.attn_processors.values():
-            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+            if isinstance(
+                attn_processor,
+                (
+                    IPAdapterAttnProcessor,
+                    IPAdapterAttnProcessor2_0,
+                    LoRAIPAdapterAttnProcessor,
+                    LoRAIPAdapterAttnProcessor2_0,
+                ),
+            ):
                 attn_processor.scale = scale
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -707,13 +707,20 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict):
                 diffusers_name = key.replace("proj", "image_embeds")
                 updated_state_dict[diffusers_name] = value
 
-        elif "proj.3.weight" in state_dict:
+        elif "proj.0.weight" in state_dict:
             # IP-Adapter Full
-            clip_embeddings_dim = state_dict["proj.0.weight"].shape[0]
-            cross_attention_dim = state_dict["proj.3.weight"].shape[0]
+            clip_embeddings_dim_in = state_dict["proj.0.weight"].shape[1]
+            clip_embeddings_dim_out = state_dict["proj.0.weight"].shape[0]
+            multiplier = clip_embeddings_dim_out // clip_embeddings_dim_in
+            norm_layer = "proj.3.weight" if "proj.3.weight" in state_dict else "norm.weight"
+            cross_attention_dim = state_dict[norm_layer].shape[0]
+            num_tokens = state_dict["proj.2.weight"].shape[0] // cross_attention_dim
 
             image_projection = IPAdapterFullImageProjection(
-                cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
+                cross_attention_dim=cross_attention_dim,
+                image_embed_dim=clip_embeddings_dim_in,
+                mult=multiplier,
+                num_tokens=num_tokens,
             )
 
             for key, value in state_dict.items():
@@ -767,14 +774,24 @@ def _load_ip_adapter_weights(self, state_dict):
             AttnProcessor2_0,
             IPAdapterAttnProcessor,
             IPAdapterAttnProcessor2_0,
+            LoRAAttnProcessor,
+            LoRAAttnProcessor2_0,
+            LoRAIPAdapterAttnProcessor,
+            LoRAIPAdapterAttnProcessor2_0,
         )
 
+        use_lora = False
         if "proj.weight" in state_dict["image_proj"]:
             # IP-Adapter
             num_image_text_embeds = 4
-        elif "proj.3.weight" in state_dict["image_proj"]:
+        elif "proj.0.weight" in state_dict["image_proj"]:
             # IP-Adapter Full Face
             num_image_text_embeds = 257  # 256 CLIP tokens + 1 CLS token
+            for k in state_dict["ip_adapter"].keys():
+                if "lora" in k:
+                    num_image_text_embeds = 4
+                    use_lora = True
+                    break
         else:
             # IP-Adapter Plus
             num_image_text_embeds = state_dict["image_proj"]["latents"].shape[1]
@@ -797,20 +814,47 @@ def _load_ip_adapter_weights(self, state_dict):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = self.config.block_out_channels[block_id]
             if cross_attention_dim is None or "motion_modules" in name:
-                attn_processor_class = (
-                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
-                )
-                attn_procs[name] = attn_processor_class()
+                if use_lora:
+                    attn_processor_class = (
+                        LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        rank=128,
+                    ).to(self.device, dtype=self.dtype)
+                else:
+                    attn_processor_class = (
+                        AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class()
             else:
-                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
-                )
-                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    scale=1.0,
-                    num_tokens=num_image_text_embeds,
-                ).to(dtype=self.dtype, device=self.device)
+                if use_lora:
+                    attn_processor_class = (
+                        LoRAIPAdapterAttnProcessor2_0
+                        if hasattr(F, "scaled_dot_product_attention")
+                        else LoRAIPAdapterAttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        rank=128,
+                        num_tokens=num_image_text_embeds,
+                    ).to(dtype=self.dtype, device=self.device)
+
+                else:
+                    attn_processor_class = (
+                        IPAdapterAttnProcessor2_0
+                        if hasattr(F, "scaled_dot_product_attention")
+                        else IPAdapterAttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                    ).to(dtype=self.dtype, device=self.device)
 
                 value_dict = {}
                 for k, w in attn_procs[name].state_dict().items():