AUTOMATIC1111 · AUTOMATIC1111 · Oct 21, 2022 · Oct 14, 2022 · Oct 15, 2022 · Oct 15, 2022
@@ -70,6 +70,7 @@ Check the [custom scripts](https://github.com/AUTOMATIC1111/stable-diffusion-web
 - No token limit for prompts (original stable diffusion lets you use up to 75 tokens)
 - DeepDanbooru integration, creates danbooru style tags for anime prompts (add --deepdanbooru to commandline args)
 - [xformers](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Xformers), major speed increase for select cards: (add --xformers to commandline args)
+- Aesthetic, a way to generate images with a specific aesthetic by using clip images embds (implementation of https://github.com/vicgalle/stable-diffusion-aesthetic-gradients)
 
 ## Installation and Running
 Make sure the required [dependencies](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Dependencies) are met and follow the instructions available for both [NVidia](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Install-and-Run-on-NVidia-GPUs) (recommended) and [AMD](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Install-and-Run-on-AMD-GPUs) GPUs.

@@ -0,0 +1,78 @@
+import itertools
+import os
+from pathlib import Path
+import html
+import gc
+
+import gradio as gr
+import torch
+from PIL import Image
+from modules import shared
+from modules.shared import device, aesthetic_embeddings
+from transformers import CLIPModel, CLIPProcessor
+
+from tqdm.auto import tqdm
+
+
+def get_all_images_in_folder(folder):
+    return [os.path.join(folder, f) for f in os.listdir(folder) if
+            os.path.isfile(os.path.join(folder, f)) and check_is_valid_image_file(f)]
+
+
+def check_is_valid_image_file(filename):
+    return filename.lower().endswith(('.png', '.jpg', '.jpeg'))
+
+
+def batched(dataset, total, n=1):
+    for ndx in range(0, total, n):
+        yield [dataset.__getitem__(i) for i in range(ndx, min(ndx + n, total))]
+
+
+def iter_to_batched(iterable, n=1):
+    it = iter(iterable)
+    while True:
+        chunk = tuple(itertools.islice(it, n))
+        if not chunk:
+            return
+        yield chunk
+
+
+def generate_imgs_embd(name, folder, batch_size):
+    # clipModel = CLIPModel.from_pretrained(
+    #     shared.sd_model.cond_stage_model.clipModel.name_or_path
+    # )
+    model = CLIPModel.from_pretrained(shared.sd_model.cond_stage_model.clipModel.name_or_path).to(device)
+    processor = CLIPProcessor.from_pretrained(shared.sd_model.cond_stage_model.clipModel.name_or_path)
+
+    with torch.no_grad():
+        embs = []
+        for paths in tqdm(iter_to_batched(get_all_images_in_folder(folder), batch_size),
+                          desc=f"Generating embeddings for {name}"):
+            if shared.state.interrupted:
+                break
+            inputs = processor(images=[Image.open(path) for path in paths], return_tensors="pt").to(device)
+            outputs = model.get_image_features(**inputs).cpu()
+            embs.append(torch.clone(outputs))
+            inputs.to("cpu")
+            del inputs, outputs
+
+        embs = torch.cat(embs, dim=0).mean(dim=0, keepdim=True)
+
+        # The generated embedding will be located here
+        path = str(Path(shared.cmd_opts.aesthetic_embeddings_dir) / f"{name}.pt")
+        torch.save(embs, path)
+
+        model = model.cpu()
+        del model
+        del processor
+        del embs
+        gc.collect()
+        torch.cuda.empty_cache()
+        res = f"""
+        Done generating embedding for {name}!
+        Hypernetwork saved to {html.escape(path)}
+        """
+        shared.update_aesthetic_embeddings()
+        return gr.Dropdown(sorted(aesthetic_embeddings.keys()), label="Imgs embedding",
+                           value=sorted(aesthetic_embeddings.keys())[0] if len(
+                               aesthetic_embeddings) > 0 else None), res, ""
@@ -30,6 +30,8 @@
 parser.add_argument("--no-progressbar-hiding", action='store_true', help="do not hide progressbar in gradio UI (we hide it because it slows down ML if you have hardware acceleration in browser)")
 parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI")
 parser.add_argument("--embeddings-dir", type=str, default=os.path.join(script_path, 'embeddings'), help="embeddings directory for textual inversion (default: embeddings)")
+parser.add_argument("--aesthetic_embeddings-dir", type=str, default=os.path.join(script_path, 'aesthetic_embeddings'),
+                    help="aesthetic_embeddings directory(default: aesthetic_embeddings)")
 parser.add_argument("--hypernetwork-dir", type=str, default=os.path.join(models_path, 'hypernetworks'), help="hypernetwork directory")
 parser.add_argument("--allow-code", action='store_true', help="allow custom script execution from webui")
 parser.add_argument("--medvram", action='store_true', help="enable stable diffusion model optimizations for sacrificing a little speed for low VRM usage")
@@ -90,6 +92,13 @@
 hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
 loaded_hypernetwork = None
 
+aesthetic_embeddings = {f.replace(".pt",""): os.path.join(cmd_opts.aesthetic_embeddings_dir, f) for f in
+                        os.listdir(cmd_opts.aesthetic_embeddings_dir) if f.endswith(".pt")}
+
+def update_aesthetic_embeddings():
+    global aesthetic_embeddings
+    aesthetic_embeddings = {f.replace(".pt",""): os.path.join(cmd_opts.aesthetic_embeddings_dir, f) for f in
+                            os.listdir(cmd_opts.aesthetic_embeddings_dir) if f.endswith(".pt")}
 
 def reload_hypernetworks():
     global hypernetworks

diff --git a/modules/textual_inversion/dataset.py b/modules/textual_inversion/dataset.py
@@ -48,7 +48,7 @@ def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_to
         print("Preparing dataset...")
         for path in tqdm.tqdm(self.image_paths):
             try:
-                image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.BICUBIC)
+                image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.Resampling.BICUBIC)
             except Exception:
                 continue
 

@@ -172,7 +172,15 @@ def create_embedding(name, num_vectors_per_token, init_text='*'):
     return fn
 
 
-def train_embedding(embedding_name, learn_rate, data_root, log_directory, training_width, training_height, steps, create_image_every, save_embedding_every, template_file, save_image_with_stored_embedding, preview_image_prompt):
+def batched(dataset, total, n=1):
+    for ndx in range(0, total, n):
+        yield [dataset.__getitem__(i) for i in range(ndx, min(ndx + n, total))]
+
+
+def train_embedding(embedding_name, learn_rate, data_root, log_directory, training_width, training_height, steps,
+                    create_image_every, save_embedding_every, template_file, save_image_with_stored_embedding,
+                    preview_image_prompt, batch_size=1,
+                    gradient_accumulation=1):
     assert embedding_name, 'embedding not selected'
 
     shared.state.textinfo = "Initializing textual inversion training..."
@@ -204,7 +212,11 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, traini
 
     shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
     with torch.autocast("cuda"):
-        ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=embedding_name, model=shared.sd_model, device=devices.device, template_file=template_file)
+        ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width,
+                                                                height=training_height,
+                                                                repeats=shared.opts.training_image_repeats_per_epoch,
+                                                                placeholder_token=embedding_name, model=shared.sd_model,
+                                                                device=devices.device, template_file=template_file)
 
     hijack = sd_hijack.model_hijack
 
@@ -223,7 +235,7 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, traini
     scheduler = LearnRateScheduler(learn_rate, steps, ititial_step)
     optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate)
 
-    pbar = tqdm.tqdm(enumerate(ds), total=steps-ititial_step)
+    pbar = tqdm.tqdm(enumerate(batched(ds, steps - ititial_step, batch_size)), total=steps - ititial_step)
     for i, entry in pbar:
         embedding.step = i + ititial_step
 
@@ -235,17 +247,20 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, traini
             break
 
         with torch.autocast("cuda"):
-            c = cond_model([entry.cond_text])
+            c = cond_model([e.cond_text for e in entry])
+
+            x = torch.stack([e.latent for e in entry]).to(devices.device)
+            loss = shared.sd_model(x, c)[0]
 
-            x = entry.latent.to(devices.device)
-            loss = shared.sd_model(x.unsqueeze(0), c)[0]
             del x
 
             losses[embedding.step % losses.shape[0]] = loss.item()
 
-            optimizer.zero_grad()
             loss.backward()
-            optimizer.step()
+            if ((i + 1) % gradient_accumulation == 0) or (i + 1 == steps - ititial_step):
+                optimizer.step()
+                optimizer.zero_grad()
+
 
         epoch_num = embedding.step // len(ds)
         epoch_step = embedding.step - (epoch_num * len(ds)) + 1
@@ -259,7 +274,7 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, traini
         if embedding.step > 0 and images_dir is not None and embedding.step % create_image_every == 0:
             last_saved_image = os.path.join(images_dir, f'{embedding_name}-{embedding.step}.png')
 
-            preview_text = entry.cond_text if preview_image_prompt == "" else preview_image_prompt
+            preview_text = entry[0].cond_text if preview_image_prompt == "" else preview_image_prompt
 
             p = processing.StableDiffusionProcessingTxt2Img(
                 sd_model=shared.sd_model,
@@ -305,7 +320,7 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, traini
 <p>
 Loss: {losses.mean():.7f}<br/>
 Step: {embedding.step}<br/>
-Last prompt: {html.escape(entry.cond_text)}<br/>
+Last prompt: {html.escape(entry[-1].cond_text)}<br/>
 Last saved embedding: {html.escape(last_saved_file)}<br/>
 Last saved image: {html.escape(last_saved_image)}<br/>
 </p>

@@ -6,7 +6,18 @@
 from modules.ui import plaintext_to_html
 
 
-def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, enable_hr: bool, scale_latent: bool, denoising_strength: float, *args):
+def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int,
+            restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int,
+            subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool,
+            height: int, width: int, enable_hr: bool, scale_latent: bool, denoising_strength: float,
+            aesthetic_lr=0,
+            aesthetic_weight=0, aesthetic_steps=0,
+            aesthetic_imgs=None,
+            aesthetic_slerp=False,
+            aesthetic_imgs_text="",
+            aesthetic_slerp_angle=0.15,
+            aesthetic_text_negative=False,
+            *args):
     p = StableDiffusionProcessingTxt2Img(
         sd_model=shared.sd_model,
         outpath_samples=opts.outdir_samples or opts.outdir_txt2img_samples,
@@ -40,7 +51,9 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
     processed = modules.scripts.scripts_txt2img.run(p, *args)
 
     if processed is None:
-        processed = process_images(p)
+        processed = process_images(p, aesthetic_lr, aesthetic_weight, aesthetic_steps, aesthetic_imgs, aesthetic_slerp,aesthetic_imgs_text,
+                           aesthetic_slerp_angle,
+                           aesthetic_text_negative)
 
     shared.total_tqdm.clear()