AUTOMATIC1111 · AUTOMATIC1111 · Oct 21, 2022 · Oct 14, 2022 · Oct 15, 2022 · Oct 15, 2022
@@ -70,6 +70,7 @@ Check the [custom scripts](https://github.com/AUTOMATIC1111/stable-diffusion-web
 - No token limit for prompts (original stable diffusion lets you use up to 75 tokens)
 - DeepDanbooru integration, creates danbooru style tags for anime prompts (add --deepdanbooru to commandline args)
 - [xformers](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Xformers), major speed increase for select cards: (add --xformers to commandline args)
+- Aesthetic, a way to generate images with a specific aesthetic by using clip images embds (implementation of https://github.com/vicgalle/stable-diffusion-aesthetic-gradients)
 
 ## Installation and Running
 Make sure the required [dependencies](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Dependencies) are met and follow the instructions available for both [NVidia](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Install-and-Run-on-NVidia-GPUs) (recommended) and [AMD](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Install-and-Run-on-AMD-GPUs) GPUs.

@@ -0,0 +1,78 @@
+import itertools
+import os
+from pathlib import Path
+import html
+import gc
+
+import gradio as gr
+import torch
+from PIL import Image
+from modules import shared
+from modules.shared import device, aesthetic_embeddings
+from transformers import CLIPModel, CLIPProcessor
+
+from tqdm.auto import tqdm
+
+
+def get_all_images_in_folder(folder):
+    return [os.path.join(folder, f) for f in os.listdir(folder) if
+            os.path.isfile(os.path.join(folder, f)) and check_is_valid_image_file(f)]
+
+
+def check_is_valid_image_file(filename):
+    return filename.lower().endswith(('.png', '.jpg', '.jpeg'))
+
+
+def batched(dataset, total, n=1):
+    for ndx in range(0, total, n):
+        yield [dataset.__getitem__(i) for i in range(ndx, min(ndx + n, total))]
+
+
+def iter_to_batched(iterable, n=1):
+    it = iter(iterable)
+    while True:
+        chunk = tuple(itertools.islice(it, n))
+        if not chunk:
+            return
+        yield chunk
+
+
+def generate_imgs_embd(name, folder, batch_size):
+    # clipModel = CLIPModel.from_pretrained(
+    #     shared.sd_model.cond_stage_model.clipModel.name_or_path
+    # )
+    model = CLIPModel.from_pretrained(shared.sd_model.cond_stage_model.clipModel.name_or_path).to(device)
+    processor = CLIPProcessor.from_pretrained(shared.sd_model.cond_stage_model.clipModel.name_or_path)
+
+    with torch.no_grad():
+        embs = []
+        for paths in tqdm(iter_to_batched(get_all_images_in_folder(folder), batch_size),
+                          desc=f"Generating embeddings for {name}"):
+            if shared.state.interrupted:
+                break
+            inputs = processor(images=[Image.open(path) for path in paths], return_tensors="pt").to(device)
+            outputs = model.get_image_features(**inputs).cpu()
+            embs.append(torch.clone(outputs))
+            inputs.to("cpu")
+            del inputs, outputs
+
+        embs = torch.cat(embs, dim=0).mean(dim=0, keepdim=True)
+
+        # The generated embedding will be located here
+        path = str(Path(shared.cmd_opts.aesthetic_embeddings_dir) / f"{name}.pt")
+        torch.save(embs, path)
+
+        model = model.cpu()
+        del model
+        del processor
+        del embs
+        gc.collect()
+        torch.cuda.empty_cache()
+        res = f"""
+        Done generating embedding for {name}!
+        Hypernetwork saved to {html.escape(path)}
+        """
+        shared.update_aesthetic_embeddings()
+        return gr.Dropdown(sorted(aesthetic_embeddings.keys()), label="Imgs embedding",
+                           value=sorted(aesthetic_embeddings.keys())[0] if len(
+                               aesthetic_embeddings) > 0 else None), res, ""
@@ -30,6 +30,8 @@
 parser.add_argument("--no-progressbar-hiding", action='store_true', help="do not hide progressbar in gradio UI (we hide it because it slows down ML if you have hardware acceleration in browser)")
 parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI")
 parser.add_argument("--embeddings-dir", type=str, default=os.path.join(script_path, 'embeddings'), help="embeddings directory for textual inversion (default: embeddings)")
+parser.add_argument("--aesthetic_embeddings-dir", type=str, default=os.path.join(script_path, 'aesthetic_embeddings'),
+                    help="aesthetic_embeddings directory(default: aesthetic_embeddings)")
 parser.add_argument("--hypernetwork-dir", type=str, default=os.path.join(models_path, 'hypernetworks'), help="hypernetwork directory")
 parser.add_argument("--allow-code", action='store_true', help="allow custom script execution from webui")
 parser.add_argument("--medvram", action='store_true', help="enable stable diffusion model optimizations for sacrificing a little speed for low VRM usage")
@@ -92,6 +94,13 @@
 hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
 loaded_hypernetwork = None
 
+aesthetic_embeddings = {f.replace(".pt",""): os.path.join(cmd_opts.aesthetic_embeddings_dir, f) for f in
+                        os.listdir(cmd_opts.aesthetic_embeddings_dir) if f.endswith(".pt")}
+
+def update_aesthetic_embeddings():
+    global aesthetic_embeddings
+    aesthetic_embeddings = {f.replace(".pt",""): os.path.join(cmd_opts.aesthetic_embeddings_dir, f) for f in
+                            os.listdir(cmd_opts.aesthetic_embeddings_dir) if f.endswith(".pt")}
 
 def reload_hypernetworks():
     global hypernetworks

diff --git a/modules/textual_inversion/dataset.py b/modules/textual_inversion/dataset.py
@@ -49,7 +49,7 @@ def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_to
         print("Preparing dataset...")
         for path in tqdm.tqdm(self.image_paths):
             try:
-                image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.BICUBIC)
+                image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.Resampling.BICUBIC)
             except Exception:
                 continue
 

@@ -269,9 +269,11 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
 
             losses[embedding.step % losses.shape[0]] = loss.item()
 
-            optimizer.zero_grad()
             loss.backward()
-            optimizer.step()
+            if ((i + 1) % gradient_accumulation == 0) or (i + 1 == steps - ititial_step):
+                optimizer.step()
+                optimizer.zero_grad()
+
 
         epoch_num = embedding.step // len(ds)
         epoch_step = embedding.step - (epoch_num * len(ds)) + 1

@@ -6,7 +6,13 @@
 from modules.ui import plaintext_to_html
 
 
-def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, enable_hr: bool, denoising_strength: float, firstphase_width: int, firstphase_height: int, *args):
+def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, enable_hr: bool, denoising_strength: float, firstphase_width: int, firstphase_height: int,aesthetic_lr=0,
+            aesthetic_weight=0, aesthetic_steps=0,
+            aesthetic_imgs=None,
+            aesthetic_slerp=False,
+            aesthetic_imgs_text="",
+            aesthetic_slerp_angle=0.15,
+            aesthetic_text_negative=False, *args):
     p = StableDiffusionProcessingTxt2Img(
         sd_model=shared.sd_model,
         outpath_samples=opts.outdir_samples or opts.outdir_txt2img_samples,
@@ -41,7 +47,9 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
     processed = modules.scripts.scripts_txt2img.run(p, *args)
 
     if processed is None:
-        processed = process_images(p)
+        processed = process_images(p, aesthetic_lr, aesthetic_weight, aesthetic_steps, aesthetic_imgs, aesthetic_slerp,aesthetic_imgs_text,
+                           aesthetic_slerp_angle,
+                           aesthetic_text_negative)
 
     shared.total_tqdm.clear()
 

@@ -25,7 +25,8 @@
 
 from modules import sd_hijack, sd_models
 from modules.paths import script_path
-from modules.shared import opts, cmd_opts
+from modules.shared import opts, cmd_opts,aesthetic_embeddings
+
 if cmd_opts.deepdanbooru:
     from modules.deepbooru import get_deepbooru_tags
 import modules.shared as shared
@@ -41,8 +42,11 @@
 from modules.images import save_image
 import modules.textual_inversion.ui
 import modules.hypernetworks.ui
+
+import modules.aesthetic_clip
 import modules.images_history as img_his
 
+
 # this is a fix for Windows users. Without it, javascript files will be served with text/html content-type and the browser will not show any UI
 mimetypes.init()
 mimetypes.add_type('application/javascript', '.js')
@@ -592,6 +596,18 @@ def create_ui(wrap_gradio_gpu_call):
                     width = gr.Slider(minimum=64, maximum=2048, step=64, label="Width", value=512)
                     height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512)
 
+                with gr.Group():
+                    aesthetic_lr = gr.Textbox(label='Learning rate', placeholder="Learning rate", value="0.0001")
+                    aesthetic_weight = gr.Slider(minimum=0, maximum=1, step=0.01, label="Aesthetic weight", value=0.9)
+                    aesthetic_steps = gr.Slider(minimum=0, maximum=256, step=1, label="Aesthetic steps", value=5)
+                    with gr.Row():
+                        aesthetic_imgs_text = gr.Textbox(label='Aesthetic text for imgs', placeholder="This text is used to rotate the feature space of the imgs embs", value="")
+                        aesthetic_slerp_angle = gr.Slider(label='Slerp angle',minimum=0, maximum=1, step=0.01, value=0.1)
+                        aesthetic_text_negative = gr.Checkbox(label="Is negative text", value=False)
+
+                    aesthetic_imgs = gr.Dropdown(sorted(aesthetic_embeddings.keys()), label="Imgs embedding", value=sorted(aesthetic_embeddings.keys())[0] if len(aesthetic_embeddings) > 0 else None)
+                    aesthetic_slerp = gr.Checkbox(label="Slerp interpolation", value=False)
+
                 with gr.Row():
                     restore_faces = gr.Checkbox(label='Restore faces', value=False, visible=len(shared.face_restorers) > 1)
                     tiling = gr.Checkbox(label='Tiling', value=False)
@@ -664,7 +680,16 @@ def create_ui(wrap_gradio_gpu_call):
                     denoising_strength,
                     firstphase_width,
                     firstphase_height,
+                    aesthetic_lr,
+                    aesthetic_weight,
+                    aesthetic_steps,
+                    aesthetic_imgs,
+                    aesthetic_slerp,
+                    aesthetic_imgs_text,
+                    aesthetic_slerp_angle,
+                    aesthetic_text_negative
                 ] + custom_inputs,
+
                 outputs=[
                     txt2img_gallery,
                     generation_info,
@@ -793,7 +818,7 @@ def create_ui(wrap_gradio_gpu_call):
 
                         with gr.Row():
                             inpaint_full_res = gr.Checkbox(label='Inpaint at full resolution', value=False)
-                            inpaint_full_res_padding = gr.Slider(label='Inpaint at full resolution padding, pixels', minimum=0, maximum=256, step=4, value=32)
+                            inpaint_full_res_padding = gr.Slider(label='Inpaint at full resolution padding, pixels', minimum=0, maximum=1024, step=4, value=32)
 
                     with gr.TabItem('Batch img2img', id='batch'):
                         hidden = '<br>Disabled when launched with --hide-ui-dir-config.' if shared.cmd_opts.hide_ui_dir_config else ''
@@ -1162,6 +1187,17 @@ def create_ui(wrap_gradio_gpu_call):
                         with gr.Column():
                             create_embedding = gr.Button(value="Create embedding", variant='primary')
 
+                with gr.Tab(label="Create images embedding"):
+                    new_embedding_name_ae = gr.Textbox(label="Name")
+                    process_src_ae = gr.Textbox(label='Source directory')
+                    batch_ae = gr.Slider(minimum=1, maximum=1024, step=1, label="Batch size", value=256)
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            gr.HTML(value="")
+
+                        with gr.Column():
+                            create_embedding_ae = gr.Button(value="Create images embedding", variant='primary')
+
                 with gr.Tab(label="Create hypernetwork"):
                     new_hypernetwork_name = gr.Textbox(label="Name")
                     new_hypernetwork_sizes = gr.CheckboxGroup(label="Modules", value=["768", "320", "640", "1280"], choices=["768", "320", "640", "1280"])
@@ -1203,6 +1239,9 @@ def create_ui(wrap_gradio_gpu_call):
                     template_file = gr.Textbox(label='Prompt template file', value=os.path.join(script_path, "textual_inversion_templates", "style_filewords.txt"))
                     training_width = gr.Slider(minimum=64, maximum=2048, step=64, label="Width", value=512)
                     training_height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512)
+                    batch_size = gr.Slider(minimum=1, maximum=64, step=1, label="Batch Size", value=4)
+                    gradient_accumulation = gr.Slider(minimum=1, maximum=256, step=1, label="Gradient accumulation",
+                                                      value=1)
                     steps = gr.Number(label='Max steps', value=100000, precision=0)
                     create_image_every = gr.Number(label='Save an image to log directory every N steps, 0 to disable', value=500, precision=0)
                     save_embedding_every = gr.Number(label='Save a copy of embedding to log directory every N steps, 0 to disable', value=500, precision=0)
@@ -1228,7 +1267,7 @@ def create_ui(wrap_gradio_gpu_call):
             fn=modules.textual_inversion.ui.create_embedding,
             inputs=[
                 new_embedding_name,
-                initialization_text,
+                process_src,
                 nvpt,
             ],
             outputs=[
@@ -1238,6 +1277,20 @@ def create_ui(wrap_gradio_gpu_call):
             ]
         )
 
+        create_embedding_ae.click(
+            fn=modules.aesthetic_clip.generate_imgs_embd,
+            inputs=[
+                new_embedding_name_ae,
+                process_src_ae,
+                batch_ae
+            ],
+            outputs=[
+                aesthetic_imgs,
+                ti_output,
+                ti_outcome,
+            ]
+        )
+
         create_hypernetwork.click(
             fn=modules.hypernetworks.ui.create_hypernetwork,
             inputs=[