update training code and data

Shu Zhang · Shu Zhang · commit 9527553cd08b · 2023-07-08T19:26:52.000-07:00
diff --git a/configs/generate.yaml b/configs/generate.yaml
@@ -84,7 +84,7 @@ data:
     validation:
       target: edit_dataset.EditDataset
       params:
-        path: data/clip-filtered-dataset
+        path: ./data/training/instructpix2pix
         cache_dir:  data/
         cache_name: data_10k
         split: val
diff --git a/configs/train_v21_base.yaml b/configs/train_v21_base.yaml
@@ -0,0 +1,125 @@
+# File modified by authors of InstructPix2Pix from original (https://github.com/CompVis/stable-diffusion).
+# See more details in LICENSE.
+
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm_edit_v21.LatentDiffusion
+  params:
+    ckpt_path: ./checkpoints/v2-1_512-ema-pruned.ckpt
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: edited
+    cond_stage_key: edit
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: true
+    load_ema: false
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 0 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel_v21.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 32
+    num_workers: 2
+    train:
+      target: edit_dataset.EditDataset
+      params:
+        path_instructpix2pix: ./data/training/instructpix2pix
+        path_hive_0: ./data/training
+        path_hive_1: ./data/training/part_0_blip_prompt_new
+        path_hive_2: ./data/training/part_1_blip_prompt_new
+        split: train
+        min_resize_res: 256
+        max_resize_res: 256
+        crop_res: 256
+        flip_prob: 0.5
+    validation:
+      target: edit_dataset.EditDataset
+      params:
+        path_instructpix2pix: ./data/training/instructpix2pix
+        path_hive_0: ./data/training
+        path_hive_1: ./data/training/part_0_blip_prompt_new
+        path_hive_2: ./data/training/part_1_blip_prompt_new
+        split: val
+        min_resize_res: 256
+        max_resize_res: 256
+        crop_res: 256
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 2000
+        max_images: 2
+        increase_log_steps: False
+
+  trainer:
+    max_epochs: 3000
+    benchmark: True
+    accumulate_grad_batches: 4
+    check_val_every_n_epoch: 4
diff --git a/edit_dataset.py b/edit_dataset.py
@@ -21,13 +21,16 @@
 from PIL import Image
 from torch.utils.data import Dataset
 import jsonlines
+from collections import deque
 
 
 class EditDataset(Dataset):
     def __init__(
         self,
-        path_official: str,
-        path_ours: str,
+        path_instructpix2pix: str,
+        path_hive_0: str,
+        path_hive_1: str,
+        path_hive_2: str,
         split: str = "train",
         splits: tuple[float, float, float] = (0.9, 0.05, 0.05),
         min_resize_res: int = 256,
@@ -37,51 +40,91 @@ def __init__(
     ):
         assert split in ("train", "val", "test")
         assert sum(splits) == 1
-        self.path_official = path_official
-        self.path_ours = path_ours
+        self.path_instructpix2pix = path_instructpix2pix
+        self.path_hive_0 = path_hive_0
+        self.path_hive_1 = path_hive_1
+        self.path_hive_2 = path_hive_2
         self.min_resize_res = min_resize_res
         self.max_resize_res = max_resize_res
         self.crop_res = crop_res
         self.flip_prob = flip_prob
-        # load official dataset
-        with open(Path(self.path_official, "seeds.json")) as f:
-            self.seeds = json.load(f)
+        self.seeds = []
+        self.instructions = []
+        self.source_imgs = []
+        self.edited_imgs = []
+        # load instructpix2pix dataset
+        with open(Path(self.path_instructpix2pix, "seeds.json")) as f:
+            seeds = json.load(f)
         split_0, split_1 = {
             "train": (0.0, splits[0]),
             "val": (splits[0], splits[0] + splits[1]),
             "test": (splits[0] + splits[1], 1.0),
         }[split]
 
-        idx_0 = math.floor(split_0 * len(self.seeds))
-        idx_1 = math.floor(split_1 * len(self.seeds))
-        self.seeds = self.seeds[idx_0:idx_1]
+        idx_0 = math.floor(split_0 * len(seeds))
+        idx_1 = math.floor(split_1 * len(seeds))
+        seeds = seeds[idx_0:idx_1]
+
+        for seed in seeds:
+            seed = deque(seed)
+            seed.appendleft('')
+            seed.appendleft('instructpix2pix')
+            self.seeds.append(list(seed))
+
+
+        # load HIVE dataset first part
 
-        # load in-house dataset
-        self.instructions = []
-        self.source_imgs = []
-        self.edited_imgs = []
         cnt = 0
-        with jsonlines.open(Path(self.path_ours, "training_1M.jsonl")) as reader:
+        with jsonlines.open(Path(self.path_hive_0, "training_cycle.jsonl")) as reader:
             for ll in reader:
                 self.instructions.append(ll['instruction'])
                 self.source_imgs.append(ll['source_img'])
                 self.edited_imgs.append(ll['edited_img'])
-                self.seeds.append(['in_house', [cnt]])
+                self.seeds.append(['hive_0', '', '', [cnt]])
                 cnt += 1
 
+        # load HIVE dataset second part
+        with open(Path(self.path_hive_1, "seeds.json")) as f:
+            seeds = json.load(f)
+        for seed in seeds:
+            seed = deque(seed)
+            seed.appendleft('hive_1')
+            self.seeds.append(list(seed))
+        # load HIVE dataset third part
+        with open(Path(self.path_hive_2, "seeds.json")) as f:
+            seeds = json.load(f)
+        for seed in seeds:
+            seed = deque(seed)
+            seed.appendleft('hive_2')
+            self.seeds.append(list(seed))
+
     def __len__(self) -> int:
         return len(self.seeds)
 
     def __getitem__(self, i: int) -> dict[str, Any]:
 
-        name, seeds = self.seeds[i]
-        if name != 'in_house':
-            propt_dir = Path(self.path_official, name)
+        name_0, name_1, name_2, seeds = self.seeds[i]
+        if name_0 == 'instructpix2pix':
+            propt_dir = Path(self.path_instructpix2pix, name_2)
             seed = seeds[torch.randint(0, len(seeds), ()).item()]
             with open(propt_dir.joinpath("prompt.json")) as fp:
                 prompt = json.load(fp)["edit"]
             image_0 = Image.open(propt_dir.joinpath(f"{seed}_0.jpg"))
             image_1 = Image.open(propt_dir.joinpath(f"{seed}_1.jpg"))
+        elif name_0 == 'hive_1':
+            propt_dir = Path(self.path_hive_1, name_1, name_2)
+            seed = seeds[torch.randint(0, len(seeds), ()).item()]
+            with open(propt_dir.joinpath("prompt.json")) as fp:
+                prompt = json.load(fp)["instruction"]
+            image_0 = Image.open(propt_dir.joinpath(f"{seed}_0.jpg"))
+            image_1 = Image.open(propt_dir.joinpath(f"{seed}_1.jpg"))
+        elif name_0 == 'hive_2':
+            propt_dir = Path(self.path_hive_2, name_1, name_2)
+            seed = seeds[torch.randint(0, len(seeds), ()).item()]
+            with open(propt_dir.joinpath("prompt.json")) as fp:
+                prompt = json.load(fp)["instruction"]
+            image_0 = Image.open(propt_dir.joinpath(f"{seed}_0.jpg"))
+            image_1 = Image.open(propt_dir.joinpath(f"{seed}_1.jpg"))
         else:
             j = seeds[0]
             image_0 = Image.open(self.source_imgs[j])
@@ -101,51 +144,3 @@ def __getitem__(self, i: int) -> dict[str, Any]:
 
         return dict(edited=image_1, edit=dict(c_concat=image_0, c_crossattn=prompt))
 
-
-class EditDatasetEval(Dataset):
-    def __init__(
-        self,
-        path: str,
-        split: str = "train",
-        splits: tuple[float, float, float] = (0.9, 0.05, 0.05),
-        res: int = 256,
-    ):
-        assert split in ("train", "val", "test")
-        assert sum(splits) == 1
-        self.path = path
-        self.res = res
-
-        with open(Path(self.path, "seeds.json")) as f:
-            self.seeds = json.load(f)
-
-        split_0, split_1 = {
-            "train": (0.0, splits[0]),
-            "val": (splits[0], splits[0] + splits[1]),
-            "test": (splits[0] + splits[1], 1.0),
-        }[split]
-
-        idx_0 = math.floor(split_0 * len(self.seeds))
-        idx_1 = math.floor(split_1 * len(self.seeds))
-        self.seeds = self.seeds[idx_0:idx_1]
-
-    def __len__(self) -> int:
-        return len(self.seeds)
-
-    def __getitem__(self, i: int) -> dict[str, Any]:
-        name, seeds = self.seeds[i]
-        propt_dir = Path(self.path, name)
-        seed = seeds[torch.randint(0, len(seeds), ()).item()]
-        with open(propt_dir.joinpath("prompt.json")) as fp:
-            prompt = json.load(fp)
-            edit = prompt["edit"]
-            input_prompt = prompt["input"]
-            output_prompt = prompt["output"]
-
-        image_0 = Image.open(propt_dir.joinpath(f"{seed}_0.jpg"))
-
-        reize_res = torch.randint(self.res, self.res + 1, ()).item()
-        image_0 = image_0.resize((reize_res, reize_res), Image.Resampling.LANCZOS)
-
-        image_0 = rearrange(2 * torch.tensor(np.array(image_0)).float() / 255 - 1, "h w c -> c h w")
-
-        return dict(image_0=image_0, input_prompt=input_prompt, edit=edit, output_prompt=output_prompt)
diff --git a/main.py b/main.py
@@ -111,7 +111,7 @@ def str2bool(v):
         "-s",
         "--seed",
         type=int,
-        default=23,
+        default=100,
         help="seed for seed_everything",
     )
     parser.add_argument(
@@ -125,7 +125,7 @@ def str2bool(v):
         "-l",
         "--logdir",
         type=str,
-        default="/export/laion-aesthetics-v2/instruct_pix2pix/logs",
+        default="./logs",
         help="directory for logging dat shit",
     )
     parser.add_argument(