clean-up unnecessary code blocks and update docs for better readability

saehoonkim · saehoonkim · commit 0401ecf59fc6 · 2022-12-26T16:36:49.000+09:00
diff --git a/assets/improved_sr_arch.jpg b/assets/improved_sr_arch.jpg
diff --git a/assets/improved_sr_arch.png b/assets/improved_sr_arch.png
diff --git a/configs/decoder_900M_vit_l.yaml b/configs/decoder_900M_vit_l.yaml
@@ -19,10 +19,8 @@ model:
     xf_layers: 0
     xf_heads: 0
     xf_final_ln: false
-    xf_padding: false
     resblock_updown: true
     learn_sigma: true
-    cache_text_emb: false
     text_drop: 0.3
     clip_emb_type: image
     clip_emb_drop: 0.1
diff --git a/configs/prior_1B_vit_l.yaml b/configs/prior_1B_vit_l.yaml
@@ -7,10 +7,8 @@ model:
     xf_layers: 20
     xf_heads: 32
     xf_final_ln: true
-    xf_padding: false
     text_drop: 0.2
     clip_dim: 768
-    clip_xf_width: 768
 
 diffusion:
   steps: 1000
diff --git a/karlo/models/decoder_model.py b/karlo/models/decoder_model.py
@@ -11,6 +11,13 @@
 
 
 class Text2ImProgressiveModel(torch.nn.Module):
+    """
+    A decoder that generates 64x64px images based on the text prompt.
+
+    :param config: yaml config to define the decoder.
+    :param tokenizer: tokenizer used in clip.
+    """
+
     def __init__(
         self,
         config,
diff --git a/karlo/models/prior_model.py b/karlo/models/prior_model.py
@@ -11,6 +11,15 @@
 
 
 class PriorDiffusionModel(torch.nn.Module):
+    """
+    A prior that generates clip image feature based on the text prompt.
+
+    :param config: yaml config to define the decoder.
+    :param tokenizer: tokenizer used in clip.
+    :param clip_mean: mean to normalize the clip image feature (zero-mean, unit variance).
+    :param clip_std: std to noramlize the clip image feature (zero-mean, unit variance).
+    """
+
     def __init__(self, config, tokenizer, clip_mean, clip_std):
         super().__init__()
 
@@ -40,9 +49,7 @@ def __init__(self, config, tokenizer, clip_mean, clip_std):
             xf_layers=self._model_conf.xf_layers,
             xf_heads=self._model_conf.xf_heads,
             xf_final_ln=self._model_conf.xf_final_ln,
-            xf_padding=self._model_conf.xf_padding,
             clip_dim=self._model_conf.clip_dim,
-            clip_xf_width=self._model_conf.clip_xf_width,
         )
 
         cf_token, cf_mask = self.set_cf_text_tensor()
diff --git a/karlo/modules/unet.py b/karlo/modules/unet.py
@@ -421,6 +421,7 @@ class UNetModel(nn.Module):
     :param conv_resample: if True, use learned convolutions for upsampling and
         downsampling.
     :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param clip_dim: dimension of clip feature.
     :param num_classes: if specified (as an int), then this model will be
         class-conditional with `num_classes` classes.
     :param use_checkpoint: use gradient checkpointing to reduce memory usage.
@@ -431,6 +432,8 @@ class UNetModel(nn.Module):
                                of heads for upsampling. Deprecated.
     :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
     :param resblock_updown: use residual blocks for up/downsampling.
+    :param encoder_channels: use to make the dimension of query and kv same in AttentionBlock.
+    :param use_time_embedding: use time embedding for condition.
     """
 
     def __init__(
@@ -672,6 +675,7 @@ class SuperResUNetModel(UNetModel):
     A UNetModel that performs super-resolution.
 
     Expects an extra kwarg `low_res` to condition on a low-resolution image.
+    Assumes that the shape of low-resolution and the input should be the same.
     """
 
     def __init__(self, *args, **kwargs):
@@ -686,22 +690,21 @@ def __init__(self, *args, **kwargs):
 
     def forward(self, x, timesteps, low_res=None, **kwargs):
         _, _, new_height, new_width = x.shape
-        upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear")
-        x = th.cat([x, upsampled], dim=1)
+        assert new_height == low_res.shape[2] and new_width == low_res.shape[3]
+
+        x = th.cat([x, low_res], dim=1)
         return super().forward(x, timesteps, **kwargs)
 
 
 class PLMImUNet(UNetModel):
     """
-    A UNetModel that conditions on text with an encoding transformer.
-
-    Expects an extra kwarg `tokens` of text.
+    A UNetModel that conditions on text with a pretrained text encoder in CLIP.
 
     :param text_ctx: number of text tokens to expect.
     :param xf_width: width of the transformer.
-    :param xf_layers: depth of the transformer.
-    :param xf_heads: heads in the transformer.
-    :param xf_final_ln: use a LayerNorm after the output layer.
+    :param clip_emb_mult: #extra tokens by projecting clip text feature.
+    :param clip_emb_type: type of condition (here, we fix clip image feature).
+    :param clip_emb_drop: dropout rato of clip image feature for cfg.
     """
 
     def __init__(
@@ -725,21 +728,21 @@ def __init__(
         else:
             super().__init__(*args, **kwargs, encoder_channels=xf_width)
 
-        # Project text encoded feat seq from pre-trained LM
+        # Project text encoded feat seq from pre-trained text encoder in CLIP
         self.text_seq_proj = nn.Sequential(
             nn.Linear(self.clip_dim, xf_width),
             LayerNorm(xf_width),
         )
         # Project CLIP text feat
         self.text_feat_proj = nn.Linear(self.clip_dim, self.model_channels * 4)
 
-        if self.clip_emb_mult is not None:
-            assert (
-                self.clip_dim is not None
-            ), "CLIP representation dim should be specified"
-            self.clip_tok_proj = nn.Linear(
-                self.clip_dim, self.xf_width * self.clip_emb_mult
-            )
+        assert clip_emb_mult is not None
+        assert clip_emb_type == "image"
+        assert self.clip_dim is not None, "CLIP representation dim should be specified"
+
+        self.clip_tok_proj = nn.Linear(
+            self.clip_dim, self.xf_width * self.clip_emb_mult
+        )
         if self.clip_emb_drop > 0:
             self.cf_param = nn.Parameter(th.empty(self.clip_dim, dtype=th.float32))
 
@@ -761,21 +764,19 @@ def forward(
         bsz = x.shape[0]
         hs = []
         emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
-        if self.clip_dim is not None:
-            emb = emb + self.clip_emb(y)
+        emb = emb + self.clip_emb(y)
 
         xf_out = self.text_seq_proj(txt_feat_seq)
         xf_out = xf_out.permute(0, 2, 1)
         emb = emb + self.text_feat_proj(txt_feat)
-        if self.clip_emb_mult is not None:
-            xf_out = th.cat(
-                [
-                    self.clip_tok_proj(y).reshape(bsz, -1, self.clip_emb_mult),
-                    xf_out,
-                ],
-                dim=2,
-            )
-            mask = F.pad(mask, (self.clip_emb_mult, 0), value=True)
+        xf_out = th.cat(
+            [
+                self.clip_tok_proj(y).reshape(bsz, -1, self.clip_emb_mult),
+                xf_out,
+            ],
+            dim=2,
+        )
+        mask = F.pad(mask, (self.clip_emb_mult, 0), value=True)
         mask = th.where(mask, 0.0, float("-inf"))
 
         h = x
diff --git a/karlo/modules/xf.py b/karlo/modules/xf.py
@@ -138,13 +138,12 @@ class PriorTransformer(nn.Module):
     """
     A Causal Transformer that conditions on CLIP text embedding, text.
 
-    Expects an extra kwarg `tokens` of text.
-
     :param text_ctx: number of text tokens to expect.
     :param xf_width: width of the transformer.
     :param xf_layers: depth of the transformer.
     :param xf_heads: heads in the transformer.
     :param xf_final_ln: use a LayerNorm after the output layer.
+    :param clip_dim: dimension of clip feature.
     """
 
     def __init__(
@@ -154,27 +153,23 @@ def __init__(
         xf_layers,
         xf_heads,
         xf_final_ln,
-        xf_padding,
         clip_dim,
-        clip_xf_width,
     ):
         super().__init__()
 
         self.text_ctx = text_ctx
         self.xf_width = xf_width
         self.xf_layers = xf_layers
         self.xf_heads = xf_heads
-        self.xf_padding = xf_padding
         self.clip_dim = clip_dim
-        self.clip_xf_width = clip_xf_width
         self.ext_len = 4
 
         self.time_embed = nn.Sequential(
             nn.Linear(xf_width, xf_width),
             nn.SiLU(),
             nn.Linear(xf_width, xf_width),
         )
-        self.text_enc_proj = nn.Linear(clip_xf_width, xf_width)
+        self.text_enc_proj = nn.Linear(clip_dim, xf_width)
         self.text_emb_proj = nn.Linear(clip_dim, xf_width)
         self.clip_img_proj = nn.Linear(clip_dim, xf_width)
         self.out_proj = nn.Linear(xf_width, clip_dim)
@@ -194,12 +189,6 @@ def __init__(
         )
         self.prd_emb = nn.Parameter(th.randn((1, 1, xf_width)))
 
-        if self.xf_padding:
-            self.padding_embedding = nn.Parameter(
-                th.empty(text_ctx + self.ext_len, xf_width)
-            )
-            nn.init.normal_(self.padding_embedding, std=0.01)
-
         nn.init.normal_(self.prd_emb, std=0.01)
         nn.init.normal_(self.positional_embedding, std=0.01)
 
@@ -229,10 +218,6 @@ def forward(
         ]
         input = th.cat(input_seq, dim=1)
         input = input + self.positional_embedding.to(input.dtype)
-        if self.xf_padding:
-            input = th.where(
-                mask[..., None], input, self.padding_embedding[None].to(input.dtype)
-            )
 
         mask = th.where(mask, 0.0, float("-inf"))
         mask = (mask[:, None, :] + causal_mask).to(input.dtype)
diff --git a/karlo/sampler/i2i.py b/karlo/sampler/i2i.py
@@ -13,6 +13,14 @@
 
 
 class I2ISampler(BaseSampler):
+    """
+    A sampler for image variation. In the original unclip paper, image variation transforms the noise obtained by DDIM inversion into a sample in RGB space.
+    Here, we simply transform the white noise to image, conditioned on the clip image feature.
+
+    :param root_dir: directory for model checkpoints.
+    :param sampling_type: ["default", "fast"]
+    """
+
     def __init__(
         self,
         root_dir: str,
diff --git a/karlo/sampler/t2i.py b/karlo/sampler/t2i.py
@@ -13,6 +13,13 @@
 
 
 class T2ISampler(BaseSampler):
+    """
+    A sampler for text-to-image generation.
+
+    :param root_dir: directory for model checkpoints.
+    :param sampling_type: ["default", "fast"]
+    """
+
     def __init__(
         self,
         root_dir: str,