fix: diffusion with input and output of different channel size

beniz · beniz · commit cd264de330c1 · 2024-09-11T20:19:07.000+02:00
diff --git a/models/diffusion_networks.py b/models/diffusion_networks.py
@@ -90,7 +90,7 @@ def define_G(
     norm_layer = get_norm_layer(norm_type=G_norm)
 
     if model_type == "palette":
-        in_channel = model_input_nc * 2
+        in_channel = model_input_nc + model_output_nc
     else:  # CM
         in_channel = model_input_nc
         if (
diff --git a/models/modules/diffusion_generator.py b/models/modules/diffusion_generator.py
@@ -138,7 +138,15 @@ def restoration_ddpm(
         ), "num_timesteps must greater than sample_num"
         sample_inter = self.denoise_fn.model.num_timesteps_test // sample_num
 
-        y_t = self.default(y_t, lambda: torch.randn_like(y_cond))
+        # y_t must be of output channel size, since we do not have y_0 (gt), we get it from the model
+        y_t_shape = list(y_cond.shape)
+        y_t_shape[1] = (
+            self.denoise_fn.model.out_channel
+        )  # set to number of model output channels
+        y_t = self.default(
+            y_t,
+            lambda: torch.randn(y_t_shape, device=y_cond.device, dtype=y_cond.dtype),
+        )
         ret_arr = y_t
 
         for i in tqdm(
@@ -439,6 +447,8 @@ def ddim_p_mean_variance(
 
     def forward(self, y_0, y_cond, mask, noise, cls, ref, dropout_prob=0.0):
         sequence_length = 0
+
+        # vid only
         if len(y_0.shape) == 5:
             sequence_length = y_0.shape[1]
             y_0, y_cond, mask = rearrange_5dto4d(y_0, y_cond, mask)