KohakuBlueleaf · KohakuBlueleaf · Jun 30, 2024 · Jun 30, 2024 · Jun 30, 2024 · Jun 30, 2024
diff --git a/hunyuan_train.py b/hunyuan_train.py
@@ -29,8 +29,14 @@
 
 logger = logging.getLogger(__name__)
 
-import library.config_util as config_util
-import library.sdxl_train_util as sdxl_train_util
+from library import (
+    hunyuan_models,
+    hunyuan_utils,
+    sdxl_model_util,
+    sdxl_train_util,
+    train_util,
+    config_util,
+)
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -44,7 +50,6 @@
     apply_debiased_estimation,
     apply_masked_loss,
 )
-import library.hunyuan_utils as hunyuan_utils
 
 UNET_NUM_BLOCKS_FOR_BLOCK_LR = 23
 
@@ -158,6 +163,7 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
     vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
 
+    hydit_version = hunyuan_models.MODEL_VERSION_HUNYUAN_V1_1 if args.use_extra_cond else hunyuan_models.MODEL_VERSION_HUNYUAN_V1_2
     # Load models
     (
         load_stable_diffusion_format,
@@ -167,11 +173,7 @@ def train(args):
         hydit,
         logit_scale,
         ckpt_info,
-    ) = hunyuan_utils.load_target_model(args, accelerator, "hydit", weight_dtype, args.use_extra_cond)
-    if args.use_extra_cond:
-        hydit_version = 'v1.1'
-    else:
-        hydit_version = 'v1.2'
+    ) = hunyuan_utils.load_target_model(args, accelerator, hydit_version, weight_dtype)
 
     # verify load/save model formats
     if load_stable_diffusion_format:
@@ -733,10 +735,10 @@ def optimizer_hook(parameter: torch.Tensor):
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
             if args.logging_dir is not None:
                 logs = {"loss": current_loss}
-                if block_lrs is None:
-                    train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=train_hydit)
-                else:
-                    append_block_lr_to_logs(block_lrs, logs, lr_scheduler, args.optimizer_type)  # U-Net is included in block_lrs
+                # if block_lrs is None:
+                train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=train_hydit)
+                # else:
+                #     train_util.append_block_lr_to_logs(block_lrs, logs, lr_scheduler, args.optimizer_type)  # U-Net is included in block_lrs
 
                 accelerator.log(logs, step=global_step)
 

diff --git a/hunyuan_train_network.py b/hunyuan_train_network.py
@@ -59,7 +59,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         ) = hunyuan_utils.load_target_model(
             args,
             accelerator,
-            hunyuan_models.MODEL_VERSION_HUNYUAN_V1_1,
+            hunyuan_models.MODEL_VERSION_HUNYUAN_V1_1 if args.use_extra_cond else hunyuan_models.MODEL_VERSION_HUNYUAN_V1_2,
             weight_dtype,
         )
 
@@ -68,7 +68,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         self.ckpt_info = ckpt_info
 
         return (
-            hunyuan_models.MODEL_VERSION_HUNYUAN_V1_1,
+            hunyuan_models.MODEL_VERSION_HUNYUAN_V1_1 if args.use_extra_cond else hunyuan_models.MODEL_VERSION_HUNYUAN_V1_2,
             [text_encoder1, text_encoder2],
             vae,
             unet,
@@ -156,23 +156,20 @@ def call_unet(
         noisy_latents = noisy_latents.to(
             weight_dtype
         )  # TODO check why noisy_latents is not weight_dtype
-
-        # get size embeddings
-        orig_size = batch["original_sizes_hw"]  # B, 2
-        crop_size = batch["crop_top_lefts"]  # B, 2
-        target_size = batch["target_sizes_hw"]  # B, 2
         B, C, H, W = noisy_latents.shape
 
-        style = torch.as_tensor([0] * B, device=accelerator.device)
-        image_meta_size = torch.concat(
-            [
-                orig_size,
-                target_size,
-                # Not following SDXL but following HunYuan's Implementation
-                # TODO examine if this is correct
-                torch.zeros_like(target_size),
-            ]
-        )
+        if args.use_extra_cond:
+            # get size embeddings
+            orig_size = batch["original_sizes_hw"]  # B, 2
+            crop_size = batch["crop_top_lefts"]  # B, 2
+            target_size = batch["target_sizes_hw"]  # B, 2
+
+            style = torch.as_tensor([0] * B, device=accelerator.device)
+            image_meta_size = torch.concat([orig_size, target_size, crop_size])
+        else:
+            style = None
+            image_meta_size = None
+
         freqs_cis_img = hunyuan_utils.calc_rope(H * 8, W * 8, 2, 88)
 
         # concat embeddings
@@ -226,6 +223,7 @@ def sample_images(
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
     sdxl_train_util.add_sdxl_training_arguments(parser)
+    hunyuan_utils.add_hydit_arguments(parser)
     return parser
 
 

diff --git a/library/hunyuan_models.py b/library/hunyuan_models.py
@@ -32,6 +32,7 @@
 
 VAE_SCALE_FACTOR = 0.13025
 MODEL_VERSION_HUNYUAN_V1_1 = "HunyuanDiT-v1.1"
+MODEL_VERSION_HUNYUAN_V1_2 = "HunyuanDiT-v1.2"
 
 
 class MT5Embedder(nn.Module):

diff --git a/library/hunyuan_utils.py b/library/hunyuan_utils.py
@@ -288,7 +288,7 @@ def match_mixed_precision(args, weight_dtype):
         return None
 
 
-def load_target_model(args, accelerator, model_version: str, weight_dtype, use_extra_cond=False):
+def load_target_model(args, accelerator, model_version: str, weight_dtype):
     _ = model_version   # unused
     model_dtype = match_mixed_precision(args, weight_dtype)  # prepare fp16/bf16
     for pi in range(accelerator.state.num_processes):
@@ -309,7 +309,7 @@ def load_target_model(args, accelerator, model_version: str, weight_dtype, use_e
                 args.pretrained_model_name_or_path,
                 model_dtype,
                 accelerator.device if args.lowram else "cpu",
-                use_extra_cond,
+                args.use_extra_cond,
             )
 
             # work on low-ram device

diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
@@ -130,9 +130,9 @@ def build_metadata(
     # metadata["modelspec.hash_sha256"] = hash
 
     if hydit:
-        if hydit == 'v1.1':
+        if hydit == 'HunyuanDiT-v1.1':
             arch = ARCH_HYDIT_V1_1
-        elif hydit == 'v1.2':
+        elif hydit == 'HunyuanDiT-v1.2':
             arch = ARCH_HYDIT_V1_2
         else:
             raise ValueError(f"Invalid hydit version: {hydit}")