From 5dd097d341a9cb2649733285d57e1efe6f35c0bd Mon Sep 17 00:00:00 2001 From: Junyu Chen Date: Mon, 11 Nov 2024 20:29:10 -0800 Subject: [PATCH] support uvit-2b --- applications/dc_ae/README.md | 16 ++++++++-------- efficientvit/diffusion_model_zoo.py | 23 ++++++++++++++++++++++- efficientvit/diffusioncore/models/uvit.py | 11 +++++++++++ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/applications/dc_ae/README.md b/applications/dc_ae/README.md index 7c097be..f2f37d4 100644 --- a/applications/dc_ae/README.md +++ b/applications/dc_ae/README.md @@ -122,7 +122,7 @@ save_image(image_samples * 0.5 + 0.5, "demo_dc_ae_diffusion.png", nrow=int(np.sq - Generate reference for FID computation: ```bash -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.generate_reference \ +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.generate_reference \ dataset=imagenet imagenet.resolution=512 imagenet.image_mean=[0.,0.,0.] imagenet.image_std=[1.,1.,1.] split=test \ fid.save_path=assets/data/fid/imagenet_512_val.npz ``` @@ -131,7 +131,7 @@ torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.generate_reference ```bash # full DC-AE model list: https://huggingface.co/collections/mit-han-lab/dc-ae-670085b9400ad7197bb1009b -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_model dataset=imagenet_512 model=dc-ae-f64c128-in-1.0 run_dir=tmp +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_model dataset=imagenet_512 model=dc-ae-f64c128-in-1.0 run_dir=tmp # Expected results: # fid: 0.2167766520628902 @@ -145,7 +145,7 @@ torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_model da ``` bash # full DC-AE-Diffusion model list: https://huggingface.co/collections/mit-han-lab/dc-ae-diffusion-670dbb8d6b6914cf24c1a49d -torchrun --nnodes 1 --nproc_per_node=1 -m applications.dc_ae.demo_dc_ae_diffusion_model model=dc-ae-f64c128-in-1.0-uvit-h-in-512px-train2000k run_dir=.demo/diffusion/dc-ae-f64c128-in-1.0-uvit-h-in-512px-train2000k +torchrun --nnodes=1 --nproc_per_node=1 -m applications.dc_ae.demo_dc_ae_diffusion_model model=dc-ae-f64c128-in-1.0-uvit-h-in-512px-train2000k run_dir=.demo/diffusion/dc-ae-f64c128-in-1.0-uvit-h-in-512px-train2000k ``` Expected results: @@ -159,7 +159,7 @@ Expected results: ```bash # generate reference for FID computation -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.generate_reference \ +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.generate_reference \ dataset=imagenet imagenet.resolution=512 imagenet.image_mean=[0.,0.,0.] imagenet.image_std=[1.,1.,1.] split=train \ fid.save_path=assets/data/fid/imagenet_512_train.npz ``` @@ -169,7 +169,7 @@ torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.generate_reference ```bash # full DC-AE-Diffusion model list: https://huggingface.co/collections/mit-han-lab/dc-ae-diffusion-670dbb8d6b6914cf24c1a49d -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_diffusion_model dataset=imagenet_512 model=dc-ae-f64c128-in-1.0-uvit-h-in-512px cfg_scale=1.0 run_dir=tmp +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_diffusion_model dataset=imagenet_512 model=dc-ae-f64c128-in-1.0-uvit-h-in-512px cfg_scale=1.0 run_dir=tmp # Expected results: # fid: 13.754458694549271 ``` @@ -180,7 +180,7 @@ torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_diffusio # full DC-AE-Diffusion model list: https://huggingface.co/collections/mit-han-lab/dc-ae-diffusion-670dbb8d6b6914cf24c1a49d # cfg=1.3 for mit-han-lab/dc-ae-f32c32-in-1.0-dit-xl-in-512px # and cfg=1.5 for all other models -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_diffusion_model dataset=imagenet_512 model=dc-ae-f64c128-in-1.0-uvit-h-in-512px cfg_scale=1.5 run_dir=tmp +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_diffusion_model dataset=imagenet_512 model=dc-ae-f64c128-in-1.0-uvit-h-in-512px cfg_scale=1.5 run_dir=tmp # Expected results: # fid: 2.963459255529642 ``` @@ -190,7 +190,7 @@ torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.eval_dc_ae_diffusio - Generate and save latent: ```bash -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.dc_ae_generate_latent resolution=512 \ +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.dc_ae_generate_latent resolution=512 \ image_root_path=~/dataset/imagenet/train batch_size=64 \ model_name=dc-ae-f64c128-in-1.0 scaling_factor=0.2889 \ latent_root_path=assets/data/latent/dc_ae_f64c128_in_1.0/imagenet_512 @@ -200,7 +200,7 @@ torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.dc_ae_generate_late ``` bash # Example: DC-AE-f64 + UViT-H on ImageNet 512x512 -torchrun --nnodes 1 --nproc_per_node=8 -m applications.dc_ae.train_dc_ae_diffusion_model resolution=512 \ +torchrun --nnodes=1 --nproc_per_node=8 -m applications.dc_ae.train_dc_ae_diffusion_model resolution=512 \ train_dataset=latent_imagenet latent_imagenet.batch_size=128 latent_imagenet.data_dir=assets/data/latent/dc_ae_f64c128_in_1.0/imagenet_512 \ evaluate_dataset=sample_class sample_class.num_samples=50000 \ autoencoder=dc-ae-f64c128-in-1.0 scaling_factor=0.2889 \ diff --git a/efficientvit/diffusion_model_zoo.py b/efficientvit/diffusion_model_zoo.py index 722cca7..f0aa5ac 100644 --- a/efficientvit/diffusion_model_zoo.py +++ b/efficientvit/diffusion_model_zoo.py @@ -6,7 +6,7 @@ from efficientvit.diffusioncore.evaluator import Evaluator, EvaluatorConfig from efficientvit.diffusioncore.models.dit import dc_ae_dit_xl_in_512px -from efficientvit.diffusioncore.models.uvit import dc_ae_uvit_h_in_512px, dc_ae_uvit_s_in_512px +from efficientvit.diffusioncore.models.uvit import dc_ae_uvit_2b_in_512px, dc_ae_uvit_h_in_512px, dc_ae_uvit_s_in_512px __all__ = ["create_dc_ae_diffusion_model", "DCAE_Diffusion_HF"] @@ -34,6 +34,13 @@ 32, None, ), + "dc-ae-f32c32-in-1.0-uvit-2b-in-512px": ( + dc_ae_uvit_2b_in_512px, + "dc-ae-f32c32-in-1.0", + 0.3189, + 32, + None, + ), ################################################################################ "dc-ae-f64c128-in-1.0-uvit-h-in-512px": ( dc_ae_uvit_h_in_512px, @@ -49,6 +56,20 @@ 128, None, ), + "dc-ae-f64c128-in-1.0-uvit-2b-in-512px": ( + dc_ae_uvit_2b_in_512px, + "dc-ae-f64c128-in-1.0", + 0.2889, + 128, + None, + ), + "dc-ae-f64c128-in-1.0-uvit-2b-in-512px-train2000k": ( + dc_ae_uvit_2b_in_512px, + "dc-ae-f64c128-in-1.0", + 0.2889, + 128, + None, + ), } diff --git a/efficientvit/diffusioncore/models/uvit.py b/efficientvit/diffusioncore/models/uvit.py index 04000af..97a4519 100644 --- a/efficientvit/diffusioncore/models/uvit.py +++ b/efficientvit/diffusioncore/models/uvit.py @@ -511,3 +511,14 @@ def dc_ae_uvit_h_in_512px(ae_name: str, scaling_factor: float, in_channels: int, f"uvit.pretrained_path={'null' if pretrained_path is None else pretrained_path} " "fid.ref_path=assets/data/fid/imagenet_512_train.npz" ) + + +def dc_ae_uvit_2b_in_512px( + ae_name: str, scaling_factor: float, in_channels: int, pretrained_path: Optional[str] +) -> str: + return ( + f"autoencoder={ae_name} scaling_factor={scaling_factor} " + f"model=uvit uvit.depth=28 uvit.hidden_size=2048 uvit.num_heads=32 uvit.in_channels={in_channels} uvit.patch_size=1 " + f"uvit.pretrained_path={'null' if pretrained_path is None else pretrained_path} " + "fid.ref_path=assets/data/fid/imagenet_512_train.npz" + )