|
| 1 | +trainer: |
| 2 | + devices: 2 |
| 3 | + num_nodes: 1 |
| 4 | + accelerator: gpu |
| 5 | + precision: 16 |
| 6 | + logger: False # logger provided by exp_manager |
| 7 | + enable_checkpointing: False |
| 8 | + use_distributed_sampler: True |
| 9 | + max_epochs: 3 # PTL default. In practice, max_steps will be reached first. |
| 10 | + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches |
| 11 | + log_every_n_steps: 10 |
| 12 | + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models |
| 13 | + gradient_clip_val: 1.0 |
| 14 | + benchmark: False |
| 15 | + enable_model_summary: True |
| 16 | + limit_val_batches: 0 |
| 17 | + |
| 18 | + |
| 19 | +exp_manager: |
| 20 | + explicit_log_dir: null |
| 21 | + exp_dir: null |
| 22 | + name: controlnet |
| 23 | + create_wandb_logger: False |
| 24 | + wandb_logger_kwargs: |
| 25 | + project: stable-diffusion |
| 26 | + group: controlnet |
| 27 | + name: controlnet-v1.5 |
| 28 | + resume: True |
| 29 | + create_checkpoint_callback: True |
| 30 | + create_tensorboard_logger: True |
| 31 | + checkpoint_callback_params: |
| 32 | + save_top_k: -1 |
| 33 | + every_n_train_steps: 5000 |
| 34 | + every_n_epochs: 0 |
| 35 | + monitor: reduced_train_loss |
| 36 | + filename: 'controlnet--{reduced_train_loss:.2f}-{step}-{consumed_samples}' |
| 37 | + resume_if_exists: True |
| 38 | + resume_ignore_no_checkpoint: True |
| 39 | + resume_from_checkpoint: ${model.resume_from_checkpoint} |
| 40 | + ema: |
| 41 | + enable: False |
| 42 | + decay: 0.9999 |
| 43 | + validate_original_weights: False |
| 44 | + every_n_steps: 1 |
| 45 | + cpu_offload: False |
| 46 | + |
| 47 | + |
| 48 | + |
| 49 | + |
| 50 | +model: |
| 51 | + precision: ${trainer.precision} |
| 52 | + # specify micro_batch_size, global_batch_size, and model parallelism |
| 53 | + # gradient accumulation will be done automatically based on data_parallel_size |
| 54 | + micro_batch_size: 4 # limited by GPU memory |
| 55 | + global_batch_size: 8 |
| 56 | + |
| 57 | + linear_start: 0.00085 |
| 58 | + linear_end: 0.0120 |
| 59 | + num_timesteps_cond: 1 |
| 60 | + log_every_t: 200 |
| 61 | + timesteps: 1000 |
| 62 | + first_stage_key: images |
| 63 | + cond_stage_key: captions |
| 64 | + control_key: hint |
| 65 | + image_size: 64 |
| 66 | + channels: 4 |
| 67 | + cond_stage_trainable: false |
| 68 | + conditioning_key: crossattn |
| 69 | + monitor: val/loss_simple_ema |
| 70 | + scale_factor: 0.18215 |
| 71 | + use_ema: False |
| 72 | + scale_by_std: False |
| 73 | + ckpt_path: |
| 74 | + ignore_keys: [ ] |
| 75 | + parameterization: eps |
| 76 | + clip_denoised: True |
| 77 | + load_only_unet: False |
| 78 | + cosine_s: 8e-3 |
| 79 | + given_betas: |
| 80 | + original_elbo_weight: 0 |
| 81 | + v_posterior: 0 |
| 82 | + l_simple_weight: 1 |
| 83 | + use_positional_encodings: False |
| 84 | + learn_logvar: False |
| 85 | + logvar_init: 0 |
| 86 | + beta_schedule: linear |
| 87 | + loss_type: l2 |
| 88 | + learning_rate: 1.0e-04 |
| 89 | + concat_mode: True |
| 90 | + cond_stage_forward: |
| 91 | + text_embedding_dropout_rate: 0.0 |
| 92 | + fused_opt: True |
| 93 | + inductor: False |
| 94 | + inductor_cudagraphs: False |
| 95 | + capture_cudagraph_iters: -1 # -1 to disable |
| 96 | + channels_last: True |
| 97 | + only_mid_control: False |
| 98 | + sd_locked: True |
| 99 | + |
| 100 | + control_stage_config: |
| 101 | + _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlNet |
| 102 | + params: |
| 103 | + from_pretrained_unet: /ckpts/v1-5-pruned.ckpt |
| 104 | + from_NeMo: True |
| 105 | + image_size: 32 # unused |
| 106 | + in_channels: 4 |
| 107 | + hint_channels: 3 |
| 108 | + model_channels: 320 |
| 109 | + attention_resolutions: [ 4, 2, 1 ] |
| 110 | + num_res_blocks: 2 |
| 111 | + channel_mult: [ 1, 2, 4, 4 ] |
| 112 | + num_heads: 8 |
| 113 | + use_spatial_transformer: True |
| 114 | + use_linear_in_transformer: False |
| 115 | + transformer_depth: 1 |
| 116 | + context_dim: 768 |
| 117 | + use_checkpoint: False |
| 118 | + legacy: False |
| 119 | + use_flash_attention: False |
| 120 | + |
| 121 | + unet_config: |
| 122 | + _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel |
| 123 | + from_pretrained: /ckpts/v1-5-pruned.ckpt |
| 124 | + from_NeMo: True |
| 125 | + image_size: 32 # unused |
| 126 | + in_channels: 4 |
| 127 | + out_channels: 4 |
| 128 | + model_channels: 320 |
| 129 | + attention_resolutions: |
| 130 | + - 4 |
| 131 | + - 2 |
| 132 | + - 1 |
| 133 | + num_res_blocks: 2 |
| 134 | + channel_mult: |
| 135 | + - 1 |
| 136 | + - 2 |
| 137 | + - 4 |
| 138 | + - 4 |
| 139 | + num_heads: 8 |
| 140 | + use_spatial_transformer: True |
| 141 | + transformer_depth: 1 |
| 142 | + context_dim: 768 |
| 143 | + use_checkpoint: False |
| 144 | + legacy: False |
| 145 | + use_flash_attention: False |
| 146 | + |
| 147 | + first_stage_config: |
| 148 | + _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL |
| 149 | + from_pretrained: /ckpts/vae.bin |
| 150 | + embed_dim: 4 |
| 151 | + monitor: val/rec_loss |
| 152 | + ddconfig: |
| 153 | + double_z: true |
| 154 | + z_channels: 4 |
| 155 | + resolution: 256 |
| 156 | + in_channels: 3 |
| 157 | + out_ch: 3 |
| 158 | + ch: 128 |
| 159 | + ch_mult: |
| 160 | + - 1 |
| 161 | + - 2 |
| 162 | + - 4 |
| 163 | + - 4 |
| 164 | + num_res_blocks: 2 |
| 165 | + attn_resolutions: [] |
| 166 | + dropout: 0.0 |
| 167 | + lossconfig: |
| 168 | + target: torch.nn.Identity |
| 169 | + |
| 170 | + cond_stage_config: |
| 171 | + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder |
| 172 | + version: openai/clip-vit-large-patch14 |
| 173 | + device: cuda |
| 174 | + max_length: 77 |
| 175 | + |
| 176 | + data: |
| 177 | + num_workers: 16 |
| 178 | + synthetic_data: False # dataset_path and local_root_path can be empty when using synthetic data |
| 179 | + synthetic_data_length: 10000 |
| 180 | + train: |
| 181 | + dataset_path: |
| 182 | + #- /datasets/tarfiles/fill50k.pkl |
| 183 | + - /datasets/coco-stuff/coco-stuff-tarfiles/wdinfo-coco-stuff.pkl |
| 184 | + augmentations: |
| 185 | + resize_smallest_side: 512 |
| 186 | + center_crop_h_w: 512, 512 |
| 187 | + horizontal_flip: False |
| 188 | + filterings: |
| 189 | + |
| 190 | + webdataset: |
| 191 | + infinite_sampler: False |
| 192 | + local_root_path: /datasets/coco-stuff/coco-stuff-tarfiles |
| 193 | + |
| 194 | + optim: |
| 195 | + name: fused_adam |
| 196 | + lr: 2e-5 |
| 197 | + weight_decay: 0. |
| 198 | + betas: |
| 199 | + - 0.9 |
| 200 | + - 0.999 |
| 201 | + sched: |
| 202 | + name: WarmupHoldPolicy |
| 203 | + warmup_steps: 0 |
| 204 | + hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant |
| 205 | + |
| 206 | + # Nsys profiling options |
| 207 | + nsys_profile: |
| 208 | + enabled: False |
| 209 | + start_step: 10 # Global batch to start profiling |
| 210 | + end_step: 10 # Global batch to end profiling |
| 211 | + ranks: [ 0 ] # Global rank IDs to profile |
| 212 | + gen_shape: False # Generate model and kernel details including input shapes |
| 213 | + |
| 214 | + image_logger: |
| 215 | + batch_frequency: 1000 |
| 216 | + max_images: 4 |
| 217 | + |
| 218 | + #miscellaneous |
| 219 | + seed: 1234 |
| 220 | + resume_from_checkpoint: null # manually set the checkpoint file to load from |
| 221 | + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this |
| 222 | + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) |
0 commit comments