ssh-meister
diff --git a/‎examples/multimodal/convert_ckpt_to_nemo.py
+1-1 b/‎examples/multimodal/convert_ckpt_to_nemo.py
+1-1
diff --git a/‎examples/multimodal/text_to_image/controlnet/conf/controlnet_infer.yaml
+36 b/‎examples/multimodal/text_to_image/controlnet/conf/controlnet_infer.yaml
+36
diff --git a/‎examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml
+222 b/‎examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml
+222
@@ -36,7 +36,7 @@
 from nemo.collections.multimodal.models.text_to_image.imagen import MegatronImagen
 from nemo.collections.multimodal.models.text_to_image.instruct_pix2pix.ldm.ddpm_edit import MegatronLatentDiffusionEdit
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
-from nemo.collections.multimodal.models.vision_language_foundation.clip import MegatronCLIPModel
+from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.utils import AppState, logging
 
@@ -0,0 +1,36 @@
+name: stable-diffusion-train
+
+infer:
+  unconditional_guidance_scale: 3
+  num_images_per_prompt: 4
+  hint_image_size: 512
+  height: 512
+  width: 512
+  down_factor: 8
+  inference_steps: 50
+  sampler_type: 'DDIM'
+  eta: 0
+  output_type: 'pil'
+  save_to_file: True
+  out_path: 'controlnet'
+  seed: 355
+  prompts:
+    - high quality picture of a house in oil painting style
+  control:
+    - /datasets/coco-stuff/house.png #images/val2017/000000001584.jpg
+  # Depending on the input control, if the input control is already the conditioning image, null should be passed here
+  # If a reconstruction target is used as control, then preprocessing function that turns it into a conditioning image needs to be specified
+  control_image_preprocess:
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+
+model:
+  restore_from_path: /ckpts/controlnet/30k.nemo
+  precision: ${trainer.precision}
+  strength: 2.0
+  guess_mode: False
@@ -0,0 +1,222 @@
+trainer:
+  devices: 2
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: True
+  max_epochs: 3 # PTL default. In practice, max_steps will be reached first.
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: True
+  limit_val_batches: 0
+
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: controlnet
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: stable-diffusion
+    group: controlnet
+    name: controlnet-v1.5
+    resume: True
+  create_checkpoint_callback: True
+  create_tensorboard_logger: True
+  checkpoint_callback_params:
+    save_top_k: -1
+    every_n_train_steps: 5000
+    every_n_epochs: 0
+    monitor: reduced_train_loss
+    filename: 'controlnet--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+
+
+
+model:
+  precision: ${trainer.precision}
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8
+
+  linear_start: 0.00085
+  linear_end: 0.0120
+  num_timesteps_cond: 1
+  log_every_t: 200
+  timesteps: 1000
+  first_stage_key: images
+  cond_stage_key: captions
+  control_key: hint
+  image_size: 64
+  channels: 4
+  cond_stage_trainable: false
+  conditioning_key: crossattn
+  monitor: val/loss_simple_ema
+  scale_factor: 0.18215
+  use_ema: False
+  scale_by_std: False
+  ckpt_path:
+  ignore_keys: [ ]
+  parameterization: eps
+  clip_denoised: True
+  load_only_unet: False
+  cosine_s: 8e-3
+  given_betas:
+  original_elbo_weight: 0
+  v_posterior: 0
+  l_simple_weight: 1
+  use_positional_encodings: False
+  learn_logvar: False
+  logvar_init: 0
+  beta_schedule: linear
+  loss_type: l2
+  learning_rate: 1.0e-04
+  concat_mode: True
+  cond_stage_forward:
+  text_embedding_dropout_rate: 0.0
+  fused_opt: True
+  inductor: False
+  inductor_cudagraphs: False
+  capture_cudagraph_iters: -1 # -1 to disable
+  channels_last: True
+  only_mid_control: False
+  sd_locked: True
+
+  control_stage_config:
+    _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlNet
+    params:
+      from_pretrained_unet: /ckpts/v1-5-pruned.ckpt
+      from_NeMo: True
+      image_size: 32 # unused
+      in_channels: 4
+      hint_channels: 3
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_heads: 8
+      use_spatial_transformer: True
+      use_linear_in_transformer: False
+      transformer_depth: 1
+      context_dim: 768
+      use_checkpoint: False
+      legacy: False
+      use_flash_attention: False
+
+  unet_config:
+    _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+    from_pretrained: /ckpts/v1-5-pruned.ckpt
+    from_NeMo: True
+    image_size: 32 # unused
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions:
+    - 4
+    - 2
+    - 1
+    num_res_blocks: 2
+    channel_mult:
+    - 1
+    - 2
+    - 4
+    - 4
+    num_heads: 8
+    use_spatial_transformer: True
+    transformer_depth: 1
+    context_dim: 768
+    use_checkpoint: False
+    legacy: False
+    use_flash_attention: False
+
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    from_pretrained: /ckpts/vae.bin
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult:
+      - 1
+      - 2
+      - 4
+      - 4
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+
+  cond_stage_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+    version: openai/clip-vit-large-patch14
+    device: cuda
+    max_length: 77
+
+  data:
+    num_workers: 16
+    synthetic_data: False # dataset_path and local_root_path can be empty when using synthetic data
+    synthetic_data_length: 10000
+    train:
+      dataset_path:
+        #- /datasets/tarfiles/fill50k.pkl
+        - /datasets/coco-stuff/coco-stuff-tarfiles/wdinfo-coco-stuff.pkl
+      augmentations:
+        resize_smallest_side: 512
+        center_crop_h_w: 512, 512
+        horizontal_flip: False
+      filterings:
+
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coco-stuff/coco-stuff-tarfiles
+
+  optim:
+    name: fused_adam
+    lr: 2e-5
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.999
+    sched:
+      name: WarmupHoldPolicy
+      warmup_steps: 0
+      hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant
+
+    # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  image_logger:
+    batch_frequency: 1000
+    max_images: 4
+
+  #miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)