Merge branch 'main' into vsarge/seqlen_logging

NVIDIA · Jan 11, 2024 · f240597 · f240597
2 parents fe2070e + 8d4218e
commit f240597
Show file tree

Hide file tree

Showing 219 changed files with 29,128 additions and 1,072 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -31,7 +31,7 @@ ARG REQUIRE_AIS_CLI=false
 
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
-# libavdevice-dev rerquired for latest torchaudio
+# libavdevice-dev required for latest torchaudio
 RUN apt-get update && \
   apt-get upgrade -y && \
   apt-get install -y \
@@ -42,6 +42,25 @@ RUN apt-get update && \
   libavdevice-dev && \
   rm -rf /var/lib/apt/lists/*
 
+# libtool, ... , libgts-dev are required for graphviz
+# graphviz is required for k2 and pynini visualization
+RUN apt-get update && \
+  apt-get install -y \
+  libtool \
+  libltdl-dev \
+  automake \
+  autoconf \
+  bison \
+  flex \
+  tcl \
+  ghostscript \
+  libgd-dev \
+  fontconfig \
+  libcairo2-dev \
+  libpango1.0-dev \
+  libgts-dev && \
+  rm -rf /var/lib/apt/lists/*
+
 WORKDIR /workspace/
 # Install megatron core, this can be removed once 0.3 pip package is released
 # We leave it here in case we need to work off of a specific commit in main
@@ -90,6 +109,17 @@ RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
 
+COPY scripts /tmp/nemo/scripts/
+# install correct graphviz version (k2 and pynini visualization tool), skip if installation fails
+RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \
+  echo ${INSTALL_MSG}; \
+  if [ ${INSTALL_CODE} -ne 0 ]; then \
+  echo "graphviz installation failed";  \
+  if [ "${REQUIRE_K2}" = true ]; then \
+  exit ${INSTALL_CODE};  \
+  else echo "Skipping failed graphviz installation"; fi \
+  else echo "graphviz installed successfully"; fi
+
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
 RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL_CODE=$?; \

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -175,6 +175,9 @@ class TranscriptionConfig:
     # Set to False to return text instead of hypotheses from the transcribe function, so as to save memory
     return_hypotheses: bool = True
 
+    # key for groundtruth text in manifest
+    gt_text_attr_name: str = "text"
+
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]:
@@ -370,6 +373,7 @@ def autocast(dtype=None):
     if cfg.calculate_wer:
         output_manifest_w_wer, total_res, _ = cal_write_wer(
             pred_manifest=output_filename,
+            gt_text_attr_name=cfg.gt_text_attr_name,
             pred_text_attr_name=pred_text_attr_name,
             clean_groundtruth_text=cfg.clean_groundtruth_text,
             langid=cfg.langid,

diff --git a/examples/multimodal/convert_ckpt_to_nemo.py b/examples/multimodal/convert_ckpt_to_nemo.py
@@ -36,7 +36,7 @@
 from nemo.collections.multimodal.models.text_to_image.imagen import MegatronImagen
 from nemo.collections.multimodal.models.text_to_image.instruct_pix2pix.ldm.ddpm_edit import MegatronLatentDiffusionEdit
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
-from nemo.collections.multimodal.models.vision_language_foundation.clip import MegatronCLIPModel
+from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.utils import AppState, logging

diff --git a/examples/multimodal/text_to_image/controlnet/conf/controlnet_infer.yaml b/examples/multimodal/text_to_image/controlnet/conf/controlnet_infer.yaml
@@ -0,0 +1,36 @@
+name: stable-diffusion-train
+
+infer:
+  unconditional_guidance_scale: 3
+  num_images_per_prompt: 4
+  hint_image_size: 512
+  height: 512
+  width: 512
+  down_factor: 8
+  inference_steps: 50
+  sampler_type: 'DDIM'
+  eta: 0
+  output_type: 'pil'
+  save_to_file: True
+  out_path: 'controlnet'
+  seed: 355
+  prompts:
+    - high quality picture of a house in oil painting style
+  control:
+    - /datasets/coco-stuff/house.png #images/val2017/000000001584.jpg
+  # Depending on the input control, if the input control is already the conditioning image, null should be passed here
+  # If a reconstruction target is used as control, then preprocessing function that turns it into a conditioning image needs to be specified
+  control_image_preprocess:
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+
+model:
+  restore_from_path: /ckpts/controlnet/30k.nemo
+  precision: ${trainer.precision}
+  strength: 2.0
+  guess_mode: False
diff --git a/examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml b/examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml
@@ -0,0 +1,222 @@
+trainer:
+  devices: 2
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: True
+  max_epochs: 3 # PTL default. In practice, max_steps will be reached first.
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: True
+  limit_val_batches: 0
+
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: controlnet
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: stable-diffusion
+    group: controlnet
+    name: controlnet-v1.5
+    resume: True
+  create_checkpoint_callback: True
+  create_tensorboard_logger: True
+  checkpoint_callback_params:
+    save_top_k: -1
+    every_n_train_steps: 5000
+    every_n_epochs: 0
+    monitor: reduced_train_loss
+    filename: 'controlnet--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+
+
+
+model:
+  precision: ${trainer.precision}
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8
+
+  linear_start: 0.00085
+  linear_end: 0.0120
+  num_timesteps_cond: 1
+  log_every_t: 200
+  timesteps: 1000
+  first_stage_key: images
+  cond_stage_key: captions
+  control_key: hint
+  image_size: 64
+  channels: 4
+  cond_stage_trainable: false
+  conditioning_key: crossattn
+  monitor: val/loss_simple_ema
+  scale_factor: 0.18215
+  use_ema: False
+  scale_by_std: False
+  ckpt_path:
+  ignore_keys: [ ]
+  parameterization: eps
+  clip_denoised: True
+  load_only_unet: False
+  cosine_s: 8e-3
+  given_betas:
+  original_elbo_weight: 0
+  v_posterior: 0
+  l_simple_weight: 1
+  use_positional_encodings: False
+  learn_logvar: False
+  logvar_init: 0
+  beta_schedule: linear
+  loss_type: l2
+  learning_rate: 1.0e-04
+  concat_mode: True
+  cond_stage_forward:
+  text_embedding_dropout_rate: 0.0
+  fused_opt: True
+  inductor: False
+  inductor_cudagraphs: False
+  capture_cudagraph_iters: -1 # -1 to disable
+  channels_last: True
+  only_mid_control: False
+  sd_locked: True
+
+  control_stage_config:
+    _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlNet
+    params:
+      from_pretrained_unet: /ckpts/v1-5-pruned.ckpt
+      from_NeMo: True
+      image_size: 32 # unused
+      in_channels: 4
+      hint_channels: 3
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_heads: 8
+      use_spatial_transformer: True
+      use_linear_in_transformer: False
+      transformer_depth: 1
+      context_dim: 768
+      use_checkpoint: False
+      legacy: False
+      use_flash_attention: False
+
+  unet_config:
+    _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+    from_pretrained: /ckpts/v1-5-pruned.ckpt
+    from_NeMo: True
+    image_size: 32 # unused
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions:
+    - 4
+    - 2
+    - 1
+    num_res_blocks: 2
+    channel_mult:
+    - 1
+    - 2
+    - 4
+    - 4
+    num_heads: 8
+    use_spatial_transformer: True
+    transformer_depth: 1
+    context_dim: 768
+    use_checkpoint: False
+    legacy: False
+    use_flash_attention: False
+
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    from_pretrained: /ckpts/vae.bin
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult:
+      - 1
+      - 2
+      - 4
+      - 4
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+
+  cond_stage_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+    version: openai/clip-vit-large-patch14
+    device: cuda
+    max_length: 77
+
+  data:
+    num_workers: 16
+    synthetic_data: False # dataset_path and local_root_path can be empty when using synthetic data
+    synthetic_data_length: 10000
+    train:
+      dataset_path:
+        #- /datasets/tarfiles/fill50k.pkl
+        - /datasets/coco-stuff/coco-stuff-tarfiles/wdinfo-coco-stuff.pkl
+      augmentations:
+        resize_smallest_side: 512
+        center_crop_h_w: 512, 512
+        horizontal_flip: False
+      filterings:
+
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coco-stuff/coco-stuff-tarfiles
+
+  optim:
+    name: fused_adam
+    lr: 2e-5
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.999
+    sched:
+      name: WarmupHoldPolicy
+      warmup_steps: 0
+      hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant
+
+    # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  image_logger:
+    batch_frequency: 1000
+    max_images: 4
+
+  #miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)