Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import math
import os
import re
import sys

import torch
import torchvision
Expand All @@ -28,10 +29,14 @@

# pylint: disable=line-too-long
from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
from nemo.core.config import hydra_runner

if sys.version_info >= (3, 8):
from typing import TypedDict
else:
from typing_extensions import TypedDict

"""
This is the script to run multimodal autoregresssive text generation.

Expand Down Expand Up @@ -89,6 +94,23 @@
"""


class LengthParam(TypedDict):
max_length: int # The maximum length of the sequence to be generated.
min_length: int # The minimum length of the sequence to be generated.


class SamplingParam(TypedDict):
use_greedy: bool # Whether or not to use sampling ; use greedy decoding otherwise
temperature: float # sampling temperature
top_k: int # The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p: float # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
repetition_penalty: float # The parameter for repetition penalty. 1.0 means no penalty.
add_BOS: bool # add the bos token at the begining of the prompt
all_probs: bool # whether return the log prob for all the tokens in vocab
compute_logprob: bool # a flag used to compute logprob of all the input text, a very special case of running inference, default False
end_strings: List[str] # generation will stop when one of these tokens is generated


def to_img(tokens_string, image_tokenizer):
"""Converts visual tokens to images

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import datetime
import sys

import torch
import torchvision
Expand All @@ -30,10 +31,14 @@
from transformers import AutoModel, AutoTokenizer

# pylint: disable=line-too-long
from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
from nemo.core.config import hydra_runner

if sys.version_info >= (3, 8):
from typing import TypedDict
else:
from typing_extensions import TypedDict

"""
This is the script to run multimodal autoregresssive text generation.

Expand Down Expand Up @@ -94,6 +99,23 @@
VQ_HUB = "BAAI/Emu3-VisionTokenizer"


class LengthParam(TypedDict):
max_length: int # The maximum length of the sequence to be generated.
min_length: int # The minimum length of the sequence to be generated.


class SamplingParam(TypedDict):
use_greedy: bool # Whether or not to use sampling ; use greedy decoding otherwise
temperature: float # sampling temperature
top_k: int # The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p: float # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
repetition_penalty: float # The parameter for repetition penalty. 1.0 means no penalty.
add_BOS: bool # add the bos token at the begining of the prompt
all_probs: bool # whether return the log prob for all the tokens in vocab
compute_logprob: bool # a flag used to compute logprob of all the input text, a very special case of running inference, default False
end_strings: List[str] # generation will stop when one of these tokens is generated


def to_imgstr(image_tokens, tokenizer):
"""Convert integer image tokens to visual tokens string"""
image_tokens = image_tokens.cpu().numpy().tolist()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
88 changes: 49 additions & 39 deletions nemo/collections/asr/modules/wav2vec_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@

import math
import random
from typing import Dict, List, Tuple
from typing import Dict, List

import torch
from omegaconf import DictConfig
from omegaconf.dictconfig import DictConfig
from torch import nn
from torch.nn import functional as F

from nemo.collections.asr.modules.common.transformer.transformer_encoders_nlp import TransformerEncoder
from nemo.collections.common.parts import form_attention_mask, transformer_weights_init
from nemo.collections.nlp.modules.common.transformer import TransformerEncoder
from nemo.core.classes.module import NeuralModule
from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType

Expand All @@ -55,10 +55,10 @@ def forward(self, x):

class ConvFeatureEncoder(NeuralModule):
"""
Encoder used to isolate features in raw audio for Wav2Vec style training.
Treated as preprocessor module in NeMo ASR training. Defaults values are
for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
save for use of layer normalization as default schema. (Chosen for stability.)
Encoder used to isolate features in raw audio for Wav2Vec style training.
Treated as preprocessor module in NeMo ASR training. Defaults values are
for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
save for use of layer normalization as default schema. (Chosen for stability.)
"""

@property
Expand All @@ -78,7 +78,7 @@ def input_types(self):

@property
def output_types(self):
"""Returns definitions of module output ports.
"""Returns definitions of module output ports.
For compatibility, processed features are treated as Spectrogram types
processed_signal:
0: AxisType(BatchTag)
Expand Down Expand Up @@ -107,7 +107,13 @@ def __init__(
self.normalize_input = normalize_audio

def block(
n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False,
n_in,
n_out,
k,
stride,
is_layer_norm=False,
is_group_norm=False,
conv_bias=False,
):
def make_conv():
conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
Expand All @@ -123,7 +129,11 @@ def make_conv():
nn.GELU(),
)
elif is_group_norm:
return nn.Sequential(make_conv(), nn.GroupNorm(dim, dim, affine=True), nn.GELU(),)
return nn.Sequential(
make_conv(),
nn.GroupNorm(dim, dim, affine=True),
nn.GELU(),
)
else:
return nn.Sequential(make_conv(), nn.GELU())

Expand Down Expand Up @@ -213,34 +223,34 @@ def get_lengths(self, audio_lengths):

class Wav2VecTransformerEncoder(TransformerEncoder):
"""
Encoder module following Transformer encoder paradigm
as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
Takes convolutional encodings of all time steps and adds to features before applying series
of self-attention layers.
Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec

Args:
layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
Occurs only during training step
pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
to maintain number of time steps
Must contain following:
embedding_dim: Depth/number of channels of each time step from feature encoding
conv_pos: Kernel size for convolution
conv_pos_groups: Number of groups for convolution
transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
Must contain followign:
num_layers: Number of attention layers
hidden_size: Expected input depth (embedding size between model layers)
inner_size: Depth of embeddings within feed-forward sections of encoder layers
num_attention_heads: Number of attention heads
attn_score_dropout: Probability of dropout applied to attention scores
attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
ffn_dropout: Probability of dropout applied to feed-forward modules
hidden_act: Activation function for hidden layers
Encoder module following Transformer encoder paradigm
as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
Takes convolutional encodings of all time steps and adds to features before applying series
of self-attention layers.

Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec

Args:
layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
Occurs only during training step
pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
to maintain number of time steps
Must contain following:
embedding_dim: Depth/number of channels of each time step from feature encoding
conv_pos: Kernel size for convolution
conv_pos_groups: Number of groups for convolution
transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
Must contain followign:
num_layers: Number of attention layers
hidden_size: Expected input depth (embedding size between model layers)
inner_size: Depth of embeddings within feed-forward sections of encoder layers
num_attention_heads: Number of attention heads
attn_score_dropout: Probability of dropout applied to attention scores
attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
ffn_dropout: Probability of dropout applied to feed-forward modules
hidden_act: Activation function for hidden layers
"""

def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: float = 0.0):
Expand Down Expand Up @@ -271,7 +281,7 @@ def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: f

@property
def input_types(self):
"""Returns definitions of module output ports.
"""Returns definitions of module output ports.
We treat features as SpectrogramType for Nemo compatibility
audio_signal:
0: AxisType(BatchTag)
Expand All @@ -287,7 +297,7 @@ def input_types(self):

@property
def output_types(self):
"""Returns definitions of module output ports.
"""Returns definitions of module output ports.
We're using SpectrogramType for now to keep things Nemo safe
processed_signal:
0: AxisType(BatchTag)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os.path
from dataclasses import MISSING, dataclass
from typing import Dict, List, Optional
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/bert/data/mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(
self.global_batch_size = global_batch_size
self.micro_batch_size = micro_batch_size
if tokenizer is None:
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer

self.tokenizer = get_nmt_tokenizer("megatron", "BertWordPieceLowerCase")
else:
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/bert/data/pre_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def __init__(
self.index_mapping_dir = index_mapping_dir
self.init_global_step = 0

from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer

self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceLowerCase")

Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/gpt/data/mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(
self.create_attention_mask = create_attention_mask or not HAVE_TE

if tokenizer is None:
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer

self.tokenizer = get_nmt_tokenizer(
"megatron", "GPT2BPETokenizer", vocab_file=vocab_file, merges_file=merges_file
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/gpt/data/pre_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def __init__(
self.init_global_step = init_global_step
self.output_log = output_log

from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer

self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
self.data_sampler = MegatronDataSampler(
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/gpt/model/hyena.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ def tokenizer(self):
Returns:
Tokenizer instance
"""
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer

tokenizer = get_nmt_tokenizer(
library=self.model_config.tokenizer_library,
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/gpt/model/ssm.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def tokenizer(self):
Returns:
TokenizerSpec: The tokenizer object.
"""
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer

tokenizer = get_nmt_tokenizer(
library=self.model_config.tokenizer_library,
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/modelopt/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def setup_trainer_and_restore_model_with_modelopt_spec(

tokenizer = None
if tokenizer_path:
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer

tokenizer = get_tokenizer(tokenizer_path)

Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/recipes/hyena_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from nemo import lightning as nl
from nemo.collections import llm
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data import PreTrainingDataModule
from nemo.collections.llm.gpt.data.megatron.hyena import Evo2Dataset, parse_dataset_config
Expand All @@ -37,7 +38,6 @@
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed, bf16_with_fp8_mixed
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.pytorch import callbacks as nl_callbacks
from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/recipes/mamba2_130m.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from nemo import lightning as nl
from nemo.collections import llm
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.utils.exp_manager import TimingCallback

NAME = "mamba2_130m"
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/recipes/mamba2_1_3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from nemo import lightning as nl
from nemo.collections import llm
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.utils.exp_manager import TimingCallback

NAME = "mamba2_1_3b"
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/recipes/mamba2_2_7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from nemo import lightning as nl
from nemo.collections import llm
from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.utils.exp_manager import TimingCallback

NAME = "mamba2_2_7b"
Expand Down
Loading
Loading