NVIDIA-NeMo · dimapihtar · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
@@ -16,6 +16,7 @@
 import math
 import os
 import re
+import sys
 
 import torch
 import torchvision
@@ -28,10 +29,14 @@
 
 # pylint: disable=line-too-long
 from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 """
 This is the script to run multimodal autoregresssive text generation.
 
@@ -89,6 +94,23 @@
 """
 
 
+class LengthParam(TypedDict):
+    max_length: int  # The maximum length of the sequence to be generated.
+    min_length: int  # The minimum length of the sequence to be generated.
+
+
+class SamplingParam(TypedDict):
+    use_greedy: bool  # Whether or not to use sampling ; use greedy decoding otherwise
+    temperature: float  # sampling temperature
+    top_k: int  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: float  # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    repetition_penalty: float  # The parameter for repetition penalty. 1.0 means no penalty.
+    add_BOS: bool  # add the bos token at the begining of the prompt
+    all_probs: bool  # whether return the log prob for all the tokens in vocab
+    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: List[str]  # generation will stop when one of these tokens is generated
+
+
 def to_img(tokens_string, image_tokenizer):
     """Converts visual tokens to images
 

diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
@@ -14,6 +14,7 @@
 
 
 import datetime
+import sys
 
 import torch
 import torchvision
@@ -30,10 +31,14 @@
 from transformers import AutoModel, AutoTokenizer
 
 # pylint: disable=line-too-long
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 """
 This is the script to run multimodal autoregresssive text generation.
 
@@ -94,6 +99,23 @@
 VQ_HUB = "BAAI/Emu3-VisionTokenizer"
 
 
+class LengthParam(TypedDict):
+    max_length: int  # The maximum length of the sequence to be generated.
+    min_length: int  # The minimum length of the sequence to be generated.
+
+
+class SamplingParam(TypedDict):
+    use_greedy: bool  # Whether or not to use sampling ; use greedy decoding otherwise
+    temperature: float  # sampling temperature
+    top_k: int  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: float  # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    repetition_penalty: float  # The parameter for repetition penalty. 1.0 means no penalty.
+    add_BOS: bool  # add the bos token at the begining of the prompt
+    all_probs: bool  # whether return the log prob for all the tokens in vocab
+    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: List[str]  # generation will stop when one of these tokens is generated
+
+
 def to_imgstr(image_tokens, tokenizer):
     """Convert integer image tokens to visual tokens string"""
     image_tokens = image_tokens.cpu().numpy().tolist()

diff --git a/...ommon/transformer/transformer_encoders.py → ...s/transformer/transformer_encoders_nlp.py b/...ommon/transformer/transformer_encoders.py → ...s/transformer/transformer_encoders_nlp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/nemo/collections/asr/modules/wav2vec_modules.py b/nemo/collections/asr/modules/wav2vec_modules.py
@@ -19,16 +19,16 @@
 
 import math
 import random
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
 import torch
 from omegaconf import DictConfig
 from omegaconf.dictconfig import DictConfig
 from torch import nn
 from torch.nn import functional as F
 
+from nemo.collections.asr.modules.common.transformer.transformer_encoders_nlp import TransformerEncoder
 from nemo.collections.common.parts import form_attention_mask, transformer_weights_init
-from nemo.collections.nlp.modules.common.transformer import TransformerEncoder
 from nemo.core.classes.module import NeuralModule
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
 
@@ -55,10 +55,10 @@ def forward(self, x):
 
 class ConvFeatureEncoder(NeuralModule):
     """
-		Encoder used to isolate features in raw audio for Wav2Vec style training.
-		Treated as preprocessor module in NeMo ASR training. Defaults values are
-		for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
-		save for use of layer normalization as default schema. (Chosen for stability.) 
+    Encoder used to isolate features in raw audio for Wav2Vec style training.
+    Treated as preprocessor module in NeMo ASR training. Defaults values are
+    for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
+    save for use of layer normalization as default schema. (Chosen for stability.)
     """
 
     @property
@@ -78,7 +78,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports. 
+        """Returns definitions of module output ports.
         For compatibility, processed features are treated as Spectrogram types
         processed_signal:
             0: AxisType(BatchTag)
@@ -107,7 +107,13 @@ def __init__(
         self.normalize_input = normalize_audio
 
         def block(
-            n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False,
+            n_in,
+            n_out,
+            k,
+            stride,
+            is_layer_norm=False,
+            is_group_norm=False,
+            conv_bias=False,
         ):
             def make_conv():
                 conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
@@ -123,7 +129,11 @@ def make_conv():
                     nn.GELU(),
                 )
             elif is_group_norm:
-                return nn.Sequential(make_conv(), nn.GroupNorm(dim, dim, affine=True), nn.GELU(),)
+                return nn.Sequential(
+                    make_conv(),
+                    nn.GroupNorm(dim, dim, affine=True),
+                    nn.GELU(),
+                )
             else:
                 return nn.Sequential(make_conv(), nn.GELU())
 
@@ -213,34 +223,34 @@ def get_lengths(self, audio_lengths):
 
 class Wav2VecTransformerEncoder(TransformerEncoder):
     """
-		Encoder module following Transformer encoder paradigm 
-		as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
-		style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
-		Takes convolutional encodings of all time steps and adds to features before applying series
-		of self-attention layers. 
-		
-		Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec
-
-		Args:
-			layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
-				If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
-				Occurs only during training step
-			pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
-				to maintain number of time steps
-				Must contain following:
-					embedding_dim: Depth/number of channels of each time step from feature encoding 
-					conv_pos: Kernel size for convolution
-					conv_pos_groups: Number of groups for convolution
-			transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
-				Must contain followign:
-					num_layers: Number of attention layers 
-					hidden_size: Expected input depth (embedding size between model layers)
-					inner_size: Depth of embeddings within feed-forward sections of encoder layers
-					num_attention_heads: Number of attention heads
-					attn_score_dropout: Probability of dropout applied to attention scores
-					attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
-					ffn_dropout: Probability of dropout applied to feed-forward modules
-					hidden_act: Activation function for hidden layers
+    Encoder module following Transformer encoder paradigm
+    as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
+    style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
+    Takes convolutional encodings of all time steps and adds to features before applying series
+    of self-attention layers.
+
+    Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec
+
+    Args:
+            layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
+                    If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
+                    Occurs only during training step
+            pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
+                    to maintain number of time steps
+                    Must contain following:
+                            embedding_dim: Depth/number of channels of each time step from feature encoding
+                            conv_pos: Kernel size for convolution
+                            conv_pos_groups: Number of groups for convolution
+            transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
+                    Must contain followign:
+                            num_layers: Number of attention layers
+                            hidden_size: Expected input depth (embedding size between model layers)
+                            inner_size: Depth of embeddings within feed-forward sections of encoder layers
+                            num_attention_heads: Number of attention heads
+                            attn_score_dropout: Probability of dropout applied to attention scores
+                            attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
+                            ffn_dropout: Probability of dropout applied to feed-forward modules
+                            hidden_act: Activation function for hidden layers
     """
 
     def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: float = 0.0):
@@ -271,7 +281,7 @@ def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: f
 
     @property
     def input_types(self):
-        """Returns definitions of module output ports. 
+        """Returns definitions of module output ports.
         We treat features as SpectrogramType for Nemo compatibility
         audio_signal:
             0: AxisType(BatchTag)
@@ -287,7 +297,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports. 
+        """Returns definitions of module output ports.
         We're using SpectrogramType for now to keep things Nemo safe
         processed_signal:
             0: AxisType(BatchTag)

diff --git a/...ons/nlp/modules/common/tokenizer_utils.py → ...ions/common/tokenizers/tokenizer_utils.py b/...ons/nlp/modules/common/tokenizer_utils.py → ...ions/common/tokenizers/tokenizer_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os.path
 from dataclasses import MISSING, dataclass
 from typing import Dict, List, Optional

diff --git a/nemo/collections/llm/bert/data/mock.py b/nemo/collections/llm/bert/data/mock.py
@@ -69,7 +69,7 @@ def __init__(
         self.global_batch_size = global_batch_size
         self.micro_batch_size = micro_batch_size
         if tokenizer is None:
-            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+            from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
             self.tokenizer = get_nmt_tokenizer("megatron", "BertWordPieceLowerCase")
         else:

diff --git a/nemo/collections/llm/bert/data/pre_training.py b/nemo/collections/llm/bert/data/pre_training.py
@@ -129,7 +129,7 @@ def __init__(
         self.index_mapping_dir = index_mapping_dir
         self.init_global_step = 0
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceLowerCase")
 

diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
@@ -85,7 +85,7 @@ def __init__(
         self.create_attention_mask = create_attention_mask or not HAVE_TE
 
         if tokenizer is None:
-            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+            from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
             self.tokenizer = get_nmt_tokenizer(
                 "megatron", "GPT2BPETokenizer", vocab_file=vocab_file, merges_file=merges_file

diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
@@ -255,7 +255,7 @@ def __init__(
         self.init_global_step = init_global_step
         self.output_log = output_log
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
         self.data_sampler = MegatronDataSampler(

diff --git a/nemo/collections/llm/gpt/model/hyena.py b/nemo/collections/llm/gpt/model/hyena.py
@@ -775,7 +775,7 @@ def tokenizer(self):
         Returns:
             Tokenizer instance
         """
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         tokenizer = get_nmt_tokenizer(
             library=self.model_config.tokenizer_library,

diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
@@ -438,7 +438,7 @@ def tokenizer(self):
         Returns:
             TokenizerSpec: The tokenizer object.
         """
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         tokenizer = get_nmt_tokenizer(
             library=self.model_config.tokenizer_library,

diff --git a/nemo/collections/llm/modelopt/model_utils.py b/nemo/collections/llm/modelopt/model_utils.py
@@ -195,7 +195,7 @@ def setup_trainer_and_restore_model_with_modelopt_spec(
 
     tokenizer = None
     if tokenizer_path:
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 
         tokenizer = get_tokenizer(tokenizer_path)
 

diff --git a/nemo/collections/llm/recipes/hyena_base.py b/nemo/collections/llm/recipes/hyena_base.py
@@ -29,6 +29,7 @@
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.collections.llm.gpt.data.megatron.hyena import Evo2Dataset, parse_dataset_config
@@ -37,7 +38,6 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed, bf16_with_fp8_mixed
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback

diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_130m"

diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_1_3b"

diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_2_7b"