From 23cb1e386be8fbd9388c260c80214ff343838422 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 03:26:44 -0700
Subject: [PATCH 01/21] move tokenizer_utils

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../common/tokenizers/tokenizer_utils.py      | 286 ++++++++++++++++++
 nemo/collections/llm/bert/data/mock.py        |   2 +-
 .../collections/llm/bert/data/pre_training.py |   2 +-
 nemo/collections/llm/gpt/data/mock.py         |   2 +-
 nemo/collections/llm/gpt/data/pre_training.py |   2 +-
 nemo/collections/llm/gpt/model/hyena.py       |   2 +-
 nemo/collections/llm/gpt/model/ssm.py         |   2 +-
 nemo/collections/llm/modelopt/model_utils.py  |   2 +-
 nemo/collections/llm/recipes/hyena_base.py    |   2 +-
 nemo/collections/llm/recipes/mamba2_130m.py   |   2 +-
 nemo/collections/llm/recipes/mamba2_1_3b.py   |   2 +-
 nemo/collections/llm/recipes/mamba2_2_7b.py   |   2 +-
 nemo/collections/llm/recipes/mamba2_370m.py   |   2 +-
 nemo/collections/llm/recipes/mamba2_780m.py   |   2 +-
 nemo/collections/llm/recipes/mamba2_8b.py     |   2 +-
 .../llm/recipes/mamba2_hybrid_8b.py           |   2 +-
 .../llm/recipes/nemotron_nano_12b_v2.py       |   2 +-
 .../llm/recipes/nemotron_nano_9b_v2.py        |   2 +-
 nemo/collections/llm/recipes/nemotronh_47b.py |   2 +-
 nemo/collections/llm/recipes/nemotronh_4b.py  |   2 +-
 nemo/collections/llm/recipes/nemotronh_56b.py |   2 +-
 nemo/collections/llm/recipes/nemotronh_8b.py  |   2 +-
 nemo/collections/llm/t5/data/fine_tuning.py   |   2 +-
 nemo/collections/llm/t5/data/mock.py          |   2 +-
 nemo/collections/llm/t5/data/pre_training.py  |   2 +-
 .../nlp/modules/common/__init__.py            |   2 +-
 .../tts/models/language_modeling/nlp_model.py |   2 +-
 nemo/export/trt_llm/qnemo/tokenizer_utils.py  |   2 +-
 .../create_tarred_transformer_lm_dataset.py   |   2 +-
 .../neural_rescorer/eval_neural_rescorer.py   |   2 +-
 scripts/llm/gpt_train.py                      |   2 +-
 .../performance/llm/finetune_deepseek_v3.py   |   2 +-
 .../performance/llm/pretrain_deepseek_v3.py   |   2 +-
 scripts/performance/llm/pretrain_gpt3_175b.py |   2 +-
 .../performance/llm/pretrain_llama31_405b.py  |   2 +-
 .../performance/llm/pretrain_llama3_70b.py    |   2 +-
 scripts/performance/llm/pretrain_llama3_8b.py |   2 +-
 .../performance/llm/pretrain_llama4_e128.py   |   2 +-
 .../performance/llm/pretrain_llama4_e16.py    |   2 +-
 .../performance/llm/pretrain_mixtral_8x22b.py |   2 +-
 .../performance/llm/pretrain_mixtral_8x7b.py  |   2 +-
 .../performance/llm/pretrain_nemotron3_22b.py |   2 +-
 .../performance/llm/pretrain_nemotron3_8b.py  |   2 +-
 .../performance/llm/pretrain_nemotron4_15b.py |   2 +-
 .../llm/pretrain_nemotron4_340b.py            |   2 +-
 .../performance/llm/pretrain_nemotronh_47b.py |   2 +-
 .../performance/llm/pretrain_nemotronh_56b.py |   2 +-
 .../performance/llm/pretrain_nemotronh_8b.py  |   2 +-
 scripts/performance/vlm/finetune_neva_8b.py   |   2 +-
 .../performance/vlm/finetune_qwen25vl_32b.py  |   2 +-
 .../performance/vlm/finetune_qwen25vl_7b.py   |   2 +-
 .../common/test_apply_chat_template.py        |   2 +-
 tests/collections/llm/bert_pretraining.py     |   2 +-
 .../bitexact/mixtral/pretrain_mini_mixtral.py |   2 +-
 .../data/megatron/hyena/test_evo2_dataset.py  |   2 +-
 .../llm/gpt/data/test_pre_training_data.py    |   2 +-
 tests/collections/llm/gpt/model/test_hyena.py |   2 +-
 .../llm/gpt/model/test_hyena_accuracy.py      |   2 +-
 .../llm/gpt/model/test_nemotronh.py           |   2 +-
 tests/collections/llm/gpt_finetuning.py       |   2 +-
 .../llm/megatron_gpt_pretraining.py           |   2 +-
 .../llm/megatron_mixtral_pretraining.py       |   2 +-
 .../collections/llm/megatron_t5_finetuning.py |   2 +-
 .../llm/megatron_t5_pretraining.py            |   2 +-
 .../speechlm/speech_to_text_llm_train.py      |   2 +-
 tests/lightning/test_ddp_parity_checker.py    |   2 +-
 tests/lightning/test_nemo_resume_from_ckpt.py |   2 +-
 tests/lightning/test_state_restoration.py     |   2 +-
 tutorials/llm/embedding/llama_embedding.ipynb |   2 +-
 69 files changed, 354 insertions(+), 68 deletions(-)
 create mode 100644 nemo/collections/common/tokenizers/tokenizer_utils.py

diff --git a/nemo/collections/common/tokenizers/tokenizer_utils.py b/nemo/collections/common/tokenizers/tokenizer_utils.py
new file mode 100644
index 000000000000..1f77b79b40c0
--- /dev/null
+++ b/nemo/collections/common/tokenizers/tokenizer_utils.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path
+from dataclasses import MISSING, dataclass
+from typing import Dict, List, Optional
+
+from nemo.utils import logging
+
+from .huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list
+
+__all__ = ["get_tokenizer", "get_tokenizer_list"]
+
+
+megatron_tokenizer_model_map = {
+    "BertWordPieceLowerCase": "megatron-bert-345m-uncased",
+    "BertWordPieceCase": "megatron-bert-345m-cased",
+    "GPT2BPETokenizer": "megatron-gpt-345m",
+}
+
+
+def get_tokenizer_list() -> List[str]:
+    """
+    Returns all all supported tokenizer names
+    """
+    s = set(get_huggingface_pretrained_lm_models_list(include_external=False))
+    s.update(set(get_huggingface_pretrained_lm_models_list(include_external=True)))
+    return ["sentencepiece", "char", "word"] + list(s)
+
+
+@dataclass
+class TokenizerConfig:
+    """
+    Tokenizer Configuration Dataclass.
+    """
+
+    library: str = MISSING
+    tokenizer_model: Optional[str] = None
+    vocab_size: Optional[int] = None
+    vocab_file: Optional[str] = None
+    special_tokens: Optional[Dict[str, str]] = None
+    bpe_dropout: Optional[float] = 0.0
+    coverage: Optional[float] = 0.999
+    training_sample_size: Optional[int] = None
+    r2l: Optional[bool] = False
+    sentencepiece_legacy: Optional[bool] = False
+
+
+def get_tokenizer(
+    tokenizer_name: str,
+    tokenizer_model: Optional[str] = None,
+    vocab_file: Optional[str] = None,
+    merges_file: Optional[str] = None,
+    special_tokens: Optional[Dict[str, str]] = None,
+    use_fast: Optional[bool] = False,
+    bpe_dropout: Optional[float] = 0.0,
+    chat_template: Optional[Dict] = None,
+):
+    """
+    Args:
+        tokenizer_name: sentencepiece or pretrained model from the hugging face list,
+            for example: bert-base-cased
+            To see the list of all HuggingFace pretrained models, use:
+            nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
+        tokenizer_model: tokenizer model file of sentencepiece
+        special_tokens: dict of special tokens.
+            For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel
+            tokens for T5 (<extra_id_0>, <extra_id_1>, etc.), use key 'additional_special_tokens'
+        vocab_file: path to vocab file
+        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
+        bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation
+            procedure of BPE to help
+            model better learn word compositionality and become robust to segmentation errors.
+            It has empirically been shown to improve inference time BLEU scores.
+    """
+    import omegaconf
+    from omegaconf import OmegaConf
+
+    if isinstance(
+        special_tokens,
+        (omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig),
+    ):
+        special_tokens = OmegaConf.to_container(special_tokens)
+
+    if special_tokens is None:
+        special_tokens_dict = {}
+    else:
+        special_tokens_dict = special_tokens
+
+    if "megatron" in tokenizer_name:
+        try:
+            from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
+                get_megatron_merges_file,
+                get_megatron_tokenizer,
+                get_megatron_vocab_file,
+            )
+        except (ImportError, ModuleNotFoundError):
+            raise ImportError(
+                "Megatron-core was not found. Please see the NeMo README for installation instructions: "
+                " https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+        if vocab_file is None:
+            vocab_file = get_megatron_vocab_file(tokenizer_name)
+            merges_file = get_megatron_merges_file(tokenizer_name)
+        tokenizer_name = get_megatron_tokenizer(tokenizer_name)
+
+    if tokenizer_name == "sentencepiece":
+        from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+
+        logging.info("tokenizer_model: " + str(tokenizer_model))
+        return SentencePieceTokenizer(
+            model_path=tokenizer_model,
+            special_tokens=special_tokens,
+            legacy=True,
+            chat_template=chat_template,
+        )
+    elif tokenizer_name == "tiktoken":
+        from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
+
+        return TiktokenTokenizer(
+            vocab_file=vocab_file,
+            special_tokens=special_tokens["additional_special_tokens"],
+        )
+    elif tokenizer_name == "word":
+        from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer
+
+        return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict)
+    elif tokenizer_name == "char":
+        from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer
+
+        return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict)
+    elif tokenizer_name == "regex":
+        from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
+
+        return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
+
+    logging.info(
+        f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, "
+        f" merges_files: {merges_file}, special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}"
+    )
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+    tokenizer = AutoTokenizer(
+        pretrained_model_name=tokenizer_name,
+        vocab_file=vocab_file,
+        merges_file=merges_file,
+        **special_tokens_dict,
+        use_fast=use_fast,
+        chat_template=chat_template,
+    )
+    return tokenizer
+
+
+def get_nmt_tokenizer(
+    library: str = "sentencepiece",
+    model_name: Optional[str] = None,
+    tokenizer_model: Optional[str] = None,
+    vocab_file: Optional[str] = None,
+    merges_file: Optional[str] = None,
+    special_tokens: Optional[Dict[str, str]] = None,
+    use_fast: Optional[bool] = False,
+    bpe_dropout: Optional[float] = 0.0,
+    r2l: Optional[bool] = False,
+    legacy: Optional[bool] = False,
+    delimiter: Optional[str] = None,
+    trust_remote_code: Optional[bool] = False,
+    chat_template: Optional[Dict] = None,
+    vocab_size: Optional[int] = None,
+):
+    """
+    Args:
+        model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
+        tokenizer_model: tokenizer model file of sentencepiece
+        special_tokens: dict of special tokens
+        vocab_file: path to vocab file
+        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
+        bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation procedure
+            of BPE to help model better learn word compositionality and become robust to segmentation errors.
+            It has empirically been shown to improve inference time BLEU scores.
+        r2l: Whether to return subword IDs from right to left
+    """
+    import omegaconf
+    from omegaconf import OmegaConf
+
+    if isinstance(
+        special_tokens,
+        (omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig),
+    ):
+        special_tokens = OmegaConf.to_container(special_tokens)
+    if special_tokens is None:
+        special_tokens_dict = {}
+    else:
+        special_tokens_dict = special_tokens
+
+    if (library != "byte-level") and (
+        model_name is None and (tokenizer_model is None or not os.path.isfile(tokenizer_model))
+    ):
+        raise ValueError("No Tokenizer path provided or file does not exist!")
+
+    if library == "huggingface":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        logging.info(f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}')
+        tokenizer = AutoTokenizer(
+            pretrained_model_name=model_name,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            **special_tokens_dict,
+            use_fast=use_fast,
+            trust_remote_code=trust_remote_code,
+            chat_template=chat_template,
+        )
+        if chat_template:
+            tokenizer.tokenizer.chat_template = chat_template
+        return tokenizer
+    elif library == 'sentencepiece':
+        from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+
+        logging.info(f"Getting SentencePiece with model: {tokenizer_model}")
+
+        return SentencePieceTokenizer(
+            model_path=tokenizer_model,
+            special_tokens=special_tokens,
+            legacy=legacy,
+            chat_template=chat_template,
+        )
+    elif library == "byte-level":
+        from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer
+
+        logging.info("Using byte-level tokenization")
+        return ByteLevelTokenizer(special_tokens_dict)
+    elif library == "regex":
+        from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
+
+        logging.info("Using regex tokenization")
+        return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
+    elif library == "megatron":
+        if model_name == "GPTSentencePieceTokenizer":
+            from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+
+            logging.info("tokenizer_model: ")
+            logging.info(tokenizer_model)
+            return SentencePieceTokenizer(model_path=tokenizer_model, legacy=legacy)
+
+        if model_name in megatron_tokenizer_model_map:
+            model_name = megatron_tokenizer_model_map[model_name]
+        logging.info(
+            f"Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, "
+            f"and merges file: {merges_file}"
+        )
+        return get_tokenizer(
+            tokenizer_name=model_name,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            special_tokens=special_tokens_dict,
+            chat_template=chat_template,
+        )
+    elif library == "tabular":
+        from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
+
+        return TabularTokenizer(vocab_file, delimiter=delimiter)
+    elif library == "tiktoken":
+        from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
+
+        return TiktokenTokenizer(vocab_file=vocab_file)
+    elif library == "null":
+        assert vocab_size is not None
+        from nemo.collections.common.tokenizers.null_tokenizer import NullTokenizer
+
+        return NullTokenizer(vocab_size)
+    else:
+        raise NotImplementedError(
+            'Currently we only support "huggingface", "sentencepiece", "megatron", "byte-level", "regex", "tabular",'
+            '"tiktoken", and "null" tokenizer libraries.'
+        )
diff --git a/nemo/collections/llm/bert/data/mock.py b/nemo/collections/llm/bert/data/mock.py
index 8b92eb2e72ba..1742699ad7e2 100644
--- a/nemo/collections/llm/bert/data/mock.py
+++ b/nemo/collections/llm/bert/data/mock.py
@@ -69,7 +69,7 @@ def __init__(
         self.global_batch_size = global_batch_size
         self.micro_batch_size = micro_batch_size
         if tokenizer is None:
-            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+            from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
             self.tokenizer = get_nmt_tokenizer("megatron", "BertWordPieceLowerCase")
         else:
diff --git a/nemo/collections/llm/bert/data/pre_training.py b/nemo/collections/llm/bert/data/pre_training.py
index 8bc2b3f0d069..4fc58f576f9a 100644
--- a/nemo/collections/llm/bert/data/pre_training.py
+++ b/nemo/collections/llm/bert/data/pre_training.py
@@ -129,7 +129,7 @@ def __init__(
         self.index_mapping_dir = index_mapping_dir
         self.init_global_step = 0
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceLowerCase")
 
diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
index 4d7644d225b5..07dc7b934961 100644
--- a/nemo/collections/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -85,7 +85,7 @@ def __init__(
         self.create_attention_mask = create_attention_mask or not HAVE_TE
 
         if tokenizer is None:
-            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+            from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
             self.tokenizer = get_nmt_tokenizer(
                 "megatron", "GPT2BPETokenizer", vocab_file=vocab_file, merges_file=merges_file
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index a33e7958e251..23081a4c4897 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -255,7 +255,7 @@ def __init__(
         self.init_global_step = init_global_step
         self.output_log = output_log
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
         self.data_sampler = MegatronDataSampler(
diff --git a/nemo/collections/llm/gpt/model/hyena.py b/nemo/collections/llm/gpt/model/hyena.py
index 5f17d112c021..21667d506325 100644
--- a/nemo/collections/llm/gpt/model/hyena.py
+++ b/nemo/collections/llm/gpt/model/hyena.py
@@ -775,7 +775,7 @@ def tokenizer(self):
         Returns:
             Tokenizer instance
         """
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         tokenizer = get_nmt_tokenizer(
             library=self.model_config.tokenizer_library,
diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
index 1f7a2dfaebd0..ea1104e9ed4c 100644
--- a/nemo/collections/llm/gpt/model/ssm.py
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -438,7 +438,7 @@ def tokenizer(self):
         Returns:
             TokenizerSpec: The tokenizer object.
         """
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         tokenizer = get_nmt_tokenizer(
             library=self.model_config.tokenizer_library,
diff --git a/nemo/collections/llm/modelopt/model_utils.py b/nemo/collections/llm/modelopt/model_utils.py
index ef8de2eaa9ea..bab753d431b2 100644
--- a/nemo/collections/llm/modelopt/model_utils.py
+++ b/nemo/collections/llm/modelopt/model_utils.py
@@ -195,7 +195,7 @@ def setup_trainer_and_restore_model_with_modelopt_spec(
 
     tokenizer = None
     if tokenizer_path:
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 
         tokenizer = get_tokenizer(tokenizer_path)
 
diff --git a/nemo/collections/llm/recipes/hyena_base.py b/nemo/collections/llm/recipes/hyena_base.py
index 868455587313..77f89d039391 100644
--- a/nemo/collections/llm/recipes/hyena_base.py
+++ b/nemo/collections/llm/recipes/hyena_base.py
@@ -37,7 +37,7 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed, bf16_with_fp8_mixed
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
index b0efaaffec50..a833d65395d2 100644
--- a/nemo/collections/llm/recipes/mamba2_130m.py
+++ b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_130m"
diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
index 986a13156dab..d1ce2845bc69 100644
--- a/nemo/collections/llm/recipes/mamba2_1_3b.py
+++ b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_1_3b"
diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
index e2dd3dc626cb..11836b459cb7 100644
--- a/nemo/collections/llm/recipes/mamba2_2_7b.py
+++ b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_2_7b"
diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py
index f5c1eb2b043f..4267e05e47c9 100644
--- a/nemo/collections/llm/recipes/mamba2_370m.py
+++ b/nemo/collections/llm/recipes/mamba2_370m.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_370m"
diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py
index 732c0cc9725a..5b3a7ae5cad6 100644
--- a/nemo/collections/llm/recipes/mamba2_780m.py
+++ b/nemo/collections/llm/recipes/mamba2_780m.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_780m"
diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py
index c4deac2b17d5..f33a1f918bca 100644
--- a/nemo/collections/llm/recipes/mamba2_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_8b.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_8b"
diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
index 5c527785eec4..4b4182f279df 100644
--- a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -28,7 +28,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_hybrid_8b"
diff --git a/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py b/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py
index 82ea0194ac5e..58028f21c4e2 100644
--- a/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py
+++ b/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py
@@ -29,7 +29,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nanov2_bf16_with_fp8_current_scaling_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.utils.exp_manager import TimingCallback
 
diff --git a/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py b/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py
index 812c3f2846a8..032519821e59 100644
--- a/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py
+++ b/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py
@@ -29,7 +29,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_47b.py b/nemo/collections/llm/recipes/nemotronh_47b.py
index 24182eb240f6..32549bebcda2 100644
--- a/nemo/collections/llm/recipes/nemotronh_47b.py
+++ b/nemo/collections/llm/recipes/nemotronh_47b.py
@@ -29,7 +29,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nemotron_h_bf16_with_fp8_current_scaling_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_4b.py b/nemo/collections/llm/recipes/nemotronh_4b.py
index eb6c781482a3..29a1e44aa597 100644
--- a/nemo/collections/llm/recipes/nemotronh_4b.py
+++ b/nemo/collections/llm/recipes/nemotronh_4b.py
@@ -29,7 +29,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_56b.py b/nemo/collections/llm/recipes/nemotronh_56b.py
index c67f870be3ba..5177e688263d 100644
--- a/nemo/collections/llm/recipes/nemotronh_56b.py
+++ b/nemo/collections/llm/recipes/nemotronh_56b.py
@@ -29,7 +29,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nemotron_h_bf16_with_fp8_current_scaling_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_8b.py b/nemo/collections/llm/recipes/nemotronh_8b.py
index 06f54b6f1d24..98f5fe390805 100644
--- a/nemo/collections/llm/recipes/nemotronh_8b.py
+++ b/nemo/collections/llm/recipes/nemotronh_8b.py
@@ -29,7 +29,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
index fd4cd9a70253..c63a9b98c46b 100644
--- a/nemo/collections/llm/t5/data/fine_tuning.py
+++ b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -73,7 +73,7 @@ def __init__(
 
         # create tokenizer if tokenizer is None
         if tokenizer is None:
-            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+            from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
             special_tokens = {}
             special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
diff --git a/nemo/collections/llm/t5/data/mock.py b/nemo/collections/llm/t5/data/mock.py
index 7a2007936aee..329eb320f2c8 100644
--- a/nemo/collections/llm/t5/data/mock.py
+++ b/nemo/collections/llm/t5/data/mock.py
@@ -66,7 +66,7 @@ def __init__(
         self.persistent_workers = persistent_workers
         self.create_attention_mask = create_attention_mask or not HAVE_TE
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
         self.data_sampler = MegatronDataSampler(
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index c22965917943..902bf342330f 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -145,7 +145,7 @@ def __init__(
 
         # create tokenizer if tokenizer is None
         if tokenizer is None:
-            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+            from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
             special_tokens = {}
             special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
diff --git a/nemo/collections/nlp/modules/common/__init__.py b/nemo/collections/nlp/modules/common/__init__.py
index 82923cc72a15..791e60c9a1f7 100644
--- a/nemo/collections/nlp/modules/common/__init__.py
+++ b/nemo/collections/nlp/modules/common/__init__.py
@@ -33,4 +33,4 @@
 from nemo.collections.nlp.modules.common.sequence_regression import SequenceRegression
 from nemo.collections.nlp.modules.common.sequence_token_classifier import SequenceTokenClassifier
 from nemo.collections.nlp.modules.common.token_classifier import BertPretrainingTokenClassifier, TokenClassifier
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer, get_tokenizer_list
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer, get_tokenizer_list
diff --git a/nemo/collections/tts/models/language_modeling/nlp_model.py b/nemo/collections/tts/models/language_modeling/nlp_model.py
index 04e87cd032f6..0fe97828cd14 100644
--- a/nemo/collections/tts/models/language_modeling/nlp_model.py
+++ b/nemo/collections/tts/models/language_modeling/nlp_model.py
@@ -36,7 +36,7 @@
     MEGATRON_CONFIG_MAP,
     get_megatron_pretrained_bert_models,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.core.classes import ModelPT
 from nemo.core.classes.exportable import Exportable
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
index b3cc88de7caf..1ff708efef54 100644
--- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -22,7 +22,7 @@
 from nemo.export.tiktoken_tokenizer import TiktokenTokenizer
 
 # TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
-# from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+# from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
 TOKENIZER_CONFIG_FILE = "tokenizer_config.yaml"
 TOKENIZER_DIR = "tokenizer"
diff --git a/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py b/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py
index 1ec2042a5d0b..3a041ea6e264 100644
--- a/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py
+++ b/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py
@@ -39,7 +39,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 
 parser = argparse.ArgumentParser(description='Tarred Tokenized dataset for text language modelling')
 
diff --git a/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py b/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py
index ede16b616827..5c28b604782a 100644
--- a/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py
+++ b/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py
@@ -58,7 +58,7 @@
 except (ImportError, ModuleNotFoundError):
     TransformerLMModel = ABC
 
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.utils import logging
 
 
diff --git a/scripts/llm/gpt_train.py b/scripts/llm/gpt_train.py
index a567693ce585..880e20e6696a 100644
--- a/scripts/llm/gpt_train.py
+++ b/scripts/llm/gpt_train.py
@@ -24,7 +24,7 @@
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.llm.gpt.data import ChatDataModule, MockDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
 from nemo.utils import logging
diff --git a/scripts/performance/llm/finetune_deepseek_v3.py b/scripts/performance/llm/finetune_deepseek_v3.py
index 26aa92872f6b..d8ba675a72ce 100644
--- a/scripts/performance/llm/finetune_deepseek_v3.py
+++ b/scripts/performance/llm/finetune_deepseek_v3.py
@@ -19,7 +19,7 @@
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.deepseek_v3 import finetune_recipe, model
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
diff --git a/scripts/performance/llm/pretrain_deepseek_v3.py b/scripts/performance/llm/pretrain_deepseek_v3.py
index 5a640a3be5b7..24038e51e7b5 100644
--- a/scripts/performance/llm/pretrain_deepseek_v3.py
+++ b/scripts/performance/llm/pretrain_deepseek_v3.py
@@ -18,7 +18,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
diff --git a/scripts/performance/llm/pretrain_gpt3_175b.py b/scripts/performance/llm/pretrain_gpt3_175b.py
index d8aeca3dcb7c..8dc56d29061d 100644
--- a/scripts/performance/llm/pretrain_gpt3_175b.py
+++ b/scripts/performance/llm/pretrain_gpt3_175b.py
@@ -25,7 +25,7 @@
     userbuffers_fp8_b200_h12288_tp4_mbs1_seqlen2048,
     userbuffers_fp8_h100_h12288_tp4_mbs1_seqlen2048,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py
index 79a4d3b2eda4..93995e9b4b30 100644
--- a/scripts/performance/llm/pretrain_llama31_405b.py
+++ b/scripts/performance/llm/pretrain_llama31_405b.py
@@ -25,7 +25,7 @@
     userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192,
     userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama3_70b.py b/scripts/performance/llm/pretrain_llama3_70b.py
index 39bea54b6299..4d8471add6b8 100644
--- a/scripts/performance/llm/pretrain_llama3_70b.py
+++ b/scripts/performance/llm/pretrain_llama3_70b.py
@@ -25,7 +25,7 @@
     userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192,
     userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py
index 46492d61d2be..3d4df90efb4d 100644
--- a/scripts/performance/llm/pretrain_llama3_8b.py
+++ b/scripts/performance/llm/pretrain_llama3_8b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama4_e128.py b/scripts/performance/llm/pretrain_llama4_e128.py
index 560d68e3ffd5..1dec173f8d08 100644
--- a/scripts/performance/llm/pretrain_llama4_e128.py
+++ b/scripts/performance/llm/pretrain_llama4_e128.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.llama4_e128 import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama4_e16.py b/scripts/performance/llm/pretrain_llama4_e16.py
index f90d8f02bf81..0db891f15829 100644
--- a/scripts/performance/llm/pretrain_llama4_e16.py
+++ b/scripts/performance/llm/pretrain_llama4_e16.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.llama4_e16 import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_mixtral_8x22b.py b/scripts/performance/llm/pretrain_mixtral_8x22b.py
index 294f8fe22fbf..b7abe01cef3c 100644
--- a/scripts/performance/llm/pretrain_mixtral_8x22b.py
+++ b/scripts/performance/llm/pretrain_mixtral_8x22b.py
@@ -18,7 +18,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.mixtral_8x22b_64k import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_mixtral_8x7b.py b/scripts/performance/llm/pretrain_mixtral_8x7b.py
index e6653b5d95a6..b490609587f0 100644
--- a/scripts/performance/llm/pretrain_mixtral_8x7b.py
+++ b/scripts/performance/llm/pretrain_mixtral_8x7b.py
@@ -18,7 +18,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron3_22b.py b/scripts/performance/llm/pretrain_nemotron3_22b.py
index 128b98403047..48b27231bf51 100644
--- a/scripts/performance/llm/pretrain_nemotron3_22b.py
+++ b/scripts/performance/llm/pretrain_nemotron3_22b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron3_8b.py b/scripts/performance/llm/pretrain_nemotron3_8b.py
index c60acf3a9fe6..7c4515bd4309 100644
--- a/scripts/performance/llm/pretrain_nemotron3_8b.py
+++ b/scripts/performance/llm/pretrain_nemotron3_8b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.nemotron3_8b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py
index 92009b134f55..2ef0859049ad 100644
--- a/scripts/performance/llm/pretrain_nemotron4_15b.py
+++ b/scripts/performance/llm/pretrain_nemotron4_15b.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.llm.recipes.nemotron4_15b import pretrain_recipe
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py
index ab97ab612255..8256bb709126 100644
--- a/scripts/performance/llm/pretrain_nemotron4_340b.py
+++ b/scripts/performance/llm/pretrain_nemotron4_340b.py
@@ -23,7 +23,7 @@
     userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096,
     userbuffers_fp8_b200_h18432_tp8_mbs1_seqlen4096,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotronh_47b.py b/scripts/performance/llm/pretrain_nemotronh_47b.py
index 2a7a3e4cfb31..2193d8e72bd1 100644
--- a/scripts/performance/llm/pretrain_nemotronh_47b.py
+++ b/scripts/performance/llm/pretrain_nemotronh_47b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.nemotronh_47b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotronh_56b.py b/scripts/performance/llm/pretrain_nemotronh_56b.py
index 5a057e959044..3f64f421f1fe 100644
--- a/scripts/performance/llm/pretrain_nemotronh_56b.py
+++ b/scripts/performance/llm/pretrain_nemotronh_56b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.nemotronh_56b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotronh_8b.py b/scripts/performance/llm/pretrain_nemotronh_8b.py
index f7d37b32a7b2..ca0c6ff9435c 100644
--- a/scripts/performance/llm/pretrain_nemotronh_8b.py
+++ b/scripts/performance/llm/pretrain_nemotronh_8b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.nemotronh_8b import pretrain_recipe
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/vlm/finetune_neva_8b.py b/scripts/performance/vlm/finetune_neva_8b.py
index 7afdcebb5dd8..3f9b76a45bd6 100644
--- a/scripts/performance/vlm/finetune_neva_8b.py
+++ b/scripts/performance/vlm/finetune_neva_8b.py
@@ -16,7 +16,7 @@
 
 import nemo_run as run
 
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.vlm.recipes.neva_llama3_8b import finetune_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
diff --git a/scripts/performance/vlm/finetune_qwen25vl_32b.py b/scripts/performance/vlm/finetune_qwen25vl_32b.py
index 3b8c061e007d..0bf6f5bb790c 100644
--- a/scripts/performance/vlm/finetune_qwen25vl_32b.py
+++ b/scripts/performance/vlm/finetune_qwen25vl_32b.py
@@ -16,7 +16,7 @@
 
 import nemo_run as run
 
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.vlm.recipes.qwen25vl_32b import finetune_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
diff --git a/scripts/performance/vlm/finetune_qwen25vl_7b.py b/scripts/performance/vlm/finetune_qwen25vl_7b.py
index fedcd6efcfdf..5e028ba8e481 100644
--- a/scripts/performance/vlm/finetune_qwen25vl_7b.py
+++ b/scripts/performance/vlm/finetune_qwen25vl_7b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.vlm.recipes.qwen25vl_7b import finetune_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
diff --git a/tests/collections/common/test_apply_chat_template.py b/tests/collections/common/test_apply_chat_template.py
index ce27c5c824f4..66b5a275bce5 100644
--- a/tests/collections/common/test_apply_chat_template.py
+++ b/tests/collections/common/test_apply_chat_template.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import pytest
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 
 
 def test_chat_template():
diff --git a/tests/collections/llm/bert_pretraining.py b/tests/collections/llm/bert_pretraining.py
index 4854e24ac8b0..995db093576c 100644
--- a/tests/collections/llm/bert_pretraining.py
+++ b/tests/collections/llm/bert_pretraining.py
@@ -20,7 +20,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
 
 ## NOTE: This script is present for github-actions testing only.
diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
index ab13c02d6e08..05466243ff82 100644
--- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
+++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
@@ -22,7 +22,7 @@
 
 from nemo.collections.llm import MixtralConfig8x7B, MixtralModel, PreTrainingDataModule
 from nemo.collections.llm.api import train
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import MegatronStrategy, NeMoLogger, Trainer
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule as MegatronOptim
 from nemo.lightning.pytorch.optim.megatron import OptimizerConfig
diff --git a/tests/collections/llm/gpt/data/megatron/hyena/test_evo2_dataset.py b/tests/collections/llm/gpt/data/megatron/hyena/test_evo2_dataset.py
index 982258db5a03..f33092c6b003 100644
--- a/tests/collections/llm/gpt/data/megatron/hyena/test_evo2_dataset.py
+++ b/tests/collections/llm/gpt/data/megatron/hyena/test_evo2_dataset.py
@@ -1061,7 +1061,7 @@ def benchmark_phylo_tag_masking(num_iterations: int = 1000) -> tuple[float, floa
 def test_evo2_dataset_getitem(monkeypatch):
     """Test Evo2Dataset.__getitem__ method."""
     import numpy as np
-    from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+    from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
     tokenizer = get_nmt_tokenizer("byte-level")
     eod_token_id = tokenizer.eod
diff --git a/tests/collections/llm/gpt/data/test_pre_training_data.py b/tests/collections/llm/gpt/data/test_pre_training_data.py
index c2cd04348e38..2e50ffeb5bba 100644
--- a/tests/collections/llm/gpt/data/test_pre_training_data.py
+++ b/tests/collections/llm/gpt/data/test_pre_training_data.py
@@ -16,7 +16,7 @@
 
 import nemo.lightning as nl
 from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 
 DATA_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document"
 VOCAB_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json"
diff --git a/tests/collections/llm/gpt/model/test_hyena.py b/tests/collections/llm/gpt/model/test_hyena.py
index 8c41ea9b413d..d7257896d962 100644
--- a/tests/collections/llm/gpt/model/test_hyena.py
+++ b/tests/collections/llm/gpt/model/test_hyena.py
@@ -37,7 +37,7 @@
     userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
     userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
diff --git a/tests/collections/llm/gpt/model/test_hyena_accuracy.py b/tests/collections/llm/gpt/model/test_hyena_accuracy.py
index 3e6d2bd2b0c9..9339796d1594 100644
--- a/tests/collections/llm/gpt/model/test_hyena_accuracy.py
+++ b/tests/collections/llm/gpt/model/test_hyena_accuracy.py
@@ -36,7 +36,7 @@
 from megatron.core.transformer.module import Float16Module, MegatronModule
 
 from nemo.collections import llm
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.io.pl import MegatronCheckpointIO
 
 
diff --git a/tests/collections/llm/gpt/model/test_nemotronh.py b/tests/collections/llm/gpt/model/test_nemotronh.py
index 1d1703a926c8..6d51d642e248 100644
--- a/tests/collections/llm/gpt/model/test_nemotronh.py
+++ b/tests/collections/llm/gpt/model/test_nemotronh.py
@@ -25,7 +25,7 @@
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.llm.gpt.data import MockDataModule, PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index 56c9320d51fa..f580eeca7d3d 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -22,7 +22,7 @@
 from nemo.collections import llm
 from nemo.collections.llm.gpt.data.core import get_dataset_root
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from tests.collections.llm.common import Llama3ConfigCI
 
 ## NOTE: This script is present for github-actions testing only.
diff --git a/tests/collections/llm/megatron_gpt_pretraining.py b/tests/collections/llm/megatron_gpt_pretraining.py
index 1441022adff2..970eed4d201f 100644
--- a/tests/collections/llm/megatron_gpt_pretraining.py
+++ b/tests/collections/llm/megatron_gpt_pretraining.py
@@ -25,7 +25,7 @@
 from nemo.collections import llm
 from nemo.collections.llm.api import train
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import AutoResume, NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint, ModelTrainingStateCallback, ParameterDebugger
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py
index 533ef63b6628..c1341b51a298 100644
--- a/tests/collections/llm/megatron_mixtral_pretraining.py
+++ b/tests/collections/llm/megatron_mixtral_pretraining.py
@@ -23,7 +23,7 @@
 
 from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule
 from nemo.collections.llm.api import train
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import MegatronStrategy, NeMoLogger, Trainer
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule as MegatronOptim
 from nemo.lightning.pytorch.optim.megatron import OptimizerConfig
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index a71032718195..ea9e407681da 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -25,7 +25,7 @@
 from nemo.collections import llm
 from nemo.collections.llm.api import finetune
 from nemo.collections.llm.t5.data import SquadDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index ccea8431e655..519d38a70e53 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -25,7 +25,7 @@
 from nemo.collections import llm
 from nemo.collections.llm.api import pretrain
 from nemo.collections.llm.t5.data import MockDataModule, PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler
diff --git a/tests/collections/speechlm/speech_to_text_llm_train.py b/tests/collections/speechlm/speech_to_text_llm_train.py
index 975b0cbf21c2..b054854439e5 100644
--- a/tests/collections/speechlm/speech_to_text_llm_train.py
+++ b/tests/collections/speechlm/speech_to_text_llm_train.py
@@ -18,7 +18,7 @@
 from omegaconf import OmegaConf
 
 from nemo.collections import speechlm
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils import logging
 
 ## NOTE: This script is present for github-actions testing only.
diff --git a/tests/lightning/test_ddp_parity_checker.py b/tests/lightning/test_ddp_parity_checker.py
index 176b3d512fe3..5783e526e503 100644
--- a/tests/lightning/test_ddp_parity_checker.py
+++ b/tests/lightning/test_ddp_parity_checker.py
@@ -22,7 +22,7 @@
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import DdpParityChecker
 
 
diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py
index 0cf99b4eceee..0d6aedf84d14 100644
--- a/tests/lightning/test_nemo_resume_from_ckpt.py
+++ b/tests/lightning/test_nemo_resume_from_ckpt.py
@@ -32,7 +32,7 @@ def set_env():
 import nemo.lightning as nl
 from nemo.collections import llm
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py
index 35569f912c30..5a7d6b32d3c6 100644
--- a/tests/lightning/test_state_restoration.py
+++ b/tests/lightning/test_state_restoration.py
@@ -24,7 +24,7 @@
 from nemo.collections import llm
 from nemo.collections.llm.api import train
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import AutoResume, NeMoLogger
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tutorials/llm/embedding/llama_embedding.ipynb b/tutorials/llm/embedding/llama_embedding.ipynb
index fd7b3ad77a0d..366d3339f8e3 100644
--- a/tutorials/llm/embedding/llama_embedding.ipynb
+++ b/tutorials/llm/embedding/llama_embedding.ipynb
@@ -416,7 +416,7 @@
     "\n",
     "import nemo.lightning as nl\n",
     "from nemo.collections.llm.inference.base import _setup_trainer_and_restore_model\n",
-    "from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer\n",
+    "from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer\n",
     "from nemo.lightning import io\n",
     "from nemo.utils.exp_manager import TimingCallback\n",
     "from nemo.lightning.ckpt_utils import ckpt_to_context_subdir\n",

From 1213d1b33c29a3fa5490cc4f6c0764d691645b2f Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 03:27:29 -0700
Subject: [PATCH 02/21] remove tokenizer_utils

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/modules/common/tokenizer_utils.py     | 285 ------------------
 1 file changed, 285 deletions(-)
 delete mode 100644 nemo/collections/nlp/modules/common/tokenizer_utils.py

diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
deleted file mode 100644
index 5f7c5ee17920..000000000000
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os.path
-from dataclasses import MISSING, dataclass
-from typing import Dict, List, Optional
-
-from nemo.utils import logging
-
-from .huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list
-
-__all__ = ["get_tokenizer", "get_tokenizer_list"]
-
-
-megatron_tokenizer_model_map = {
-    "BertWordPieceLowerCase": "megatron-bert-345m-uncased",
-    "BertWordPieceCase": "megatron-bert-345m-cased",
-    "GPT2BPETokenizer": "megatron-gpt-345m",
-}
-
-
-def get_tokenizer_list() -> List[str]:
-    """
-    Returns all all supported tokenizer names
-    """
-    s = set(get_huggingface_pretrained_lm_models_list(include_external=False))
-    s.update(set(get_huggingface_pretrained_lm_models_list(include_external=True)))
-    return ["sentencepiece", "char", "word"] + list(s)
-
-
-@dataclass
-class TokenizerConfig:
-    """
-    Tokenizer Configuration Dataclass.
-    """
-
-    library: str = MISSING
-    tokenizer_model: Optional[str] = None
-    vocab_size: Optional[int] = None
-    vocab_file: Optional[str] = None
-    special_tokens: Optional[Dict[str, str]] = None
-    bpe_dropout: Optional[float] = 0.0
-    coverage: Optional[float] = 0.999
-    training_sample_size: Optional[int] = None
-    r2l: Optional[bool] = False
-    sentencepiece_legacy: Optional[bool] = False
-
-
-def get_tokenizer(
-    tokenizer_name: str,
-    tokenizer_model: Optional[str] = None,
-    vocab_file: Optional[str] = None,
-    merges_file: Optional[str] = None,
-    special_tokens: Optional[Dict[str, str]] = None,
-    use_fast: Optional[bool] = False,
-    bpe_dropout: Optional[float] = 0.0,
-    chat_template: Optional[Dict] = None,
-):
-    """
-    Args:
-        tokenizer_name: sentencepiece or pretrained model from the hugging face list,
-            for example: bert-base-cased
-            To see the list of all HuggingFace pretrained models, use:
-            nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
-        tokenizer_model: tokenizer model file of sentencepiece
-        special_tokens: dict of special tokens.
-            For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel
-            tokens for T5 (<extra_id_0>, <extra_id_1>, etc.), use key 'additional_special_tokens'
-        vocab_file: path to vocab file
-        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
-        bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation
-            procedure of BPE to help
-            model better learn word compositionality and become robust to segmentation errors.
-            It has empirically been shown to improve inference time BLEU scores.
-    """
-    import omegaconf
-    from omegaconf import OmegaConf
-
-    if isinstance(
-        special_tokens,
-        (omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig),
-    ):
-        special_tokens = OmegaConf.to_container(special_tokens)
-
-    if special_tokens is None:
-        special_tokens_dict = {}
-    else:
-        special_tokens_dict = special_tokens
-
-    if "megatron" in tokenizer_name:
-        try:
-            from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
-                get_megatron_merges_file,
-                get_megatron_tokenizer,
-                get_megatron_vocab_file,
-            )
-        except (ImportError, ModuleNotFoundError):
-            raise ImportError(
-                "Megatron-core was not found. Please see the NeMo README for installation instructions: "
-                " https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
-        if vocab_file is None:
-            vocab_file = get_megatron_vocab_file(tokenizer_name)
-            merges_file = get_megatron_merges_file(tokenizer_name)
-        tokenizer_name = get_megatron_tokenizer(tokenizer_name)
-
-    if tokenizer_name == "sentencepiece":
-        from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
-
-        logging.info("tokenizer_model: " + str(tokenizer_model))
-        return SentencePieceTokenizer(
-            model_path=tokenizer_model,
-            special_tokens=special_tokens,
-            legacy=True,
-            chat_template=chat_template,
-        )
-    elif tokenizer_name == "tiktoken":
-        from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
-
-        return TiktokenTokenizer(
-            vocab_file=vocab_file,
-            special_tokens=special_tokens["additional_special_tokens"],
-        )
-    elif tokenizer_name == "word":
-        from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer
-
-        return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict)
-    elif tokenizer_name == "char":
-        from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer
-
-        return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict)
-    elif tokenizer_name == "regex":
-        from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
-
-        return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
-
-    logging.info(
-        f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, "
-        f" merges_files: {merges_file}, special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}"
-    )
-    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-
-    tokenizer = AutoTokenizer(
-        pretrained_model_name=tokenizer_name,
-        vocab_file=vocab_file,
-        merges_file=merges_file,
-        **special_tokens_dict,
-        use_fast=use_fast,
-        chat_template=chat_template,
-    )
-    return tokenizer
-
-
-def get_nmt_tokenizer(
-    library: str = "sentencepiece",
-    model_name: Optional[str] = None,
-    tokenizer_model: Optional[str] = None,
-    vocab_file: Optional[str] = None,
-    merges_file: Optional[str] = None,
-    special_tokens: Optional[Dict[str, str]] = None,
-    use_fast: Optional[bool] = False,
-    bpe_dropout: Optional[float] = 0.0,
-    r2l: Optional[bool] = False,
-    legacy: Optional[bool] = False,
-    delimiter: Optional[str] = None,
-    trust_remote_code: Optional[bool] = False,
-    chat_template: Optional[Dict] = None,
-    vocab_size: Optional[int] = None,
-):
-    """
-    Args:
-        model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
-        tokenizer_model: tokenizer model file of sentencepiece
-        special_tokens: dict of special tokens
-        vocab_file: path to vocab file
-        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
-        bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation procedure
-            of BPE to help model better learn word compositionality and become robust to segmentation errors.
-            It has empirically been shown to improve inference time BLEU scores.
-        r2l: Whether to return subword IDs from right to left
-    """
-    import omegaconf
-    from omegaconf import OmegaConf
-
-    if isinstance(
-        special_tokens,
-        (omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig),
-    ):
-        special_tokens = OmegaConf.to_container(special_tokens)
-    if special_tokens is None:
-        special_tokens_dict = {}
-    else:
-        special_tokens_dict = special_tokens
-
-    if (library != "byte-level") and (
-        model_name is None and (tokenizer_model is None or not os.path.isfile(tokenizer_model))
-    ):
-        raise ValueError("No Tokenizer path provided or file does not exist!")
-
-    if library == "huggingface":
-        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-
-        logging.info(f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}')
-        tokenizer = AutoTokenizer(
-            pretrained_model_name=model_name,
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            **special_tokens_dict,
-            use_fast=use_fast,
-            trust_remote_code=trust_remote_code,
-            chat_template=chat_template,
-        )
-        if chat_template:
-            tokenizer.tokenizer.chat_template = chat_template
-        return tokenizer
-    elif library == 'sentencepiece':
-        from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
-
-        logging.info(f"Getting SentencePiece with model: {tokenizer_model}")
-
-        return SentencePieceTokenizer(
-            model_path=tokenizer_model,
-            special_tokens=special_tokens,
-            legacy=legacy,
-            chat_template=chat_template,
-        )
-    elif library == "byte-level":
-        from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer
-
-        logging.info("Using byte-level tokenization")
-        return ByteLevelTokenizer(special_tokens_dict)
-    elif library == "regex":
-        from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
-
-        logging.info("Using regex tokenization")
-        return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
-    elif library == "megatron":
-        if model_name == "GPTSentencePieceTokenizer":
-            from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
-
-            logging.info("tokenizer_model: ")
-            logging.info(tokenizer_model)
-            return SentencePieceTokenizer(model_path=tokenizer_model, legacy=legacy)
-
-        if model_name in megatron_tokenizer_model_map:
-            model_name = megatron_tokenizer_model_map[model_name]
-        logging.info(
-            f"Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, "
-            f"and merges file: {merges_file}"
-        )
-        return get_tokenizer(
-            tokenizer_name=model_name,
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            special_tokens=special_tokens_dict,
-            chat_template=chat_template,
-        )
-    elif library == "tabular":
-        from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
-
-        return TabularTokenizer(vocab_file, delimiter=delimiter)
-    elif library == "tiktoken":
-        from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
-
-        return TiktokenTokenizer(vocab_file=vocab_file)
-    elif library == "null":
-        assert vocab_size is not None
-        from nemo.collections.common.tokenizers.null_tokenizer import NullTokenizer
-
-        return NullTokenizer(vocab_size)
-    else:
-        raise NotImplementedError(
-            'Currently we only support "huggingface", "sentencepiece", "megatron", "byte-level", "regex", "tabular",'
-            '"tiktoken", and "null" tokenizer libraries.'
-        )

From 36f71a369899e3d6d8bb9004998f35e91d5f8359 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 15 Oct 2025 10:28:51 +0000
Subject: [PATCH 03/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/llm/recipes/hyena_base.py                     | 2 +-
 nemo/collections/llm/recipes/mamba2_130m.py                    | 2 +-
 nemo/collections/llm/recipes/mamba2_1_3b.py                    | 2 +-
 nemo/collections/llm/recipes/mamba2_2_7b.py                    | 2 +-
 nemo/collections/llm/recipes/mamba2_370m.py                    | 2 +-
 nemo/collections/llm/recipes/mamba2_780m.py                    | 2 +-
 nemo/collections/llm/recipes/mamba2_8b.py                      | 2 +-
 nemo/collections/llm/recipes/mamba2_hybrid_8b.py               | 2 +-
 nemo/collections/llm/recipes/nemotron_nano_12b_v2.py           | 2 +-
 nemo/collections/llm/recipes/nemotron_nano_9b_v2.py            | 2 +-
 nemo/collections/llm/recipes/nemotronh_47b.py                  | 2 +-
 nemo/collections/llm/recipes/nemotronh_4b.py                   | 2 +-
 nemo/collections/llm/recipes/nemotronh_56b.py                  | 2 +-
 nemo/collections/llm/recipes/nemotronh_8b.py                   | 2 +-
 nemo/collections/nlp/modules/common/__init__.py                | 2 +-
 nemo/collections/tts/models/language_modeling/nlp_model.py     | 2 +-
 .../neural_rescorer/create_tarred_transformer_lm_dataset.py    | 3 +--
 scripts/llm/gpt_train.py                                       | 2 +-
 scripts/performance/llm/finetune_deepseek_v3.py                | 2 +-
 scripts/performance/llm/pretrain_deepseek_v3.py                | 2 +-
 scripts/performance/llm/pretrain_gpt3_175b.py                  | 2 +-
 scripts/performance/llm/pretrain_llama31_405b.py               | 2 +-
 scripts/performance/llm/pretrain_llama3_70b.py                 | 2 +-
 scripts/performance/llm/pretrain_llama3_8b.py                  | 2 +-
 scripts/performance/llm/pretrain_llama4_e128.py                | 2 +-
 scripts/performance/llm/pretrain_llama4_e16.py                 | 2 +-
 scripts/performance/llm/pretrain_mixtral_8x22b.py              | 2 +-
 scripts/performance/llm/pretrain_mixtral_8x7b.py               | 2 +-
 scripts/performance/llm/pretrain_nemotron3_22b.py              | 2 +-
 scripts/performance/llm/pretrain_nemotron3_8b.py               | 2 +-
 scripts/performance/llm/pretrain_nemotron4_15b.py              | 2 +-
 scripts/performance/llm/pretrain_nemotron4_340b.py             | 2 +-
 scripts/performance/llm/pretrain_nemotronh_47b.py              | 2 +-
 scripts/performance/llm/pretrain_nemotronh_56b.py              | 2 +-
 scripts/performance/llm/pretrain_nemotronh_8b.py               | 2 +-
 scripts/performance/vlm/finetune_qwen25vl_7b.py                | 2 +-
 .../collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py  | 2 +-
 tests/collections/llm/gpt/data/test_pre_training_data.py       | 2 +-
 tests/collections/llm/gpt/model/test_hyena.py                  | 2 +-
 tests/collections/llm/gpt/model/test_nemotronh.py              | 2 +-
 tests/collections/llm/gpt_finetuning.py                        | 2 +-
 tests/collections/llm/megatron_gpt_pretraining.py              | 2 +-
 tests/collections/llm/megatron_mixtral_pretraining.py          | 2 +-
 tests/collections/llm/megatron_t5_finetuning.py                | 2 +-
 tests/collections/llm/megatron_t5_pretraining.py               | 2 +-
 tests/lightning/test_ddp_parity_checker.py                     | 2 +-
 tests/lightning/test_nemo_resume_from_ckpt.py                  | 2 +-
 tests/lightning/test_state_restoration.py                      | 2 +-
 48 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/nemo/collections/llm/recipes/hyena_base.py b/nemo/collections/llm/recipes/hyena_base.py
index 77f89d039391..ab03c32f8d6e 100644
--- a/nemo/collections/llm/recipes/hyena_base.py
+++ b/nemo/collections/llm/recipes/hyena_base.py
@@ -29,6 +29,7 @@
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.collections.llm.gpt.data.megatron.hyena import Evo2Dataset, parse_dataset_config
@@ -37,7 +38,6 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed, bf16_with_fp8_mixed
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
index a833d65395d2..f787dfdc7435 100644
--- a/nemo/collections/llm/recipes/mamba2_130m.py
+++ b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_130m"
diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
index d1ce2845bc69..7ebe05963be7 100644
--- a/nemo/collections/llm/recipes/mamba2_1_3b.py
+++ b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_1_3b"
diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
index 11836b459cb7..61f4f0714f41 100644
--- a/nemo/collections/llm/recipes/mamba2_2_7b.py
+++ b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_2_7b"
diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py
index 4267e05e47c9..304082866ae6 100644
--- a/nemo/collections/llm/recipes/mamba2_370m.py
+++ b/nemo/collections/llm/recipes/mamba2_370m.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_370m"
diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py
index 5b3a7ae5cad6..90ceb5d5d9b3 100644
--- a/nemo/collections/llm/recipes/mamba2_780m.py
+++ b/nemo/collections/llm/recipes/mamba2_780m.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_780m"
diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py
index f33a1f918bca..c9d86ffd7656 100644
--- a/nemo/collections/llm/recipes/mamba2_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_8b.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_8b"
diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
index 4b4182f279df..57cc3926ad6a 100644
--- a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -23,12 +23,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mamba2_hybrid_8b"
diff --git a/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py b/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py
index 58028f21c4e2..8637973121bb 100644
--- a/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py
+++ b/nemo/collections/llm/recipes/nemotron_nano_12b_v2.py
@@ -24,12 +24,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nanov2_bf16_with_fp8_current_scaling_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.utils.exp_manager import TimingCallback
 
diff --git a/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py b/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py
index 032519821e59..ee3e745c21d4 100644
--- a/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py
+++ b/nemo/collections/llm/recipes/nemotron_nano_9b_v2.py
@@ -24,12 +24,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_47b.py b/nemo/collections/llm/recipes/nemotronh_47b.py
index 32549bebcda2..bd5bf4b5f652 100644
--- a/nemo/collections/llm/recipes/nemotronh_47b.py
+++ b/nemo/collections/llm/recipes/nemotronh_47b.py
@@ -24,12 +24,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nemotron_h_bf16_with_fp8_current_scaling_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_4b.py b/nemo/collections/llm/recipes/nemotronh_4b.py
index 29a1e44aa597..d8697d4a81c8 100644
--- a/nemo/collections/llm/recipes/nemotronh_4b.py
+++ b/nemo/collections/llm/recipes/nemotronh_4b.py
@@ -24,12 +24,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_56b.py b/nemo/collections/llm/recipes/nemotronh_56b.py
index 5177e688263d..2ad0812db759 100644
--- a/nemo/collections/llm/recipes/nemotronh_56b.py
+++ b/nemo/collections/llm/recipes/nemotronh_56b.py
@@ -24,12 +24,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nemotron_h_bf16_with_fp8_current_scaling_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/recipes/nemotronh_8b.py b/nemo/collections/llm/recipes/nemotronh_8b.py
index 98f5fe390805..8a22cfec6cfd 100644
--- a/nemo/collections/llm/recipes/nemotronh_8b.py
+++ b/nemo/collections/llm/recipes/nemotronh_8b.py
@@ -24,12 +24,12 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/nlp/modules/common/__init__.py b/nemo/collections/nlp/modules/common/__init__.py
index 791e60c9a1f7..367ef5f1bd57 100644
--- a/nemo/collections/nlp/modules/common/__init__.py
+++ b/nemo/collections/nlp/modules/common/__init__.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer, get_tokenizer_list
 from nemo.collections.nlp.modules.common.bert_module import BertModule
 from nemo.collections.nlp.modules.common.huggingface import (
     AlbertEncoder,
@@ -33,4 +34,3 @@
 from nemo.collections.nlp.modules.common.sequence_regression import SequenceRegression
 from nemo.collections.nlp.modules.common.sequence_token_classifier import SequenceTokenClassifier
 from nemo.collections.nlp.modules.common.token_classifier import BertPretrainingTokenClassifier, TokenClassifier
-from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer, get_tokenizer_list
diff --git a/nemo/collections/tts/models/language_modeling/nlp_model.py b/nemo/collections/tts/models/language_modeling/nlp_model.py
index 0fe97828cd14..df5add50b07b 100644
--- a/nemo/collections/tts/models/language_modeling/nlp_model.py
+++ b/nemo/collections/tts/models/language_modeling/nlp_model.py
@@ -29,6 +29,7 @@
 from transformers import TRANSFORMERS_CACHE
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.collections.nlp.modules import BertModule
 from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import VOCAB_FILE_NAME
 from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
@@ -36,7 +37,6 @@
     MEGATRON_CONFIG_MAP,
     get_megatron_pretrained_bert_models,
 )
-from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.core.classes import ModelPT
 from nemo.core.classes.exportable import Exportable
diff --git a/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py b/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py
index 3a041ea6e264..571d4af740a8 100644
--- a/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py
+++ b/scripts/asr_language_modeling/neural_rescorer/create_tarred_transformer_lm_dataset.py
@@ -199,8 +199,7 @@ def __tokenize_text(
 
 
 def __create_chunk(data_root, chunk_path, shard_id, compute_metrics=False):
-    """Creates a tarball containing the tokenized text chunks.
-       """
+    """Creates a tarball containing the tokenized text chunks."""
     tar = tarfile.open(os.path.join(data_root, f'text_{shard_id}.tar'), mode='a', encoding='utf-8')
 
     # We squash the filename since we do not preserve directory structure of tokenized text in the tarball.
diff --git a/scripts/llm/gpt_train.py b/scripts/llm/gpt_train.py
index 880e20e6696a..16bb9a0066ec 100644
--- a/scripts/llm/gpt_train.py
+++ b/scripts/llm/gpt_train.py
@@ -23,8 +23,8 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
-from nemo.collections.llm.gpt.data import ChatDataModule, MockDataModule
 from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
+from nemo.collections.llm.gpt.data import ChatDataModule, MockDataModule
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
 from nemo.utils import logging
diff --git a/scripts/performance/llm/finetune_deepseek_v3.py b/scripts/performance/llm/finetune_deepseek_v3.py
index d8ba675a72ce..bb3e53ea8ca1 100644
--- a/scripts/performance/llm/finetune_deepseek_v3.py
+++ b/scripts/performance/llm/finetune_deepseek_v3.py
@@ -16,10 +16,10 @@
 
 import nemo_run as run
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.deepseek_v3 import finetune_recipe, model
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
diff --git a/scripts/performance/llm/pretrain_deepseek_v3.py b/scripts/performance/llm/pretrain_deepseek_v3.py
index 24038e51e7b5..709b47266edd 100644
--- a/scripts/performance/llm/pretrain_deepseek_v3.py
+++ b/scripts/performance/llm/pretrain_deepseek_v3.py
@@ -17,8 +17,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
diff --git a/scripts/performance/llm/pretrain_gpt3_175b.py b/scripts/performance/llm/pretrain_gpt3_175b.py
index 8dc56d29061d..ae3fe55a4217 100644
--- a/scripts/performance/llm/pretrain_gpt3_175b.py
+++ b/scripts/performance/llm/pretrain_gpt3_175b.py
@@ -18,6 +18,7 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.recipes.gpt3_175b import pretrain_recipe
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_bf16_b200_h12288_tp4_mbs1_seqlen2048,
@@ -25,7 +26,6 @@
     userbuffers_fp8_b200_h12288_tp4_mbs1_seqlen2048,
     userbuffers_fp8_h100_h12288_tp4_mbs1_seqlen2048,
 )
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py
index 93995e9b4b30..1a40a189db37 100644
--- a/scripts/performance/llm/pretrain_llama31_405b.py
+++ b/scripts/performance/llm/pretrain_llama31_405b.py
@@ -18,6 +18,7 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.recipes.llama31_405b import pretrain_recipe
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_bf16_b200_h16384_tp4_cp2_mbs1_seqlen8192,
@@ -25,7 +26,6 @@
     userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192,
     userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192,
 )
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama3_70b.py b/scripts/performance/llm/pretrain_llama3_70b.py
index 4d8471add6b8..0f1458fb6f33 100644
--- a/scripts/performance/llm/pretrain_llama3_70b.py
+++ b/scripts/performance/llm/pretrain_llama3_70b.py
@@ -18,6 +18,7 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.recipes.llama3_70b import pretrain_recipe
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_bf16_b200_h8192_tp2_mbs1_seqlen8192,
@@ -25,7 +26,6 @@
     userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192,
     userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192,
 )
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py
index 3d4df90efb4d..47a7f13824b2 100644
--- a/scripts/performance/llm/pretrain_llama3_8b.py
+++ b/scripts/performance/llm/pretrain_llama3_8b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama4_e128.py b/scripts/performance/llm/pretrain_llama4_e128.py
index 1dec173f8d08..fe5c525354fd 100644
--- a/scripts/performance/llm/pretrain_llama4_e128.py
+++ b/scripts/performance/llm/pretrain_llama4_e128.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.llama4_e128 import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.llama4_e128 import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_llama4_e16.py b/scripts/performance/llm/pretrain_llama4_e16.py
index 0db891f15829..2140ce0f2cc4 100644
--- a/scripts/performance/llm/pretrain_llama4_e16.py
+++ b/scripts/performance/llm/pretrain_llama4_e16.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.llama4_e16 import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.llama4_e16 import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_mixtral_8x22b.py b/scripts/performance/llm/pretrain_mixtral_8x22b.py
index b7abe01cef3c..64a9c20c876e 100644
--- a/scripts/performance/llm/pretrain_mixtral_8x22b.py
+++ b/scripts/performance/llm/pretrain_mixtral_8x22b.py
@@ -17,8 +17,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.mixtral_8x22b_64k import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.mixtral_8x22b_64k import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_mixtral_8x7b.py b/scripts/performance/llm/pretrain_mixtral_8x7b.py
index b490609587f0..58b4cce3582a 100644
--- a/scripts/performance/llm/pretrain_mixtral_8x7b.py
+++ b/scripts/performance/llm/pretrain_mixtral_8x7b.py
@@ -17,8 +17,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron3_22b.py b/scripts/performance/llm/pretrain_nemotron3_22b.py
index 48b27231bf51..185f1ab808ec 100644
--- a/scripts/performance/llm/pretrain_nemotron3_22b.py
+++ b/scripts/performance/llm/pretrain_nemotron3_22b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron3_8b.py b/scripts/performance/llm/pretrain_nemotron3_8b.py
index 7c4515bd4309..d7d3634821fd 100644
--- a/scripts/performance/llm/pretrain_nemotron3_8b.py
+++ b/scripts/performance/llm/pretrain_nemotron3_8b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.nemotron3_8b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.nemotron3_8b import pretrain_recipe
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py
index 2ef0859049ad..4609a84ef6b1 100644
--- a/scripts/performance/llm/pretrain_nemotron4_15b.py
+++ b/scripts/performance/llm/pretrain_nemotron4_15b.py
@@ -18,9 +18,9 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.recipes.nemotron4_15b import pretrain_recipe
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py
index 8256bb709126..c1a074c6b745 100644
--- a/scripts/performance/llm/pretrain_nemotron4_340b.py
+++ b/scripts/performance/llm/pretrain_nemotron4_340b.py
@@ -18,12 +18,12 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.recipes.nemotron4_340b import pretrain_recipe
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096,
     userbuffers_fp8_b200_h18432_tp8_mbs1_seqlen4096,
 )
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotronh_47b.py b/scripts/performance/llm/pretrain_nemotronh_47b.py
index 2193d8e72bd1..a16ca2881eb7 100644
--- a/scripts/performance/llm/pretrain_nemotronh_47b.py
+++ b/scripts/performance/llm/pretrain_nemotronh_47b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.nemotronh_47b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.nemotronh_47b import pretrain_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotronh_56b.py b/scripts/performance/llm/pretrain_nemotronh_56b.py
index 3f64f421f1fe..6a92a4b06f34 100644
--- a/scripts/performance/llm/pretrain_nemotronh_56b.py
+++ b/scripts/performance/llm/pretrain_nemotronh_56b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.nemotronh_56b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.nemotronh_56b import pretrain_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/llm/pretrain_nemotronh_8b.py b/scripts/performance/llm/pretrain_nemotronh_8b.py
index ca0c6ff9435c..c2e7ca38289a 100644
--- a/scripts/performance/llm/pretrain_nemotronh_8b.py
+++ b/scripts/performance/llm/pretrain_nemotronh_8b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.nemotronh_8b import pretrain_recipe
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.nemotronh_8b import pretrain_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_additional_slurm_params, parse_cli_args
diff --git a/scripts/performance/vlm/finetune_qwen25vl_7b.py b/scripts/performance/vlm/finetune_qwen25vl_7b.py
index 5e028ba8e481..3f69515d9e08 100644
--- a/scripts/performance/vlm/finetune_qwen25vl_7b.py
+++ b/scripts/performance/vlm/finetune_qwen25vl_7b.py
@@ -16,8 +16,8 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.collections.vlm.recipes.qwen25vl_7b import finetune_recipe
 from nemo.lightning.run.plugins import NsysPlugin
 
diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
index 05466243ff82..e248bc543181 100644
--- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
+++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
@@ -20,9 +20,9 @@
 from megatron.core.transformer.enums import AttnBackend
 from megatron.core.utils import init_method_normal, scaled_init_method_normal
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm import MixtralConfig8x7B, MixtralModel, PreTrainingDataModule
 from nemo.collections.llm.api import train
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import MegatronStrategy, NeMoLogger, Trainer
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule as MegatronOptim
 from nemo.lightning.pytorch.optim.megatron import OptimizerConfig
diff --git a/tests/collections/llm/gpt/data/test_pre_training_data.py b/tests/collections/llm/gpt/data/test_pre_training_data.py
index 2e50ffeb5bba..e033406d507f 100644
--- a/tests/collections/llm/gpt/data/test_pre_training_data.py
+++ b/tests/collections/llm/gpt/data/test_pre_training_data.py
@@ -15,8 +15,8 @@
 import pytest
 
 import nemo.lightning as nl
-from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
 
 DATA_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document"
 VOCAB_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json"
diff --git a/tests/collections/llm/gpt/model/test_hyena.py b/tests/collections/llm/gpt/model/test_hyena.py
index d7257896d962..6764038f4db8 100644
--- a/tests/collections/llm/gpt/model/test_hyena.py
+++ b/tests/collections/llm/gpt/model/test_hyena.py
@@ -29,6 +29,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.gpt.data import MockDataModule, PreTrainingDataModule
 from nemo.collections.llm.gpt.data.megatron.hyena.config import parse_dataset_config
 from nemo.collections.llm.gpt.data.megatron.hyena.evo2_dataset import Evo2Dataset, Evo2DatasetPadEodLossMask
@@ -37,7 +38,6 @@
     userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
     userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192,
 )
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
diff --git a/tests/collections/llm/gpt/model/test_nemotronh.py b/tests/collections/llm/gpt/model/test_nemotronh.py
index 6d51d642e248..9cb7b288dd5f 100644
--- a/tests/collections/llm/gpt/model/test_nemotronh.py
+++ b/tests/collections/llm/gpt/model/test_nemotronh.py
@@ -24,8 +24,8 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
-from nemo.collections.llm.gpt.data import MockDataModule, PreTrainingDataModule
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.gpt.data import MockDataModule, PreTrainingDataModule
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index f580eeca7d3d..7a085da402fb 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -20,9 +20,9 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.gpt.data.core import get_dataset_root
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from tests.collections.llm.common import Llama3ConfigCI
 
 ## NOTE: This script is present for github-actions testing only.
diff --git a/tests/collections/llm/megatron_gpt_pretraining.py b/tests/collections/llm/megatron_gpt_pretraining.py
index 970eed4d201f..1d553beafbff 100644
--- a/tests/collections/llm/megatron_gpt_pretraining.py
+++ b/tests/collections/llm/megatron_gpt_pretraining.py
@@ -23,9 +23,9 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import train
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import AutoResume, NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint, ModelTrainingStateCallback, ParameterDebugger
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py
index c1341b51a298..26dc10a96d68 100644
--- a/tests/collections/llm/megatron_mixtral_pretraining.py
+++ b/tests/collections/llm/megatron_mixtral_pretraining.py
@@ -21,9 +21,9 @@
 from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig
 from megatron.core.transformer.enums import AttnBackend
 
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule
 from nemo.collections.llm.api import train
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import MegatronStrategy, NeMoLogger, Trainer
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule as MegatronOptim
 from nemo.lightning.pytorch.optim.megatron import OptimizerConfig
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index ea9e407681da..6e7b2f48f952 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -23,9 +23,9 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import finetune
 from nemo.collections.llm.t5.data import SquadDataModule
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index 519d38a70e53..ef656d5f47a7 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -23,9 +23,9 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import pretrain
 from nemo.collections.llm.t5.data import MockDataModule, PreTrainingDataModule
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler
diff --git a/tests/lightning/test_ddp_parity_checker.py b/tests/lightning/test_ddp_parity_checker.py
index 5783e526e503..5143b10f43e4 100644
--- a/tests/lightning/test_ddp_parity_checker.py
+++ b/tests/lightning/test_ddp_parity_checker.py
@@ -21,8 +21,8 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
-from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.lightning.pytorch.callbacks import DdpParityChecker
 
 
diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py
index 0d6aedf84d14..515dcac9f18b 100644
--- a/tests/lightning/test_nemo_resume_from_ckpt.py
+++ b/tests/lightning/test_nemo_resume_from_ckpt.py
@@ -31,8 +31,8 @@ def set_env():
 
 import nemo.lightning as nl
 from nemo.collections import llm
-from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
+from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py
index 5a7d6b32d3c6..7feba02dce3d 100644
--- a/tests/lightning/test_state_restoration.py
+++ b/tests/lightning/test_state_restoration.py
@@ -22,9 +22,9 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.collections.llm.api import train
 from nemo.collections.llm.gpt.data import PreTrainingDataModule
-from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import AutoResume, NeMoLogger
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule

From 24003fed3353c8235e2a68987638c1581cd7c940 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 03:34:55 -0700
Subject: [PATCH 04/21] fix style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/t5/data/fine_tuning.py     | 3 +++
 nemo/collections/llm/t5/data/pre_training.py    | 4 ++++
 nemo/collections/nlp/modules/common/__init__.py | 4 +++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
index c63a9b98c46b..367da5e4933d 100644
--- a/nemo/collections/llm/t5/data/fine_tuning.py
+++ b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# flake8: noqa
+# pylint: skip-file
+
 import math
 from functools import lru_cache
 from pathlib import Path
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 902bf342330f..5644d5edbc19 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# flake8: noqa
+# pylint: skip-file
+
 import logging
 import warnings
 from pathlib import Path
@@ -19,6 +22,7 @@
 
 import lightning.pytorch as pl
 from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from torch.utils import data
 
 from nemo.lightning.data import WrappedDataLoader
diff --git a/nemo/collections/nlp/modules/common/__init__.py b/nemo/collections/nlp/modules/common/__init__.py
index 367ef5f1bd57..d075a6b3566b 100644
--- a/nemo/collections/nlp/modules/common/__init__.py
+++ b/nemo/collections/nlp/modules/common/__init__.py
@@ -14,7 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer, get_tokenizer_list
+# flake8: noqa
+# pylint: skip-file
+
 from nemo.collections.nlp.modules.common.bert_module import BertModule
 from nemo.collections.nlp.modules.common.huggingface import (
     AlbertEncoder,

From 93c965904c26dd084b845b09c7213064a33ad68c Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 15 Oct 2025 10:36:04 +0000
Subject: [PATCH 05/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/llm/t5/data/pre_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 5644d5edbc19..7db83a9d0034 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -21,8 +21,8 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import lightning.pytorch as pl
-from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
 
 from nemo.lightning.data import WrappedDataLoader

From 16b4c800ceda568aaaa9a406e8092cf94e22dbef Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 03:33:59 -0700
Subject: [PATCH 06/21] fix style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/t5/data/pre_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 7db83a9d0034..3e923c4e86d7 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -23,6 +23,7 @@
 import lightning.pytorch as pl
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from torch.utils import data
 
 from nemo.lightning.data import WrappedDataLoader

From c31440663f31c3cc799a7d6ac2dfb965bb85464f Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 03:37:41 -0700
Subject: [PATCH 07/21] remove extra import

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/t5/data/pre_training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 3e923c4e86d7..7db83a9d0034 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -23,7 +23,6 @@
 import lightning.pytorch as pl
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from torch.utils import data
 
 from nemo.lightning.data import WrappedDataLoader

From 025bd43645ad87b843fcde8ff0c4ea327c948584 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 04:14:59 -0700
Subject: [PATCH 08/21] move vocab file name

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../modules/common/huggingface/__init__.py    |  23 --
 .../nlp/modules/common/huggingface/albert.py  |  33 ---
 .../nlp/modules/common/huggingface/bert.py    |  33 ---
 .../modules/common/huggingface/camembert.py   |  33 ---
 .../modules/common/huggingface/distilbert.py  |  34 ---
 .../nlp/modules/common/huggingface/gpt2.py    |  58 -----
 .../common/huggingface/huggingface_decoder.py |  79 -------
 .../common/huggingface/huggingface_encoder.py |  99 --------
 .../common/huggingface/huggingface_utils.py   | 218 ------------------
 .../nlp/modules/common/huggingface/roberta.py |  34 ---
 .../tts/models/language_modeling/nlp_model.py |  13 +-
 11 files changed, 12 insertions(+), 645 deletions(-)
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/__init__.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/albert.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/bert.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/camembert.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/distilbert.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/gpt2.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/huggingface_decoder.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/huggingface_encoder.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py
 delete mode 100644 nemo/collections/nlp/modules/common/huggingface/roberta.py

diff --git a/nemo/collections/nlp/modules/common/huggingface/__init__.py b/nemo/collections/nlp/modules/common/huggingface/__init__.py
deleted file mode 100644
index 10052f643c0a..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder
-from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder
-from nemo.collections.nlp.modules.common.huggingface.camembert import CamembertEncoder
-from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder
-from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import (
-    get_huggingface_lm_model,
-    get_huggingface_pretrained_lm_models_list,
-)
-from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder
diff --git a/nemo/collections/nlp/modules/common/huggingface/albert.py b/nemo/collections/nlp/modules/common/huggingface/albert.py
deleted file mode 100644
index eff926f02957..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/albert.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import AlbertModel
-
-from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.core.classes import typecheck
-
-__all__ = ['AlbertEncoder']
-
-
-class AlbertEncoder(AlbertModel, BertModule):
-    """
-    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
-    """
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        res = super().forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
-        return res
diff --git a/nemo/collections/nlp/modules/common/huggingface/bert.py b/nemo/collections/nlp/modules/common/huggingface/bert.py
deleted file mode 100644
index a9275efa982c..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/bert.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import BertModel
-
-from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.core.classes import typecheck
-
-__all__ = ['BertEncoder']
-
-
-class BertEncoder(BertModel, BertModule):
-    """
-    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
-    """
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
-        res = super().forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
-        return res
diff --git a/nemo/collections/nlp/modules/common/huggingface/camembert.py b/nemo/collections/nlp/modules/common/huggingface/camembert.py
deleted file mode 100644
index 34c917200285..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/camembert.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2020 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import CamembertModel
-
-from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.core.classes import typecheck
-
-__all__ = ['CamembertEncoder']
-
-
-class CamembertEncoder(CamembertModel, BertModule):
-    """
-    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
-    """
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0]
-        return res
diff --git a/nemo/collections/nlp/modules/common/huggingface/distilbert.py b/nemo/collections/nlp/modules/common/huggingface/distilbert.py
deleted file mode 100644
index 21c3469b38aa..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/distilbert.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2020 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import DistilBertModel
-
-from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.core.classes import typecheck
-
-__all__ = ['DistilBertEncoder']
-
-
-class DistilBertEncoder(DistilBertModel, BertModule):
-    """
-    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
-    """
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids=None):
-        # distilBert does not use token_type_ids as the most of the other Bert models
-        res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0]
-        return res
diff --git a/nemo/collections/nlp/modules/common/huggingface/gpt2.py b/nemo/collections/nlp/modules/common/huggingface/gpt2.py
deleted file mode 100644
index a4457916f737..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/gpt2.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import GPT2LMHeadModel
-
-from nemo.collections.nlp.modules.common.gpt_module import GPTModule
-from nemo.core.classes import typecheck
-
-__all__ = ['GPT2Encoder']
-
-
-class GPT2Encoder(GPT2LMHeadModel, GPTModule):
-    """
-    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
-    """
-
-    @typecheck()
-    def forward(
-        self,
-        input_ids,
-        attention_mask=None,
-        token_type_ids=None,
-        labels=None,
-        return_dict=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        past_key_values=None,
-        use_cache=False,
-        position_ids=None,
-        max_length=128,
-    ):
-        res = super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            return_dict=return_dict,
-            labels=labels,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-            use_cache=use_cache,
-        )
-
-        return res if not return_dict else res
diff --git a/nemo/collections/nlp/modules/common/huggingface/huggingface_decoder.py b/nemo/collections/nlp/modules/common/huggingface/huggingface_decoder.py
deleted file mode 100644
index 20049ae6d7c9..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/huggingface_decoder.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from hydra.utils import instantiate
-from transformers import AutoConfig, AutoModel
-
-from nemo.collections.nlp.modules.common.decoder_module import DecoderModule
-from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list
-from nemo.utils import logging
-
-
-class HuggingFaceDecoderModule(DecoderModule):
-    """Gets HuggingFace based model to be used as an Decoder in NeMo NLP.
-    Use the model_name arg to get a named model architecture. 
-    Available model names can be found with get_huggingface_pretrained_lm_models_list() or
-    by going to https://huggingface.co/models.
-    Use the pretrained arg to get the named model architecture with or without pretrained weights.
-
-    If model_name is None, then we can pass in a custom configuration via the config_dict.
-    For example, to instantiate a HuggingFace BERT model with custom configuration we would do:
-        config_dict={
-            '_target_': 'transformers.BertConfig',
-            'hidden_size': 1536
-        } 
-
-
-    Args:
-        model_name (Optional[str]): Named model architecture from HuggingFace. Defaults to None.
-        pretrained (bool): Use True to get pretrained weights. 
-                                    False will use the same architecture but with randomly initialized weights.
-                                    Defaults to False.
-        config_dict (Optional[dict], optional): Use for custom configuration of the HuggingFace model. Defaults to None.
-        checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        model_name: Optional[str] = None,
-        pretrained: bool = False,
-        config_dict: Optional[dict] = None,
-        checkpoint_file: Optional[str] = None,
-    ):
-        super().__init__()
-        model = None
-        if model_name is not None:
-            if model_name in get_huggingface_pretrained_lm_models_list():
-                if pretrained:
-                    model = AutoModel.from_pretrained(model_name)
-                else:
-                    cfg = AutoConfig.from_pretrained(model_name)
-                    model = AutoModel.from_config(cfg)
-            else:
-                logging.error(f'{model_name} not found in list of HuggingFace pretrained models')
-        else:
-            cfg = instantiate(config_dict)
-            model = AutoModel.from_config(cfg)
-        self._hidden_size = model.config.hidden_size
-        self._vocab_size = model.config.vocab_size
-
-    @property
-    def hidden_size(self) -> Optional[int]:
-        return self._hidden_size
-
-    @property
-    def vocab_size(self) -> Optional[int]:
-        return self._vocab_size
diff --git a/nemo/collections/nlp/modules/common/huggingface/huggingface_encoder.py b/nemo/collections/nlp/modules/common/huggingface/huggingface_encoder.py
deleted file mode 100644
index 2c5e17e1be02..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/huggingface_encoder.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from hydra.utils import instantiate
-from transformers import AutoConfig, AutoModel
-
-from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
-from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list
-from nemo.core.classes.common import typecheck
-from nemo.utils import logging
-
-
-class HuggingFaceEncoderModule(EncoderModule):
-    """ Class for using HuggingFace encoders in NeMo NLP."""
-
-    def __init__(
-        self,
-        model_name: Optional[str] = None,
-        pretrained: bool = False,
-        config_dict: Optional[dict] = None,
-        checkpoint_file: Optional[str] = None,
-    ):
-        """Gets HuggingFace based model to be used as an Encoder in NeMo NLP.
-        Use the model_name arg to get a named model architecture. 
-        Available model names can be found with get_huggingface_pretrained_lm_models_list() or
-        by going to https://huggingface.co/models.
-        Use the pretrained arg to get the named model architecture with or without pretrained weights.
-
-        If model_name is None, then we can pass in a custom configuration via the config_dict.
-        For example, to instantiate a HuggingFace BERT model with custom configuration we would do:
-            config_dict={
-                '_target_': 'transformers.BertConfig',
-                'hidden_size': 1536
-            } 
-
-
-        Args:
-            model_name (Optional[str]): Named model architecture from HuggingFace. Defaults to None.
-            pretrained (bool): Use True to get pretrained weights. 
-                                        False will use the same architecture but with randomly initialized weights.
-                                        Defaults to False.
-            config_dict (Optional[dict], optional): Use for custom configuration of the HuggingFace model. Defaults to None.
-            checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint. Defaults to None.
-        """
-        super().__init__()
-
-        if checkpoint_file:
-            raise NotImplementedError('Restoring from checkpoint file not implemented yet.')
-
-        model = None
-        if model_name is not None:
-            if model_name in get_huggingface_pretrained_lm_models_list(include_external=False):
-                if pretrained:
-                    config_dict.pop('vocab_size')
-                    if config_dict:
-                        raise ValueError(
-                            f'When using pretrained model, config_dict should be None or empty. Got: {config_dict}'
-                        )
-                    model = AutoModel.from_pretrained(model_name)
-                else:
-                    cfg = AutoConfig.from_pretrained(model_name)
-                    model = AutoModel.from_config(cfg)
-            else:
-                logging.error(f'{model_name} not found in list of HuggingFace pretrained models')
-        else:
-            if pretrained:
-                raise ValueError(f'If not using model_name, then pretrained should be False. Got: {pretrained}.')
-            cfg = instantiate(config_dict)
-            model = AutoModel.from_config(cfg)
-        self._hidden_size = model.config.hidden_size
-        self._vocab_size = model.config.vocab_size
-
-        self._encoder = model
-
-    @typecheck()
-    def forward(self, input_ids, encoder_mask):
-        encoder_hidden_states = self._encoder.forward(input_ids=input_ids, attention_mask=encoder_mask)[0]
-        return encoder_hidden_states
-
-    @property
-    def hidden_size(self) -> Optional[int]:
-        return self._hidden_size
-
-    @property
-    def vocab_size(self) -> Optional[int]:
-        return self._vocab_size
diff --git a/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py b/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py
deleted file mode 100644
index d8f6936f7126..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional
-
-from transformers import (
-    AlbertConfig,
-    AutoModel,
-    BertConfig,
-    CamembertConfig,
-    DistilBertConfig,
-    GPT2Config,
-    RobertaConfig,
-)
-
-from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder
-from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder
-from nemo.collections.nlp.modules.common.huggingface.camembert import CamembertEncoder
-from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder
-from nemo.collections.nlp.modules.common.huggingface.gpt2 import GPT2Encoder
-from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder
-from nemo.utils import logging
-
-__all__ = ["get_huggingface_lm_model", "get_huggingface_pretrained_lm_models_list", "VOCAB_FILE_NAME"]
-
-# Manually specify the model archive lists since these are now removed in HF
-# https://github.com/huggingface/transformers/blob/v4.40-release/src/transformers/models/deprecated/_archive_maps.py
-ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "albert/albert-base-v1",
-    "albert/albert-large-v1",
-    "albert/albert-xlarge-v1",
-    "albert/albert-xxlarge-v1",
-    "albert/albert-base-v2",
-    "albert/albert-large-v2",
-    "albert/albert-xlarge-v2",
-    "albert/albert-xxlarge-v2",
-]
-
-BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google-bert/bert-base-uncased",
-    "google-bert/bert-large-uncased",
-    "google-bert/bert-base-cased",
-    "google-bert/bert-large-cased",
-    "google-bert/bert-base-multilingual-uncased",
-    "google-bert/bert-base-multilingual-cased",
-    "google-bert/bert-base-chinese",
-    "google-bert/bert-base-german-cased",
-    "google-bert/bert-large-uncased-whole-word-masking",
-    "google-bert/bert-large-cased-whole-word-masking",
-    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad",
-    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad",
-    "google-bert/bert-base-cased-finetuned-mrpc",
-    "google-bert/bert-base-german-dbmdz-cased",
-    "google-bert/bert-base-german-dbmdz-uncased",
-    "cl-tohoku/bert-base-japanese",
-    "cl-tohoku/bert-base-japanese-whole-word-masking",
-    "cl-tohoku/bert-base-japanese-char",
-    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
-    "TurkuNLP/bert-base-finnish-cased-v1",
-    "TurkuNLP/bert-base-finnish-uncased-v1",
-    "wietsedv/bert-base-dutch-cased",
-]
-CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "almanach/camembert-base",
-    "Musixmatch/umberto-commoncrawl-cased-v1",
-    "Musixmatch/umberto-wikipedia-uncased-v1",
-]
-
-DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "distilbert-base-uncased",
-    "distilbert-base-uncased-distilled-squad",
-    "distilbert-base-cased",
-    "distilbert-base-cased-distilled-squad",
-    "distilbert-base-german-cased",
-    "distilbert-base-multilingual-cased",
-    "distilbert-base-uncased-finetuned-sst-2-english",
-]
-GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "openai-community/gpt2",
-    "openai-community/gpt2-medium",
-    "openai-community/gpt2-large",
-    "openai-community/gpt2-xl",
-    "distilbert/distilgpt2",
-]
-ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "FacebookAI/roberta-base",
-    "FacebookAI/roberta-large",
-    "FacebookAI/roberta-large-mnli",
-    "distilbert/distilroberta-base",
-    "openai-community/roberta-base-openai-detector",
-    "openai-community/roberta-large-openai-detector",
-]
-
-
-HUGGINGFACE_MODELS = {
-    "BertModel": {
-        "default": "bert-base-uncased",
-        "class": BertEncoder,
-        "config": BertConfig,
-        "pretrained_model_list": BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    },
-    "DistilBertModel": {
-        "default": "distilbert-base-uncased",
-        "class": DistilBertEncoder,
-        "config": DistilBertConfig,
-        "pretrained_model_list": DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    },
-    "CamembertModel": {
-        "default": "camembert-base-uncased",
-        "class": CamembertEncoder,
-        "config": CamembertConfig,
-        "pretrained_model_list": CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    },
-    "RobertaModel": {
-        "default": "roberta-base",
-        "class": RobertaEncoder,
-        "config": RobertaConfig,
-        "pretrained_model_list": ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    },
-    "AlbertModel": {
-        "default": "albert-base-v2",
-        "class": AlbertEncoder,
-        "config": AlbertConfig,
-        "pretrained_model_list": ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    },
-    "GPT2Model": {
-        "default": "gpt2",
-        "class": GPT2Encoder,
-        "config": GPT2Config,
-        "pretrained_model_list": GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
-    },
-}
-
-VOCAB_FILE_NAME = {
-    'AlbertTokenizer': "spiece.model",
-    'RobertaTokenizer': "vocab.json",
-    'BertTokenizer': "vocab.txt",
-    'DistilBertTokenizer': "vocab.txt",
-    'CamembertTokenizer': "sentencepiece.bpe.model",
-    'GPT2Tokenizer': "vocab.json",
-    'T5Tokenizer': "spiece.model",
-    "BartTokenizer": "vocab.json",
-}
-
-
-def get_huggingface_lm_model(
-    pretrained_model_name: str,
-    config_dict: Optional[dict] = None,
-    config_file: Optional[str] = None,
-):
-    """
-    Returns lm model instantiated with Huggingface
-
-    Args:
-        pretrained_mode_name: specify this to instantiate pretrained model from Huggingface,
-            e.g. bert-base-cased. For entire list, see get_huggingface_pretrained_lm_models_list().
-        config_dict: model configuration dictionary used to instantiate Huggingface model from scratch
-        config_file: path to model configuration file used to instantiate Huggingface model from scratch
-
-    Returns:
-        BertModule
-    """
-
-    try:
-        automodel = AutoModel.from_pretrained(pretrained_model_name)
-    except Exception as e:
-        raise ValueError(f"{pretrained_model_name} is not supported by HuggingFace. {e}")
-
-    model_type = type(automodel).__name__
-
-    if model_type in HUGGINGFACE_MODELS:
-        model_class = HUGGINGFACE_MODELS[model_type]["class"]
-        if config_file:
-            if not os.path.exists(config_file):
-                logging.warning(
-                    f"Config file was not found at {config_file}. Will attempt to use config_dict or pretrained_model_name."
-                )
-            else:
-                config_class = HUGGINGFACE_MODELS[model_type]["config"]
-                return model_class(config_class.from_json_file(config_file))
-        if config_dict:
-            config_class = HUGGINGFACE_MODELS[model_type]["config"]
-            return model_class(config=config_class(**config_dict))
-        else:
-            return model_class.from_pretrained(pretrained_model_name)
-    else:
-        raise ValueError(f"Use HuggingFace API directly in NeMo for {pretrained_model_name}")
-
-
-def get_huggingface_pretrained_lm_models_list(
-    include_external: bool = False,
-) -> List[str]:
-    """
-    Returns the list of pretrained HuggingFace language models
-
-    Args:
-        include_external if true includes all HuggingFace model names, not only those supported language models in NeMo.
-
-    Returns the list of HuggingFace models
-    """
-
-    huggingface_models = []
-    for model in HUGGINGFACE_MODELS:
-        model_names = HUGGINGFACE_MODELS[model]["pretrained_model_list"]
-        huggingface_models.extend(model_names)
-    return huggingface_models
diff --git a/nemo/collections/nlp/modules/common/huggingface/roberta.py b/nemo/collections/nlp/modules/common/huggingface/roberta.py
deleted file mode 100644
index f21d1e7ba41e..000000000000
--- a/nemo/collections/nlp/modules/common/huggingface/roberta.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from transformers import RobertaModel
-
-from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.core.classes import typecheck
-
-__all__ = ['RobertaEncoder']
-
-
-class RobertaEncoder(RobertaModel, BertModule):
-    """
-    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
-    """
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0]
-        return res
diff --git a/nemo/collections/tts/models/language_modeling/nlp_model.py b/nemo/collections/tts/models/language_modeling/nlp_model.py
index df5add50b07b..72ee023bcd43 100644
--- a/nemo/collections/tts/models/language_modeling/nlp_model.py
+++ b/nemo/collections/tts/models/language_modeling/nlp_model.py
@@ -31,7 +31,6 @@
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.collections.nlp.modules import BertModule
-from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import VOCAB_FILE_NAME
 from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
     MEGATRON_CONFIG_MAP,
@@ -58,6 +57,18 @@
 
 os.makedirs(NEMO_NLP_TMP, exist_ok=True)
 
+VOCAB_FILE_NAME = {
+    "AlbertTokenizer": "spiece.model",
+    "RobertaTokenizer": "vocab.json",
+    "BertTokenizer": "vocab.txt",
+    "DistilBertTokenizer": "vocab.txt",
+    "CamembertTokenizer": "sentencepiece.bpe.model",
+    "GPT2Tokenizer": "vocab.json",
+    "T5Tokenizer": "spiece.model",
+    "BartTokenizer": "vocab.json",
+}
+
+
 
 class NLPModel(ModelPT, Exportable):
     """Base class for NLP Models."""

From 515f348ccc699573ce2395c93ac8e06ee62fcef7 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 15 Oct 2025 11:15:57 +0000
Subject: [PATCH 09/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/tts/models/language_modeling/nlp_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/tts/models/language_modeling/nlp_model.py b/nemo/collections/tts/models/language_modeling/nlp_model.py
index 72ee023bcd43..38b8eec469d9 100644
--- a/nemo/collections/tts/models/language_modeling/nlp_model.py
+++ b/nemo/collections/tts/models/language_modeling/nlp_model.py
@@ -69,7 +69,6 @@
 }
 
 
-
 class NLPModel(ModelPT, Exportable):
     """Base class for NLP Models."""
 

From 8d6545e2f924b707f50241622dddf8216089d259 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 04:17:50 -0700
Subject: [PATCH 10/21] remove HF imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/nlp/modules/__init__.py        | 7 -------
 nemo/collections/nlp/modules/common/__init__.py | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py
index 82f0ee3bbcd1..e45960eb0422 100644
--- a/nemo/collections/nlp/modules/__init__.py
+++ b/nemo/collections/nlp/modules/__init__.py
@@ -13,17 +13,10 @@
 # limitations under the License.
 
 
-from nemo.collections.nlp.modules.common import AlbertEncoder  # noqa: F401
-from nemo.collections.nlp.modules.common import BertEncoder  # noqa: F401
 from nemo.collections.nlp.modules.common import BertModule  # noqa: F401
-from nemo.collections.nlp.modules.common import CamembertEncoder  # noqa: F401
-from nemo.collections.nlp.modules.common import DistilBertEncoder  # noqa: F401
 from nemo.collections.nlp.modules.common import PromptEncoder  # noqa: F401
-from nemo.collections.nlp.modules.common import RobertaEncoder  # noqa: F401
 from nemo.collections.nlp.modules.common import SequenceClassifier  # noqa: F401
 from nemo.collections.nlp.modules.common import SequenceRegression  # noqa: F401
 from nemo.collections.nlp.modules.common import SequenceTokenClassifier  # noqa: F401
 from nemo.collections.nlp.modules.common import get_lm_model  # noqa: F401
 from nemo.collections.nlp.modules.common import get_pretrained_lm_models_list  # noqa: F401
-from nemo.collections.nlp.modules.common import get_tokenizer  # noqa: F401
-from nemo.collections.nlp.modules.common import get_tokenizer_list  # noqa: F401
diff --git a/nemo/collections/nlp/modules/common/__init__.py b/nemo/collections/nlp/modules/common/__init__.py
index d075a6b3566b..8c68a7c09883 100644
--- a/nemo/collections/nlp/modules/common/__init__.py
+++ b/nemo/collections/nlp/modules/common/__init__.py
@@ -18,13 +18,6 @@
 # pylint: skip-file
 
 from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.collections.nlp.modules.common.huggingface import (
-    AlbertEncoder,
-    BertEncoder,
-    CamembertEncoder,
-    DistilBertEncoder,
-    RobertaEncoder,
-)
 from nemo.collections.nlp.modules.common.lm_utils import get_lm_model, get_pretrained_lm_models_list
 from nemo.collections.nlp.modules.common.prompt_encoder import PromptEncoder, PromptEncoderType
 from nemo.collections.nlp.modules.common.prompt_table import (

From 794112cfbd418e6eb2ac62b647c3fcde77384b85 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 04:20:58 -0700
Subject: [PATCH 11/21] remove hyena submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/modules/common/hyena/README.md        |  26 --
 .../nlp/modules/common/hyena/__init__.py      |   1 -
 .../modules/common/hyena/fftconv_wrapper.py   | 143 -------
 .../nlp/modules/common/hyena/hyena.py         | 405 ------------------
 .../nlp/modules/common/hyena/hyena_filter.py  | 187 --------
 .../nlp/modules/common/hyena/hyena_spec.py    |  74 ----
 6 files changed, 836 deletions(-)
 delete mode 100644 nemo/collections/nlp/modules/common/hyena/README.md
 delete mode 100644 nemo/collections/nlp/modules/common/hyena/__init__.py
 delete mode 100644 nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py
 delete mode 100644 nemo/collections/nlp/modules/common/hyena/hyena.py
 delete mode 100644 nemo/collections/nlp/modules/common/hyena/hyena_filter.py
 delete mode 100644 nemo/collections/nlp/modules/common/hyena/hyena_spec.py

diff --git a/nemo/collections/nlp/modules/common/hyena/README.md b/nemo/collections/nlp/modules/common/hyena/README.md
deleted file mode 100644
index a5e7b32cc590..000000000000
--- a/nemo/collections/nlp/modules/common/hyena/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## Required Dependencies for Hyena
-
-We depend on 3rd-party libraries for FFT convolutions implementation. Each library supports different use-cases:
-
-|     Library      | Supported Sequence Length | Single/Multi-Head Support |
-|:----------------:|:-------------------------:|:-------------------------:|
-| Safari `fftconv` |        Up to 8192         |       1 or 8 heads        |
-|   FlashFFTConv   |         Up to 4M          |     Single-head only      |
-
-Note the overlapping support for single-head with sequence length up to 8192. By default, in this case we default to Safari `fftconv` as it is faster (and fallback to FlashFFTConv). The user may force the FFT convolution implementation used by setting the configuration key `model.hyena.fftconv_type` to either `safari` or `flash`.
-
-### Installation
-
-#### Safari `fftconv`
-
-Install from the [Safari repository](https://github.com/HazyResearch/safari/tree/main/csrc/fftconv). Run the following in a terminal:
-
-```bash
-git clone https://github.com/HazyResearch/safari.git
-cd safari/csrc/fftconv
-pip install .
-```
-
-#### FlashFFTConv
-
-Follow the [installation instructions](https://github.com/HazyResearch/flash-fft-conv?tab=readme-ov-file#installation) in the FlashFFTConv repository.
diff --git a/nemo/collections/nlp/modules/common/hyena/__init__.py b/nemo/collections/nlp/modules/common/hyena/__init__.py
deleted file mode 100644
index f976e8f9d9c6..000000000000
--- a/nemo/collections/nlp/modules/common/hyena/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from nemo.collections.nlp.modules.common.hyena.hyena import HyenaOperator
diff --git a/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py
deleted file mode 100644
index c17b01a1acba..000000000000
--- a/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import torch
-from einops import rearrange
-from fftconv import fftconv_bwd, fftconv_fwd
-
-# Code taken from:
-# https://github.com/HazyResearch/safari/blob/main/src/ops/fftconv.py
-
-
-class FFTConvFunc(torch.autograd.Function):
-
-    @staticmethod
-    def forward(
-        ctx,
-        u,
-        k,
-        D,
-        dropout_mask=None,
-        gelu=True,
-        force_fp16_output=False,
-        output_hbl_layout=False,
-        v=None,
-        head_dim=1,
-        q=None,
-        fftfp16=False,
-        k_rev=None,
-    ):
-        seqlen = u.shape[-1]
-        fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16)
-        k_f = torch.fft.rfft(k, n=fft_size)
-        if k_rev is not None:
-            k_f = k_f + torch.fft.rfft(k_rev, n=fft_size).conj()
-        if u.stride(-1) != 1:
-            u = u.contiguous()
-        k_f = k_f.contiguous()
-        D = D.contiguous()
-        if v is not None and v.stride(-1) != 1:
-            v = v.contiguous()
-        if q is not None and q.stride(-1) != 1:
-            q = q.contiguous()
-        if dropout_mask is not None:
-            dropout_mask = dropout_mask.contiguous()
-        ctx.save_for_backward(u, k_f, D, dropout_mask, v, q)
-        ctx.output_hbl_layout = output_hbl_layout
-        ctx.head_dim = head_dim
-        ctx.gelu = gelu
-        ctx.fftfp16 = fftfp16
-        ctx.has_k_rev = k_rev is not None
-        out = fftconv_fwd(
-            u,
-            k_f,
-            D,
-            v,
-            head_dim,
-            q,
-            dropout_mask,
-            gelu,
-            False,
-            False,
-            fft_size,
-            force_fp16_output,
-            output_hbl_layout,
-            fftfp16,
-        )
-        return out
-
-    @staticmethod
-    def backward(ctx, dout):
-        if ctx.output_hbl_layout:
-            dout = rearrange(rearrange(dout, 'b h l -> h b l').contiguous(), 'h b l -> b h l')
-        else:
-            dout = dout.contiguous()
-        u, k_f, D, dropout_mask, v, q = ctx.saved_tensors
-        seqlen = u.shape[-1]
-        fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16)
-        du, dk_f, dD, dv, dq = fftconv_bwd(
-            dout,
-            u,
-            k_f,
-            D,
-            v,
-            ctx.head_dim,
-            q,
-            dropout_mask,
-            ctx.gelu,
-            False,
-            False,
-            fft_size,
-            ctx.output_hbl_layout,
-            ctx.fftfp16,
-        )
-        dk = torch.fft.irfft(dk_f, n=fft_size, norm='forward')[..., :seqlen]
-        dk_rev = None if not ctx.has_k_rev else torch.fft.irfft(dk_f.conj(), n=fft_size, norm='forward')[..., :seqlen]
-        if v is not None:
-            dv = dv.to(dtype=v.dtype)  # We do atomicAdd in fp32 so might need to convert to fp16
-        return (
-            du,
-            dk,
-            dD,
-            None,
-            None,
-            None,
-            None,
-            dv,
-            None,
-            dq,
-            None,
-            dk_rev,
-        )
-
-
-def fftconv_func(
-    u,
-    k,
-    D,
-    dropout_mask=None,
-    gelu=True,
-    force_fp16_output=False,
-    output_hbl_layout=False,
-    v=None,
-    head_dim=1,
-    q=None,
-    fftfp16=False,
-    k_rev=None,
-):
-    return FFTConvFunc.apply(
-        u, k, D, dropout_mask, gelu, force_fp16_output, output_hbl_layout, v, head_dim, q, fftfp16, k_rev
-    )
diff --git a/nemo/collections/nlp/modules/common/hyena/hyena.py b/nemo/collections/nlp/modules/common/hyena/hyena.py
deleted file mode 100644
index b33f357e8741..000000000000
--- a/nemo/collections/nlp/modules/common/hyena/hyena.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Implementation of Hyena operator
-#
-# Michael Poli and Stefano Massaroli and Eric Nguyen and Daniel Y Fu and Tri Dao and Stephen Baccus and
-# Yoshua Bengio and Stefano Ermon and Christopher Re,
-# Hyena Hierarchy: Towards Larger Convolutional Language Models
-# 2023, https://arxiv.org/abs/2302.10866
-#
-# Multi-head variant introduced in:
-#
-# Stefano Massaroli and Michael Poli and Daniel Y Fu and Hermann Kumbong and Rom Nishijima Parnichkun and
-# David W. Romero and Aman Timalsina and Quinn McIntyre and Beidi Chen and Atri Rudra and Ce Zhang and
-# Christopher Re and Stefano Ermon and Yoshua Bengio,
-# Laughing Hyena Distillery: Extracting Compact Recurrences From Convolutions
-# NeurIPS 2023, https://arxiv.org/abs/2310.18780
-#
-# Code is heavily based on the reference implementations from:
-# https://github.com/HazyResearch/safari/blob/flashfftconv/src/models/sequence/hyena.py
-# https://github.com/athms/mad-lab/blob/main/mad/model/layers/hyena.py
-
-from dataclasses import dataclass
-from typing import Union
-
-import torch
-import torch.nn as nn
-from einops import rearrange
-
-try:
-    from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TERowParallelLinear
-    from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
-    from megatron.core.transformer.spec_utils import ModuleSpec, build_module
-    from megatron.core.transformer.transformer_config import TransformerConfig
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-    from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
-
-    ModuleSpec = ApexGuardDefaults
-    IdentityFuncOp = ApexGuardDefaults
-    IdentityOp = ApexGuardDefaults
-    TransformerConfig = ApexGuardDefaults
-    HAVE_MEGATRON_CORE = False
-
-from nemo.collections.common.parts.utils import activation_registry
-from nemo.collections.nlp.modules.common.hyena.hyena_filter import HyenaFilter, HyenaFilterSubmodules
-from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
-from nemo.utils.metaclasses import Singleton
-
-try:
-    from nemo.collections.nlp.modules.common.hyena.fftconv_wrapper import fftconv_func as safari_fftconv_fn
-
-    HAVE_SAFARI_FFTCONV = True
-except ImportError:
-    HAVE_SAFARI_FFTCONV = False
-
-try:
-    from flashfftconv import FlashFFTConv as FlashFFTConvImpl
-
-    HAVE_FLASHFFTCONV = True
-
-    class FlashFFTConv(metaclass=Singleton):
-        # Recommendation is to create single instance per model
-        # https://github.com/HazyResearch/flash-fft-conv?tab=readme-ov-file#example-model
-        def __init__(self, seqlen, dtype):
-            self.flashfftconv = FlashFFTConvImpl(seqlen, dtype)
-
-except ImportError:
-    HAVE_FLASHFFTCONV = False
-
-try:
-    from causal_conv1d import causal_conv1d_fn
-
-    HAVE_CAUSAL_CONV1D = True
-except ImportError:
-    HAVE_CAUSAL_CONV1D = False
-
-
-@dataclass
-class HyenaOperatorSubmodules:
-    in_proj: Union[ModuleSpec, type] = IdentityOp
-    short_filter: Union[ModuleSpec, type] = IdentityFuncOp
-    implicit_filter: Union[ModuleSpec, type] = IdentityOp
-    out_proj: Union[ModuleSpec, type] = IdentityOp
-
-
-def auto_assign_attrs(cls, **kwargs):
-    for k, v in kwargs.items():
-        setattr(cls, k, v)
-
-
-class CausalDepthWiseConv1d(nn.Module):
-    def __init__(self, channels, width, bias=True):
-        if not HAVE_CAUSAL_CONV1D:
-            raise ImportError("Missing causal-conv1d library, please run 'pip install causal-conv1d'")
-
-        super().__init__()
-        self.channels = channels
-        self.width = width
-        self._conv_1d = nn.Conv1d(
-            in_channels=channels,
-            out_channels=channels,
-            kernel_size=width,
-            padding=width - 1,
-            groups=channels,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        return causal_conv1d_fn(x, self._conv_1d.weight.squeeze(1), self._conv_1d.bias)
-
-
-class HyenaConv(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        max_seq_length: int,
-        order: int,
-        bias: bool = True,
-        filter_cls: Union[ModuleSpec, type] = HyenaFilter,
-        filter_submodules: HyenaFilterSubmodules = None,
-        **filter_kwargs,
-    ):
-        super().__init__()
-        self.d_model = d_model
-        self.order = order
-        self.max_seq_length = max_seq_length
-        self.use_bias = bias
-        bias_shape = self.d_model * (self.order - 1)
-        if self.use_bias:
-            self.bias = nn.Parameter(torch.randn(bias_shape))
-        else:
-            self.bias = torch.zeros(bias_shape)
-
-        self.filter = build_module(
-            filter_cls,
-            self.d_model * (self.order - 1),
-            submodules=filter_submodules,
-            seq_len=max_seq_length,
-            **filter_kwargs,
-        )
-
-
-class SingleHeadHyenaConv(HyenaConv):
-    def __init__(
-        self,
-        d_model: int,
-        max_seq_length: int,
-        order: int,
-        bias: bool = True,
-        filter_cls: Union[ModuleSpec, type] = HyenaFilter,
-        filter_submodules: HyenaFilterSubmodules = None,
-        fftconv_type: str = None,
-        precision: str = 'bf16',
-        **filter_kwargs,
-    ):
-        super().__init__(
-            d_model,
-            max_seq_length,
-            order,
-            bias=bias,
-            filter_cls=filter_cls,
-            filter_submodules=filter_submodules,
-            **filter_kwargs,
-        )
-
-        if fftconv_type is None:
-            if max_seq_length <= 8192 and HAVE_SAFARI_FFTCONV:
-                # safari-fftconv supports seq-len <= 8192 and is a bit faster vs. flashfftconv
-                fftconv_type = 'safari'
-            else:
-                fftconv_type = 'flash'
-
-        if fftconv_type not in ['safari', 'flash']:
-            raise ValueError("fftconv_type must be one of ['safari', 'flash']")
-        if fftconv_type == 'safari' and max_seq_length > 8192:
-            raise ValueError('Safari-fftconv only supports sequence length up to 8192')
-        if fftconv_type == 'safari' and not HAVE_SAFARI_FFTCONV:
-            raise ImportError('Safari-fftconv library not found. Please see README at <tbd> for instructions.')
-        if fftconv_type == 'flash' and not HAVE_FLASHFFTCONV:
-            raise ImportError('flashfftconv library not found. Please see README at <tbd> for instructions.')
-
-        if fftconv_type == 'safari':
-            self.fftconv_fn = self._safari_fft
-        else:  # fftconv_type == 'flash'
-            self.flashfftconv = FlashFFTConv(
-                2 * self.max_seq_length, torch_dtype_from_precision(precision)
-            ).flashfftconv
-            self.fftconv_fn = self._flash_fft
-
-    def _safari_fft(self, x, k, bias):
-        bias = bias.to(dtype=torch.float32)
-        return safari_fftconv_fn(x, k, bias, gelu=False)
-
-    def _flash_fft(self, x, k, bias):
-        x = x.contiguous()
-        y = self.flashfftconv(x, k) + x * bias.unsqueeze(dim=1)
-        return y
-
-    def forward(self, x, k, recurrence_idx):
-        bias = rearrange(self.bias, '(v o) -> o v', v=self.d_model, o=self.order - 1)[recurrence_idx]
-        y = self.fftconv_fn(x, k, bias)
-        return y
-
-
-class MultiHeadHyenaConv(HyenaConv):
-    def __init__(
-        self,
-        d_model: int,
-        max_seq_length: int,
-        order: int,
-        num_heads: int,
-        bias: bool = True,
-        filter_cls: Union[ModuleSpec, type] = HyenaFilter,
-        filter_submodules: HyenaFilterSubmodules = None,
-        fftconv_type: str = None,
-        precision: str = 'bf16',
-        **filter_kwargs,
-    ):
-        if num_heads == 1:
-            raise ValueError('Expecting num_heads > 1')
-        if order != 2:
-            raise ValueError(f'Multi-head supported only with order == 2 (got order {self.order})')
-        if not HAVE_SAFARI_FFTCONV:
-            raise ImportError('Safari-fftconv library not found. Please see README at <tbd> for instructions.')
-
-        super().__init__(
-            d_model,
-            max_seq_length,
-            order,
-            bias=bias,
-            filter_cls=filter_cls,
-            filter_submodules=filter_submodules,
-            **filter_kwargs,
-        )
-        self.num_heads = num_heads
-
-    def forward(self, v, k, x1, x2):
-        bias = self.bias.to(dtype=torch.float32)
-        y = safari_fftconv_fn(v, k, bias, gelu=False, output_hbl_layout=True, v=x2, head_dim=self.num_heads, q=x1)
-        return y
-
-
-class HyenaOperator(nn.Module):
-    def __init__(
-        self,
-        config: TransformerConfig,
-        max_seq_length: int,
-        order: int = 2,
-        num_heads: int = 1,
-        dropout: float = 0.0,
-        short_filter_order: int = 3,
-        activation: str = "identity",
-        submodules: HyenaOperatorSubmodules = None,
-        layer_number=None,
-        **long_conv_kwargs,
-    ):
-        r"""
-        Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf
-
-        Args:
-            max_seq_length: (int): Maximum input sequence length.
-            order: (int): Depth of the Hyena recurrence. Defaults to 2
-            num_heads: (int): Number of heads. Defaults to 1
-            dropout: (float): Dropout probability. Defaults to 0.0
-            short_filter_order: (int): Length of the explicit input convolutional filter. Defaults to 3
-            activation: (str): type of act between kernel output and output projection (default identity)
-        """
-        super().__init__()
-
-        if submodules is None:
-            submodules = HyenaOperatorSubmodules(
-                in_proj=TELayerNormColumnParallelLinear,
-                short_filter=CausalDepthWiseConv1d,
-                implicit_filter=HyenaFilter,
-                out_proj=TERowParallelLinear,
-            )
-
-        if order < 2:
-            raise ValueError(f'Order must be at least 2, (got {self.order})')
-
-        d_model = config.hidden_size
-        if d_model % num_heads != 0:
-            raise ValueError(f'Model dimension {d_model} must be divisible by num heads {num_heads}')
-        head_dim = d_model // num_heads
-
-        auto_assign_attrs(
-            self,
-            d_model=d_model,
-            order=order,
-            max_seq_length=max_seq_length,
-            num_heads=num_heads,
-            head_dim=head_dim,
-            short_filter_order=short_filter_order,
-            activation=activation,
-            mcore_config=config,
-        )
-        self.activation = activation_registry[activation]()
-        self.dropout = nn.Dropout(dropout)
-
-        # Setup input and output projections (over the width dimension)
-        self.in_proj = build_module(
-            submodules.in_proj,
-            self.d_model,
-            (self.order + 1) * self.d_model,
-            config=self.mcore_config,
-            init_method=self.mcore_config.init_method,
-            gather_output=False,
-            bias=True,
-            skip_bias_add=False,
-            is_expert=False,
-            tp_comm_buffer_name='in_proj',
-        )
-
-        self.out_proj = build_module(
-            submodules.out_proj,
-            self.d_model,
-            self.d_model,
-            config=self.mcore_config,
-            init_method=self.mcore_config.output_layer_init_method,
-            bias=True,
-            input_is_parallel=True,
-            skip_bias_add=True,
-            is_expert=False,
-            tp_comm_buffer_name='out_proj',
-        )
-
-        # Setup short filter
-        total_width = self.d_model * (self.order + 1)
-        self.short_filter = build_module(submodules.short_filter, total_width, self.short_filter_order)
-
-        # Setup long convolution with implicit filter
-        long_conv_args = [self.head_dim, self.max_seq_length, self.order]
-        long_conv_kwargs['filter_cls'] = submodules.implicit_filter
-        long_conv_kwargs['filter_submodules'] = submodules.implicit_filter.submodules
-        if self.num_heads == 1:
-            self.long_conv = SingleHeadHyenaConv(*long_conv_args, **long_conv_kwargs)
-            self.conv_fwd_fn = self.conv_single_head
-        else:
-            long_conv_args.append(self.num_heads)
-            self.long_conv = MultiHeadHyenaConv(*long_conv_args, **long_conv_kwargs)
-            self.conv_fwd_fn = self.conv_multi_head
-
-    def forward(self, u, *args, **kwargs):
-        l = u.size(0)
-        l_filter = min(l, self.max_seq_length)
-        u = self.in_proj(u)
-        u = u[0] if isinstance(u, tuple) else u
-        u = rearrange(u, 'l b d -> b d l')  # In MCore the leading dimension is the sequence dimension
-
-        k = self.long_conv.filter(l_filter)
-        # `c` is always 1 by default
-        k = rearrange(k, 'c l v -> c v l', v=self.head_dim)[0]
-
-        uc = self.short_filter(u)[..., :l_filter]
-
-        k = k.to(dtype=torch.float32)
-        y = self.conv_fwd_fn(uc, k)
-
-        y = rearrange(y, 'b d l -> b l d')
-        y = self.activation(y)
-        y = self.out_proj(y)
-        if isinstance(y, tuple):
-            y, bias = y
-        else:
-            bias = None
-
-        # Convert back to sequence-first for MCore
-        y = rearrange(y, 'b l d -> l b d')
-
-        # MCore TransformerLayer expects tuple where 2nd element represents the bias, it can be None
-        return y, bias
-
-    def conv_single_head(self, uc, k):
-        k = rearrange(k, '(o v) l -> o v l', v=self.head_dim, o=self.order - 1)
-
-        *x, v = uc.split(self.d_model, dim=1)
-        for o, x_i in enumerate(reversed(x[1:])):
-            v = self.dropout(v * x_i)
-            v = self.long_conv(v, k=k[o], recurrence_idx=o)
-
-        y = v * x[0]
-        return y
-
-    def conv_multi_head(self, uc, k):
-        x1, x2, v = uc.split(self.d_model, dim=1)
-        x1 = x1.contiguous()
-        x2 = x2.contiguous()
-        v = v.contiguous()
-
-        y = self.long_conv(v, k, x1, x2)
-        return y
diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_filter.py b/nemo/collections/nlp/modules/common/hyena/hyena_filter.py
deleted file mode 100644
index e70b024e067c..000000000000
--- a/nemo/collections/nlp/modules/common/hyena/hyena_filter.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import Union
-
-import torch
-import torch.nn as nn
-
-from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.spec_utils import ModuleSpec, build_module
-
-# Code mostly taken from:
-# https://github.com/HazyResearch/safari/blob/flashfftconv/src/models/sequence/hyena.py
-
-
-@dataclass
-class HyenaFilterSubmodules:
-    positional_embedding: Union[ModuleSpec, type] = IdentityOp
-    linear: Union[ModuleSpec, type] = IdentityOp
-    activation: Union[ModuleSpec, type] = IdentityOp
-    modulation: Union[ModuleSpec, type] = IdentityOp
-
-
-def register(module: nn.Module, name: str, tensor: torch.Tensor, learnable: bool):
-    if learnable:
-        module.register_parameter(name, nn.Parameter(tensor))
-    else:
-        module.register_buffer(name, tensor)
-
-
-class Sin(nn.Module):
-    def __init__(self, dim: int, freq: float = 10, train_freq: bool = True):
-        """
-        Sinusoidal activation function with (optionally learned) per-channel frequency
-        """
-        super().__init__()
-        self.freq = nn.Parameter(freq * torch.ones(1, dim)) if train_freq else freq * torch.ones(1, dim)
-
-    def forward(self, x):
-        return torch.sin(self.freq * x)
-
-
-class PositionalEmbedding(nn.Module):
-    def __init__(
-        self,
-        emb_dim: int,
-        seq_len: int,
-        learn_pos_emb_z: bool = True,
-    ):
-        """Complex exponential positional embeddings for Hyena filters."""
-        super().__init__()
-
-        self.seq_len = seq_len
-        # The time embedding fed to the filters is normalized so that t_f = 1
-        t = torch.linspace(0, 1, self.seq_len)[None, :, None]  # 1, L, 1
-
-        if emb_dim > 1:
-            bands = (emb_dim - 1) // 2
-        # To compute the right embeddings we use the "proper" linspace
-        t_rescaled = torch.linspace(0, seq_len - 1, seq_len)[None, :, None]
-        w = 2 * math.pi * t_rescaled / seq_len  # 1, L, 1
-
-        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
-        z = torch.exp(-1j * f * w)
-        z = torch.cat([t, z.real, z.imag], dim=-1)
-        register(self, "z", z, learnable=learn_pos_emb_z)
-        register(self, "t", t, learnable=False)
-
-    def forward(self, L):
-        return self.z[:, :L], self.t[:, :L]
-
-
-class ExponentialModulation(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        modulate: bool = True,
-        learn_modulation: bool = False,
-        fast_decay_pct: float = 0.3,
-        slow_decay_pct: float = 1.5,
-        target: float = 1e-2,
-        shift: float = 0.0,
-    ):
-        """
-        Exponential decay modulation with (optionally learned) per-channel decay rate
-        """
-        super().__init__()
-        self.modulate = modulate
-        self.shift = shift
-        max_decay = math.log(target) / fast_decay_pct
-        min_decay = math.log(target) / slow_decay_pct
-        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
-        register(self, "deltas", deltas, learnable=learn_modulation)
-
-    def forward(self, t, x):
-        if self.modulate:
-            decay = torch.exp(-t * self.deltas.abs())
-            x = x * (decay + self.shift)
-        return x
-
-
-class HyenaFilter(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        seq_len: int = 1024,
-        emb_dim: int = 3,
-        learn_pos_emb_z: bool = True,
-        mlp_width: int = 64,
-        sine_freq: int = 1,
-        num_inner_mlps: int = 2,
-        normalized: bool = False,
-        submodules: HyenaFilterSubmodules = None,
-        **modulation_kwargs,
-    ):
-        """
-        Implicit long filter with modulation.
-
-        Args:
-            d_model (int): number of channels in the input
-            emb_dim (int): dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
-            mlp_width (int): Width of the MLP parametrizing the implicit filter. Defaults to 64
-            seq_len (int): length of input sequence
-            learn_pos_emb_z (bool): whether the positional embeddings are learned
-            sine_freq (int): frequency of periodic activations
-            num_inner_mlps (int): number of inner linear layers inside filter MLP
-            normalized (bool): whether to apply normalization after modulation
-        """
-        super().__init__()
-
-        if submodules is None:
-            submodules = HyenaFilterSubmodules(
-                positional_embedding=PositionalEmbedding,
-                linear=nn.Linear,
-                activation=Sin,
-                modulation=ExponentialModulation,
-            )
-
-        self.d_model = d_model
-        self.mlp_width = mlp_width
-
-        act = build_module(submodules.activation, dim=mlp_width, freq=sine_freq)
-        self.emb_dim = emb_dim
-        if emb_dim % 2 == 0 or emb_dim < 3:
-            raise ValueError("emb_dim must be odd and greater or equal to 3 (time, sine and cosine)")
-        self.seq_len = seq_len
-
-        self.pos_emb = build_module(submodules.positional_embedding, emb_dim, seq_len, learn_pos_emb_z)
-
-        # uses a variable number of inner linear layers
-        self.mlp = nn.Sequential(
-            build_module(submodules.linear, emb_dim, mlp_width),
-            act,
-        )
-        for i in range(num_inner_mlps):
-            self.mlp.append(build_module(submodules.linear, mlp_width, mlp_width))
-            self.mlp.append(act)
-        # final linear layer
-        self.mlp.append(build_module(submodules.linear, mlp_width, d_model, bias=False))
-
-        self.modulation = build_module(submodules.modulation, d_model, **modulation_kwargs)
-
-        self.normalized = normalized
-
-    def forward(self, L):
-        z, t = self.pos_emb(L)
-        h = self.mlp(z)
-
-        h = self.modulation(t, h)
-
-        if self.normalized:
-            h = h / torch.norm(h, dim=-1, p=1, keepdim=True)
-
-        return h
diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_spec.py b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py
deleted file mode 100644
index c35aebad342e..000000000000
--- a/nemo/collections/nlp/modules/common/hyena/hyena_spec.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch.nn as nn
-
-try:
-    from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TERowParallelLinear
-    from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
-    from megatron.core.transformer.spec_utils import ModuleSpec
-
-    HAVE_MEGATRON_CORE = True
-
-except (AttributeError, ImportError, ModuleNotFoundError):
-    from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
-
-    ModuleSpec = ApexGuardDefaults
-    HAVE_MEGATRON_CORE = False
-
-from nemo.collections.nlp.modules.common.hyena.hyena import (
-    CausalDepthWiseConv1d,
-    HyenaOperator,
-    HyenaOperatorSubmodules,
-)
-from nemo.collections.nlp.modules.common.hyena.hyena_filter import (
-    ExponentialModulation,
-    HyenaFilter,
-    HyenaFilterSubmodules,
-    PositionalEmbedding,
-    Sin,
-)
-
-
-def get_hyena_layer_with_transformer_engine_spec(hyena_cfg):
-    if not HAVE_MEGATRON_CORE:
-        raise ImportError(
-            "megatron-core was not found. "
-            "Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-        )
-
-    return ModuleSpec(
-        module=HyenaOperator,
-        params=hyena_cfg,
-        submodules=HyenaOperatorSubmodules(
-            in_proj=TELayerNormColumnParallelLinear,
-            short_filter=CausalDepthWiseConv1d,
-            implicit_filter=ModuleSpec(
-                module=HyenaFilter,
-                submodules=HyenaFilterSubmodules(
-                    positional_embedding=PositionalEmbedding,
-                    linear=nn.Linear,
-                    activation=Sin,
-                    modulation=ExponentialModulation,
-                ),
-            ),
-            out_proj=TERowParallelLinear,
-        ),
-    )
-
-
-def get_gpt_layer_with_te_and_hyena_spec(hyena_cfg):
-    spec = get_gpt_layer_with_transformer_engine_spec()
-    spec.submodules.self_attention = get_hyena_layer_with_transformer_engine_spec(hyena_cfg)
-    return spec

From ab01630e29923bb64e5eb8af8455e89f1acd1c18 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 05:20:08 -0700
Subject: [PATCH 12/21] remove transformer submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ...mm_autoregressive_eval_image_generation.py |   24 +-
 ...utoregressive_eval_vision_understanding.py |   24 +-
 .../transformer/transformer_encoders_nlp.py}  |    3 +-
 .../asr/modules/wav2vec_modules.py            |    2 +-
 .../speech_llm/models/modular_models.py       |    2 +-
 .../common/audio_text_generation_utils.py     |   18 +-
 .../modules/common/text_generation_utils.py   |  147 ++
 .../speech_llm/modules/perception_modules.py  |    2 +-
 .../modules}/transformer_decoders.py          |    3 +-
 .../modules/common/text_generation_utils.py   | 1271 -----------------
 .../modules/common/transformer/__init__.py    |   21 -
 .../common/transformer/bridge_encoders.py     |  141 --
 .../common/transformer/perceiver_encoders.py  |  174 ---
 .../common/transformer/reduction_encoders.py  |  148 --
 .../common/transformer/text_generation.py     |  114 --
 .../modules/common/transformer/transformer.py |  287 ----
 .../transformer/transformer_bottleneck.py     |  338 -----
 .../transformer/transformer_generators.py     |  951 ------------
 .../common/transformer/transformer_modules.py |  296 ----
 .../common/transformer/transformer_utils.py   |  180 ---
 .../audio_text_generation_utils.py            |   18 +-
 21 files changed, 232 insertions(+), 3932 deletions(-)
 rename nemo/collections/{nlp/modules/common/transformer/transformer_encoders.py => asr/modules/transformer/transformer_encoders_nlp.py} (99%)
 create mode 100644 nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
 rename nemo/collections/{nlp/modules/common/transformer => multimodal/speech_llm/modules}/transformer_decoders.py (99%)
 delete mode 100644 nemo/collections/nlp/modules/common/text_generation_utils.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/__init__.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/bridge_encoders.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/perceiver_encoders.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/reduction_encoders.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/text_generation.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/transformer.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/transformer_bottleneck.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/transformer_generators.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/transformer_modules.py
 delete mode 100644 nemo/collections/nlp/modules/common/transformer/transformer_utils.py

diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
index ae8dddb29553..d2f7cdc41db5 100644
--- a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
@@ -14,6 +14,7 @@
 
 import datetime
 import math
+import sys
 import os
 import re
 
@@ -28,10 +29,14 @@
 
 # pylint: disable=line-too-long
 from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 """
 This is the script to run multimodal autoregresssive text generation.
 
@@ -89,6 +94,23 @@
 """
 
 
+class LengthParam(TypedDict):
+    max_length: int  # The maximum length of the sequence to be generated.
+    min_length: int  # The minimum length of the sequence to be generated.
+
+
+class SamplingParam(TypedDict):
+    use_greedy: bool  # Whether or not to use sampling ; use greedy decoding otherwise
+    temperature: float  # sampling temperature
+    top_k: int  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: float  # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    repetition_penalty: float  # The parameter for repetition penalty. 1.0 means no penalty.
+    add_BOS: bool  # add the bos token at the begining of the prompt
+    all_probs: bool  # whether return the log prob for all the tokens in vocab
+    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: List[str]  # generation will stop when one of these tokens is generated
+
+
 def to_img(tokens_string, image_tokenizer):
     """Converts visual tokens to images
 
diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
index 4aea4d9898ae..556637f012f6 100644
--- a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_vision_understanding.py
@@ -14,6 +14,7 @@
 
 
 import datetime
+import sys
 
 import torch
 import torchvision
@@ -30,10 +31,14 @@
 from transformers import AutoModel, AutoTokenizer
 
 # pylint: disable=line-too-long
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 """
 This is the script to run multimodal autoregresssive text generation.
 
@@ -94,6 +99,23 @@
 VQ_HUB = "BAAI/Emu3-VisionTokenizer"
 
 
+class LengthParam(TypedDict):
+    max_length: int  # The maximum length of the sequence to be generated.
+    min_length: int  # The minimum length of the sequence to be generated.
+
+
+class SamplingParam(TypedDict):
+    use_greedy: bool  # Whether or not to use sampling ; use greedy decoding otherwise
+    temperature: float  # sampling temperature
+    top_k: int  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: float  # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    repetition_penalty: float  # The parameter for repetition penalty. 1.0 means no penalty.
+    add_BOS: bool  # add the bos token at the begining of the prompt
+    all_probs: bool  # whether return the log prob for all the tokens in vocab
+    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: List[str]  # generation will stop when one of these tokens is generated
+
+
 def to_imgstr(image_tokens, tokenizer):
     """Convert integer image tokens to visual tokens string"""
     image_tokens = image_tokens.cpu().numpy().tolist()
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_encoders.py b/nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py
similarity index 99%
rename from nemo/collections/nlp/modules/common/transformer/transformer_encoders.py
rename to nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py
index 6755a86ba40f..fbbf7fe079e7 100644
--- a/nemo/collections/nlp/modules/common/transformer/transformer_encoders.py
+++ b/nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -172,3 +172,4 @@ def forward(self, encoder_states, encoder_mask, encoder_mems_list=None, return_m
             return cached_mems_list
         else:
             return cached_mems_list[-1]
+
diff --git a/nemo/collections/asr/modules/wav2vec_modules.py b/nemo/collections/asr/modules/wav2vec_modules.py
index d1f5b090d4e1..4ca474bee8e0 100644
--- a/nemo/collections/asr/modules/wav2vec_modules.py
+++ b/nemo/collections/asr/modules/wav2vec_modules.py
@@ -28,7 +28,7 @@
 from torch.nn import functional as F
 
 from nemo.collections.common.parts import form_attention_mask, transformer_weights_init
-from nemo.collections.nlp.modules.common.transformer import TransformerEncoder
+from nemo.collections.asr.modules.common.transformer.transformer_encoders_nlp import TransformerEncoder
 from nemo.core.classes.module import NeuralModule
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
 
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
index 3aae6990d50a..29c5c98337a6 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -60,7 +60,7 @@
     build_position_ids,
     get_iterator_k_split,
 )
-from nemo.collections.nlp.modules.common.text_generation_utils import get_computeprob_response
+from nemo.collections.multimodal.speech_llm.modules.common.text_generation_utils import get_computeprob_response
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes import ModelPT
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
index 16a29eb35443..1f270247469e 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
@@ -17,18 +17,18 @@
 """Utilities for generating text."""
 
 import pickle
+import sys
 from collections.abc import Iterable
 from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
 
-import nemo.collections.nlp.modules.common.text_generation_utils as text_generation_utils
+import nemo.collections.multimodal.speech_llm.modules.common.text_generation_utils as text_generation_utils
 from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
 from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_strategy import (
     model_inference_strategy_dispatcher,
 )
-from nemo.collections.nlp.modules.common.transformer.text_generation import OutputType
 from nemo.utils import AppState, logging
 
 try:
@@ -49,12 +49,26 @@
         _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
     )
 
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 __all__ = [
     "get_computeprob_response",
     "generate",
 ]
 
 
+class OutputType(TypedDict):
+    sentences: List[str]  # output sentences
+    tokens: List[List[str]]  # output sentences borken into tokens
+    logprob: List[List[float]]  # log prob of generated tokens
+    full_logprob: List[List[float]]  # log prob of all the tokens in the vocab
+    token_ids: List[List[int]]  # output sentence token ids
+    offsets: List[List[int]]  # list of tokens start positions in text
+
+
 def get_computeprob_response(tokenizer, response, inputs):
     return text_generation_utils.get_computeprob_response(tokenizer, response, inputs)
 
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
new file mode 100644
index 000000000000..5e204697d902
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the model parallel group."""
+    world_size = torch.distributed.get_world_size()
+    all_ranks = np.arange(world_size)
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    dp_rank = parallel_state.get_data_parallel_rank()
+    if AppState().use_tp_pp_dp_mapping:
+        # [DP, PP, TP]
+        all_ranks = all_ranks.reshape(-1, pp_size, tp_size)
+        return all_ranks[dp_rank, :, :].min()
+    else:
+        # [PP, DP, TP]
+        all_ranks = all_ranks.reshape(pp_size, -1, tp_size)
+        return all_ranks[:, dp_rank, :].min()
+
+
+def get_computeprob_response(tokenizer, response, inputs):
+    if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage():
+        # we only have a response on the first and last pipeline stages
+        compute_prob_response = {}
+        new_token_ids = []
+        new_tokens = []
+        new_texts = []
+        log_probs = []
+        full_logprobs = []
+        offsets = []
+        for batch_id in range(len(response['tokens'])):
+            if isinstance(inputs, (list, tuple)):
+                if isinstance(inputs[0], str):
+                    new_token_id = tokenizer.text_to_ids(inputs[batch_id])
+                    new_text = inputs[batch_id]
+                    token_len = len(new_token_id)
+                elif isinstance(inputs[0], torch.Tensor):
+                    token_len = int(inputs[1][batch_id].item())
+                    new_token_id = inputs[0][batch_id][:token_len].tolist()
+                    new_text = tokenizer.ids_to_text(new_token_id)
+                else:
+                    raise TypeError(
+                        f"Unsupported type of `inputs[0]`: {type(inputs[0])}. Supported types: `str`, `torch.Tensor`."
+                    )
+            else:
+                raise TypeError(
+                    f"Unsupported type of parameter `inputs`: {type(inputs)}. Supported types: `list` and `tuple`"
+                )
+            new_token_ids.append(new_token_id)
+            new_tokens.append(response['tokens'][batch_id][:token_len])
+            new_texts.append(new_text)
+            log_probs.append(response['logprob'][batch_id][:token_len])
+            full_logprobs.append(response['full_logprob'][batch_id][:token_len])
+            offsets.append(response['offsets'][batch_id][:-1])
+        compute_prob_response['sentences'] = new_texts
+        compute_prob_response['tokens'] = new_tokens
+        compute_prob_response['token_ids'] = new_token_ids
+        compute_prob_response['logprob'] = log_probs
+        compute_prob_response['full_logprob'] = full_logprobs
+        compute_prob_response['offsets'] = offsets
+        return compute_prob_response
+    else:
+        # intermediate stages
+        return None
+
+
+def repetition_penalty(logits, repetition_penalty, used_tokens):
+    """Implement the repetition penalty, check paper
+    https://arxiv.org/pdf/1909.05858.pdf
+    """
+    if used_tokens is not None and repetition_penalty != 1.0:
+        logits_update = torch.gather(logits, 1, used_tokens)
+        logits = torch.scatter(logits, 1, used_tokens, logits_update / repetition_penalty)
+    return logits
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None):
+    """
+    This function has been mostly taken from huggingface conversational
+      ai code at
+      https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+           conversational-ai-with-transfer-learning-2d818ac26313
+
+     @param logits: logits tensor
+     @param top_k: keep only top k tokens with highest probability
+     @param top_p: keep the top tokens with cumulative probability
+     @filter_value: value to set filtered tokens to
+     @started: a tensor of bools indicating whether the text generation starts for the batch
+     returns the filtered logits
+    """
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        if started is not None:
+            for i in np.arange(indices_to_remove.size(0))[started.cpu().numpy()]:
+                logits[i, indices_to_remove[i]] = filter_value
+        else:
+            logits[indices_to_remove] = filter_value
+
+    if 0.0 < top_p < 1.0:
+        # Cconvert to 1D
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        if started is not None:
+            for i in np.arange(sorted_indices.size(0))[started.cpu().numpy()]:
+                indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+                logits[i, indices_to_remove] = filter_value
+        else:
+            for i in range(sorted_indices.size(0)):
+                indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+                logits[i, indices_to_remove] = filter_value
+
+    return logits
+
diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
index 4f631b093510..1f35b4fc6529 100644
--- a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
+++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
@@ -23,7 +23,7 @@
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerMultiLayerFeatureExtractor
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import align_feat_seq_list
-from nemo.collections.nlp.modules.common.transformer.transformer_decoders import TransformerDecoder
+from nemo.collections.multimodal.speech_llm.modules.common.transformer.transformer_decoders import TransformerDecoder
 from nemo.core.classes import Exportable, NeuralModule
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_decoders.py b/nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py
similarity index 99%
rename from nemo/collections/nlp/modules/common/transformer/transformer_decoders.py
rename to nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py
index 5124cd33d44c..aa1ad355d5ba 100644
--- a/nemo/collections/nlp/modules/common/transformer/transformer_decoders.py
+++ b/nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -216,3 +216,4 @@ def input_example(self, max_batch=1, max_dim=256):
         input_ids = torch.randint(low=0, high=2048, size=(max_batch, max_dim, 1024), device=sample.device)
         encoder_mask = torch.randint(low=0, high=1, size=(max_batch, max_dim), device=sample.device)
         return tuple([input_ids, encoder_mask, input_ids, encoder_mask])
+
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
deleted file mode 100644
index 17afb782f8b9..000000000000
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ /dev/null
@@ -1,1271 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# flake8: noqa
-# pylint: skip-file
-
-"""Utilities for generating text."""
-
-import os
-import pickle
-import re
-from collections.abc import Iterable
-from functools import partial
-from typing import Callable, Tuple
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from lightning.fabric.utilities.seed import seed_everything
-
-from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
-from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
-from nemo.collections.nlp.modules.common.text_generation_strategy import model_inference_strategy_dispatcher
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, OutputType, SamplingParam
-from nemo.utils import AppState, logging
-
-try:
-    from megatron.core import parallel_state, tensor_parallel
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-
-try:
-    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
-
-except (ImportError, ModuleNotFoundError):
-    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
-    from apex.transformer.pipeline_parallel.utils import (
-        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
-    )
-
-__all__ = [
-    "get_default_sampling_params",
-    "get_default_length_params",
-    "megatron_gpt_generate",
-    "get_computeprob_response",
-    "generate",
-    "sample_token_greedy",
-    "sample_token_topk",
-]
-
-
-def get_default_sampling_params():
-    # default do greedy sampling
-    sampling_params: SamplingParam = {
-        "use_greedy": True,
-        "temperature": 1.0,
-        "top_k": 0,
-        "top_p": 1.0,
-        "repetition_penalty": 1.0,
-        "add_BOS": True,
-        "all_probs": False,
-        "compute_logprob": False,
-        "end_strings": ["<|endoftext|>", "<extra_id_1>"],
-    }
-
-    return sampling_params
-
-
-def get_default_length_params():
-    # default do greedy sampling
-    length_params: LengthParam = {"min_length": 0, "max_length": 30}
-
-    return length_params
-
-
-def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_params, **strategy_args):
-    # reproduce the old compute_prob method
-    # a very special case
-    if sampling_params['compute_logprob']:
-        # need to overwrite some configuration, make it immutable
-        sampling_params = sampling_params.copy()
-        length_params = length_params.copy()
-        length_params['max_length'] = 1
-        sampling_params['all_probs'] = True
-        sampling_params["add_BOS"] = False
-        sampling_params['greedy'] = True
-        response = generate(
-            model,
-            inputs=inputs,
-            tokens_to_generate=length_params['max_length'],
-            all_probs=sampling_params['all_probs'],
-            compute_logprob=sampling_params['compute_logprob'],
-            temperature=sampling_params['temperature'],
-            add_BOS=sampling_params['add_BOS'],
-            top_k=sampling_params['top_k'],
-            top_p=sampling_params['top_p'],
-            greedy=sampling_params['use_greedy'],
-            repetition_penalty=sampling_params['repetition_penalty'],
-            end_strings=sampling_params['end_strings'],
-            min_tokens_to_generate=length_params['min_length'],
-            compute_attention_mask=sampling_params.get("compute_attention_mask", True),
-            **strategy_args,
-        )
-        compute_prob_response = get_computeprob_response(tokenizer, response, inputs)
-        return compute_prob_response
-
-    if not isinstance(inputs, (list, tuple)):
-        raise NotImplementedError(f"unknown type {type(inputs)} is not implemented")
-
-    output = generate(
-        model,
-        inputs=inputs,
-        tokens_to_generate=length_params['max_length'],
-        all_probs=sampling_params['all_probs'],
-        compute_logprob=sampling_params['compute_logprob'],
-        temperature=sampling_params['temperature'],
-        add_BOS=sampling_params['add_BOS'],
-        top_k=sampling_params['top_k'],
-        top_p=sampling_params['top_p'],
-        greedy=sampling_params['use_greedy'],
-        repetition_penalty=sampling_params['repetition_penalty'],
-        end_strings=sampling_params['end_strings'],
-        min_tokens_to_generate=length_params['min_length'],
-        **strategy_args,
-    )
-    return output
-
-
-def decode_time_tokens(tokenizer, text: str, duration: float, time_tokens: list[str], time_token_ids: list[int]):
-    """Decode the time tokens <t0>....<t99> in the text to the actual time in seconds.
-       TO DO: to do time decoding on output ids instead of text
-
-    Args:
-        text (str): _description_
-        duration (float): the total length of the video in seconds
-        time_tokens (list[str]): list of time tokens [<t1>, <t2>, <t3>, ..]
-        time_token_ids (list[str]): list of time token ids [32004, 32005, ....]
-    """
-    output_ids = tokenizer.text_to_ids(text)
-    num_time_tokens = len(time_token_ids)
-    # the original code is len(output_ids) - 1
-    indices = [j for j in range(len(output_ids)) if output_ids[j] in time_token_ids]
-    last_processed = -1
-    new_output_ids = []
-    for j in range(len(indices)):
-        pred_seq = [int(output_ids[k]) for k in range(last_processed + 1, indices[j])]
-        new_output_ids.extend(pred_seq)
-        max_offset = num_time_tokens - 1
-        time_token = tokenizer.ids_to_tokens([output_ids[indices[j]]])[0]
-        time_idx = time_tokens.index(time_token)
-        time = float(time_idx) * duration / max_offset
-        time = min(max(time, 0), duration)
-        time = round(time, 2)
-        # time_str = '<' + str(time) + '>'
-        time_str = '<%s>' % str(time)
-        new_output_ids.extend(tokenizer.text_to_ids(time_str))
-
-        last_processed = indices[j]
-    pred_seq = [int(x) for x in output_ids[last_processed + 1 :]]
-    new_output_ids.extend(pred_seq)
-    output_ids = new_output_ids
-    decoded_text = tokenizer.ids_to_text(output_ids)
-    return decoded_text
-
-
-def encode_time_str(text: str, duration: float, num_time_tokens: int = 100, time_token_template: str = "<t{t}>"):
-    """
-    Encode the common time expression to its time token expression
-    """
-
-    def time_to_string(time):
-        # time is normalized in [0, 1]
-        max_offset = float(num_time_tokens - 1)
-        time = int(np.round(max_offset * time))
-        return time_token_template.format(t=time)
-
-    def repl(match):
-        value = float(match.group(1)) / duration
-        return time_to_string(value) + f"<!|t{value}t|!>"
-
-    text = re.sub(r"<([\d.]{1,20})s>", repl, text)
-    text = re.sub(r"\s([\d.]{1,20})s[\s|\.|,|>]", repl, text)
-    text = re.sub(r"\s([\d.]{1,20}) seconds", repl, text)
-    text = re.sub(r"\s([\d.]{1,20}) second", repl, text)
-
-    # This is to remove the timestamps from the text
-    text = re.sub(r"<!\|t([\d.]+)t\|!>", "", text)
-    return text.strip()
-
-
-def get_computeprob_response(tokenizer, response, inputs):
-    if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage():
-        # we only have a response on the first and last pipeline stages
-        compute_prob_response = {}
-        new_token_ids = []
-        new_tokens = []
-        new_texts = []
-        log_probs = []
-        full_logprobs = []
-        offsets = []
-        for batch_id in range(len(response['tokens'])):
-            if isinstance(inputs, (list, tuple)):
-                if isinstance(inputs[0], str):
-                    new_token_id = tokenizer.text_to_ids(inputs[batch_id])
-                    new_text = inputs[batch_id]
-                    token_len = len(new_token_id)
-                elif isinstance(inputs[0], torch.Tensor):
-                    token_len = int(inputs[1][batch_id].item())
-                    new_token_id = inputs[0][batch_id][:token_len].tolist()
-                    new_text = tokenizer.ids_to_text(new_token_id)
-                else:
-                    raise TypeError(
-                        f"Unsupported type of `inputs[0]`: {type(inputs[0])}. Supported types: `str`, `torch.Tensor`."
-                    )
-            else:
-                raise TypeError(
-                    f"Unsupported type of parameter `inputs`: {type(inputs)}. Supported types: `list` and `tuple`"
-                )
-            new_token_ids.append(new_token_id)
-            new_tokens.append(response['tokens'][batch_id][:token_len])
-            new_texts.append(new_text)
-            log_probs.append(response['logprob'][batch_id][:token_len])
-            full_logprobs.append(response['full_logprob'][batch_id][:token_len])
-            offsets.append(response['offsets'][batch_id][:-1])
-        compute_prob_response['sentences'] = new_texts
-        compute_prob_response['tokens'] = new_tokens
-        compute_prob_response['token_ids'] = new_token_ids
-        compute_prob_response['logprob'] = log_probs
-        compute_prob_response['full_logprob'] = full_logprobs
-        compute_prob_response['offsets'] = offsets
-        return compute_prob_response
-    else:
-        # intermediate stages
-        return None
-
-
-def get_batch(model, tokenizer, context_tokens):
-    """Generate batch from context tokens."""
-    # Move to GPU.
-    tokens = context_tokens.contiguous().cuda()
-    # Get the attention mask and postition ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eos_id,
-        model.cfg.get('reset_position_ids', False),
-        model.cfg.get('reset_attention_mask', False),
-        model.cfg.get('eod_mask_loss', False),
-    )
-
-    return tokens, attention_mask, position_ids
-
-
-def tab_logits(logits, min_id, max_id, filter_value=-float('Inf')):
-    logits[:, :min_id] = filter_value
-    logits[:, max_id:] = filter_value
-    return logits
-
-
-def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None):
-    """
-    This function has been mostly taken from huggingface conversational
-      ai code at
-      https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-           conversational-ai-with-transfer-learning-2d818ac26313
-
-     @param logits: logits tensor
-     @param top_k: keep only top k tokens with highest probability
-     @param top_p: keep the top tokens with cumulative probability
-     @filter_value: value to set filtered tokens to
-     @started: a tensor of bools indicating whether the text generation starts for the batch
-     returns the filtered logits
-    """
-    if top_k > 0:
-        # Remove all tokens with a probability less than the
-        # last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        if started is not None:
-            for i in np.arange(indices_to_remove.size(0))[started.cpu().numpy()]:
-                logits[i, indices_to_remove[i]] = filter_value
-        else:
-            logits[indices_to_remove] = filter_value
-
-    if 0.0 < top_p < 1.0:
-        # Cconvert to 1D
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold
-        sorted_indices_to_remove = cumulative_probs > top_p
-        # Shift the indices to the right to keep also the first token
-        # above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        if started is not None:
-            for i in np.arange(sorted_indices.size(0))[started.cpu().numpy()]:
-                indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
-                logits[i, indices_to_remove] = filter_value
-        else:
-            for i in range(sorted_indices.size(0)):
-                indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
-                logits[i, indices_to_remove] = filter_value
-
-    return logits
-
-
-def repetition_penalty(logits, repetition_penalty, used_tokens):
-    """Implement the repetition penalty, check paper
-    https://arxiv.org/pdf/1909.05858.pdf
-    """
-    if used_tokens is not None and repetition_penalty != 1.0:
-        logits_update = torch.gather(logits, 1, used_tokens)
-        logits = torch.scatter(logits, 1, used_tokens, logits_update / repetition_penalty)
-    return logits
-
-
-def get_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the model parallel group."""
-    world_size = torch.distributed.get_world_size()
-    all_ranks = np.arange(world_size)
-    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-    dp_rank = parallel_state.get_data_parallel_rank()
-    if AppState().use_tp_pp_dp_mapping:
-        # [DP, PP, TP]
-        all_ranks = all_ranks.reshape(-1, pp_size, tp_size)
-        return all_ranks[dp_rank, :, :].min()
-    else:
-        # [PP, DP, TP]
-        all_ranks = all_ranks.reshape(pp_size, -1, tp_size)
-        return all_ranks[:, dp_rank, :].min()
-
-
-def send_generate_info(
-    context_tokens_tensor,
-    context_length_tensor,
-    tokens_to_generate,
-    all_probs,
-    compute_logprob,
-    temperature,
-    top_k,
-    top_p,
-    greedy,
-    repetition_penalty,
-    min_tokens_to_generate,
-    end_strings,
-    random_seed,
-):
-    """
-    Needs to be synced up with receive_generate_info
-    """
-    model_parallel_group = parallel_state.get_model_parallel_group()
-    src = get_model_parallel_src_rank()
-    if random_seed is None:
-        random_seed = -1  # to be able to convert to float
-    # Send the sizes of the tensors
-    input_info = [
-        context_tokens_tensor.size(0),  # batch_size
-        context_tokens_tensor.size(1),  # seq_len
-        tokens_to_generate,
-        all_probs,
-        compute_logprob,  # whether to compute log probabilities matrix
-        temperature,
-        top_k,
-        top_p,
-        greedy,
-        repetition_penalty,
-        min_tokens_to_generate,
-        random_seed,
-    ]
-    input_info_tensor = torch.cuda.FloatTensor(input_info)
-    torch.distributed.broadcast(input_info_tensor, src, model_parallel_group)
-
-    # Send variables to all ranks
-    torch.distributed.broadcast(context_length_tensor, src, model_parallel_group)
-    torch.distributed.broadcast(context_tokens_tensor, src, model_parallel_group)
-
-    # send end strings
-    string_tensor = torch.as_tensor(
-        np.frombuffer(pickle.dumps(end_strings), dtype=np.int8), device=torch.cuda.current_device()
-    )
-    size = torch.as_tensor([string_tensor.size(0)], device=torch.cuda.current_device(), dtype=torch.int64)
-    torch.distributed.broadcast(size, src, model_parallel_group)
-    torch.distributed.broadcast(string_tensor, src, model_parallel_group)
-
-
-def receive_generate_info():
-    """
-    Needs to be synced up with send_generate_info
-    """
-    model_parallel_group = parallel_state.get_model_parallel_group()
-    src = get_model_parallel_src_rank()
-    input_info_tensor = torch.empty(12, dtype=torch.float32, device=torch.cuda.current_device())
-    torch.distributed.broadcast(input_info_tensor, src, model_parallel_group)
-    batch_size = int(input_info_tensor[0].item())
-    seq_len = int(input_info_tensor[1].item())
-    tokens_to_generate = int(input_info_tensor[2].item())
-    all_probs = bool(input_info_tensor[3].item())
-    compute_logprob = bool(input_info_tensor[4].item())  # whether to compute log probabilities matrix
-    temperature = float(input_info_tensor[5].item())
-    top_k = int(input_info_tensor[6].item())
-    top_p = float(input_info_tensor[7].item())
-    greedy = bool(input_info_tensor[8].item())
-    repetition_penalty = float(input_info_tensor[9].item())
-    min_tokens_to_generate = int(input_info_tensor[10].item())
-    random_seed = int(input_info_tensor[11].item())
-    if random_seed == -1:  # was converted to -1 before broadcast
-        random_seed = None
-
-    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
-    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())
-    # Send variables to all ranks
-    torch.distributed.broadcast(context_length_tensor, src, model_parallel_group)
-    torch.distributed.broadcast(context_tokens_tensor, src, model_parallel_group)
-
-    array_size = torch.empty(1, dtype=torch.int64, device=torch.cuda.current_device())
-    torch.distributed.broadcast(array_size, src, model_parallel_group)
-
-    string_tensor = torch.empty(array_size[0], dtype=torch.int8, device=torch.cuda.current_device())
-    torch.distributed.broadcast(string_tensor, src, model_parallel_group)
-    bytes = string_tensor.cpu().numpy().tobytes()
-    end_strings = pickle.loads(bytes)
-
-    return (
-        context_length_tensor,
-        context_tokens_tensor,
-        tokens_to_generate,
-        all_probs,
-        compute_logprob,
-        temperature,
-        top_k,
-        top_p,
-        greedy,
-        repetition_penalty,
-        min_tokens_to_generate,
-        end_strings,
-        random_seed,
-    )
-
-
-def synced_generate(
-    model,
-    inference_strategy,
-    context_tokens_tensor,
-    context_length_tensor,
-    tokens_to_generate,
-    all_probs,
-    temperature,
-    top_k=0,
-    top_p=0.0,
-    greedy=False,
-    compute_attention_mask=True,
-    compute_logprob=False,
-    repetition_penalty=1.2,
-    end_strings=[],
-    min_tokens_to_generate=0,
-    image_list=None,
-    **strategy_args,
-):
-    context_length = context_length_tensor.min().item()
-    tokenizer = model.tokenizer
-    if isinstance(tokenizer, TabularTokenizer):
-        batch_token_iterator = tab_sample_sequence_batch(
-            model,
-            inference_strategy,
-            context_tokens_tensor,
-            context_length_tensor,
-            tokens_to_generate,
-            all_probs,
-            compute_attention_mask=compute_attention_mask,
-            temperature=temperature,
-        )
-    else:
-
-        extra = {
-            "top_p": top_p,
-            "top_k": top_k,
-            "greedy": greedy,
-            "repetition_penalty": repetition_penalty,
-            "min_tokens_to_generate": min_tokens_to_generate,
-        }
-
-        # if input containing neighbors (for Mcore retrieval RETRO model)
-        if "neighbors_tokens" in strategy_args:
-            extra['neighbors_tokens'] = strategy_args['neighbors_tokens']
-
-        batch_token_iterator = sample_sequence_batch(
-            model,
-            inference_strategy,
-            context_tokens_tensor,
-            context_length_tensor,
-            tokens_to_generate,
-            all_probs,
-            compute_attention_mask=compute_attention_mask,
-            compute_logprob=compute_logprob,
-            temperature=temperature,
-            end_strings=end_strings,
-            image_list=image_list,
-            extra=extra,
-        )
-
-    for tokens, lengths, output_logits, full_logits in batch_token_iterator:
-        context_length += 1
-
-    if parallel_state.is_pipeline_last_stage():
-        src = parallel_state.get_pipeline_model_parallel_last_rank()
-        group = parallel_state.get_embedding_group()
-        if compute_logprob:
-            torch.distributed.broadcast(output_logits, src, group)
-        if all_probs:
-            src = parallel_state.get_pipeline_model_parallel_last_rank()
-            group = parallel_state.get_embedding_group()
-            torch.distributed.broadcast(full_logits, src, group)
-
-    else:
-        if parallel_state.is_pipeline_first_stage():
-            src = parallel_state.get_pipeline_model_parallel_last_rank()
-            group = parallel_state.get_embedding_group()
-
-            if compute_logprob:
-                precision = model._trainer.precision
-                dtype = torch.float32
-
-                output_logits = torch.empty(
-                    tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda")
-                )
-                torch.distributed.broadcast(output_logits, src, group)
-
-            if all_probs:
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_embedding_group()
-                full_logits = torch.empty(
-                    tokens.size(0),
-                    context_length - 1,
-                    model.padded_vocab_size,
-                    dtype=dtype,
-                    device=torch.device("cuda"),
-                )
-                torch.distributed.broadcast(full_logits, src, group)
-    if tokens is not None:
-        return tokens[:, :context_length], output_logits, full_logits
-
-
-def generate(
-    model,
-    inputs=None,
-    tokens_to_generate=0,
-    all_probs=False,
-    temperature=1.0,
-    add_BOS=False,
-    top_k=0,
-    top_p=0.0,
-    greedy=False,
-    compute_attention_mask=True,
-    compute_logprob=False,
-    repetition_penalty=1.0,
-    end_strings=['<|endoftext|>'],
-    image_list=None,
-    min_tokens_to_generate=0,
-    random_seed=None,
-    **strategy_args,
-) -> OutputType:
-    """
-    Args:
-        model (NLPModel): text generative model
-        inputs (Union[tuple, List[str]]): if it is a tuple, it is assumed to be (context_tokens_tensor, context_length_tensor). Otherwise it it a list of prompt text strings
-        tokens_to_generate (int): The maximum length of the tokens to be generated.
-        all_probs (bool): Return the log prob for all the tokens
-        temperature (float): sampling temperature
-        add_BOS (bool): add the bos token at the begining of the prompt
-        top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        top_p (float): If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-        greedy (bool):  Whether or not to use sampling ; use greedy decoding otherwise
-        repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty
-        min_tokens_to_generate (int): The minimum length of the tokens to be generated
-        random_seed (int): can set to fix random seed for reproducibility. If None, we do not set random seed, so
-            the behavior of generation will depend on whether the seed was set earlier or not.
-        strategy_args, the extra arguments are treated as inference strategy arguments
-        end_strings, a list of strings to stop generation when they are encountered in the output.
-
-    Returns:
-        OutputType: It generates the output in a dictionary type. It has the following keys:
-
-            sentences: List[str], output sentences
-            tokens: List[List[str]], output sentences borken into tokens
-            logprob: List[Tensor], log prob of generated tokens
-            full_logprob: List[Tensor], log prob of all the tokens in the vocab
-            token_ids: List[Tensor], output sentence token ids
-            offsets: List[List[int]]  # list of tokens start positions in text
-    """
-    if 'strategy' in strategy_args:
-        inference_strategy = strategy_args['strategy']
-    else:
-        inference_strategy = model_inference_strategy_dispatcher(model, **strategy_args)
-    tokenizer = model.tokenizer
-    if torch.distributed.get_rank() == get_model_parallel_src_rank():
-        if isinstance(inputs, tuple):
-            context_tokens_tensor, context_length_tensor = inputs
-        else:
-            context_tokens_tensor, context_length_tensor = inference_strategy.tokenize_batch(
-                inputs, tokens_to_generate, add_BOS
-            )
-
-        send_generate_info(
-            context_tokens_tensor,
-            context_length_tensor,
-            tokens_to_generate,
-            all_probs,
-            compute_logprob,
-            temperature,
-            top_k,
-            top_p,
-            greedy,
-            repetition_penalty,
-            min_tokens_to_generate,
-            end_strings,
-            random_seed,
-        )
-
-        # tokenize neighbors and broadcast (for Mcore retrieval RETRO model)
-        if 'neighbors' in strategy_args:
-            # tokenize neighbors
-            neighbors_tokens_tensor, neighbors_tokens_tensor_shape = inference_strategy.tokenize_neighbors_batch(
-                strategy_args['neighbors'], strategy_args['retro_inference']
-            )
-
-            # send neighbors tensors to all ranks
-            model_parallel_group = parallel_state.get_model_parallel_group()
-            src = get_model_parallel_src_rank()
-            torch.distributed.broadcast(neighbors_tokens_tensor_shape, src, model_parallel_group)
-            torch.distributed.broadcast(neighbors_tokens_tensor, src, model_parallel_group)
-        else:
-            neighbors_tokens_tensor = None
-
-    else:
-        (
-            context_length_tensor,
-            context_tokens_tensor,
-            tokens_to_generate,
-            all_probs,
-            compute_logprob,
-            temperature,
-            top_k,
-            top_p,
-            greedy,
-            repetition_penalty,
-            min_tokens_to_generate,
-            end_strings,
-            random_seed,
-        ) = receive_generate_info()
-
-        # receive broadcast (for Mcore retrieval RETRO model)
-        if 'neighbors' in strategy_args:
-            # receive neighbors tensors to all ranks
-            model_parallel_group = parallel_state.get_model_parallel_group()
-            src = get_model_parallel_src_rank()
-            neighbors_tokens_tensor_shape = torch.empty(2, dtype=torch.float32, device=torch.cuda.current_device())
-            torch.distributed.broadcast(neighbors_tokens_tensor_shape, src, model_parallel_group)
-            neighbors_tokens_tensor = torch.empty(
-                neighbors_tokens_tensor_shape[0],
-                neighbors_tokens_tensor_shape[1],
-                dtype=torch.int64,
-                device=torch.cuda.current_device(),
-            )
-            torch.distributed.broadcast(neighbors_tokens_tensor, src, model_parallel_group)
-        else:
-            neighbors_tokens_tensor = None
-
-    # add neighbors to strategy_args (for retrieval RETRO model)
-    if 'neighbors' in strategy_args:
-        strategy_args['neighbors_tokens'] = neighbors_tokens_tensor
-
-    if random_seed is not None:
-        seed_everything(random_seed)
-
-    if hasattr(model, 'get_attention_mask_from_fusion') and model.get_attention_mask_from_fusion:
-        compute_attention_mask = False
-
-    output = synced_generate(
-        model,
-        inference_strategy,
-        context_tokens_tensor,
-        context_length_tensor,
-        tokens_to_generate,
-        all_probs,
-        temperature,
-        compute_attention_mask=compute_attention_mask,
-        compute_logprob=compute_logprob,
-        top_k=top_k,
-        top_p=top_p,
-        greedy=greedy,
-        repetition_penalty=repetition_penalty,
-        end_strings=end_strings,
-        min_tokens_to_generate=min_tokens_to_generate,
-        image_list=image_list,
-        **strategy_args,
-    )
-    special_tokens = set()
-    if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None:
-        special_tokens.add(tokenizer.pad_token)
-    if hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None:
-        special_tokens.add(tokenizer.eos_token)
-    if hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None:
-        special_tokens.add(tokenizer.bos_token)
-    if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token is not None:
-        special_tokens.add(tokenizer.cls_token)
-    if hasattr(tokenizer, 'unk_token') and tokenizer.unk_token is not None:
-        special_tokens.add(tokenizer.unk_token)
-    if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token is not None:
-        special_tokens.add(tokenizer.sep_token)
-    if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token is not None:
-        special_tokens.add(tokenizer.mask_token)
-    if output is not None:
-        decode_tokens, output_logits, full_logits = output
-        resp_sentences = []
-        resp_sentences_seg = []
-
-        decode_tokens = decode_tokens.cpu().numpy().tolist()
-        for decode_token in decode_tokens:
-            sentence = tokenizer.ids_to_text(decode_token)
-            resp_sentences.append(sentence)
-            if not isinstance(tokenizer, TabularTokenizer):
-                words = []
-                for token in decode_token:
-                    if not isinstance(token, Iterable):
-                        token = [token]
-                    word = tokenizer.ids_to_tokens(token)
-                    if isinstance(word, Iterable):
-                        word = word[0]
-                    if hasattr(tokenizer.tokenizer, 'byte_decoder'):
-                        word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
-                            'utf-8', errors='replace'
-                        )
-                    words.append(word)
-                resp_sentences_seg.append(words)
-            else:
-                words = tokenizer.text_to_tokens(sentence)
-                resp_sentences_seg.append(words)
-
-        # offsets calculation
-        all_offsets = []
-        for item in resp_sentences_seg:
-            offsets = [0]
-            for index, token in enumerate(item):
-                if index != len(item) - 1:
-                    if token in special_tokens:
-                        offsets.append(offsets[-1])
-                    else:
-                        offsets.append(len(token) + offsets[-1])
-            all_offsets.append(offsets)
-
-        output = {}
-        output['sentences'] = resp_sentences
-        output['tokens'] = resp_sentences_seg
-        output['logprob'] = output_logits
-        output['full_logprob'] = full_logits
-        output['token_ids'] = decode_tokens
-        output['offsets'] = all_offsets
-        output = inference_strategy.post_generation_process(output)
-        return output
-
-
-def switch(val1, val2, boolean):
-    boolean = boolean.type_as(val1)
-    return (1 - boolean) * val1 + boolean * val2
-
-
-def sample_sequence_batch(
-    model,
-    inference_strategy,
-    context_tokens,
-    context_lengths,
-    tokens_to_generate,
-    all_probs=False,
-    compute_attention_mask=True,
-    compute_logprob=False,
-    type_ids=None,
-    temperature=None,
-    end_strings=['<|endoftext|>'],
-    image_list=None,
-    extra={},
-):
-    # Importing here to avoid circular import errors
-
-    app_state = AppState()
-    micro_batch_size = context_tokens.shape[0]
-    reconfigure_num_microbatches_calculator(
-        rank=app_state.global_rank,
-        rampup_batch_size=None,
-        global_batch_size=micro_batch_size,
-        micro_batch_size=micro_batch_size,
-        data_parallel_size=1,
-    )
-    assert (
-        model.cfg.get('activations_checkpoint_granularity', None) is None
-    ), 'activations_checkpoint_granularity should be None during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
-    assert (
-        model.cfg.get('activations_checkpoint_method', None) is None
-    ), 'activations_checkpoint_method should be None during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
-
-    tokenizer = model.tokenizer
-    # initialize the batch
-    with torch.no_grad():
-        context_length = context_lengths.min().item()
-        if 'neighbors_tokens' in extra:  # for Mcore retrieval RETRO model
-
-            # For Mcore retrieval RETRO model, context_tokens tensors are updated after init_batch() (the length is doubled after processing)
-            context_tokens = inference_strategy.init_batch(
-                context_tokens, context_length, compute_attention_mask, **extra
-            )
-
-        else:
-            inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask)
-        # added eos_id to support the function generate_samples_eval that passes
-        # eos_id as an argument and needs termination when that id id found.
-        eod_id = tokenizer.eos_id
-        counter = 0
-
-        batch_size = context_tokens.size(0)
-        is_done = torch.zeros([batch_size]).byte().cuda()
-        tokens = context_tokens
-        output_logits = None
-        all_generated_indices = None  # used to track all generated indices
-        # Generate enough tokens for the longest sequence
-        maxlen = tokens_to_generate + context_lengths.max().item()
-
-        maxlen = inference_strategy.clip_max_len(maxlen)
-
-        lengths = torch.ones([batch_size]).long().cuda() * maxlen
-
-        while context_length < maxlen:
-            if image_list is not None:
-                batch, tensor_shape = inference_strategy.prepare_batch_at_step(
-                    tokens, maxlen, micro_batch_size, counter, context_length, compute_attention_mask, image_list
-                )
-            else:
-                batch, tensor_shape = inference_strategy.prepare_batch_at_step(
-                    tokens, maxlen, micro_batch_size, counter, context_length, compute_attention_mask
-                )
-            output = inference_strategy.forward_step(batch, tensor_shape)
-
-            if parallel_state.is_pipeline_last_stage():
-
-                if compute_logprob:
-                    output = output[0]['logits']
-                    output = tensor_parallel.gather_from_tensor_model_parallel_region(output)
-                    assert output is not None
-                    logits = output[:, -1].view(batch_size, -1).contiguous()
-
-                else:
-                    if 'neighbors_tokens' in extra:  # for Mcore retrieval RETRO model
-                        # for Mcore RETRO inference, disimilar to GPT, we will get the logits of the (context_length - 1)th token, instead of the last token
-                        logits = output[0]['logits'][:, context_length - 1].contiguous()
-                    else:
-                        logits = output[0]['logits'][:, -1].contiguous()
-                    logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
-                    assert logits is not None
-                    logits = logits.view(batch_size, -1)
-
-                # make sure it will generate at least min_length
-                min_length = extra.get('min_tokens_to_generate', 0)
-                if min_length > 0:
-                    within_min_length = (context_length - context_lengths) < min_length
-                    logits[within_min_length, eod_id] = -float('Inf')
-
-                # make sure it won't sample outside the vocab_size range
-                logits[:, tokenizer.vocab_size :] = -float('Inf')
-
-                # started indicates whether the current token step passes the context_length, so we make sure not to overwrite the context tokens
-
-                started = context_lengths <= context_length
-                if extra.get('greedy', False):
-                    prev = torch.argmax(logits, dim=-1).view(-1)
-                else:
-                    logits = logits.float()
-                    logits /= temperature
-                    # handle repetition penality
-                    logits = repetition_penalty(logits, extra.get('repetition_penalty', 1.2), all_generated_indices)
-                    logits = top_k_logits(
-                        logits, top_k=extra.get('top_k', 0), top_p=extra.get('top_p', 0.9), started=started
-                    )
-                    probs = F.softmax(logits, dim=-1)
-                    prev = torch.multinomial(probs, num_samples=1).view(-1)
-
-                # Clamp the predicted out of vocabulary tokens
-                prev = torch.clamp(prev, max=tokenizer.vocab_size - 1)
-                new_tokens = switch(tokens[:, context_length].view(-1), prev, started)
-
-                # Replace sampled tokens w/ done token if EOD has already been sampled
-                new_tokens = switch(new_tokens, eod_id, is_done)
-
-                # post process the inference tokens based on the strategy
-                inference_strategy.post_process(tokens, new_tokens, context_length)
-
-                # Insert either new predicted or next prompt token
-                tokens[:, context_length] = new_tokens
-
-                if compute_logprob:
-                    if output_logits is None:
-                        output = F.log_softmax(output[:, :context_length, :], 2)
-
-                        indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2)
-                        output_logits = torch.gather(output, 2, indices).squeeze(2)
-                        all_generated_indices = indices[:, :, 0]
-                        if all_probs:
-                            full_logits = output
-                    else:
-                        output = F.log_softmax(output, 2)
-                        indices = torch.unsqueeze(new_tokens, 1).unsqueeze(2)
-                        new_output_logits = torch.gather(output, 2, indices).squeeze(2)
-
-                        # TODO(rprenger) we're copying output_logits every time.  Should pre-allocate
-                        output_logits = torch.cat([output_logits, new_output_logits], 1)
-                        all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1)
-                        if all_probs:
-                            full_logits = torch.cat([full_logits, output], 1)
-
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_embedding_group()
-                torch.distributed.broadcast(new_tokens, src, group)
-
-                #                done_token = (prev == eod_id).byte() & started.byte()
-                done_token = inference_strategy.end_of_generation_condition(
-                    tokens[:, : context_length + 1], prev, eod_id, end_strings
-                )
-                done_token = done_token.byte() & started.byte()
-
-                just_finished = (done_token & ~is_done).bool()
-                lengths[just_finished.view(-1)] = context_length
-                is_done = is_done | done_token
-
-                done = torch.all(is_done)
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-                if compute_logprob:
-                    if all_probs:
-                        yield tokens, lengths, output_logits, full_logits
-                    else:
-                        yield tokens, lengths, output_logits, None
-                else:
-                    yield tokens, lengths, None, None
-
-            else:
-                if parallel_state.is_pipeline_first_stage():
-                    src = parallel_state.get_pipeline_model_parallel_last_rank()
-                    group = parallel_state.get_embedding_group()
-                    new_tokens = torch.empty_like(tokens[:, context_length])
-                    torch.distributed.broadcast(new_tokens, src, group)
-                    tokens[:, context_length] = new_tokens
-                    yield tokens, None, None, None
-                else:
-                    yield None, None, None, None
-
-                done = torch.cuda.ByteTensor([0])
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-
-            context_length += 1
-            counter += 1
-            if done:
-                break
-
-
-def tab_sample_sequence_batch(
-    model,
-    inference_strategy,
-    context_tokens,
-    context_lengths,
-    tokens_to_generate,
-    all_probs=True,
-    compute_attention_mask=True,
-    type_ids=None,
-    temperature=None,
-):
-    app_state = AppState()
-    micro_batch_size = context_tokens.shape[0]
-    reconfigure_num_microbatches_calculator(
-        rank=app_state.global_rank,
-        rampup_batch_size=None,
-        global_batch_size=micro_batch_size,
-        micro_batch_size=micro_batch_size,
-        data_parallel_size=1,
-    )
-    tokenizer = model.tokenizer
-    sizes = tokenizer.code_column.sizes
-    tokens_per_row = sum(sizes) + 1
-    columns = tokenizer.code_column.columns
-    num_columns = len(columns)
-    tokenid_range = []
-    for i in range(num_columns):
-        tokenid_range.extend(tokenizer.code_column.get_range(i))
-    # initialize the batch
-    with torch.no_grad():
-        context_length = context_lengths.min().item()
-        inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask)
-        context = context_tokens[:, :context_length]
-        # the context may start in the middle of the row,
-        # calculate the offset according to the position of '\n' or '<|endoftext|>'
-        positions = torch.where(context == tokenizer.eor)[1]
-        if len(positions) == 0:
-            positions = torch.where(context == tokenizer.eod)[1]
-        if len(positions) != 0:
-            max_position = positions.max().item()
-            # TODO, need to make sure context of different batch have the same offset lengths")
-            # otherwise, need to calculate offset per batch_id
-            offset = (context_length - max_position - 1) % tokens_per_row
-        else:
-            offset = 0
-
-        eod_id = tokenizer.eos_id
-
-        counter = 0
-
-        batch_size = context_tokens.size(0)
-        is_done = torch.zeros([batch_size]).byte().cuda()
-        tokens = context_tokens
-        output_logits = None
-
-        # Generate enough tokens for the longest sequence
-        maxlen = tokens_to_generate + context_lengths.max().item()
-
-        if maxlen > model.cfg.encoder_seq_length:
-            maxlen = model.cfg.encoder_seq_length
-
-        lengths = torch.ones([batch_size]).long().cuda() * maxlen
-
-        while context_length < maxlen:
-            batch, tensor_shape = inference_strategy.prepare_batch_at_step(
-                tokens, maxlen, micro_batch_size, counter, context_length, compute_attention_mask
-            )
-            output = inference_strategy.forward_step(batch, tensor_shape)
-
-            if parallel_state.is_pipeline_last_stage():
-                output = output[0]['logits'].float()
-                output = tensor_parallel.gather_from_tensor_model_parallel_region(output)
-                assert output is not None
-                output = output.float()
-                logits = output[:, -1].view(batch_size, -1).contiguous()
-                token_in_row = (counter + offset) % tokens_per_row
-                logits = logits.float()
-                logits /= temperature
-                if token_in_row == tokens_per_row - 1:
-                    # line break
-                    eor_id = tokenizer.eor
-                    eod_id = tokenizer.eos_id
-                    min_id = min(eor_id, eod_id)
-                    max_id = max(eor_id, eod_id) + 1
-                    logits = tab_logits(logits, min_id, max_id)
-                else:
-                    # limit the range
-                    min_id, max_id = tokenid_range[token_in_row]
-                    logits = tab_logits(logits, min_id, max_id)
-                probs = F.softmax(logits, dim=-1)
-                prev = torch.multinomial(probs, num_samples=1).view(-1)
-                started = context_lengths <= context_length
-                # Clamp the out of vocabulary tokens.
-                prev = torch.clamp(prev, max=tokenizer.vocab_size - 1)
-
-                new_tokens = switch(tokens[:, context_length].view(-1), prev, started)
-
-                # post process the inference tokens based on the strategy
-                inference_strategy.post_process(tokens, new_tokens, context_length)
-
-                tokens[:, context_length] = new_tokens
-
-                if output_logits is None:
-                    output_context = F.log_softmax(output[:, :context_length, :], 2)
-                    indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2)
-                    output_logits = torch.gather(output_context, 2, indices).squeeze(2)
-                    if all_probs:
-                        full_logits = output_context
-                else:
-                    output_context = F.log_softmax(output, 2)
-                    indices = torch.unsqueeze(new_tokens, 1).unsqueeze(2)
-                    new_output_logits = torch.gather(output_context, 2, indices).squeeze(2)
-
-                    # TODO(rprenger) we're copying output_logits every time.  Should pre-allocate
-                    output_logits = torch.cat([output_logits, new_output_logits], 1)
-                    if all_probs:
-                        full_logits = torch.cat([full_logits, output_context], 1)
-
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_embedding_group()
-                torch.distributed.broadcast(new_tokens, src, group)
-
-                done_token = (prev == eod_id).byte() & started.byte()
-                just_finished = (done_token & ~is_done).bool()
-                lengths[just_finished.view(-1)] = context_length
-                is_done = is_done | done_token
-
-                done = torch.all(is_done)
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-                if all_probs:
-                    yield tokens, lengths, output_logits, full_logits
-                else:
-                    yield tokens, lengths, output_logits, None
-
-            else:
-                if parallel_state.is_pipeline_first_stage():
-                    src = parallel_state.get_pipeline_model_parallel_last_rank()
-                    group = parallel_state.get_embedding_group()
-                    new_tokens = torch.empty_like(tokens[:, context_length])
-                    torch.distributed.broadcast(new_tokens, src, group)
-                    tokens[:, context_length] = new_tokens
-                    yield tokens, None, None, None
-                else:
-                    yield None, None, None, None
-
-                done = torch.cuda.ByteTensor([0])
-                src = parallel_state.get_pipeline_model_parallel_last_rank()
-                group = parallel_state.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-
-            context_length += 1
-            counter += 1
-            if done:
-                break
-
-
-def sample_token_greedy(logits):
-    """
-    Greedy sampling. Returns the token with the highest probability, and corresponding log_prob.
-
-    Args:
-        logits: [batch_size, vocab_size] - unnormalized log probabilities of the next token
-
-    Returns:
-        log_probs: [batch_size] - log probabilities of the sampled tokens
-        token_ids: [batch_size] - sampled token ids
-    """
-    log_probs, token_ids = torch.max(torch.nn.functional.log_softmax(logits, dim=-1), dim=-1)
-
-    return log_probs, token_ids
-
-
-def sample_token_topk(logits, top_k=0, top_p=0.0, temperature=1.0, filter_value=-float('Inf')):
-    """
-    Greedy sampling. Returns the token with the highest probability, and corresponding log_prob.
-
-    Args:
-        logits: [batch_size, vocab_size] - unnormalized log probabilities of the next token
-        top_k: int - if > 0: only sample from top k tokens with highest probability
-        top_p: float - if > 0.0: only sample from a subset of candidates, where the cumulative probability
-        temperature: float - temperature for sampling
-        filter_value: float - value to set filtered tokens to
-
-    Returns:
-        log_probs: [batch_size] - log probabilities of the sampled tokens
-        token_ids: [batch_size] - sampled token ids
-    """
-    logits = logits.float()
-    logits /= temperature
-    logits = top_k_logits(logits, top_k=top_k, top_p=top_p, filter_value=filter_value)
-    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
-
-    token_ids = torch.multinomial(log_probs.exp(), num_samples=1).view(-1)
-    log_probs = log_probs.gather(1, token_ids.unsqueeze(1)).squeeze(1)
-
-    return log_probs, token_ids
-
-
-def sample_token_topk_beam_search(logits: torch.Tensor, beam_size: int = 1, dim: int = -1, log_softmax: bool = True):
-    """
-    Beam search selection of top K predictions per target (dim). Returns the beam_size tokens ids with the highest
-    probability and the corresponding log_prob per target
-
-    Args:
-        logits: [batch_size, vocab_size] or [batch_size, vocab_size] - unnormalized log probabilities of the next token,
-        beam_size: int > 1 - number of tokens to return with the highest probability per target
-        dim: int - dim of log_softmax and topk selection
-        log_softmax: bool - if to calculate log softmax  for log probabilities
-
-
-    Returns:
-        log_probs: [batch_size, beam_size] - log probabilities of the sampled tokens
-        token_ids: [batch_size, beam_size] - sampled token ids
-    """
-    if log_softmax:
-        log_probs = torch.nn.functional.log_softmax(logits, dim=dim)
-    else:
-        log_probs = logits
-    # get top candidates for each item in batch
-    log_probs, token_ids = torch.topk(log_probs, beam_size, dim=dim)
-
-    return log_probs, token_ids
-
-
-def compute_beam_search_len_penalty(lengths: torch.Tensor, alpha: int) -> torch.Tensor:
-    """
-    Length penalty used in the beam search
-    Args:
-        lengths: lengths of decoded sequences
-        alpha: params of the penalty
-    Returns:
-         tensor with the penalty value
-    """
-    return ((5 + lengths) / 6).pow(alpha)
-
-
-def get_sampling_token_fn(sampling_method: str, sampling_kwargs: dict) -> Tuple[Callable, dict]:
-    """
-    Specifies the sampling function that takes in a tensor of logits [batch_size, vocab_size] and returns a tuple
-    (tensor of log_probs [batch_size], tensor of sampled from logits [batch_size]).
-    If the beam search is enabled, the sampling function returns tensors [batch_size, beam_size]
-
-    Args:
-        sampling_method: the sampling method to use in the decode steps. Currently supported methods are
-                          "beam-search"/"greedy"/"topkp"
-        sampling_kwargs: dict with arguments to be passed to the sampling function.
-                         For sampling method 'beam-search', the following kwargs are supported:
-                         beam_size - int, number of the best sequences at each decode iteration to be left per target
-                         beam_alpha - int, the parameter of length penalty applied to predicted sequences
-                         keep_only_best_tokens - used in the beam search, boolean flag if to output only best sequence
-                                                 of predicted tokens (True) or beam_size predictions per target
-                         return_scores - used in the beam search, boolean flag if to return scores at the top of
-                                         predictions and logits
-
-    Returns:
-        sample_token_fn: the sampling function
-        default_sampling_kwargs: sampling_kwargs augmented with default sampling kwargs
-    """
-    all_default_sampling_kwargs = {
-        'greedy-search': {},
-        'topkp-sampling': {'top_k': 0, 'top_p': 0.0, 'temperature': 1.0},
-        'beam-search': {'beam_size': 1, 'beam_alpha': 0.0, 'keep_only_best_tokens': False, 'return_scores': False},
-    }
-
-    # update default sampling kwargs with user provided kwargs
-    default_sampling_kwargs = all_default_sampling_kwargs[sampling_method].copy()
-    default_sampling_kwargs.update(sampling_kwargs)
-    # sampling_kwargs = default_sampling_kwargs
-
-    if sampling_method == 'greedy-search':
-        sampling_token_fn = sample_token_greedy
-
-    elif sampling_method == "topkp-sampling":
-        top_k = default_sampling_kwargs['top_k']
-        top_p = default_sampling_kwargs['top_p']
-        temperature = default_sampling_kwargs['temperature']
-        sampling_token_fn = partial(sample_token_topk, top_k=top_k, top_p=top_p, temperature=temperature)
-
-    elif sampling_method == "beam-search":
-        beam_size = default_sampling_kwargs['beam_size']
-        sampling_token_fn = partial(sample_token_topk_beam_search, beam_size=beam_size)
-
-    else:
-        raise ValueError(
-            f'Invalid sampling method {sampling_method}. '
-            f'Supported sampling methods are {all_default_sampling_kwargs.keys()}'
-        )
-
-    return sampling_token_fn, default_sampling_kwargs
diff --git a/nemo/collections/nlp/modules/common/transformer/__init__.py b/nemo/collections/nlp/modules/common/transformer/__init__.py
deleted file mode 100644
index 5b6ea68614c2..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.modules.common.transformer.bridge_encoders import *
-from nemo.collections.nlp.modules.common.transformer.perceiver_encoders import *
-from nemo.collections.nlp.modules.common.transformer.transformer_bottleneck import *
-from nemo.collections.nlp.modules.common.transformer.transformer_decoders import *
-from nemo.collections.nlp.modules.common.transformer.transformer_encoders import *
-from nemo.collections.nlp.modules.common.transformer.transformer_generators import *
-from nemo.collections.nlp.modules.common.transformer.transformer_modules import *
diff --git a/nemo/collections/nlp/modules/common/transformer/bridge_encoders.py b/nemo/collections/nlp/modules/common/transformer/bridge_encoders.py
deleted file mode 100644
index 2ee4abcbd16f..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/bridge_encoders.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from nemo.collections.nlp.modules.common.transformer.transformer_encoders import TransformerEncoder
-from nemo.collections.nlp.modules.common.transformer.transformer_modules import AttentionBridge
-
-__all__ = ["BridgeEncoder"]
-
-
-class BridgeEncoder(torch.nn.Module):
-    def __init__(
-        self,
-        num_layers: int,
-        hidden_size: int,
-        inner_size: int,
-        mask_future: bool = False,
-        num_attention_heads: int = 1,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        ffn_dropout: float = 0.0,
-        hidden_act: str = "relu",
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        hidden_steps: int = 32,
-        hidden_init_method: str = "default",
-        hidden_blocks: int = 0,
-    ):
-        super().__init__()
-
-        self._hidden_steps = hidden_steps
-        self._hidden_init_method = hidden_init_method
-        self._hidden_blocks = hidden_blocks
-
-        if self._hidden_init_method == "default":
-            self._hidden_init_method = "enc_shared"
-
-        if self.hidden_init_method not in self.supported_init_methods:
-            raise ValueError(
-                "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format(
-                    hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods,
-                )
-            )
-
-        # attention bridge
-        self.att_bridge = AttentionBridge(hidden_size=hidden_size, k=hidden_steps, bridge_size=inner_size,)
-
-        if self.hidden_init_method == "enc":
-            self.init_hidden_enc = TransformerEncoder(
-                num_layers=num_layers,
-                hidden_size=hidden_size,
-                inner_size=inner_size,
-                mask_future=mask_future,
-                num_attention_heads=num_attention_heads,
-                attn_score_dropout=attn_score_dropout,
-                attn_layer_dropout=attn_layer_dropout,
-                ffn_dropout=ffn_dropout,
-                hidden_act=hidden_act,
-                pre_ln=pre_ln,
-                pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-            )
-
-        # self attention
-        self.hidden_enc = TransformerEncoder(
-            num_layers=num_layers,
-            hidden_size=hidden_size,
-            inner_size=inner_size,
-            mask_future=mask_future,
-            num_attention_heads=num_attention_heads,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            ffn_dropout=ffn_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-
-    @property
-    def supported_init_methods(self):
-        return ["enc_shared", "identity", "enc"]
-
-    @property
-    def hidden_steps(self):
-        return self._hidden_steps
-
-    @property
-    def hidden_blocks(self):
-        return self._hidden_blocks
-
-    @property
-    def hidden_init_method(self):
-        return self._hidden_init_method
-
-    def forward(self, encoder_states, encoder_mask):
-        """
-        Args:
-            encoder_states: output of the encoder (B x L_enc x H)
-            encoder_mask: encoder inputs mask (B x L_enc)
-        """
-        # self-attention over input
-        if self.hidden_init_method == "enc_shared":
-            residual = encoder_states
-            hidden_states = self.hidden_enc(encoder_states=encoder_states, encoder_mask=encoder_mask)
-            # residual connection
-            hidden_states += residual
-        elif self.hidden_init_method == "identity":
-            hidden_states = encoder_states
-        elif self.hidden_init_method == "enc":
-            residual = encoder_states
-            hidden_states = self.init_hidden_enc(encoder_states=encoder_states, encoder_mask=encoder_mask)
-            # residual connection
-            hidden_states += residual
-
-        # project encoder states to a fixed steps hidden using k attention heads
-        hidden_states = self.att_bridge(hidden=hidden_states, hidden_mask=encoder_mask)
-
-        # all hidden values are active
-        hidden_mask = torch.ones(
-            encoder_states.shape[0], self._hidden_steps, dtype=encoder_mask.dtype, device=encoder_mask.device
-        )
-
-        # apply self-attention over fixed-size hidden_states
-        for block in range(self._hidden_blocks):
-            residual = hidden_states
-            hidden_states = self.hidden_enc(encoder_states=hidden_states, encoder_mask=hidden_mask)
-            # residual connection
-            hidden_states += residual
-
-        return hidden_states, hidden_mask
diff --git a/nemo/collections/nlp/modules/common/transformer/perceiver_encoders.py b/nemo/collections/nlp/modules/common/transformer/perceiver_encoders.py
deleted file mode 100644
index e2a7ef8fc0a8..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/perceiver_encoders.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-import torch
-
-from nemo.collections.nlp.modules.common.transformer.transformer_decoders import TransformerDecoder
-from nemo.collections.nlp.modules.common.transformer.transformer_encoders import TransformerEncoder
-from nemo.collections.nlp.modules.common.transformer.transformer_modules import AttentionBridge
-
-__all__ = ["PerceiverEncoder"]
-
-
-class PerceiverEncoder(torch.nn.Module):
-    def __init__(
-        self,
-        num_layers: int,
-        hidden_size: int,
-        inner_size: int,
-        mask_future: bool = False,
-        num_attention_heads: int = 1,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        ffn_dropout: float = 0.0,
-        hidden_act: str = "relu",
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        hidden_steps: int = 32,
-        hidden_init_method: str = "default",
-        hidden_blocks: int = 2,
-    ):
-        super().__init__()
-
-        self._hidden_steps = hidden_steps
-        self._hidden_init_method = hidden_init_method
-        self._hidden_blocks = hidden_blocks
-
-        if self._hidden_init_method == "default":
-            self._hidden_init_method = "params"
-
-        if self.hidden_init_method not in self.supported_init_methods:
-            raise ValueError(
-                "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format(
-                    hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods,
-                )
-            )
-
-        diagonal = 0 if mask_future else None
-
-        if self.hidden_init_method == "params":
-            # learnable initial hidden values
-            self.init_hidden = torch.nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(hidden_steps, hidden_size)))
-            self.init_cross_att = TransformerDecoder(
-                num_layers=1,
-                hidden_size=hidden_size,
-                inner_size=inner_size,
-                num_attention_heads=num_attention_heads,
-                attn_score_dropout=attn_score_dropout,
-                attn_layer_dropout=attn_layer_dropout,
-                ffn_dropout=ffn_dropout,
-                hidden_act=hidden_act,
-                pre_ln=pre_ln,
-                pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-            )
-            self.init_cross_att.diagonal = diagonal
-        elif self.hidden_init_method == "bridge":
-            # initialize latent with attention bridge
-            self.att_bridge = AttentionBridge(hidden_size=hidden_size, k=hidden_steps, bridge_size=inner_size,)
-
-        # cross-attention encoder
-        layer = TransformerDecoder(
-            num_layers=1,
-            hidden_size=hidden_size,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            ffn_dropout=ffn_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-        layer.diagonal = diagonal
-        self.cross_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)])
-
-        # self-attention encoder
-        layer = TransformerEncoder(
-            num_layers=num_layers,
-            hidden_size=hidden_size,
-            inner_size=inner_size,
-            mask_future=mask_future,
-            num_attention_heads=num_attention_heads,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            ffn_dropout=ffn_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-        self.self_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)])
-
-    @property
-    def supported_init_methods(self):
-        return ["params", "bridge"]
-
-    @property
-    def hidden_steps(self):
-        return self._hidden_steps
-
-    @property
-    def hidden_blocks(self):
-        return self._hidden_blocks
-
-    @property
-    def hidden_init_method(self):
-        return self._hidden_init_method
-
-    def forward(self, encoder_states, encoder_mask):
-        """
-        Args:
-            encoder_states: output of the encoder (B x L_enc x H)
-            encoder_mask: encoder inputs mask (B x L_enc)
-        """
-        # all hidden values are active
-        hidden_mask = torch.ones(
-            encoder_states.shape[0], self._hidden_steps, dtype=encoder_mask.dtype, device=encoder_mask.device
-        )
-
-        # initialize hidden state
-        if self._hidden_init_method == "params":
-            # initialize latent with learned parameters
-            hidden_states = self.init_hidden.unsqueeze(0).expand(encoder_states.shape[0], -1, -1)
-            hidden_states = self.init_cross_att(
-                decoder_states=hidden_states,
-                decoder_mask=hidden_mask,
-                encoder_states=encoder_states,
-                encoder_mask=encoder_mask,
-            )
-        elif self._hidden_init_method == "bridge":
-            # initialize latent with attention bridge
-            hidden_states = self.att_bridge(hidden=encoder_states, hidden_mask=encoder_mask,)
-
-        # apply block (cross-attention, self-attention) multiple times
-        # for block in range(self._hidden_blocks):
-        for self_att, cross_att in zip(self.self_att_layers, self.cross_att_layers):
-            residual = hidden_states
-
-            # cross attention of hidden over encoder states
-            hidden_states = cross_att(
-                decoder_states=hidden_states,
-                decoder_mask=hidden_mask,
-                encoder_states=encoder_states,
-                encoder_mask=encoder_mask,
-            )
-
-            # self-attention over hidden
-            hidden_states = self_att(encoder_states=hidden_states, encoder_mask=hidden_mask,)
-
-            # residual connection
-            hidden_states += residual
-
-        return hidden_states, hidden_mask
diff --git a/nemo/collections/nlp/modules/common/transformer/reduction_encoders.py b/nemo/collections/nlp/modules/common/transformer/reduction_encoders.py
deleted file mode 100644
index e552e8734a77..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/reduction_encoders.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-import torch
-
-from nemo.collections.nlp.modules.common.transformer.transformer_encoders import TransformerEncoder
-
-__all__ = ["PoolingEncoder"]
-
-
-class PoolingEncoder(torch.nn.Module):
-
-    _SUPPORTED_ARCH = ["max", "avg"]
-
-    def __init__(
-        self,
-        num_layers: int,
-        hidden_size: int,
-        inner_size: int,
-        mask_future: bool = False,
-        num_attention_heads: int = 1,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        ffn_dropout: float = 0.0,
-        hidden_act: str = "relu",
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        hidden_steps: int = 4,
-        hidden_init_method: str = "default",
-        hidden_blocks: int = 2,
-        pooling_type: str = "max",
-    ):
-        super().__init__()
-
-        # minimal steps to allow reduction
-        self._hidden_steps = hidden_steps
-        self._hidden_init_method = hidden_init_method
-        self._hidden_blocks = hidden_blocks
-        self._pooling_type = pooling_type
-
-        if self._hidden_steps < 2:
-            raise ValueError("Expected hidden_steps >= 2 but received hidden_steps = {self._hidden_steps}")
-
-        if self.hidden_init_method not in self.supported_init_methods:
-            raise ValueError(
-                "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format(
-                    hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods,
-                )
-            )
-
-        if self._pooling_type not in self.supported_arch:
-            raise ValueError(f"Unknown pooling_type = {pooling_type}. Available values = {self.supported_arch}")
-
-        # self-attention encoder
-        layer = TransformerEncoder(
-            num_layers=num_layers,
-            hidden_size=hidden_size,
-            inner_size=inner_size,
-            mask_future=mask_future,
-            num_attention_heads=num_attention_heads,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            ffn_dropout=ffn_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-        self.self_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)])
-
-        self.pooling = self._build_pooling_module()
-
-    def _build_pooling_module(self):
-        """
-        Returns pooling module.
-        Allows to override for child classes.
-        """
-        if self._pooling_type == "max":
-            pooling = torch.nn.MaxPool1d(kernel_size=2, stride=2)
-        elif self._pooling_type == "avg":
-            pooling = torch.nn.AvgPool1d(kernel_size=2, stride=2)
-
-        return pooling
-
-    @property
-    def supported_arch(self):
-        return self._SUPPORTED_ARCH
-
-    @property
-    def supported_init_methods(self):
-        return ["default"]
-
-    @property
-    def hidden_steps(self):
-        return self._hidden_steps
-
-    @property
-    def hidden_blocks(self):
-        return self._hidden_blocks
-
-    @property
-    def hidden_init_method(self):
-        return self._hidden_init_method
-
-    def forward(self, encoder_states, encoder_mask):
-        """
-        Args:
-            encoder_states: output of the encoder (B x L_enc x H)
-            encoder_mask: encoder inputs mask (B x L_enc)
-        """
-        # initialize hidden state
-        hidden_mask = encoder_mask
-        hidden_states = encoder_states
-
-        # apply block (self-attention, max-pool) multiple times
-        for self_att in self.self_att_layers:
-            residual = hidden_states
-
-            # self-attention over hidden
-            hidden_states = self_att(encoder_states=hidden_states, encoder_mask=hidden_mask)
-
-            hidden_states += residual
-
-            # max pool reduction if possible
-            if hidden_states.shape[1] >= self.hidden_steps:
-                # max pool hidden states
-                hidden_states = hidden_states.permute(0, 2, 1)
-                hidden_states = self.pooling(hidden_states)
-                hidden_states = hidden_states.permute(0, 2, 1)
-
-                # max pool mask
-                hidden_mask = (
-                    self.pooling(hidden_mask.unsqueeze(0).type_as(hidden_states)).squeeze(0).type_as(hidden_mask)
-                )
-
-        return hidden_states, hidden_mask
diff --git a/nemo/collections/nlp/modules/common/transformer/text_generation.py b/nemo/collections/nlp/modules/common/transformer/text_generation.py
deleted file mode 100644
index 5f0275ff4553..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/text_generation.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-from typing import List, Tuple, Union
-
-from torch import Tensor
-
-if sys.version_info >= (3, 8):
-    from typing import TypedDict
-else:
-    from typing_extensions import TypedDict
-
-
-class LengthParam(TypedDict):
-    max_length: int  # The maximum length of the sequence to be generated.
-    min_length: int  # The minimum length of the sequence to be generated.
-
-
-class SamplingParam(TypedDict):
-    use_greedy: bool  # Whether or not to use sampling ; use greedy decoding otherwise
-    temperature: float  # sampling temperature
-    top_k: int  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-    top_p: float  # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-    repetition_penalty: float  # The parameter for repetition penalty. 1.0 means no penalty.
-    add_BOS: bool  # add the bos token at the begining of the prompt
-    all_probs: bool  # whether return the log prob for all the tokens in vocab
-    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-    end_strings: List[str]  # generation will stop when one of these tokens is generated
-
-
-class OutputType(TypedDict):
-    sentences: List[str]  # output sentences
-    tokens: List[List[str]]  # output sentences borken into tokens
-    logprob: List[List[float]]  # log prob of generated tokens
-    full_logprob: List[List[float]]  # log prob of all the tokens in the vocab
-    token_ids: List[List[int]]  # output sentence token ids
-    offsets: List[List[int]]  # list of tokens start positions in text
-
-
-class TextGeneration:
-    """
-    Interface for all text generation models.
-    """
-
-    def generate(
-        self,
-        inputs: Union[List[str], Tuple[Tensor, Tensor], List[dict]],
-        length_params: LengthParam,
-        sampling_params: SamplingParam = None,
-    ) -> OutputType:
-        """
-        Public method to generate text.
-
-        Args:
-            inputs (Union[List[str], Tensor, List[dict]]):
-                Can be one of the 3 types: 
-
-                1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
-                    E.g [‘sentence’, ‘sentence2’ … ]
-
-                2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder.
-                    The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
-                    E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
-
-                3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
-                    E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
-                    {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
-                    where 'prompt-tag' is used to identify the type of NLP task to solve.
-
-            length_params (LengthParam):
-                a dictionary type which controls the sampling length.
-
-                * max_length: int, The maximum length of the sequence to be generated.
-                * min_length: int,  The minimum length of the sequence to be generated.
-
-                If None, max_length is set to 30, and min_length is set to None
-
-            sampling_params (SamplingParam):
-                a dictionary type which contains the parameters for text sampling. It has the following keys
-
-                * use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
-                * top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
-                * top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-                * repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty.
-                * add_BOS: bool, Whether add the bos token at the begining of the prompt
-                * all_probs: bool  # whether return the log prob for all the tokens in vocab
-                * compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-                * end_strings: List[str]  # generation will stop when one of these tokens is generated
-
-                Default None, If it is None, use_greedy will be "True".
-
-        Returns:
-            It generates the output in a dictionary type. It has the following keys,
-
-            * sentences: List[str], output sentences
-            * tokens: List[List[str]], output sentences borken into tokens
-            * logprob: List[List[float]],  log prob of generated tokens
-            * full_logprob: List[List[float]], log prob of all the tokens in the vocab
-            * token_ids: List[List[int]], output sentence token ids
-            * offsets: List[List[int]]  # list of tokens start positions in text
-        """
-        raise NotImplementedError("please implement this method")
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer.py b/nemo/collections/nlp/modules/common/transformer/transformer.py
deleted file mode 100644
index 5870303be93e..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/transformer.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Dict, Optional
-
-import torch
-from omegaconf.omegaconf import MISSING
-
-from nemo.collections.nlp.modules.common.decoder_module import DecoderModule
-from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
-from nemo.collections.nlp.modules.common.transformer.transformer_decoders import TransformerDecoder
-from nemo.collections.nlp.modules.common.transformer.transformer_encoders import TransformerEncoder
-from nemo.collections.nlp.modules.common.transformer.transformer_modules import TransformerEmbedding
-from nemo.core.classes.common import typecheck
-from nemo.core.classes.exportable import Exportable
-from nemo.core.neural_types import ChannelType, NeuralType
-
-# @dataclass
-# class TransformerConfig:
-#     # named model arguments
-#     library: str = 'nemo'
-#     model_name: Optional[str] = None
-#     pretrained: bool = False
-
-
-@dataclass
-class NeMoTransformerConfig:
-    # must be configured by the user
-    hidden_size: int = MISSING
-    num_layers: int = MISSING
-    inner_size: int = MISSING
-    num_attention_heads: int = MISSING
-
-    # embedding
-    max_sequence_length: int = 512
-    num_token_types: int = 2
-    embedding_dropout: float = 0.0
-    learn_positional_encodings: bool = False
-
-    # transformer
-    ffn_dropout: float = 0.0
-    attn_score_dropout: float = 0.0
-    attn_layer_dropout: float = 0.0
-    hidden_act: str = 'relu'
-    pre_ln: bool = False
-    pre_ln_final_layer_norm: bool = True
-
-    # named model arguments
-    library: str = 'nemo'
-    model_name: Optional[str] = None
-    pretrained: bool = False
-
-
-@dataclass
-class NeMoTransformerEncoderConfig(NeMoTransformerConfig):
-    mask_future: bool = False
-
-
-@dataclass
-class NeMoTransformerDecoderConfig(NeMoTransformerConfig):
-    r2l: bool = False
-
-
-class TransformerEncoderNM(EncoderModule, Exportable):
-    def __init__(
-        self,
-        vocab_size: int,
-        hidden_size: int,
-        num_layers: int,
-        inner_size: int,
-        num_attention_heads: int,
-        max_sequence_length: int = 512,
-        num_token_types: int = 2,
-        embedding_dropout: float = 0.0,
-        learn_positional_encodings: bool = False,
-        ffn_dropout: float = 0.0,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        hidden_act: str = 'relu',
-        mask_future: bool = False,
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        padding_idx: int = 0,
-    ):
-        super().__init__()
-
-        self._vocab_size = vocab_size
-        self._hidden_size = hidden_size
-        self._max_sequence_length = max_sequence_length
-
-        self._embedding = TransformerEmbedding(
-            vocab_size=self._vocab_size,
-            hidden_size=self._hidden_size,
-            max_sequence_length=max_sequence_length,
-            num_token_types=num_token_types,
-            embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings,
-            padding_idx=padding_idx,
-        )
-
-        self._encoder = TransformerEncoder(
-            hidden_size=self._hidden_size,
-            num_layers=num_layers,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            ffn_dropout=ffn_dropout,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            hidden_act=hidden_act,
-            mask_future=mask_future,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-
-    @typecheck()
-    def forward(self, input_ids, encoder_mask):
-        embeddings = self._embedding(input_ids=input_ids)
-        encoder_hidden_states = self._encoder(encoder_states=embeddings, encoder_mask=encoder_mask)
-        return encoder_hidden_states
-
-    @property
-    def hidden_size(self):
-        return self._hidden_size
-
-    @property
-    def vocab_size(self):
-        return self._vocab_size
-
-    @property
-    def max_sequence_length(self):
-        return self._max_sequence_length
-
-    @property
-    def embedding(self):
-        return self._embedding
-
-    @property
-    def encoder(self):
-        return self._encoder
-
-    def input_example(self, max_batch=1, max_dim=256):
-        """
-        Generates input examples for tracing etc.
-        Returns:
-            A tuple of input examples.
-        """
-        sample = next(self.parameters())
-        sz = (max_batch, max_dim)
-        input_ids = torch.randint(low=0, high=2048, size=sz, device=sample.device)
-        encoder_mask = torch.randint(low=0, high=1, size=sz, device=sample.device)
-        return tuple([input_ids, encoder_mask])
-
-
-class TransformerDecoderNM(DecoderModule, Exportable):
-    def __init__(
-        self,
-        vocab_size: int,
-        hidden_size: int,
-        num_layers: int,
-        inner_size: int,
-        num_attention_heads: int,
-        max_sequence_length: int = 512,
-        num_token_types: int = 2,
-        embedding_dropout: float = 0.0,
-        learn_positional_encodings: bool = False,
-        ffn_dropout: float = 0.0,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        hidden_act: str = 'relu',
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        padding_idx: int = 0,
-    ):
-        super().__init__()
-
-        self._vocab_size = vocab_size
-        self._hidden_size = hidden_size
-        self._max_sequence_length = max_sequence_length
-        self.num_states = num_layers + 1
-        self.return_mems = False
-        if pre_ln_final_layer_norm:
-            self.num_states += 1
-
-        self._embedding = TransformerEmbedding(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            max_sequence_length=max_sequence_length,
-            num_token_types=num_token_types,
-            embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings,
-            padding_idx=padding_idx,
-        )
-
-        self._decoder = TransformerDecoder(
-            hidden_size=self.hidden_size,
-            num_layers=num_layers,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            ffn_dropout=ffn_dropout,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-
-    @typecheck()
-    def forward(
-        self, input_ids, decoder_mask, encoder_embeddings, encoder_mask, decoder_mems=None,
-    ):
-        start_pos = 0
-        if decoder_mems is not None:
-            start_pos = input_ids.shape[1] - 1
-            input_ids = input_ids[:, -1:]
-            decoder_mask = decoder_mask[:, -1:]
-            decoder_mems = torch.transpose(decoder_mems, 0, 1)
-        decoder_embeddings = self._embedding(input_ids=input_ids, start_pos=start_pos)
-        decoder_hidden_states = self._decoder(
-            decoder_states=decoder_embeddings,
-            decoder_mask=decoder_mask,
-            encoder_states=encoder_embeddings,
-            encoder_mask=encoder_mask,
-            decoder_mems_list=decoder_mems,
-            return_mems=self.return_mems,
-            return_mems_as_list=False,
-        )
-        if self.return_mems:
-            decoder_hidden_states = torch.transpose(decoder_hidden_states, 0, 1)
-        return decoder_hidden_states
-
-    @property
-    def hidden_size(self):
-        return self._hidden_size
-
-    @property
-    def vocab_size(self):
-        return self._vocab_size
-
-    @property
-    def max_sequence_length(self):
-        return self._max_sequence_length
-
-    @property
-    def embedding(self):
-        return self._embedding
-
-    @property
-    def decoder(self):
-        return self._decoder
-
-    def input_example(self, max_batch=1, max_dim=256):
-        """
-        Generates input examples for tracing etc.
-        Returns:
-            A tuple of input examples.
-        """
-        sample = next(self.parameters())
-        sz = (max_batch, max_dim)
-        input_ids = torch.randint(low=0, high=2048, size=sz, device=sample.device)
-        encoder_mask = torch.randint(low=0, high=1, size=sz, device=sample.device)
-        mem_size = [max_batch, self.num_states, max_dim - 1, self._hidden_size]
-        decoder_mems = torch.rand(mem_size, device=sample.device)
-        return tuple([input_ids, encoder_mask, self._embedding(input_ids), encoder_mask, decoder_mems])
-
-    def _prepare_for_export(self, **kwargs):
-        self._decoder.diagonal = None
-        self.return_mems = True
-        super()._prepare_for_export(**kwargs)
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        if self.return_mems:
-            return {"last_hidden_states": NeuralType(('B', 'D', 'T', 'D'), ChannelType())}
-        else:
-            return {"last_hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_bottleneck.py b/nemo/collections/nlp/modules/common/transformer/transformer_bottleneck.py
deleted file mode 100644
index b646ad8f382d..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/transformer_bottleneck.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Dict, Optional
-
-from nemo.collections.nlp.modules.common.transformer.bridge_encoders import BridgeEncoder
-from nemo.collections.nlp.modules.common.transformer.perceiver_encoders import PerceiverEncoder
-from nemo.collections.nlp.modules.common.transformer.reduction_encoders import PoolingEncoder
-from nemo.collections.nlp.modules.common.transformer.transformer import (
-    NeMoTransformerConfig,
-    TransformerDecoderNM,
-    TransformerEncoderNM,
-)
-from nemo.core.classes.common import typecheck
-from nemo.core.neural_types import MaskType, NeuralType
-from nemo.core.neural_types.elements import BoolType
-
-__all__ = [
-    "NeMoTransformerBottleneckConfig",
-    "NeMoTransformerBottleneckEncoderConfig",
-    "NeMoTransformerBottleneckDecoderConfig",
-    "TransformerBottleneckEncoderNM",
-]
-
-
-@dataclass
-class NeMoTransformerBottleneckConfig(NeMoTransformerConfig):
-    # architecture details (default is no bottleneck)
-    arch: str = ''
-    hidden_steps: int = -1
-    hidden_blocks: int = 1
-    hidden_init_method: str = "params"
-
-
-@dataclass
-class NeMoTransformerBottleneckEncoderConfig(NeMoTransformerBottleneckConfig):
-    mask_future: bool = False
-    # change return_mask to False to return hidden states only (default for non-bottleneck encoder)
-    return_mask: bool = True
-
-
-@dataclass
-class NeMoTransformerBottleneckDecoderConfig(NeMoTransformerBottleneckConfig):
-    r2l: bool = False
-
-
-class TransformerBottleneckEncoderNM(TransformerEncoderNM):
-
-    _SUPPORTED_ARCH = ["seq2seq", "bridge", "perceiver", "max_pool", "avg_pool"]
-
-    def __init__(
-        self,
-        vocab_size: int,
-        hidden_size: int,
-        num_layers: int,
-        inner_size: int,
-        num_attention_heads: int,
-        max_sequence_length: int = 512,
-        num_token_types: int = 2,
-        embedding_dropout: float = 0.0,
-        learn_positional_encodings: bool = False,
-        ffn_dropout: float = 0.0,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        hidden_act: str = 'relu',
-        mask_future: bool = False,
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        arch: str = '',
-        hidden_steps: int = -1,
-        hidden_blocks: int = 1,
-        hidden_init_method: str = "default",
-        padding_idx: int = 0,
-        # default whether forward() method returns hidden or (hidden, mask)
-        return_mask=True,
-    ):
-        super().__init__(
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            max_sequence_length=max_sequence_length,
-            num_token_types=num_token_types,
-            embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings,
-            ffn_dropout=ffn_dropout,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            hidden_act=hidden_act,
-            mask_future=mask_future,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-            padding_idx=padding_idx,
-        )
-
-        self._arch = arch
-        self._return_mask = return_mask
-
-        # replace encoder
-        self._encoder = self._build_encoder(
-            arch=arch,
-            hidden_steps=hidden_steps,
-            hidden_blocks=hidden_blocks,
-            hidden_init_method=hidden_init_method,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            ffn_dropout=ffn_dropout,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            hidden_act=hidden_act,
-            mask_future=mask_future,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-
-    def _build_encoder(self, arch, **kwargs):
-        """
-        Returns a decoder based on architecture arch and kwargs
-        """
-        # default non-bottleneck transformer encoder
-        if (not arch) or (arch == "seq2seq"):
-            encoder = self.encoder
-        elif arch == "bridge":
-            encoder = BridgeEncoder(
-                num_layers=kwargs["num_layers"],
-                hidden_size=kwargs["hidden_size"],
-                inner_size=kwargs["inner_size"],
-                num_attention_heads=kwargs["num_attention_heads"],
-                attn_score_dropout=kwargs["attn_score_dropout"],
-                attn_layer_dropout=kwargs["attn_layer_dropout"],
-                ffn_dropout=kwargs["ffn_dropout"],
-                hidden_act=kwargs["hidden_act"],
-                mask_future=kwargs["mask_future"],
-                pre_ln=kwargs["pre_ln"],
-                pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"],
-                hidden_steps=kwargs["hidden_steps"],
-                hidden_blocks=kwargs["hidden_blocks"],
-                hidden_init_method=kwargs["hidden_init_method"],
-            )
-        elif arch == "perceiver":
-            encoder = PerceiverEncoder(
-                num_layers=kwargs["num_layers"],
-                hidden_size=kwargs["hidden_size"],
-                inner_size=kwargs["inner_size"],
-                num_attention_heads=kwargs["num_attention_heads"],
-                attn_score_dropout=kwargs["attn_score_dropout"],
-                attn_layer_dropout=kwargs["attn_layer_dropout"],
-                ffn_dropout=kwargs["ffn_dropout"],
-                hidden_act=kwargs["hidden_act"],
-                mask_future=kwargs["mask_future"],
-                pre_ln=kwargs["pre_ln"],
-                pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"],
-                hidden_steps=kwargs["hidden_steps"],
-                hidden_blocks=kwargs["hidden_blocks"],
-                hidden_init_method=kwargs["hidden_init_method"],
-            )
-        elif arch == "max_pool":
-            encoder = PoolingEncoder(
-                num_layers=kwargs["num_layers"],
-                hidden_size=kwargs["hidden_size"],
-                inner_size=kwargs["inner_size"],
-                num_attention_heads=kwargs["num_attention_heads"],
-                attn_score_dropout=kwargs["attn_score_dropout"],
-                attn_layer_dropout=kwargs["attn_layer_dropout"],
-                ffn_dropout=kwargs["ffn_dropout"],
-                hidden_act=kwargs["hidden_act"],
-                mask_future=kwargs["mask_future"],
-                pre_ln=kwargs["pre_ln"],
-                pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"],
-                hidden_steps=kwargs["hidden_steps"],
-                hidden_blocks=kwargs["hidden_blocks"],
-                hidden_init_method=kwargs["hidden_init_method"],
-                pooling_type="max",
-            )
-        elif arch == "avg_pool":
-            encoder = PoolingEncoder(
-                num_layers=kwargs["num_layers"],
-                hidden_size=kwargs["hidden_size"],
-                inner_size=kwargs["inner_size"],
-                num_attention_heads=kwargs["num_attention_heads"],
-                attn_score_dropout=kwargs["attn_score_dropout"],
-                attn_layer_dropout=kwargs["attn_layer_dropout"],
-                ffn_dropout=kwargs["ffn_dropout"],
-                hidden_act=kwargs["hidden_act"],
-                mask_future=kwargs["mask_future"],
-                pre_ln=kwargs["pre_ln"],
-                pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"],
-                hidden_steps=kwargs["hidden_steps"],
-                hidden_blocks=kwargs["hidden_blocks"],
-                hidden_init_method=kwargs["hidden_init_method"],
-                pooling_type="avg",
-            )
-        else:
-            raise ValueError(f"Unknown arch = {self.arch}, supported arch = {self.supported_arch}")
-
-        return encoder
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        input_types = super().input_types
-        input_types.update(
-            {"return_mask": NeuralType((), BoolType(), True),}
-        )
-
-        return input_types
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        output_types = super().output_types
-        output_types.update(
-            {"hidden_mask": NeuralType(('B', 'T'), MaskType(), True),}
-        )
-        return output_types
-
-    @property
-    def supported_arch(self):
-        return self._SUPPORTED_ARCH
-
-    @property
-    def arch(self):
-        return self._arch
-
-    @typecheck()
-    def forward(self, input_ids, encoder_mask, return_mask=None):
-        if return_mask is None:
-            return_mask = self._return_mask
-
-        embeddings = self._embedding(input_ids=input_ids)
-
-        if (not self.arch) or (self.arch == "seq2seq"):
-            encoder_hidden_states = self._encoder(encoder_states=embeddings, encoder_mask=encoder_mask)
-            encoder_hidden_mask = encoder_mask
-        else:
-            encoder_hidden_states, encoder_hidden_mask = self._encoder(
-                encoder_states=embeddings, encoder_mask=encoder_mask,
-            )
-
-        if return_mask:
-            return encoder_hidden_states, encoder_hidden_mask
-        else:
-            return encoder_hidden_states
-
-
-class TransformerBottleneckDecoderNM(TransformerDecoderNM):
-    _SUPPORTED_ARCH = ["seq2seq"]
-
-    def __init__(
-        self,
-        vocab_size: int,
-        hidden_size: int,
-        num_layers: int,
-        inner_size: int,
-        num_attention_heads: int,
-        max_sequence_length: int = 512,
-        num_token_types: int = 2,
-        embedding_dropout: float = 0.0,
-        learn_positional_encodings: bool = False,
-        ffn_dropout: float = 0.0,
-        attn_score_dropout: float = 0.0,
-        attn_layer_dropout: float = 0.0,
-        hidden_act: str = 'relu',
-        pre_ln: bool = False,
-        pre_ln_final_layer_norm: bool = True,
-        arch='',
-    ):
-        super().__init__(
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            max_sequence_length=max_sequence_length,
-            num_token_types=num_token_types,
-            embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings,
-            ffn_dropout=ffn_dropout,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-
-        self._arch = arch
-
-        # replace decoder
-        self._decoder = self._build_decoder(
-            arch=arch,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            inner_size=inner_size,
-            num_attention_heads=num_attention_heads,
-            max_sequence_length=max_sequence_length,
-            num_token_types=num_token_types,
-            embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings,
-            ffn_dropout=ffn_dropout,
-            attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout,
-            hidden_act=hidden_act,
-            pre_ln=pre_ln,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-        )
-
-    def _build_decoder(self, arch, **kwargs):
-        """
-        Returns a decoder based on architecture arch and kwargs
-        """
-        # usual non-bottleneck transformer decoder
-        if (not arch) or (arch == "seq2seq"):
-            decoder = self.decoder
-        else:
-            raise ValueError(f"Unknown arch = {self.arch}, supported arch = {self.supported_arch}")
-
-        return decoder
-
-    @property
-    def supported_arch(self):
-        return self._SUPPORTED_ARCH
-
-    @property
-    def arch(self):
-        return self._arch
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_generators.py b/nemo/collections/nlp/modules/common/transformer/transformer_generators.py
deleted file mode 100644
index 9bac89f61135..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/transformer_generators.py
+++ /dev/null
@@ -1,951 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from contextlib import contextmanager
-
-import torch
-
-from nemo.collections.common.parts import NEG_INF, mask_padded_tokens
-
-__all__ = [
-    "GreedySequenceGenerator",
-    "TopKSequenceGenerator",
-    "BeamSearchSequenceGenerator",
-    "BeamSearchSequenceGeneratorWithLanguageModel",
-    "EnsembleBeamSearchSequenceGenerator",
-]
-
-
-class GreedySequenceGenerator:
-    """
-    Greedy sequence generator based on the decoder followed by log_softmax.
-
-    Args:
-        embedding: nn.Module, transforms input_ids into vector embeddings
-        decoder: nn.Module, takes embeddings and produces hidden_states
-        log_softmax: nn.Module, takes hidden_states and produces log_probs
-            which correspond to probability distribution of tokens (ids)
-        pad: index of padding token in the vocabulary
-        bos: index of beginning of sequence token in the vocabulary
-        eos: index of end of sequence token in the vocabulary
-        max_sequence_length: maximum allowed length for generated sequences
-        max_delta_length: in case of encoder-decoder generation (e.g. NMT),
-            forbids generated sequences to be longer than the length of
-            source sequences plus max_delta_length
-        batch_size: size of the batch of generated sequences if neither
-            source nor target starting sequences are provided
-    """
-
-    def __init__(
-        self,
-        embedding,
-        decoder,
-        log_softmax,
-        pad=0,
-        bos=1,
-        eos=2,
-        max_sequence_length=512,
-        max_delta_length=20,
-        batch_size=1,
-    ):
-        super().__init__()
-        self.embedding = embedding
-        self.decoder = decoder
-        self.log_softmax = log_softmax
-        self.pad, self.bos, self.eos = pad, bos, eos
-        self.max_seq_length = max_sequence_length
-        self.max_delta_len = max_delta_length
-        self.batch_size = batch_size
-
-    def _one_step_forward(
-        self,
-        decoder_input_ids=None,
-        encoder_hidden_states=None,
-        encoder_input_mask=None,
-        decoder_mems_list=None,
-        pos=0,
-    ):
-        """
-        One step of autoregressive output generation.
-
-        Args:
-            decoder_input_ids: starting sequence of tokens to generate from;
-                if None, generation will start from a batch of <bos> tokens
-            encoder_hidden_states: output of the encoder for conditional
-                sequence generation; if None, generator will use unconditional
-                mode (e.g., language modeling)
-            encoder_input_mask: input mask used in the encoder
-            decoder_mems_list: list of size num_layers with cached activations
-                of sequence (x[1], ..., x[k-1]) for fast generation of x[k]
-            pos: starting position in positional encoding
-        """
-
-        decoder_hidden_states = self.embedding.forward(decoder_input_ids, start_pos=pos)
-        decoder_input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float()
-
-        if encoder_hidden_states is not None:
-            decoder_mems_list = self.decoder.forward(
-                decoder_hidden_states,
-                decoder_input_mask,
-                encoder_hidden_states,
-                encoder_input_mask,
-                decoder_mems_list,
-                return_mems=True,
-            )
-        else:
-            decoder_mems_list = self.decoder.forward(
-                decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True
-            )
-        log_probs = self.log_softmax.forward(hidden_states=decoder_mems_list[-1][:, -1:])
-        return log_probs, decoder_mems_list
-
-    def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None):
-        """
-        Helper function which defines starting sequence to begin generating
-        with and maximum allowed number of tokens to be generated.
-        """
-
-        decoder_parameter = next(self.decoder.parameters())
-        batch_size = self.batch_size
-
-        # for encoder-decoder generation, maximum length of generated sequence
-        # is min(max_sequence_length, src_len + max_delta_length)
-        if encoder_hidden_states is not None:
-            batch_size, src_len, _ = encoder_hidden_states.size()
-            if self.max_delta_len >= 0:
-                max_seq_length = min(self.max_seq_length, src_len + self.max_delta_len)
-            else:
-                max_seq_length = self.max_seq_length
-        else:
-            max_seq_length = self.max_seq_length
-
-        # if no input is provided, start with the batch of <bos> tokens
-        if decoder_input_ids is not None:
-            tgt = decoder_input_ids
-            batch_size, tgt_len = decoder_input_ids.size()
-        else:
-            tgt = torch.zeros(batch_size, 1).long().fill_(self.bos).to(decoder_parameter.device)
-            tgt_len = 1
-        max_generation_length = max_seq_length - tgt_len
-
-        return tgt, batch_size, max_generation_length
-
-    def _forward(
-        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
-    ):
-        assert not return_beam_scores
-        tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
-
-        # pad profile tracks sequences ending with <eos> token to replace
-        # everything after <eos> with <pad> token
-        decoder_parameter = next(self.decoder.parameters())
-        pad_profile = torch.zeros(batch_size, 1).long().to(decoder_parameter.device)
-
-        decoder_mems_list = None
-        for i in range(max_generation_length):
-
-            log_probs, decoder_mems_list = self._one_step_forward(
-                tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i
-            )
-
-            next_tokens = torch.argmax(log_probs[:, -1], dim=-1, keepdim=True)
-            next_tokens = self.pad * pad_profile + next_tokens * (1 - pad_profile)
-            pad_profile = torch.max(pad_profile, (next_tokens == self.eos).long())
-            tgt = torch.cat((tgt, next_tokens), dim=-1)
-
-            # abort generation if all sequences end with <eos>
-            if pad_profile.sum() == batch_size:
-                break
-
-        return tgt
-
-    def __call__(
-        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
-    ):
-        with self.as_frozen():
-            return self._forward(
-                decoder_input_ids, encoder_hidden_states, encoder_input_mask, return_beam_scores=return_beam_scores
-            )
-
-    def freeze(self) -> None:
-        """Freeze weights of embedding, decoder, and classification layers to prevent memory leak."""
-        for param in self.embedding.parameters():
-            param.requires_grad = False
-        self.embedding.eval()
-        for param in self.decoder.parameters():
-            param.requires_grad = False
-        self.decoder.eval()
-        for param in self.log_softmax.parameters():
-            param.requires_grad = False
-        self.log_softmax.eval()
-
-    def unfreeze(self) -> None:
-        """Unfreeze weights of embedding, decoder, and classification layers."""
-        for param in self.embedding.parameters():
-            param.requires_grad = True
-        self.embedding.train()
-        for param in self.decoder.parameters():
-            param.requires_grad = True
-        self.decoder.train()
-        for param in self.log_softmax.parameters():
-            param.requires_grad = True
-        self.log_softmax.train()
-
-    @contextmanager
-    def as_frozen(self):
-        """
-        Context manager which temporarily freezes embedding, decoder, and log_softmax modules,
-        yields control and finally unfreezes the modules.
-        """
-        self.freeze()
-
-        try:
-            yield
-        finally:
-            self.unfreeze()
-
-
-class TopKSequenceGenerator(GreedySequenceGenerator):
-    """
-    Top-k sequence generator based on the decoder followed by log_softmax.
-
-    Args:
-        *all args of GreedySequenceGenerator class
-        beam_size: size of the beam (parameter k in top-k)
-        temperature: temperature of top-k sampling, all logits are divided
-            by temperature before rescaling. High temperature leads to
-            uniform distribution, low leads to delta-like distribution.
-    Kwargs:
-        all remaining parameters of GreedySequenceGenerator class
-    """
-
-    def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs):
-        super().__init__(embedding, decoder, log_softmax, **kwargs)
-        self.beam_size = beam_size
-        self.temp = temperature
-
-    # @torch.no_grad()
-    def _one_step_forward(
-        self,
-        decoder_input_ids=None,
-        encoder_hidden_states=None,
-        encoder_input_mask=None,
-        decoder_mems_list=None,
-        pos=0,
-    ):
-        log_probs, decoder_mems_list = super()._one_step_forward(
-            decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos
-        )
-
-        batch_size, seq_len, vocab_size = log_probs.size()
-        scores, indices = torch.topk(log_probs, self.beam_size, dim=-1)
-
-        rescaled_logexp = torch.zeros_like(log_probs).scatter(-1, indices, scores.div(self.temp).exp())
-        probs = rescaled_logexp / rescaled_logexp.norm(1, -1, keepdim=True)
-
-        # We randomly sample next tokens from rescaled probability distribution
-        # over top-k candidates and return a binary tensor which indicates
-        # candidates that have been selected. We call this object
-        # `pseudo_log_probs` as genuine log_probs should have -infs instead of
-        # 0s and 0s instead of 1s.
-        ids = torch.multinomial(probs.view(-1, vocab_size), 1).view(-1, seq_len, 1)
-        pseudo_log_probs = torch.zeros_like(log_probs).scatter(-1, ids, 1.0)
-
-        return pseudo_log_probs, decoder_mems_list
-
-
-class BeamSearchSequenceGenerator(GreedySequenceGenerator):
-    def __init__(self, embedding, decoder, log_softmax, beam_size=1, len_pen=0, **kwargs):
-        """
-        Beam Search sequence generator based on the decoder followed by
-        log_softmax.
-
-        Args:
-            *all args of GreedySequenceGenerator class
-            beam_size: size of the beam
-            len_pen: length penalty parameter
-        Kwargs:
-            all remaining parameters of GreedySequenceGenerator class
-        """
-
-        super().__init__(embedding, decoder, log_softmax, **kwargs)
-        self.beam_size = beam_size
-        self.len_pen = len_pen
-
-    @staticmethod
-    def compute_len_penalty(lengths, alpha):
-        """Returns length penalty according to https://arxiv.org/pdf/1609.08144.pdf"""
-        return ((5 + lengths) / 6).pow(alpha)
-
-    def _forward(
-        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
-    ):
-        tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
-
-        # generate initial buffer of beam_size prefixes-hypotheses
-        log_probs, decoder_mems_list = self._one_step_forward(tgt, encoder_hidden_states, encoder_input_mask, None, 0)
-        scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1)
-        scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1)
-
-        # repeat init target prefixes and cached memory states beam_size times
-        prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1)
-        for j in range(len(decoder_mems_list)):
-            decoder_mems_list[j] = decoder_mems_list[j].repeat(self.beam_size, 1, 1)
-
-        # repeat source sequence beam_size times for beam search
-        if encoder_hidden_states is not None:
-            _, src_length, hidden_size = encoder_hidden_states.size()
-            encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, src_length)
-            encoder_hidden_states = encoder_hidden_states.repeat(1, self.beam_size, 1).view(
-                -1, src_length, hidden_size
-            )
-        else:
-            hidden_size = decoder_mems_list[0].size(2)
-
-        # pad_profile tracks finished hypotheses to generate only <pad> tokens
-        # if <eos> or <pad> has been generated
-        pad_profile = torch.zeros_like(scores).long()
-
-        # prefixes_len tracks lengths of generated hypotheses to perform
-        # length penalty correction
-        prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1)
-
-        for i in range(max_generation_length):
-
-            # mask all finished hypotheses to exclude them from beam
-            pad_mask = pad_profile.repeat(1, self.beam_size)
-
-            # generate and score candidates for prefixes continuation
-            log_probs, decoder_mems_list = self._one_step_forward(
-                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1
-            )
-            scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
-
-            # for all prefixes ending with <eos> or <pad> replace generated
-            # continuations with <pad>
-            prefixes_i = self.pad * pad_mask + prefixes_i * (1 - pad_mask)
-
-            # force all hypotheses but one generated from already finished
-            # hypotheses to have extremely low score, so they will not be
-            # considered during beam re-ranking
-            pad_mask[:, 1:] = pad_mask[:, 1:] * NEG_INF
-            scores = scores + scores_i * (1 - pad_mask).to(scores.dtype)
-
-            # choose top-k hypotheses with length penalty applied
-            len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
-            scores = scores / len_penalties
-            scores, indices_i = torch.topk(scores.view(-1, self.beam_size**2), self.beam_size, dim=1)
-            scores = scores.view(-1, 1) * len_penalties
-
-            # select prefixes which correspond to the chosen hypotheses
-            prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1)
-            prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2)
-            prefixes = prefixes.view(batch_size, self.beam_size**2, -1)
-            p_len = prefixes.size(2)
-            prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len)
-            prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len)
-
-            # reshuffle cached decoder memory states to restore the order
-            # of hypotheses broken after top-k selection
-            mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size
-            for j in range(len(decoder_mems_list)):
-                decoder_mems_list[j] = (
-                    decoder_mems_list[j]
-                    .view(-1, self.beam_size, p_len - 1, hidden_size)
-                    .gather(1, mems_ids)
-                    .view(-1, p_len - 1, hidden_size)
-                )
-
-            # update prefixes_len and pad_profile
-            not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad)
-            prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype)
-            pad_profile = (~not_eos_pad[:, -1:]).long()
-
-            # if all hypotheses end with <eos> or <pad>, interrupt search
-            if pad_profile.sum() == batch_size * self.beam_size:
-                break
-
-        # select best performing hypotheses in each element of the batch
-        len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
-        scores = scores / len_penalties
-        best_guesses = (
-            torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1)
-        )
-        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1)
-
-        if return_beam_scores:
-            return prefixes, scores * len_penalties, tgt
-        else:
-            return tgt
-
-
-class EnsembleBeamSearchSequenceGenerator:
-    def __init__(
-        self,
-        encoders,
-        embeddings,
-        decoders,
-        log_softmaxes,
-        beam_size=1,
-        len_pen=0,
-        pad=0,
-        bos=1,
-        eos=2,
-        max_sequence_length=512,
-        max_delta_length=20,
-        batch_size=1,
-        language_model=None,
-        fusion_coef=None,
-    ):
-        """
-        Ensemble Beam Search sequence generator based on the decoder followed by
-        log_softmax. Averages the probabilities of different models.
-        NOTE: All models must have been trained with the same BPE tokenizers.
-
-        Args:
-            encoders: A list of encoders
-            embeddings: A list of decoder embedding layers
-            decoders: A list of decoders
-            log_softmaxes: A list of decoder output layers
-            beam_size: Beam size
-            len_pen: Length penalty to adjust logprob scores to favor longer sequences
-            pad: pad id
-            bos: beginning of sequence id
-            eos: end of sequence id
-            max_sequence_length: maximum sequence length
-            max_delta_length: maximum length difference between input and output
-            batch_size: batch size if not inferrable from input sequence
-        """
-        self.encoders = encoders
-        self.embeddings = embeddings
-        self.decoders = decoders
-        self.log_softmaxes = log_softmaxes
-        self.beam_size = beam_size
-        self.len_pen = len_pen
-        self.pad, self.bos, self.eos = pad, bos, eos
-        self.max_seq_length = max_sequence_length
-        self.max_delta_len = max_delta_length
-        self.batch_size = batch_size
-        assert len(embeddings) == len(decoders) == len(log_softmaxes) == len(encoders)
-        self.num_models = len(encoders)
-        self.language_model = language_model
-        self.fusion_coef = fusion_coef
-
-    @staticmethod
-    def compute_len_penalty(lengths, alpha):
-        """Returns length penalty according to https://arxiv.org/pdf/1609.08144.pdf"""
-        return ((5 + lengths) / 6).pow(alpha)
-
-    def _one_step_forward_lm(self, decoder_input_ids=None, lm_mems_list=None, pos=0):
-        input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float()
-        lm_hidden_states = self.language_model.encoder.embedding.forward(decoder_input_ids, start_pos=pos)
-        lm_mems_list = self.language_model.encoder.encoder.forward(
-            lm_hidden_states,
-            input_mask,
-            lm_mems_list,
-            return_mems=True,
-        )
-        lm_log_probs = self.language_model.log_softmax.forward(hidden_states=lm_mems_list[-1][:, -1:])
-        return lm_log_probs, lm_mems_list
-
-    def _one_step_forward(
-        self,
-        ensemble_index,
-        decoder_input_ids=None,
-        encoder_hidden_states=None,
-        encoder_input_mask=None,
-        decoder_mems_list=None,
-        pos=0,
-    ):
-        """
-        One step of autoregressive output generation for one particular model.
-
-        Args:
-            decoder_input_ids: starting sequence of tokens to generate from;
-                if None, generation will start from a batch of <bos> tokens
-            encoder_hidden_states: output of the encoder for conditional
-                sequence generation; if None, generator will use unconditional
-                mode (e.g., language modeling)
-            encoder_input_mask: input mask used in the encoder
-            decoder_mems_list: list of size num_layers with cached activations
-                of sequence (x[1], ..., x[k-1]) for fast generation of x[k]
-            pos: starting position in positional encoding
-        """
-
-        decoder_hidden_states = self.embeddings[ensemble_index].forward(decoder_input_ids, start_pos=pos)
-        decoder_input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float()
-
-        if encoder_hidden_states is not None:
-            decoder_mems_list = self.decoders[ensemble_index].forward(
-                decoder_hidden_states,
-                decoder_input_mask,
-                encoder_hidden_states,
-                encoder_input_mask,
-                decoder_mems_list,
-                return_mems=True,
-            )
-        else:
-            decoder_mems_list = self.decoders[ensemble_index].forward(
-                decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True
-            )
-        log_probs = self.log_softmaxes[ensemble_index].forward(hidden_states=decoder_mems_list[-1][:, -1:])
-        return log_probs, decoder_mems_list
-
-    def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None):
-        """
-        Helper function which defines starting sequence to begin generating
-        with and maximum allowed number of tokens to be generated.
-        """
-
-        decoder_parameter = next(self.decoders[0].parameters())
-        batch_size = self.batch_size
-
-        # for encoder-decoder generation, maximum length of generated sequence
-        # is min(max_sequence_length, src_len + max_delta_length)
-        if encoder_hidden_states is not None:
-            batch_size, src_len, _ = encoder_hidden_states.size()
-            if self.max_delta_len >= 0:
-                max_seq_length = min(self.max_seq_length, src_len + self.max_delta_len)
-            else:
-                max_seq_length = self.max_seq_length
-        else:
-            max_seq_length = self.max_seq_length
-
-        # if no input is provided, start with the batch of <bos> tokens
-        if decoder_input_ids is not None:
-            tgt = decoder_input_ids
-            batch_size, tgt_len = decoder_input_ids.size()
-        else:
-            tgt = torch.zeros(batch_size, 1).long().fill_(self.bos).to(decoder_parameter.device)
-            tgt_len = 1
-        max_generation_length = max_seq_length - tgt_len
-
-        return tgt, batch_size, max_generation_length
-
-    def _get_encoder_hidden_states(self, src_ids, encoder_input_mask, ensemble_index):
-        return self.encoders[ensemble_index](input_ids=src_ids, encoder_mask=encoder_input_mask)
-
-    def _average_probs(self, probs_list):
-        probs_list = torch.stack(probs_list)
-        return torch.log(torch.exp(probs_list).mean(0))
-        # probs = torch.stack(probs_list) # Ens x B x T x V
-        # return torch.log(probs.sum(0) / probs.sum(-1).sum(0).unsqueeze(-1))
-
-    def _forward(self, src_ids, encoder_input_mask, decoder_input_ids=None, return_beam_scores=False):
-        encoder_hidden_states = [
-            self._get_encoder_hidden_states(src_ids, encoder_input_mask, i) for i in range(self.num_models)
-        ]
-        tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states[0])
-
-        # generate initial buffer of beam_size prefixes-hypotheses
-        outputs = [
-            self._one_step_forward(i, tgt, encoder_hidden_states[i], encoder_input_mask, None, 0)
-            for i in range(self.num_models)
-        ]
-        nmt_log_probs = self._average_probs([x[0] for x in outputs])
-        decoder_mems_lists = [x[1] for x in outputs]
-
-        if self.language_model is not None:
-            lm_log_probs, lm_mems_list = self._one_step_forward_lm(tgt, None, 0)
-            log_probs = nmt_log_probs + self.fusion_coef * lm_log_probs
-        else:
-            log_probs = nmt_log_probs
-        scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1)
-        scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1)
-
-        # repeat init target prefixes and cached memory states beam_size times
-        prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1)
-        for i in range(self.num_models):
-            for j in range(len(decoder_mems_lists[i])):
-                decoder_mems_lists[i][j] = decoder_mems_lists[i][j].repeat(self.beam_size, 1, 1)
-
-        if self.language_model is not None:
-            for j in range(len(lm_mems_list)):
-                lm_mems_list[j] = lm_mems_list[j].repeat(self.beam_size, 1, 1)
-            lm_hidden_size = lm_mems_list[0].size(2)
-
-        encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, encoder_input_mask.size(1))
-        for i in range(self.num_models):
-            _, src_length, hidden_size = encoder_hidden_states[i].size()
-            encoder_hidden_states[i] = (
-                encoder_hidden_states[i].repeat(1, self.beam_size, 1).view(-1, src_length, hidden_size)
-            )
-
-        # pad_profile tracks finished hypotheses to generate only <pad> tokens
-        # if <eos> or <pad> has been generated
-        pad_profile = torch.zeros_like(scores).long()
-
-        # prefixes_len tracks lengths of generated hypotheses to perform
-        # length penalty correction
-        prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1)
-
-        for i in range(max_generation_length):
-
-            # mask all finished hypotheses to exclude them from beam
-            pad_mask = pad_profile.repeat(1, self.beam_size)
-
-            # generate and score candidates for prefixes continuation
-            outputs = [
-                self._one_step_forward(
-                    model_num,
-                    prefixes[:, -1:],
-                    encoder_hidden_states[model_num],
-                    encoder_input_mask,
-                    decoder_mems_lists[model_num],
-                    i + 1,
-                )
-                for model_num in range(self.num_models)
-            ]
-            nmt_log_probs = self._average_probs([x[0] for x in outputs])
-            decoder_mems_lists = [x[1] for x in outputs]
-
-            if self.language_model is not None:
-                lm_log_probs, lm_mems_list = self._one_step_forward_lm(prefixes[:, -1:], lm_mems_list, i + 1)
-                log_probs = nmt_log_probs + self.fusion_coef * lm_log_probs
-            else:
-                log_probs = nmt_log_probs
-            scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
-
-            # for all prefixes ending with <eos> or <pad> replace generated
-            # continuations with <pad>
-            prefixes_i = self.pad * pad_mask + prefixes_i * (1 - pad_mask)
-
-            # force all hypotheses but one generated from already finished
-            # hypotheses to have extremely low score, so they will not be
-            # considered during beam re-ranking
-            pad_mask[:, 1:] = pad_mask[:, 1:] * NEG_INF
-            scores = scores + scores_i * (1 - pad_mask).to(scores.dtype)
-
-            # choose top-k hypotheses with length penalty applied
-            len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
-            scores = scores / len_penalties
-            scores, indices_i = torch.topk(scores.view(-1, self.beam_size**2), self.beam_size, dim=1)
-            scores = scores.view(-1, 1) * len_penalties
-
-            # select prefixes which correspond to the chosen hypotheses
-            prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1)
-            prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2)
-            prefixes = prefixes.view(batch_size, self.beam_size**2, -1)
-            p_len = prefixes.size(2)
-            prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len)
-            prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len)
-
-            # reshuffle cached decoder memory states to restore the order
-            # of hypotheses broken after top-k selection
-            for model_num in range(self.num_models):
-                hidden_size = decoder_mems_lists[model_num][0].size(2)
-                mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size
-                for j in range(len(decoder_mems_lists[model_num])):
-                    decoder_mems_lists[model_num][j] = (
-                        decoder_mems_lists[model_num][j]
-                        .view(-1, self.beam_size, p_len - 1, hidden_size)
-                        .gather(1, mems_ids)
-                        .view(-1, p_len - 1, hidden_size)
-                    )
-            if self.language_model is not None:
-                lm_mems_ids = (
-                    indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, lm_hidden_size) // self.beam_size
-                )
-                for j in range(len(lm_mems_list)):
-                    lm_mems_list[j] = (
-                        lm_mems_list[j]
-                        .view(-1, self.beam_size, p_len - 1, lm_hidden_size)
-                        .gather(1, lm_mems_ids)
-                        .view(-1, p_len - 1, lm_hidden_size)
-                    )
-
-            # update prefixes_len and pad_profile
-            not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad)
-            prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype)
-            pad_profile = (~not_eos_pad[:, -1:]).long()
-
-            # if all hypotheses end with <eos> or <pad>, interrupt search
-            if pad_profile.sum() == batch_size * self.beam_size:
-                break
-
-        # select best performing hypotheses in each element of the batch
-        len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
-        scores = scores / len_penalties
-        best_guesses = (
-            torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1)
-        )
-        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1)
-
-        if return_beam_scores:
-            return prefixes, scores * len_penalties, tgt
-        else:
-            return tgt
-
-    def __call__(self, src_ids, encoder_input_mask, decoder_input_ids=None, return_beam_scores=False):
-        with self.as_frozen():
-            return self._forward(src_ids, encoder_input_mask, decoder_input_ids, return_beam_scores)
-
-    def freeze(self) -> None:
-        """Freeze weights of embedding, decoder, and classification layers to prevent memory leak."""
-        for model_num in range(self.num_models):
-            for param in self.embeddings[model_num].parameters():
-                param.requires_grad = False
-            self.embeddings[model_num].eval()
-            for param in self.decoders[model_num].parameters():
-                param.requires_grad = False
-            self.decoders[model_num].eval()
-            for param in self.log_softmaxes[model_num].parameters():
-                param.requires_grad = False
-            self.log_softmaxes[model_num].eval()
-            for param in self.encoders[model_num].parameters():
-                param.requires_grad = False
-            self.encoders[model_num].eval()
-
-    def unfreeze(self) -> None:
-        """Unfreeze weights of embedding, decoder, and classification layers."""
-        for model_num in range(self.num_models):
-            for param in self.embeddings[model_num].parameters():
-                param.requires_grad = True
-            self.embeddings[model_num].train()
-            for param in self.decoders[model_num].parameters():
-                param.requires_grad = True
-            self.decoders[model_num].train()
-            for param in self.log_softmaxes[model_num].parameters():
-                param.requires_grad = True
-            self.log_softmaxes[model_num].train()
-            for param in self.encoders[model_num].parameters():
-                param.requires_grad = True
-            self.encoders[model_num].train()
-
-    @contextmanager
-    def as_frozen(self):
-        """
-        Context manager which temporarily freezes embedding, decoder, and log_softmax modules,
-        yields control and finally unfreezes the modules.
-        """
-        grad_module_list = {'embeddings': {}, 'decoders': {}, 'log_softmaxes': {}, 'encoders': {}}
-        training_mode_module_list = {'embeddings': {}, 'decoders': {}, 'log_softmaxes': {}, 'encoders': {}}
-
-        def gather_grad_values(module_name):
-            map_values = [{} for _ in range(self.num_models)]
-            for model_num in range(self.num_models):
-                for name, param in getattr(self, module_name)[model_num].named_parameters():
-                    map_values[model_num][name].append(param.requires_grad)
-            return map_values
-
-        def reset_grad_values(module_name, map_values, require_grad_default: bool):
-            for model_num in range(self.num_models):
-                for name, param in getattr(self, module_name)[model_num].named_parameters():
-                    if name in map_values[model_num]:
-                        param.requires_grad = map_values[model_num].pop()
-                    else:
-                        param.requires_grad = require_grad_default
-
-        def gather_reset_training_mode_values(module_name, map_values: dict = None):
-            map_values = [{} for _ in range(self.num_models)] if not map_values else map_values
-            get_values = len(map_values) == 0
-
-            for model_num in range(self.num_models):
-                if get_values:
-                    map_values[model_num] = getattr(self, module_name)[model_num].training
-                else:
-                    getattr(self, module_name)[model_num].train(map_values[model_num])
-            return map_values
-
-        # Cache the param.require_grad state of each module
-        for module_name in grad_module_list.keys():
-            grad_module_list[module_name] = gather_grad_values(module_name)
-            training_mode_module_list[module_name] = gather_reset_training_mode_values(module_name)
-
-        self.freeze()
-
-        try:
-            yield
-        finally:
-            self.unfreeze()
-
-            # Reset the param.require_grad state of each module
-            for module_name in grad_module_list.keys():
-                reset_grad_values(module_name, grad_module_list[module_name], require_grad_default=True)
-                gather_reset_training_mode_values(module_name, map_values=training_mode_module_list[module_name])
-
-
-class BeamSearchSequenceGeneratorWithLanguageModel(GreedySequenceGenerator):
-    def __init__(
-        self, embedding, decoder, log_softmax, language_model, beam_size=1, len_pen=0, fusion_coef=0.0, **kwargs
-    ):
-        """
-        Beam Search sequence generator based on the decoder followed by log_softmax
-        with external language model fusion.
-        Args:
-            *all args of BeamSearchSequenceGenerator class
-            language_model: nemo TransformerLMModel
-            fusion_coef: coefficient before language model score, the resulting score is
-                score = log P_NMT(y|x) + fusion_coef * log P_LM(y)
-        Kwargs:
-            all remaining parameters of GreedySequenceGenerator class
-        """
-
-        super().__init__(embedding, decoder, log_softmax, **kwargs)
-        self.language_model = language_model
-        self.beam_size = beam_size
-        self.len_pen = len_pen
-        self.fusion_coef = fusion_coef
-
-    def _one_step_forward(
-        self,
-        decoder_input_ids=None,
-        encoder_hidden_states=None,
-        encoder_input_mask=None,
-        decoder_mems_list=None,
-        lm_mems_list=None,
-        pos=0,
-    ):
-
-        nmt_log_probs, decoder_mems_list = super()._one_step_forward(
-            decoder_input_ids,
-            encoder_hidden_states,
-            encoder_input_mask,
-            decoder_mems_list,
-            pos,
-        )
-        input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float()
-        lm_hidden_states = self.language_model.encoder.embedding.forward(decoder_input_ids, start_pos=pos)
-
-        lm_mems_list = self.language_model.encoder.encoder.forward(
-            lm_hidden_states,
-            input_mask,
-            lm_mems_list,
-            return_mems=True,
-        )
-        lm_log_probs = self.language_model.log_softmax.forward(hidden_states=lm_mems_list[-1][:, -1:])
-
-        log_probs = nmt_log_probs + self.fusion_coef * lm_log_probs
-
-        return log_probs, decoder_mems_list, lm_mems_list
-
-    @staticmethod
-    def compute_len_penalty(lengths, alpha):
-        """Returns length penalty according to https://arxiv.org/pdf/1609.08144.pdf"""
-        return ((5 + lengths) / 6).pow(alpha)
-
-    def _forward(
-        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
-    ):
-
-        tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
-
-        # generate initial buffer of beam_size prefixes-hypotheses
-        log_probs, decoder_mems_list, lm_mems_list = self._one_step_forward(
-            tgt, encoder_hidden_states, encoder_input_mask, None, None, 0
-        )
-        scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1)
-        scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1)
-
-        # repeat init target prefixes and cached memory states beam_size times
-        prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1)
-        for j in range(len(decoder_mems_list)):
-            decoder_mems_list[j] = decoder_mems_list[j].repeat(self.beam_size, 1, 1)
-        for j in range(len(lm_mems_list)):
-            lm_mems_list[j] = lm_mems_list[j].repeat(self.beam_size, 1, 1)
-
-        # repeat source sequence beam_size times for beam search
-        if encoder_hidden_states is not None:
-            _, src_length, hidden_size = encoder_hidden_states.size()
-            encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, src_length)
-            encoder_hidden_states = encoder_hidden_states.repeat(1, self.beam_size, 1).view(
-                -1, src_length, hidden_size
-            )
-        else:
-            hidden_size = decoder_mems_list[0].size(2)
-        lm_hidden_size = lm_mems_list[0].size(2)
-
-        # pad_profile tracks finished hypotheses to generate only <pad> tokens
-        # if <eos> or <pad> has been generated
-        pad_profile = torch.zeros_like(scores).long()
-
-        # prefixes_len tracks lengths of generated hypotheses to perform
-        # length penalty correction
-        prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1)
-
-        for i in range(max_generation_length):
-
-            # mask all finished hypotheses to exclude them from beam
-            pad_mask = pad_profile.repeat(1, self.beam_size)
-
-            # generate and score candidates for prefixes continuation
-            log_probs, decoder_mems_list, lm_mems_list = self._one_step_forward(
-                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, lm_mems_list, i + 1
-            )
-            scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
-
-            # for all prefixes ending with <eos> or <pad> replace generated
-            # continuations with <pad>
-            prefixes_i = self.pad * pad_mask + prefixes_i * (1 - pad_mask)
-
-            # force all hypotheses but one generated from already finished
-            # hypotheses to have extremely low score, so they will not be
-            # considered during beam re-ranking
-            pad_mask[:, 1:] = pad_mask[:, 1:] * NEG_INF
-            scores = scores + scores_i * (1 - pad_mask).to(scores.dtype)
-
-            # choose top-k hypotheses with length penalty applied
-            len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
-            scores = scores / len_penalties
-            scores, indices_i = torch.topk(scores.view(-1, self.beam_size**2), self.beam_size, dim=1)
-            scores = scores.view(-1, 1) * len_penalties
-
-            # select prefixes which correspond to the chosen hypotheses
-            prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1)
-            prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2)
-            prefixes = prefixes.view(batch_size, self.beam_size**2, -1)
-            p_len = prefixes.size(2)
-            prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len)
-            prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len)
-
-            # reshuffle cached decoder memory states to restore the order
-            # of hypotheses broken after top-k selection
-            mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size
-            for j in range(len(decoder_mems_list)):
-                decoder_mems_list[j] = (
-                    decoder_mems_list[j]
-                    .view(-1, self.beam_size, p_len - 1, hidden_size)
-                    .gather(1, mems_ids)
-                    .view(-1, p_len - 1, hidden_size)
-                )
-            lm_mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, lm_hidden_size) // self.beam_size
-            for j in range(len(lm_mems_list)):
-                lm_mems_list[j] = (
-                    lm_mems_list[j]
-                    .view(-1, self.beam_size, p_len - 1, lm_hidden_size)
-                    .gather(1, lm_mems_ids)
-                    .view(-1, p_len - 1, lm_hidden_size)
-                )
-
-            # update prefixes_len and pad_profile
-            not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad)
-            prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype)
-            pad_profile = (~not_eos_pad[:, -1:]).long()
-
-            # if all hypotheses end with <eos> or <pad>, interrupt search
-            if pad_profile.sum() == batch_size * self.beam_size:
-                break
-
-        # select best performing hypotheses in each element of the batch
-        len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
-        scores = scores / len_penalties
-        best_guesses = (
-            torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1)
-        )
-        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1)
-
-        if return_beam_scores:
-            return prefixes, scores * len_penalties, tgt
-        else:
-            return tgt
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_modules.py b/nemo/collections/nlp/modules/common/transformer/transformer_modules.py
deleted file mode 100644
index de9af5ffab75..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/transformer_modules.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn.functional import gelu
-
-from nemo.collections.common.parts import form_attention_mask
-from nemo.utils import logging
-
-__all__ = ["TransformerEmbedding", "AttentionBridge"]
-
-
-class FixedPositionalEncoding(nn.Module):
-    """
-    Fixed positional encoding (embedding layer) from sine and cosine functions
-    of different frequencies according to https://arxiv.org/abs/1706.03762
-
-    Args:
-        hidden_size: size of the embeddings in the model, also known as d_model
-        max_sequence_length: maximum allowed length of the input sequence
-    """
-
-    def __init__(self, hidden_size, max_sequence_length=512):
-        super().__init__()
-
-        self._hidden_size = hidden_size
-        self._max_sequence_length = max_sequence_length
-        self._build_pos_enc(hidden_size=self._hidden_size, max_sequence_length=self._max_sequence_length)
-
-    def _build_pos_enc(self, hidden_size, max_sequence_length, device=None):
-        """
-        Builds/replaces pre-computed positional encoding.
-        """
-        pos_enc = torch.zeros(max_sequence_length, hidden_size, device=device)
-        position = torch.arange(0.0, max_sequence_length).unsqueeze(1)
-        coef = -math.log(10000.0) / hidden_size
-        div_term = torch.exp(coef * torch.arange(0.0, hidden_size, 2))
-        pos_enc[:, 0::2] = torch.sin(position * div_term)
-        pos_enc[:, 1::2] = torch.cos(position * div_term)
-        pos_enc.div_(math.sqrt(hidden_size))
-        self.register_buffer('pos_enc', pos_enc)
-
-    def forward(self, position_ids):
-        max_pos_id = position_ids.max()
-        # update positional encoding if needed
-        if max_pos_id >= self._max_sequence_length:
-            logging.warning(
-                f'Max position id {max_pos_id} is greater than max sequence length {self._max_sequence_length}. Expanding position embeddings just for this batch. This is not expected to work very well. Consider chunking your input into smaller sequences.'
-            )
-            self._build_pos_enc(
-                hidden_size=self._hidden_size, max_sequence_length=max_pos_id + 1, device=position_ids.device,
-            )
-
-        embeddings = torch.embedding(self.pos_enc, position_ids)
-
-        # Revert expansion of position embeddings since this wall checkpoint size mismatches.
-        if max_pos_id >= self._max_sequence_length:
-            self._build_pos_enc(
-                hidden_size=self._hidden_size,
-                max_sequence_length=self._max_sequence_length,
-                device=position_ids.device,
-            )
-        return embeddings
-
-
-class TransformerEmbedding(nn.Module):
-    """
-    Embedding from token and position embeddings.
-    Optionally add token_type embedding (e.g. type of the sentence in BERT).
-
-    Args:
-        vocab_size: size of the vocabulary
-        hidden_size: size of the embeddings in the model, also known as d_model
-        max_sequence_length: maximum allowed length of the input sequence
-        num_token_types: number of different token types
-            (e.g. tokens of sentence A and tokens of sentence B in BERT)
-        embedding_dropout: probability of dropout applied to embeddings
-        learn_positional_encodings: whether to learn positional encodings or
-            use fixed (sine-cosine) ones
-    """
-
-    def __init__(
-        self,
-        vocab_size: int,
-        hidden_size: int,
-        max_sequence_length: int = 512,
-        num_token_types: int = 2,
-        embedding_dropout: float = 0.0,
-        learn_positional_encodings: bool = False,
-        padding_idx: int = 0,
-    ):
-        super().__init__()
-
-        self.max_sequence_length = max_sequence_length
-        self.learn_positional_encodings = learn_positional_encodings
-        self.token_embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=padding_idx)
-        if learn_positional_encodings:
-            self.position_embedding = nn.Embedding(max_sequence_length, hidden_size)
-        else:
-            self.position_embedding = FixedPositionalEncoding(hidden_size, max_sequence_length)
-        if num_token_types > 0:
-            self.token_type_embedding = nn.Embedding(num_token_types, hidden_size)
-        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-5)
-        self.dropout = nn.Dropout(embedding_dropout)
-
-    def forward(self, input_ids, token_type_ids=None, start_pos=0):
-        seq_length = input_ids.size(1)
-        # we fail here only with parametric positional embedding. FixedPositionalEncoding automatically extends.
-        if self.learn_positional_encodings and (seq_length > self.max_sequence_length):
-            raise ValueError(
-                f"Input sequence is longer than maximum allowed sequence length for positional encoding. "
-                f"Got {seq_length} and {self.max_sequence_length}"
-            )
-        position_ids = torch.arange(
-            start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device
-        )
-        position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1)
-
-        token_embeddings = self.token_embedding(input_ids)
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = token_embeddings + position_embeddings
-
-        if token_type_ids is not None:
-            token_type_embeddings = self.token_type_embedding(token_type_ids)
-            embeddings = embeddings + token_type_embeddings
-
-        embeddings = self.layer_norm(embeddings)
-        embeddings = self.dropout(embeddings)
-
-        return embeddings
-
-
-class MultiHeadAttention(nn.Module):
-    """
-    Multi-head scaled dot-product attention layer.
-
-    Args:
-        hidden_size: size of the embeddings in the model, also known as d_model
-        num_attention_heads: number of heads in multi-head attention
-        attn_score_dropout: probability of dropout applied to attention scores
-        attn_layer_dropout: probability of dropout applied to the output of the
-            whole layer, but before layer normalization
-    """
-
-    def __init__(self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0):
-        super().__init__()
-        if hidden_size % num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number "
-                "of attention heads (%d)" % (hidden_size, num_attention_heads)
-            )
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.attn_head_size = int(hidden_size / num_attention_heads)
-        self.attn_scale = math.sqrt(math.sqrt(self.attn_head_size))
-
-        self.query_net = nn.Linear(hidden_size, hidden_size)
-        self.key_net = nn.Linear(hidden_size, hidden_size)
-        self.value_net = nn.Linear(hidden_size, hidden_size)
-        self.out_projection = nn.Linear(hidden_size, hidden_size)
-
-        self.attn_dropout = nn.Dropout(attn_score_dropout)
-        self.layer_dropout = nn.Dropout(attn_layer_dropout)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self, queries, keys, values, attention_mask):
-
-        # attention_mask is needed to hide the tokens which correspond to [PAD]
-        # in the case of BERT, or to hide the future tokens in the case of
-        # vanilla language modeling and translation
-        query = self.query_net(queries)
-        key = self.key_net(keys)
-        value = self.value_net(values)
-        query = self.transpose_for_scores(query) / self.attn_scale
-        key = self.transpose_for_scores(key) / self.attn_scale
-        value = self.transpose_for_scores(value)
-
-        # for numerical stability we pre-divide query and key by sqrt(sqrt(d))
-        attention_scores = torch.matmul(query, key.transpose(-1, -2))
-        if attention_mask is not None:
-            attention_scores = attention_scores + attention_mask.to(attention_scores.dtype)
-        attention_probs = torch.softmax(attention_scores, dim=-1)
-        attention_probs = self.attn_dropout(attention_probs)
-
-        context = torch.matmul(attention_probs, value)
-        context = context.permute(0, 2, 1, 3).contiguous()
-        new_context_shape = context.size()[:-2] + (self.hidden_size,)
-        context = context.view(*new_context_shape)
-
-        # output projection
-        output_states = self.out_projection(context)
-        output_states = self.layer_dropout(output_states)
-        return output_states
-
-
-class PositionWiseFF(nn.Module):
-    """
-    Position-wise feed-forward network of Transformer block.
-
-    Args:
-        hidden_size: size of the embeddings in the model, also known as d_model
-        inner_size: number of neurons in the intermediate part of feed-forward
-            net, usually is (4-8 x hidden_size) in the papers
-        ffn_dropout: probability of dropout applied to net output
-        hidden_act: activation function used between two linear layers
-    """
-
-    def __init__(self, hidden_size, inner_size, ffn_dropout=0.0, hidden_act="relu"):
-        super().__init__()
-        self.dense_in = nn.Linear(hidden_size, inner_size)
-        self.dense_out = nn.Linear(inner_size, hidden_size)
-        self.layer_dropout = nn.Dropout(ffn_dropout)
-        ACT2FN = {"gelu": gelu, "relu": torch.relu}
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, hidden_states):
-        output_states = self.dense_in(hidden_states)
-        output_states = self.act_fn(output_states)
-        output_states = self.dense_out(output_states)
-        output_states = self.layer_dropout(output_states)
-        return output_states
-
-
-class AttentionBridge(torch.nn.Module):
-    """
-    A multi-head attention bridge to project a variable-size hidden states
-    to k hidden states (per attention head).
-
-    Code is based on the paper https://arxiv.org/pdf/1703.03130.pdf
-    """
-
-    def __init__(self, hidden_size, k, bridge_size):
-        """
-        hidden_size - size of input hidden state
-        k - number of attention heads
-        bridge_size - size of internal feed forward weights (i.e., attention head size)
-        """
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.k = k
-        self.bridge_size = bridge_size
-
-        self.attn_scale = np.sqrt(np.sqrt(self.bridge_size))
-
-        # build model
-
-        self.W1 = torch.nn.Linear(hidden_size, bridge_size, bias=False)
-        self.W2 = torch.nn.Linear(bridge_size, k, bias=False)
-        self.act = torch.nn.ReLU()
-
-    def forward(self, hidden, hidden_mask=None, return_ortho_loss=False):
-        """
-        Project hidden [B x N x H] to fixed-size [B x k x H]
-
-        return_ortho_loss - if True returns loss term to encourage
-                              orthogonal attention vectors
-        """
-
-        attention_scores = self.W2(self.act(self.W1(hidden) / self.attn_scale) / self.attn_scale).transpose(-1, -2)
-
-        attention_mask = form_attention_mask(hidden_mask)
-        if attention_mask is not None:
-            attention_mask.squeeze_(1)
-            attention_scores = attention_scores + attention_mask.to(attention_scores.dtype)
-
-        A = torch.softmax(attention_scores, dim=-1)
-        M = A @ hidden
-
-        if return_ortho_loss:
-            ortho_loss = ((A @ A.transpose(-1, -2)) - torch.eye(self.k).type_as(A)).pow(2).sum()
-
-            return M, ortho_loss
-        else:
-            return M
diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_utils.py b/nemo/collections/nlp/modules/common/transformer/transformer_utils.py
deleted file mode 100644
index ba558b9c025a..000000000000
--- a/nemo/collections/nlp/modules/common/transformer/transformer_utils.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional, Union
-
-from omegaconf.dictconfig import DictConfig
-
-from nemo.collections.nlp.modules.common.huggingface.huggingface_decoder import HuggingFaceDecoderModule
-from nemo.collections.nlp.modules.common.huggingface.huggingface_encoder import HuggingFaceEncoderModule
-from nemo.collections.nlp.modules.common.transformer.transformer import TransformerDecoderNM, TransformerEncoderNM
-from nemo.collections.nlp.modules.common.transformer.transformer_bottleneck import TransformerBottleneckEncoderNM
-
-
-def get_nemo_transformer(
-    model_name: Optional[str] = None,
-    pretrained: bool = False,
-    config_dict: Optional[Union[dict, DictConfig]] = None,
-    encoder: bool = True,
-    pre_ln_final_layer_norm: bool = True,
-    padding_idx: int = 0,
-) -> Union[TransformerEncoderNM, TransformerDecoderNM]:
-    """Returns NeMo transformer.
-    The following configurations are mandatory:
-        vocab_size: int
-        hidden_size: int
-        num_layers: int
-        inner_size: int
-    and must be specified if using config_dict.
-
-    Args:
-        model_name (Optional[str]): model name to download from NGC
-        pretrained: (bool): False will instantiate the named model architecture with random weights.
-        config_dict (Optional[dict], optional): model configuration parameters. Defaults to None.
-        config_file (Optional[str], optional): path to json file containing model configuration. Defaults to None.
-        checkpoint_file (Optional[str], optional): load weights from path to local checkpoint. Defaults to None.
-        encoder (bool, optional): True will use EncoderTransformerNM, False will use DecoderTransformerNM. Defaults to True.
-    """
-    if model_name is not None:
-        raise ValueError(f'NeMo transformers cannot be loaded from NGC yet. model_name should be None')
-
-    if pretrained:
-        raise ValueError(f'NeMo transformers cannot be loaded from NGC yet. pretrained should be False')
-
-    cfg = None
-
-    if not pretrained:
-        assert (
-            config_dict.get('vocab_size') is not None
-            and config_dict.get('hidden_size') is not None
-            and config_dict.get('num_layers') is not None
-            and config_dict.get('inner_size') is not None
-        ), f'Using config_dict: {config_dict}. vocab_size, hidden_size, num_layers, and inner_size must are mandatory arguments'
-
-        cfg = config_dict
-
-    if encoder:
-        # if arch exists in cfg we return TransformerBottleneckEncoderNM
-        arch = cfg.get('arch', '')
-        if not arch:
-            model = TransformerEncoderNM(
-                vocab_size=cfg.get('vocab_size'),
-                hidden_size=cfg.get('hidden_size'),
-                num_layers=cfg.get('num_layers'),
-                inner_size=cfg.get('inner_size'),
-                max_sequence_length=cfg.get('max_sequence_length', 512),
-                embedding_dropout=cfg.get('embedding_dropout', 0.0),
-                learn_positional_encodings=cfg.get('learn_positional_encodings', False),
-                num_attention_heads=cfg.get('num_attention_heads'),
-                ffn_dropout=cfg.get('ffn_dropout', 0.0),
-                attn_score_dropout=cfg.get('attn_score_dropout', 0.0),
-                attn_layer_dropout=cfg.get('attn_layer_dropout', 0.0),
-                hidden_act=cfg.get('hidden_act', 'relu'),
-                mask_future=cfg.get('mask_future', True),
-                pre_ln=cfg.get('pre_ln', False),
-                pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-                num_token_types=cfg.get('num_token_types', 2),
-                padding_idx=padding_idx,
-            )
-        elif arch in TransformerBottleneckEncoderNM._SUPPORTED_ARCH:
-            model = TransformerBottleneckEncoderNM(
-                vocab_size=cfg.get('vocab_size'),
-                hidden_size=cfg.get('hidden_size'),
-                num_layers=cfg.get('num_layers'),
-                inner_size=cfg.get('inner_size'),
-                max_sequence_length=cfg.get('max_sequence_length', 512),
-                embedding_dropout=cfg.get('embedding_dropout', 0.0),
-                learn_positional_encodings=cfg.get('learn_positional_encodings', False),
-                num_attention_heads=cfg.get('num_attention_heads'),
-                ffn_dropout=cfg.get('ffn_dropout', 0.0),
-                attn_score_dropout=cfg.get('attn_score_dropout', 0.0),
-                attn_layer_dropout=cfg.get('attn_layer_dropout', 0.0),
-                hidden_act=cfg.get('hidden_act', 'relu'),
-                mask_future=cfg.get('mask_future', False),
-                pre_ln=cfg.get('pre_ln', False),
-                pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-                num_token_types=cfg.get('num_token_types', 2),
-                arch=cfg.get('arch', 'full'),
-                hidden_steps=cfg.get('hidden_steps', -1),
-                hidden_blocks=cfg.get('hidden_blocks', 1),
-                hidden_init_method=cfg.get('hidden_init_method', 'default'),
-                return_mask=cfg.get('return_mask', True),
-                padding_idx=padding_idx,
-            )
-        else:
-            raise ValueError(f"Unknown arch = {arch}")
-    else:
-        model = TransformerDecoderNM(
-            vocab_size=cfg.get('vocab_size'),
-            hidden_size=cfg.get('hidden_size'),
-            num_layers=cfg.get('num_layers'),
-            inner_size=cfg.get('inner_size'),
-            max_sequence_length=cfg.get('max_sequence_length', 512),
-            embedding_dropout=cfg.get('embedding_dropout', 0.0),
-            learn_positional_encodings=cfg.get('learn_positional_encodings', False),
-            num_attention_heads=cfg.get('num_attention_heads'),
-            ffn_dropout=cfg.get('ffn_dropout', 0.0),
-            attn_score_dropout=cfg.get('attn_score_dropout', 0.0),
-            attn_layer_dropout=cfg.get('attn_layer_dropout', 0.0),
-            hidden_act=cfg.get('hidden_act', 'relu'),
-            pre_ln=cfg.get('pre_ln', False),
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-            num_token_types=cfg.get('num_token_types', 2),
-            padding_idx=padding_idx,
-        )
-
-    return model
-
-
-def get_huggingface_transformer(
-    model_name: Optional[str] = None,
-    pretrained: bool = False,
-    config_dict: Optional[Union[dict, DictConfig]] = None,
-    encoder: bool = True,
-) -> Union[HuggingFaceEncoderModule, HuggingFaceDecoderModule]:
-
-    if encoder:
-        model = HuggingFaceEncoderModule(model_name, pretrained, config_dict)
-    else:
-        model = HuggingFaceDecoderModule(model_name, pretrained, config_dict)
-
-    return model
-
-
-def get_megatron_transformer(
-    model_name: Optional[str] = None,
-    pretrained: bool = True,
-    config_dict: Optional[Union[dict, DictConfig]] = None,
-    encoder: bool = True,
-    checkpoint_file: str = None,
-) -> None:
-
-    raise ValueError(
-        "megatron-lm bert encoders are deprecated in NeMo 1.5.0. Please use NeMo 1.4.0 until megatron bert support is added again."
-    )
-
-    # vocab_file = config_dict.pop('vocab_file', None)
-    # if encoder:
-    #     model = MegatronEncoderModule(
-    #         model_name=model_name,
-    #         pretrained=pretrained,
-    #         config_dict=config_dict,
-    #         checkpoint_file=checkpoint_file,
-    #         vocab_file=vocab_file,
-    #     )
-    # else:
-    #     raise ValueError('Megatron decoders are not currently supported.')
-
-    # return model
diff --git a/nemo/collections/speechlm/utils/text_generation/audio_text_generation_utils.py b/nemo/collections/speechlm/utils/text_generation/audio_text_generation_utils.py
index e8d47c33c7e3..1dbc1c64cbdb 100644
--- a/nemo/collections/speechlm/utils/text_generation/audio_text_generation_utils.py
+++ b/nemo/collections/speechlm/utils/text_generation/audio_text_generation_utils.py
@@ -15,18 +15,18 @@
 """Utilities for generating text."""
 
 import pickle
+import sys
 from collections.abc import Iterable
 from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
 
-import nemo.collections.nlp.modules.common.text_generation_utils as text_generation_utils
+import nemo.collections.multimodal.speech_llm.modules.common.text_generation_utils as text_generation_utils
 from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
 from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_strategy import (
     model_inference_strategy_dispatcher,
 )
-from nemo.collections.nlp.modules.common.transformer.text_generation import OutputType
 from nemo.utils import AppState, logging
 
 try:
@@ -47,6 +47,11 @@
         _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
     )
 
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 __all__ = [
     "get_computeprob_response",
     "generate",
@@ -58,6 +63,15 @@
 default_inference_config = {'tokens_to_generate': 64}
 
 
+class OutputType(TypedDict):
+    sentences: List[str]  # output sentences
+    tokens: List[List[str]]  # output sentences borken into tokens
+    logprob: List[List[float]]  # log prob of generated tokens
+    full_logprob: List[List[float]]  # log prob of all the tokens in the vocab
+    token_ids: List[List[int]]  # output sentence token ids
+    offsets: List[List[int]]  # list of tokens start positions in text
+
+
 def clean_end_string(text: list[str], tokenizer, end_string: Optional[str] = None):
     if end_string is None:
         return text

From d013eb53d83e5c6f42217a84edc3cc28d1d00c3b Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 15 Oct 2025 12:21:04 +0000
Subject: [PATCH 13/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 ...mm_autoregressive_eval_image_generation.py |  2 +-
 .../transformer/transformer_encoders_nlp.py   |  1 -
 .../asr/modules/wav2vec_modules.py            | 86 +++++++++++--------
 .../speech_llm/models/modular_models.py       |  2 +-
 .../modules/common/text_generation_utils.py   |  3 +-
 .../speech_llm/modules/perception_modules.py  |  2 +-
 .../modules/transformer_decoders.py           |  1 -
 7 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
index d2f7cdc41db5..c5238caf13c8 100644
--- a/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
+++ b/examples/multimodal_autoregressive/megatron_mm_autoregressive_eval_image_generation.py
@@ -14,9 +14,9 @@
 
 import datetime
 import math
-import sys
 import os
 import re
+import sys
 
 import torch
 import torchvision
diff --git a/nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py b/nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py
index fbbf7fe079e7..ea6de7cb1848 100644
--- a/nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py
+++ b/nemo/collections/asr/modules/transformer/transformer_encoders_nlp.py
@@ -172,4 +172,3 @@ def forward(self, encoder_states, encoder_mask, encoder_mems_list=None, return_m
             return cached_mems_list
         else:
             return cached_mems_list[-1]
-
diff --git a/nemo/collections/asr/modules/wav2vec_modules.py b/nemo/collections/asr/modules/wav2vec_modules.py
index 4ca474bee8e0..dc3ba3f2821d 100644
--- a/nemo/collections/asr/modules/wav2vec_modules.py
+++ b/nemo/collections/asr/modules/wav2vec_modules.py
@@ -27,8 +27,8 @@
 from torch import nn
 from torch.nn import functional as F
 
-from nemo.collections.common.parts import form_attention_mask, transformer_weights_init
 from nemo.collections.asr.modules.common.transformer.transformer_encoders_nlp import TransformerEncoder
+from nemo.collections.common.parts import form_attention_mask, transformer_weights_init
 from nemo.core.classes.module import NeuralModule
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
 
@@ -55,10 +55,10 @@ def forward(self, x):
 
 class ConvFeatureEncoder(NeuralModule):
     """
-		Encoder used to isolate features in raw audio for Wav2Vec style training.
-		Treated as preprocessor module in NeMo ASR training. Defaults values are
-		for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
-		save for use of layer normalization as default schema. (Chosen for stability.) 
+    Encoder used to isolate features in raw audio for Wav2Vec style training.
+    Treated as preprocessor module in NeMo ASR training. Defaults values are
+    for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
+    save for use of layer normalization as default schema. (Chosen for stability.)
     """
 
     @property
@@ -78,7 +78,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports. 
+        """Returns definitions of module output ports.
         For compatibility, processed features are treated as Spectrogram types
         processed_signal:
             0: AxisType(BatchTag)
@@ -107,7 +107,13 @@ def __init__(
         self.normalize_input = normalize_audio
 
         def block(
-            n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False,
+            n_in,
+            n_out,
+            k,
+            stride,
+            is_layer_norm=False,
+            is_group_norm=False,
+            conv_bias=False,
         ):
             def make_conv():
                 conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
@@ -123,7 +129,11 @@ def make_conv():
                     nn.GELU(),
                 )
             elif is_group_norm:
-                return nn.Sequential(make_conv(), nn.GroupNorm(dim, dim, affine=True), nn.GELU(),)
+                return nn.Sequential(
+                    make_conv(),
+                    nn.GroupNorm(dim, dim, affine=True),
+                    nn.GELU(),
+                )
             else:
                 return nn.Sequential(make_conv(), nn.GELU())
 
@@ -213,34 +223,34 @@ def get_lengths(self, audio_lengths):
 
 class Wav2VecTransformerEncoder(TransformerEncoder):
     """
-		Encoder module following Transformer encoder paradigm 
-		as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
-		style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
-		Takes convolutional encodings of all time steps and adds to features before applying series
-		of self-attention layers. 
-		
-		Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec
-
-		Args:
-			layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
-				If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
-				Occurs only during training step
-			pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
-				to maintain number of time steps
-				Must contain following:
-					embedding_dim: Depth/number of channels of each time step from feature encoding 
-					conv_pos: Kernel size for convolution
-					conv_pos_groups: Number of groups for convolution
-			transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
-				Must contain followign:
-					num_layers: Number of attention layers 
-					hidden_size: Expected input depth (embedding size between model layers)
-					inner_size: Depth of embeddings within feed-forward sections of encoder layers
-					num_attention_heads: Number of attention heads
-					attn_score_dropout: Probability of dropout applied to attention scores
-					attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
-					ffn_dropout: Probability of dropout applied to feed-forward modules
-					hidden_act: Activation function for hidden layers
+    Encoder module following Transformer encoder paradigm
+    as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
+    style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
+    Takes convolutional encodings of all time steps and adds to features before applying series
+    of self-attention layers.
+
+    Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec
+
+    Args:
+            layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
+                    If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
+                    Occurs only during training step
+            pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
+                    to maintain number of time steps
+                    Must contain following:
+                            embedding_dim: Depth/number of channels of each time step from feature encoding
+                            conv_pos: Kernel size for convolution
+                            conv_pos_groups: Number of groups for convolution
+            transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
+                    Must contain followign:
+                            num_layers: Number of attention layers
+                            hidden_size: Expected input depth (embedding size between model layers)
+                            inner_size: Depth of embeddings within feed-forward sections of encoder layers
+                            num_attention_heads: Number of attention heads
+                            attn_score_dropout: Probability of dropout applied to attention scores
+                            attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
+                            ffn_dropout: Probability of dropout applied to feed-forward modules
+                            hidden_act: Activation function for hidden layers
     """
 
     def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: float = 0.0):
@@ -271,7 +281,7 @@ def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: f
 
     @property
     def input_types(self):
-        """Returns definitions of module output ports. 
+        """Returns definitions of module output ports.
         We treat features as SpectrogramType for Nemo compatibility
         audio_signal:
             0: AxisType(BatchTag)
@@ -287,7 +297,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports. 
+        """Returns definitions of module output ports.
         We're using SpectrogramType for now to keep things Nemo safe
         processed_signal:
             0: AxisType(BatchTag)
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
index 29c5c98337a6..74e744b3a638 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -55,12 +55,12 @@
     MegatronGPTModel = ABC
     MegatronGPTSFTModel = ABC
 
+from nemo.collections.multimodal.speech_llm.modules.common.text_generation_utils import get_computeprob_response
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
     build_position_ids,
     get_iterator_k_split,
 )
-from nemo.collections.multimodal.speech_llm.modules.common.text_generation_utils import get_computeprob_response
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes import ModelPT
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
index 5e204697d902..04c91c9f373b 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 import numpy as np
+import torch
 
 try:
     from megatron.core import parallel_state, tensor_parallel
@@ -144,4 +144,3 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started
                 logits[i, indices_to_remove] = filter_value
 
     return logits
-
diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
index 1f35b4fc6529..7f8e0e41cce5 100644
--- a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
+++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
@@ -22,8 +22,8 @@
 
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerMultiLayerFeatureExtractor
-from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import align_feat_seq_list
 from nemo.collections.multimodal.speech_llm.modules.common.transformer.transformer_decoders import TransformerDecoder
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import align_feat_seq_list
 from nemo.core.classes import Exportable, NeuralModule
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
diff --git a/nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py b/nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py
index aa1ad355d5ba..80c7638f9709 100644
--- a/nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py
+++ b/nemo/collections/multimodal/speech_llm/modules/transformer_decoders.py
@@ -216,4 +216,3 @@ def input_example(self, max_batch=1, max_dim=256):
         input_ids = torch.randint(low=0, high=2048, size=(max_batch, max_dim, 1024), device=sample.device)
         encoder_mask = torch.randint(low=0, high=1, size=(max_batch, max_dim), device=sample.device)
         return tuple([input_ids, encoder_mask, input_ids, encoder_mask])
-

From 040bcabe1ac592c67953f9ab92b4d6e8d5d2828b Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 05:24:40 -0700
Subject: [PATCH 14/21] fix style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/asr/modules/wav2vec_modules.py           | 2 +-
 .../speech_llm/modules/common/text_generation_utils.py    | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/asr/modules/wav2vec_modules.py b/nemo/collections/asr/modules/wav2vec_modules.py
index dc3ba3f2821d..e823943de4e2 100644
--- a/nemo/collections/asr/modules/wav2vec_modules.py
+++ b/nemo/collections/asr/modules/wav2vec_modules.py
@@ -19,7 +19,7 @@
 
 import math
 import random
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
 import torch
 from omegaconf import DictConfig
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
index 04c91c9f373b..38cad96b640d 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import torch
+import torch.nn.functional as F
+
+import numpy as np
+
+from nemo.utils import AppState
 
 try:
-    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core import parallel_state
 
     HAVE_MEGATRON_CORE = True
 

From a69f1864e5631f375e96ef686e53d1d5b54dad9d Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 15 Oct 2025 12:25:42 +0000
Subject: [PATCH 15/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../speech_llm/modules/common/text_generation_utils.py         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
index 38cad96b640d..dfcf77efe2c3 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_utils.py
@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 
-import numpy as np
-
 from nemo.utils import AppState
 
 try:

From e6be417d3571d04b50f02d5a8a0678555f412734 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 05:37:43 -0700
Subject: [PATCH 16/21] remove .py files

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/nlp/modules/__init__.py      |   4 -
 .../nlp/modules/common/__init__.py            |   5 -
 .../nlp/modules/common/prompt_encoder.py      | 361 ------------------
 .../nlp/modules/common/sequence_classifier.py |  72 ----
 .../nlp/modules/common/sequence_regression.py |  69 ----
 .../common/sequence_token_classifier.py       |  80 ----
 .../nlp/modules/common/token_classifier.py    | 164 --------
 .../g2p/models/token_classifier.py}           |  68 +++-
 8 files changed, 63 insertions(+), 760 deletions(-)
 delete mode 100644 nemo/collections/nlp/modules/common/prompt_encoder.py
 delete mode 100644 nemo/collections/nlp/modules/common/sequence_classifier.py
 delete mode 100644 nemo/collections/nlp/modules/common/sequence_regression.py
 delete mode 100644 nemo/collections/nlp/modules/common/sequence_token_classifier.py
 delete mode 100644 nemo/collections/nlp/modules/common/token_classifier.py
 rename nemo/collections/{nlp/modules/common/classifier.py => tts/g2p/models/token_classifier.py} (51%)

diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py
index e45960eb0422..d3b35b8a794b 100644
--- a/nemo/collections/nlp/modules/__init__.py
+++ b/nemo/collections/nlp/modules/__init__.py
@@ -14,9 +14,5 @@
 
 
 from nemo.collections.nlp.modules.common import BertModule  # noqa: F401
-from nemo.collections.nlp.modules.common import PromptEncoder  # noqa: F401
-from nemo.collections.nlp.modules.common import SequenceClassifier  # noqa: F401
-from nemo.collections.nlp.modules.common import SequenceRegression  # noqa: F401
-from nemo.collections.nlp.modules.common import SequenceTokenClassifier  # noqa: F401
 from nemo.collections.nlp.modules.common import get_lm_model  # noqa: F401
 from nemo.collections.nlp.modules.common import get_pretrained_lm_models_list  # noqa: F401
diff --git a/nemo/collections/nlp/modules/common/__init__.py b/nemo/collections/nlp/modules/common/__init__.py
index 8c68a7c09883..cf7bf9de9043 100644
--- a/nemo/collections/nlp/modules/common/__init__.py
+++ b/nemo/collections/nlp/modules/common/__init__.py
@@ -19,13 +19,8 @@
 
 from nemo.collections.nlp.modules.common.bert_module import BertModule
 from nemo.collections.nlp.modules.common.lm_utils import get_lm_model, get_pretrained_lm_models_list
-from nemo.collections.nlp.modules.common.prompt_encoder import PromptEncoder, PromptEncoderType
 from nemo.collections.nlp.modules.common.prompt_table import (
     VirtualPromptPlaceholderToken,
     VirtualPromptSource,
     VirtualPromptStyle,
 )
-from nemo.collections.nlp.modules.common.sequence_classifier import SequenceClassifier
-from nemo.collections.nlp.modules.common.sequence_regression import SequenceRegression
-from nemo.collections.nlp.modules.common.sequence_token_classifier import SequenceTokenClassifier
-from nemo.collections.nlp.modules.common.token_classifier import BertPretrainingTokenClassifier, TokenClassifier
diff --git a/nemo/collections/nlp/modules/common/prompt_encoder.py b/nemo/collections/nlp/modules/common/prompt_encoder.py
deleted file mode 100644
index 37536bcfd4c7..000000000000
--- a/nemo/collections/nlp/modules/common/prompt_encoder.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import enum
-from typing import Dict, Optional
-
-import torch
-import torch.nn.init as init
-from torch import nn
-
-from nemo.collections.nlp.modules.common.megatron.fused_bias_gelu import fused_bias_gelu
-from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults, init_method_normal
-from nemo.core.classes import Exportable, NeuralModule
-from nemo.core.classes.common import typecheck
-
-try:
-    from megatron.core import ModelParallelConfig, tensor_parallel
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    ModelParallelConfig = ApexGuardDefaults
-
-    HAVE_MEGATRON_CORE = False
-
-
-__all__ = ["PromptEncoder", "PromptEncoderType"]
-
-
-class PromptEncoderType(enum.Enum):
-    TPMLP = "tpmlp"  # mlp model that support tensor parallel, better work together with a large language model
-    MLP = "mlp"
-    LSTM = "lstm"
-    EMBEDDING = "embedding"
-
-
-class PromptEmbedding(NeuralModule, Exportable):
-    """Prompt embeddings
-
-    Arugments:
-        init_from_prompt_text: Whether to intialize prompt embeddings
-                               from from certain lm embeddings
-                               corresponding to a prompt string
-        hidden_size: hidden size should match lm embedding size
-        total_virtual_tokens: length of prompt initalized from torch init method
-    """
-
-    def __init__(
-        self, hidden_size, total_virtual_tokens,
-    ):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.total_virtual_tokens = total_virtual_tokens
-
-        # Randomly init token and position embeddings
-        self.prompt_embeddings = torch.nn.Embedding(self.total_virtual_tokens, self.hidden_size)
-        self.prompt_embeddings.weight.data.fill_(0.0)
-        self.prompt_embeddings.weight.requires_grad = False
-
-        # Set fixed indicies for forward pass
-        self.register_buffer("indices", torch.LongTensor(list(range(self.total_virtual_tokens))), persistent=False)
-
-    def clear_prompt_embedding_weights(self,):
-        """
-        Method sets the prompt embedding weights to 0.0
-        """
-        self.prompt_embeddings.weight.fill_(0.0)
-
-    def set_prompt_embedding_weights(self, weight: torch.Tensor):
-        """
-        Method sets the prompt embedding weights with a new weight w
-        """
-        self.prompt_embeddings.weight.data = weight.type_as(self.prompt_embeddings.weight.data)
-
-    def forward(self,):
-        """ 
-        Does forward pass
-        """
-        return self.prompt_embeddings(self.indices)
-
-
-class InferenceTable(NeuralModule, Exportable):
-    """ 
-    A wrapper class that holds the output representations of the PromptEncoder Model. 
-    At inference time we do not need to forward pass through the full PromptEncoder and can just use this class.
-    """
-
-    def __init__(self, taskname, hidden_size, total_virtual_tokens, is_inference_ready=False):
-        super().__init__()
-        self.taskname = taskname
-        self.hidden_size = hidden_size
-        self.total_virtual_tokens = total_virtual_tokens
-        self.prompt_table = torch.nn.ModuleDict()
-        self.prompt_table[self.taskname] = PromptEmbedding(self.hidden_size, self.total_virtual_tokens)
-        self.prompt_table[self.taskname].clear_prompt_embedding_weights()
-        self.is_inference_ready = is_inference_ready
-        for p in self.prompt_table.parameters():
-            p.requires_grad = False
-
-    def set_prompt_table(self, prompt_representation: torch.Tensor):
-        """
-        Method sets the prompt embedding inside self.prompt_table[taskname] with new weights
-        """
-        self.prompt_table[self.taskname].set_prompt_embedding_weights(prompt_representation)
-        self.is_inference_ready = True
-
-    def get_prompt_table(self,):
-        """ 
-        Returns the prompt representation cached in the prompt table
-        """
-        return self.prompt_table[self.taskname].forward()
-
-    def clear_prompt_table(self,):
-        """
-        Method "clears" the prompt embedding inside self.prompt_table[taskname] by setting it to zero.
-        """
-        self.prompt_table[self.taskname].clear_prompt_embedding_weights()
-        self.is_inference_ready = False
-
-
-class TPMLP(NeuralModule, Exportable):
-    """
-    The Tensor Parallel MLP prompt encoder network that is used to generate the virtual 
-    token embeddings for p-tuning. It only have two layers.
-    """
-
-    def __init__(
-        self,
-        config: ModelParallelConfig,
-        total_virtual_tokens: int,
-        hidden_size: int,
-        output_size: int,
-        init_std: float,
-    ):
-        """
-        Initializes the Tensor Model parallel MLP PromptEncoderMLP module.
-        Args:
-            config: the model parallel config used my megatron core
-            total_virtual_tokens: the total number of vitural tokens
-            hidden_size: hidden dimension
-            output_size:  the output dimension
-            init_std: the MLP init std value 
-        """
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.output_size = output_size
-        self.total_virtual_tokens = total_virtual_tokens
-        self.activation = "gelu"
-
-        config = copy.deepcopy(config)
-        config.sequence_parallel = False
-        config.gradient_accumulation_fusion = False
-
-        self.first = tensor_parallel.ColumnParallelLinear(
-            self.output_size,
-            self.hidden_size,
-            config=config,
-            gather_output=False,
-            init_method=init_method_normal(init_std),
-            skip_bias_add=True,
-            bias=True,
-        )
-        self.second = tensor_parallel.RowParallelLinear(
-            self.hidden_size,
-            self.output_size,
-            config=config,
-            input_is_parallel=True,
-            init_method=init_method_normal(init_std),
-            skip_bias_add=True,
-            bias=True,
-        )
-
-    def forward(self, input_embeds) -> torch.Tensor:
-        intermediate_parallel, bias_parallel = self.first(input_embeds)
-        intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
-        output_embeds, bias_parallel = self.second(intermediate_parallel)
-        output_embeds = output_embeds + bias_parallel
-        return output_embeds
-
-
-class PromptEncoder(NeuralModule, Exportable):
-    """
-    The prompt encoder network that is used to generate the virtual 
-    token embeddings for p-tuning.
-    """
-
-    def __init__(
-        self,
-        config: ModelParallelConfig,
-        encoder_type: enum,
-        total_virtual_tokens: int,
-        token_dim: int,
-        hidden_size,
-        lstm_dropout: float,
-        num_layers: int,
-        init_std: float,
-        taskname: str = "taskname",
-    ):
-        """
-        Initializes the PromptEncoder module.
-        Args:
-            config: the model parallel config used my megatron core
-            total_virtual_tokens: the total number of vitural tokens
-            hidden_size: hidden dimension
-            lstm_dropout: the dropout used for the LSTM
-            num_layers: number of layers used in the LSTM
-            init_std: used for TPMLP encoder type to initialize the mlp weights
-        """
-        super().__init__()
-        self.token_dim = token_dim
-        self.input_size = token_dim
-        self.output_size = token_dim
-        self.hidden_size = hidden_size
-        self.total_virtual_tokens = total_virtual_tokens
-        self.encoder_type = encoder_type
-        self.activation = "gelu"
-        self.init_std = init_std
-        self.taskname = taskname
-
-        # Set fixed indicies for forward pass
-        self.register_buffer("indices", torch.LongTensor(list(range(self.total_virtual_tokens))))
-
-        # embedding
-        self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim)
-        self.inference_table = InferenceTable(taskname, self.token_dim, self.total_virtual_tokens)
-
-        if self.encoder_type == PromptEncoderType.EMBEDDING:
-            init.xavier_normal_(self.embedding.weight)
-        elif self.encoder_type == PromptEncoderType.LSTM:
-            # LSTM
-            self.lstm_head = torch.nn.LSTM(
-                input_size=self.input_size,
-                hidden_size=self.hidden_size,
-                num_layers=num_layers,
-                dropout=lstm_dropout,
-                bidirectional=True,
-                batch_first=True,
-            )
-
-            self.mlp_head = nn.Sequential(
-                nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
-                nn.ReLU(),
-                nn.Linear(self.hidden_size * 2, self.output_size),
-            )
-
-        elif self.encoder_type == PromptEncoderType.MLP:
-            if num_layers <= 1:
-                raise ValueError(
-                    "The MLP prompt encoder must have at least 2 layers, and exactly 2 layers is recommended."
-                )
-
-            layers = [nn.Linear(self.input_size, self.hidden_size), nn.ReLU()]
-            for _ in range(num_layers - 2):
-                layers.extend([nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU()])
-
-            layers.append(nn.Linear(self.hidden_size, self.output_size))
-            self.mlp_head = nn.Sequential(*layers)
-
-        elif self.encoder_type == PromptEncoderType.TPMLP:
-            self.tpmlp = TPMLP(config, self.total_virtual_tokens, self.hidden_size, self.output_size, self.init_std,)
-        else:
-            raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
-
-    def set_inference_table(self, prompt_representation: torch.Tensor):
-        """
-        This method caches the output representation from the Encoder and saves it inside `self.inference_table`.
-        """
-        prompt_representation = prompt_representation.detach().clone()
-        self.inference_table.set_prompt_table(prompt_representation)
-
-    def clear_inference_table(self,):
-        self.inference_table.clear_prompt_table()
-
-    def get_inference_table(self,):
-        return self.inference_table.get_prompt_table()
-
-    def state_dict(self, desination=None, prefix=None, keep_vars=False):
-        _state_dict = {}
-        _state_dict[
-            'prompt_table'
-        ] = (
-            self.inference_table.state_dict()
-        )  # (@adithyare) this key is for backward compatibility with downstream users of the "inference ready" model.
-        _state_dict['embeddings'] = self.embedding.state_dict()
-        if self.encoder_type == PromptEncoderType.EMBEDDING:
-            pass
-        elif self.encoder_type == PromptEncoderType.LSTM:
-            _state_dict['mlp_head'] = self.mlp_head.state_dict()
-            _state_dict['lstm_head'] = self.lstm_head.state_dict()
-        elif self.encoder_type == PromptEncoderType.MLP:
-            _state_dict['mlp_head'] = self.mlp_head.state_dict()
-        elif self.encoder_type == PromptEncoderType.TPMLP:
-            _state_dict['tpmlp'] = self.tpmlp.state_dict()
-        else:
-            raise ValueError("Prompt encoder type not recognized. Pl.")
-        return _state_dict
-
-    def load_state_dict(self, state_dict, strict=True):
-        self.inference_table.load_state_dict(state_dict['prompt_table'])
-        self.embedding.load_state_dict(state_dict['embeddings'])
-        if self.encoder_type == PromptEncoderType.EMBEDDING:
-            pass
-        elif self.encoder_type == PromptEncoderType.LSTM:
-            self.mlp_head.load_state_dict(state_dict['mlp_head'])
-            self.lstm_head.state_dict(state_dict['lstm_head'])
-        elif self.encoder_type == PromptEncoderType.MLP:
-            self.mlp_head.load_state_dict(state_dict['mlp_head'])
-        elif self.encoder_type == PromptEncoderType.TPMLP:
-            self.tpmlp.load_state_dict(state_dict['tpmlp'])
-        else:
-            raise ValueError("Prompt encoder type not recognized. Pl.")
-        return
-
-    def _forward(self,):
-        input_embeds = self.embedding(self.indices).unsqueeze(0)
-        if self.encoder_type == PromptEncoderType.EMBEDDING:
-            output_embeds = input_embeds
-        elif self.encoder_type == PromptEncoderType.LSTM:
-            output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0])
-        elif self.encoder_type == PromptEncoderType.MLP:
-            output_embeds = self.mlp_head(input_embeds)
-        elif self.encoder_type == PromptEncoderType.TPMLP:
-            output_embeds = self.tpmlp(input_embeds)
-        else:
-            raise ValueError("Prompt encoder type not recognized. Pl.")
-        return output_embeds
-
-    @typecheck()
-    def forward(self, batch_size: int, use_cached_reps: bool) -> torch.Tensor:
-        """ 
-        Forward pass through the encoder with caching of prompt representations
-        """
-        if use_cached_reps:
-            output_embeds = self.get_inference_table().unsqueeze(0)
-        else:
-            if self.training:
-                if self.inference_table.is_inference_ready:
-                    self.clear_inference_table()
-                output_embeds = self._forward()
-            else:
-                if not self.inference_table.is_inference_ready:
-                    output_embeds = self._forward()
-                    self.set_inference_table(output_embeds.squeeze(0))
-                output_embeds = self.get_inference_table().unsqueeze(0)
-
-        output_embeds = output_embeds.expand(batch_size, self.total_virtual_tokens, self.token_dim)
-        return output_embeds
diff --git a/nemo/collections/nlp/modules/common/sequence_classifier.py b/nemo/collections/nlp/modules/common/sequence_classifier.py
deleted file mode 100644
index aac2a2707b50..000000000000
--- a/nemo/collections/nlp/modules/common/sequence_classifier.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-from nemo.collections.common.parts import MultiLayerPerceptron
-from nemo.collections.nlp.modules.common.classifier import Classifier
-from nemo.core.classes import typecheck
-from nemo.core.neural_types import LogitsType, LogprobsType, NeuralType
-
-__all__ = ['SequenceClassifier']
-
-
-class SequenceClassifier(Classifier):
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        if not self.log_softmax:
-            return {"logits": NeuralType(('B', 'D'), LogitsType())}
-        else:
-            return {"log_probs": NeuralType(('B', 'D'), LogprobsType())}
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_classes: int,
-        num_layers: int = 2,
-        activation: str = 'relu',
-        log_softmax: bool = True,
-        dropout: float = 0.0,
-        use_transformer_init: bool = True,
-        idx_conditioned_on: int = 0,
-    ):
-        """
-        Initializes the SequenceClassifier module.
-        Args:
-            hidden_size: the hidden size of the mlp head on the top of the encoder
-            num_classes: number of the classes to predict
-            num_layers: number of the linear layers of the mlp head on the top of the encoder
-            activation: type of activations between layers of the mlp head
-            log_softmax: applies the log softmax on the output
-            dropout: the dropout used for the mlp head
-            use_transformer_init: initializes the weights with the same approach used in Transformer
-            idx_conditioned_on: index of the token to use as the sequence representation for the classification task, default is the first token
-        """
-        super().__init__(hidden_size=hidden_size, dropout=dropout)
-        self.log_softmax = log_softmax
-        self._idx_conditioned_on = idx_conditioned_on
-        self.mlp = MultiLayerPerceptron(
-            hidden_size=hidden_size,
-            num_classes=num_classes,
-            num_layers=num_layers,
-            activation=activation,
-            log_softmax=log_softmax,
-        )
-        self.post_init(use_transformer_init=use_transformer_init)
-
-    @typecheck()
-    def forward(self, hidden_states):
-        hidden_states = self.dropout(hidden_states)
-        logits = self.mlp(hidden_states[:, self._idx_conditioned_on])
-        return logits
diff --git a/nemo/collections/nlp/modules/common/sequence_regression.py b/nemo/collections/nlp/modules/common/sequence_regression.py
deleted file mode 100644
index f3150727cef8..000000000000
--- a/nemo/collections/nlp/modules/common/sequence_regression.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-from torch import Tensor
-
-from nemo.collections.common.parts import MultiLayerPerceptron
-from nemo.collections.nlp.modules.common.classifier import Classifier
-from nemo.core.classes import typecheck
-from nemo.core.neural_types import NeuralType, RegressionValuesType
-
-__all__ = ['SequenceRegression']
-
-
-class SequenceRegression(Classifier):
-    """
-    Args:
-        hidden_size: the hidden size of the mlp head on the top of the encoder
-        num_layers: number of the linear layers of the mlp head on the top of the encoder
-        activation: type of activations between layers of the mlp head
-        dropout: the dropout used for the mlp head
-        use_transformer_init: initializes the weights with the same approach used in Transformer
-        idx_conditioned_on: index of the token to use as the sequence representation for the classification task,
-            default is the first token
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"preds": NeuralType(tuple('B'), RegressionValuesType())}
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_layers: int = 2,
-        activation: str = 'relu',
-        dropout: float = 0.0,
-        use_transformer_init: bool = True,
-        idx_conditioned_on: int = 0,
-    ):
-        """ Initializes the SequenceRegression module. """
-        super().__init__(hidden_size=hidden_size, dropout=dropout)
-        self._idx_conditioned_on = idx_conditioned_on
-        self.mlp = MultiLayerPerceptron(
-            hidden_size, num_classes=1, num_layers=num_layers, activation=activation, log_softmax=False,
-        )
-        self.post_init(use_transformer_init=use_transformer_init)
-
-    @typecheck()
-    def forward(self, hidden_states: Tensor) -> Tensor:
-        """ Forward pass through the module.
-
-        Args:
-            hidden_states: hidden states for each token in a sequence, for example, BERT module output
-        """
-        hidden_states = self.dropout(hidden_states)
-        preds = self.mlp(hidden_states[:, self._idx_conditioned_on])
-        return preds.view(-1)
diff --git a/nemo/collections/nlp/modules/common/sequence_token_classifier.py b/nemo/collections/nlp/modules/common/sequence_token_classifier.py
deleted file mode 100644
index 018fc74a406b..000000000000
--- a/nemo/collections/nlp/modules/common/sequence_token_classifier.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-from nemo.collections.common.parts import MultiLayerPerceptron
-from nemo.collections.nlp.modules.common.classifier import Classifier
-from nemo.core.classes import typecheck
-from nemo.core.neural_types import LogitsType, NeuralType
-
-__all__ = ['SequenceTokenClassifier']
-
-
-class SequenceTokenClassifier(Classifier):
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "intent_logits": NeuralType(('B', 'D'), LogitsType()),
-            "slot_logits": NeuralType(('B', 'T', 'D'), LogitsType()),
-        }
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_intents: int,
-        num_slots: int,
-        num_layers: int = 2,
-        activation: str = 'relu',
-        log_softmax: bool = False,
-        dropout: float = 0.0,
-        use_transformer_init: bool = True,
-    ):
-        """
-        Initializes the SequenceTokenClassifier module, could be used for tasks that train sequence and
-        token classifiers jointly, for example, for intent detection and slot tagging task.
-        Args:
-            hidden_size: hidden size of the mlp head on the top of the encoder
-            num_intents: number of the intents to predict
-            num_slots: number of the slots to predict
-            num_layers: number of the linear layers of the mlp head on the top of the encoder
-            activation: type of activations between layers of the mlp head
-            log_softmax: applies the log softmax on the output
-            dropout: the dropout used for the mlp head
-            use_transformer_init: initializes the weights with the same approach used in Transformer
-        """
-        super().__init__(hidden_size=hidden_size, dropout=dropout)
-        self.intent_mlp = MultiLayerPerceptron(
-            hidden_size=hidden_size,
-            num_classes=num_intents,
-            num_layers=num_layers,
-            activation=activation,
-            log_softmax=log_softmax,
-        )
-        self.slot_mlp = MultiLayerPerceptron(
-            hidden_size=hidden_size,
-            num_classes=num_slots,
-            num_layers=num_layers,
-            activation=activation,
-            log_softmax=log_softmax,
-        )
-        self.post_init(use_transformer_init=use_transformer_init)
-
-    @typecheck()
-    def forward(self, hidden_states):
-        hidden_states = self.dropout(hidden_states)
-        # intent is classified by first hidden position
-        intent_logits = self.intent_mlp(hidden_states[:, 0])
-        slot_logits = self.slot_mlp(hidden_states)
-        return intent_logits, slot_logits
diff --git a/nemo/collections/nlp/modules/common/token_classifier.py b/nemo/collections/nlp/modules/common/token_classifier.py
deleted file mode 100644
index c2c3a3caff2a..000000000000
--- a/nemo/collections/nlp/modules/common/token_classifier.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Dict, Optional
-
-from torch import nn as nn
-
-from nemo.collections.common.parts import MultiLayerPerceptron
-from nemo.collections.nlp.modules.common.classifier import Classifier
-from nemo.core.classes import typecheck
-from nemo.core.neural_types import LogitsType, LogprobsType, NeuralType
-
-__all__ = ['BertPretrainingTokenClassifier', 'TokenClassifier']
-
-ACT2FN = {"gelu": nn.functional.gelu, "relu": nn.functional.relu}
-
-
-@dataclass
-class TokenClassifierConfig:
-    num_layers: int = 1
-    activation: str = 'relu'
-    log_softmax: bool = True
-    dropout: float = 0.0
-    use_transformer_init: bool = True
-
-
-class TokenClassifier(Classifier):
-    """
-    A module to perform token level classification tasks such as Named entity recognition.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """
-        Returns definitions of module output ports.
-        """
-        if not self.log_softmax:
-            return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())}
-        else:
-            return {"log_probs": NeuralType(('B', 'T', 'C'), LogprobsType())}
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_classes: int,
-        num_layers: int = 1,
-        activation: str = 'relu',
-        log_softmax: bool = True,
-        dropout: float = 0.0,
-        use_transformer_init: bool = True,
-    ) -> None:
-
-        """
-        Initializes the Token Classifier module.
-
-        Args:
-            hidden_size: the size of the hidden dimension
-            num_classes: number of classes
-            num_layers: number of fully connected layers in the multilayer perceptron (MLP)
-            activation: activation to usee between fully connected layers in the MLP
-            log_softmax: whether to apply softmax to the output of the MLP
-            dropout: dropout to apply to the input hidden states
-            use_transformer_init: whether to initialize the weights of the classifier head with the same approach used in Transformer
-        """
-        super().__init__(hidden_size=hidden_size, dropout=dropout)
-        self.log_softmax = log_softmax
-        self.mlp = MultiLayerPerceptron(
-            hidden_size, num_classes, num_layers=num_layers, activation=activation, log_softmax=log_softmax
-        )
-        self.post_init(use_transformer_init=use_transformer_init)
-
-    @typecheck()
-    def forward(self, hidden_states):
-        """
-        Performs the forward step of the module.
-        Args:
-            hidden_states: batch of hidden states (for example, from the BERT encoder module)
-                [BATCH_SIZE x SEQ_LENGTH x HIDDEN_SIZE]
-        Returns: logits value for each class [BATCH_SIZE x SEQ_LENGTH x NUM_CLASSES]
-        """
-        hidden_states = self.dropout(hidden_states)
-        logits = self.mlp(hidden_states)
-        return logits
-
-
-class BertPretrainingTokenClassifier(Classifier):
-    """
-    A module to perform token level classification tasks for Bert pretraining.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """
-        Returns definitions of module output ports.
-        """
-        if not self.log_softmax:
-            return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())}
-        else:
-            return {"log_probs": NeuralType(('B', 'T', 'C'), LogprobsType())}
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_classes: int,
-        num_layers: int = 1,
-        activation: str = 'relu',
-        log_softmax: bool = True,
-        dropout: float = 0.0,
-        use_transformer_init: bool = True,
-    ) -> None:
-
-        """
-        Initializes the Token Classifier module.
-
-        Args:
-            hidden_size: the size of the hidden dimension
-            num_classes: number of classes
-            num_layers: number of fully connected layers in the multilayer perceptron (MLP)
-            activation: activation to usee between fully connected layers in the MLP
-            log_softmax: whether to apply softmax to the output of the MLP
-            dropout: dropout to apply to the input hidden states
-            use_transformer_init: whether to initialize the weights of the classifier head with the same approach used in Transformer
-        """
-        super().__init__(hidden_size=hidden_size, dropout=dropout)
-
-        self.log_softmax = log_softmax
-
-        if activation not in ACT2FN:
-            raise ValueError(f'activation "{activation}" not found')
-        self.dense = nn.Linear(hidden_size, hidden_size)
-        self.act = ACT2FN[activation]
-        self.norm = nn.LayerNorm(hidden_size, eps=1e-12)
-        self.mlp = MultiLayerPerceptron(
-            hidden_size, num_classes, num_layers=num_layers, activation=activation, log_softmax=log_softmax
-        )
-        self.post_init(use_transformer_init=use_transformer_init)
-
-    @typecheck()
-    def forward(self, hidden_states):
-        """
-        Performs the forward step of the module.
-        Args:
-            hidden_states: batch of hidden states (for example, from the BERT encoder module)
-                [BATCH_SIZE x SEQ_LENGTH x HIDDEN_SIZE]
-        Returns: logits value for each class [BATCH_SIZE x SEQ_LENGTH x NUM_CLASSES]
-        """
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.act(hidden_states)
-        transform = self.norm(hidden_states)
-        logits = self.mlp(transform)
-        return logits
diff --git a/nemo/collections/nlp/modules/common/classifier.py b/nemo/collections/tts/g2p/models/token_classifier.py
similarity index 51%
rename from nemo/collections/nlp/modules/common/classifier.py
rename to nemo/collections/tts/g2p/models/token_classifier.py
index 7d9e42593c1c..4e1b0258041f 100644
--- a/nemo/collections/nlp/modules/common/classifier.py
+++ b/nemo/collections/tts/g2p/models/token_classifier.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 import torch
 from torch import nn as nn
 
-from nemo.collections.common.parts import transformer_weights_init
-from nemo.core.classes import Exportable, NeuralModule
-from nemo.core.neural_types import ChannelType, NeuralType
+from nemo.collections.common.parts import transformer_weights_init, MultiLayerPerceptron
+from nemo.core.classes import typecheck, Exportable, NeuralModule
+from nemo.core.neural_types import LogitsType, LogprobsType, ChannelType, NeuralType
 
-__all__ = ['Classifier']
+__all__ = ['Classifier', 'TokenClassifier']
 
 
 class Classifier(NeuralModule, Exportable):
@@ -83,3 +83,61 @@ def restore_from(cls, restore_path: str):
             restore_path: Path to restore the module from.
         """
         pass
+
+class TokenClassifier(Classifier):
+    """
+    A module to perform token level classification tasks such as Named entity recognition.
+    """
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """
+        Returns definitions of module output ports.
+        """
+        if not self.log_softmax:
+            return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())}
+        else:
+            return {"log_probs": NeuralType(('B', 'T', 'C'), LogprobsType())}
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_classes: int,
+        num_layers: int = 1,
+        activation: str = 'relu',
+        log_softmax: bool = True,
+        dropout: float = 0.0,
+        use_transformer_init: bool = True,
+    ) -> None:
+
+        """
+        Initializes the Token Classifier module.
+
+        Args:
+            hidden_size: the size of the hidden dimension
+            num_classes: number of classes
+            num_layers: number of fully connected layers in the multilayer perceptron (MLP)
+            activation: activation to usee between fully connected layers in the MLP
+            log_softmax: whether to apply softmax to the output of the MLP
+            dropout: dropout to apply to the input hidden states
+            use_transformer_init: whether to initialize the weights of the classifier head with the same approach used in Transformer
+        """
+        super().__init__(hidden_size=hidden_size, dropout=dropout)
+        self.log_softmax = log_softmax
+        self.mlp = MultiLayerPerceptron(
+            hidden_size, num_classes, num_layers=num_layers, activation=activation, log_softmax=log_softmax
+        )
+        self.post_init(use_transformer_init=use_transformer_init)
+
+    @typecheck()
+    def forward(self, hidden_states):
+        """
+        Performs the forward step of the module.
+        Args:
+            hidden_states: batch of hidden states (for example, from the BERT encoder module)
+                [BATCH_SIZE x SEQ_LENGTH x HIDDEN_SIZE]
+        Returns: logits value for each class [BATCH_SIZE x SEQ_LENGTH x NUM_CLASSES]
+        """
+        hidden_states = self.dropout(hidden_states)
+        logits = self.mlp(hidden_states)
+        return logits

From e1917c734cc1472d29ccba6271af6d8d0832689b Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 15 Oct 2025 12:38:45 +0000
Subject: [PATCH 17/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../collections/tts/g2p/models/token_classifier.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/tts/g2p/models/token_classifier.py b/nemo/collections/tts/g2p/models/token_classifier.py
index 4e1b0258041f..f6257f089974 100644
--- a/nemo/collections/tts/g2p/models/token_classifier.py
+++ b/nemo/collections/tts/g2p/models/token_classifier.py
@@ -17,9 +17,9 @@
 import torch
 from torch import nn as nn
 
-from nemo.collections.common.parts import transformer_weights_init, MultiLayerPerceptron
-from nemo.core.classes import typecheck, Exportable, NeuralModule
-from nemo.core.neural_types import LogitsType, LogprobsType, ChannelType, NeuralType
+from nemo.collections.common.parts import MultiLayerPerceptron, transformer_weights_init
+from nemo.core.classes import Exportable, NeuralModule, typecheck
+from nemo.core.neural_types import ChannelType, LogitsType, LogprobsType, NeuralType
 
 __all__ = ['Classifier', 'TokenClassifier']
 
@@ -37,7 +37,11 @@ def input_types(self) -> Optional[Dict[str, NeuralType]]:
         """
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
-    def __init__(self, hidden_size: int, dropout: float = 0.0,) -> None:
+    def __init__(
+        self,
+        hidden_size: int,
+        dropout: float = 0.0,
+    ) -> None:
         """
         Initializes the Classifier base module.
         Args:
@@ -84,6 +88,7 @@ def restore_from(cls, restore_path: str):
         """
         pass
 
+
 class TokenClassifier(Classifier):
     """
     A module to perform token level classification tasks such as Named entity recognition.
@@ -109,7 +114,6 @@ def __init__(
         dropout: float = 0.0,
         use_transformer_init: bool = True,
     ) -> None:
-
         """
         Initializes the Token Classifier module.
 

From e6df5863ee63df356e541eba167e7bbb146bcab1 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 05:44:48 -0700
Subject: [PATCH 18/21] remove .py files

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../common/audio_text_generation_strategy.py  |   2 +-
 .../common/text_generation_strategy.py        |   2 +-
 .../modules/common/text_generation_server.py  | 452 ------------------
 3 files changed, 2 insertions(+), 454 deletions(-)
 rename nemo/collections/{nlp => multimodal/speech_llm}/modules/common/text_generation_strategy.py (99%)
 delete mode 100644 nemo/collections/nlp/modules/common/text_generation_server.py

diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
index 763e03b699cd..437087041069 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
@@ -16,7 +16,7 @@
 
 import torch
 
-import nemo.collections.nlp.modules.common.text_generation_strategy as text_generation_strategy
+import nemo.collections.multimodal.speech_llm.modules.common.text_generation_strategy as text_generation_strategy
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
 from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
 
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
similarity index 99%
rename from nemo/collections/nlp/modules/common/text_generation_strategy.py
rename to nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
index 7aa1f88aebd9..48d0baedd776 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py
deleted file mode 100644
index 490f66bf02ae..000000000000
--- a/nemo/collections/nlp/modules/common/text_generation_server.py
+++ /dev/null
@@ -1,452 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for generating text."""
-
-# flake8: noqa
-# pylint: skip-file
-
-import json
-import threading
-import time
-import uuid
-
-import torch
-from flask import Flask, jsonify, request
-from flask_restful import Api, Resource
-
-try:
-    from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import (
-        _get_header_conversation_type_mask_role,
-        get_prompt_template_example,
-    )
-
-    HAVE_NLP = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_NLP = False
-
-from nemo.collections.nlp.modules.common.retro_inference_strategies import (
-    RetroModelTextGenerationStrategy,
-    RetroQAModelTextGenerationStrategy,
-)
-from nemo.collections.nlp.modules.common.text_generation_utils import generate
-from nemo.utils import logging
-
-GENERATE_NUM = 0
-lock = threading.Lock()
-
-API_ALLOWED_KEYS = set(
-    [
-        'all_probs',
-        'sentences',
-        "task_ids",
-        "tokens_to_generate",
-        "temperature",
-        "add_BOS",
-        "greedy",
-        "top_k",
-        "top_p",
-        "neighbors",
-        "repetition_penalty",
-        "min_tokens_to_generate",
-        "end_strings",
-        "compute_logprob",
-        "random_seed",
-    ]
-)
-
-
-class MegatronGenerate(Resource):
-    def __init__(self, model, inference_strategy=None):
-        self.model = model
-        self.inference_strategy = inference_strategy
-
-    @staticmethod
-    def send_do_generate():
-        choice = torch.cuda.LongTensor([GENERATE_NUM])
-        torch.distributed.broadcast(choice, 0)
-
-    def convert_messages(self, input_list):
-        output_dict = {
-            'system': '',
-            'conversations': [],
-            'mask': 'User',
-            'type': 'VALUE_TO_TEXT',
-        }
-
-        # Extract the system message
-        for msg in input_list:
-            if msg['role'] == 'system':
-                output_dict['system'] = msg['content']
-                break  # Assuming only one system message
-
-        # Build the conversations list
-        for msg in input_list:
-            if msg['role'] != 'system':
-                conversation_entry = {
-                    'from': msg['role'].capitalize(),  # Capitalize 'user' and 'assistant'
-                    'value': msg['content'],
-                    'label': None,
-                }
-                output_dict['conversations'].append(conversation_entry)
-
-        return output_dict
-
-    def completion(self, data):
-        output_sentence = ""
-        with lock:  # Need to get lock to keep multiple threads from hitting code
-            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-            extra = {}
-            if self.inference_strategy is not None:
-                extra['strategy'] = self.inference_strategy
-
-            all_probs = False
-            add_BOS = False
-            top_p = data.get("top_p", 1.0)
-            top_k = data.get("top_k", 0)
-            max_tokens = data.get("max_tokens", 32)
-            temperature = data.get("temperature", 0.0)
-            logprobs = data.get("logprobs", False)
-            greedy = temperature == 0.0
-            end_strings = ['<|endoftext|>'] + data.get("end_strings", [])
-            prompt = data["prompt"]
-            random_seed = data.get("seed", 1234)
-
-            output = generate(
-                self.model,
-                [prompt],
-                tokens_to_generate=max_tokens,
-                all_probs=all_probs,
-                temperature=temperature,
-                add_BOS=add_BOS,
-                top_k=top_k,
-                top_p=top_p,
-                greedy=greedy,
-                repetition_penalty=1.0,
-                end_strings=end_strings,
-                min_tokens_to_generate=0,
-                compute_logprob=logprobs,
-                random_seed=random_seed,
-                **extra,
-            )
-            for k in output:
-                if isinstance(output[k], torch.Tensor):
-                    output[k] = output[k].tolist()
-
-            output_sentence = output['sentences'][0][len(prompt) :]
-            tokens = output['tokens'][0]
-            logprobs = output['logprob'][0] if output['logprob'] is not None else None
-            num_prompt_tokens = len(prompt.split())
-            num_output_sentence = len(output_sentence.split())
-
-        return jsonify(
-            {
-                "choices": [
-                    {
-                        "finish_reason": "",
-                        "index": 0,
-                        "logprobs": logprobs,
-                        "text": output_sentence,
-                        "tokens": tokens,
-                    }
-                ],
-                "created": int(time.time()),
-                "id": f"cmpl-{uuid.uuid4()}",
-                "model": "nemo model",
-                "object": "text_completion",
-                "usage": {
-                    "completion_tokens": num_output_sentence,
-                    "prompt_tokens": num_prompt_tokens,
-                    "total_tokens": num_output_sentence + num_prompt_tokens,
-                },
-            }
-        )
-
-    def chat_completion(self, data):
-        data['messages'] = data['messages'] + [
-            {'role': 'assistant', 'content': ''}
-        ]  # adding trailing assistant message so that prompt ends with Assistant tag.
-        special_tokens = self.model.cfg.data.chat_prompt_tokens
-        nemo_source = self.convert_messages(data['messages'])
-        header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(
-            nemo_source, special_tokens
-        )
-        len_strip = len(special_tokens['end_of_turn'] + special_tokens['turn_start'])
-        conversation = conversation[:-len_strip]
-        # Return a response mimicking the OpenAI ChatCompletion API format
-        with lock:  # Need to get lock to keep multiple threads from hitting code
-            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-            extra = {}
-            if self.inference_strategy is not None:
-                extra['strategy'] = self.inference_strategy
-
-            all_probs = False
-            add_BOS = False
-            top_k = 0
-            greedy = data['temperature'] == 0.0
-            logprobs = data.get("logprobs", False)
-            end_strings = ['<|endoftext|>', special_tokens['turn_start'], special_tokens['label_start']]
-            random_seed = None
-
-            output = generate(
-                self.model,
-                [conversation],
-                data.get('max_tokens', 32),
-                all_probs=all_probs,
-                temperature=data.get('temperature', 1.0),
-                add_BOS=add_BOS,
-                top_k=top_k,
-                top_p=data.get("top_p", 0.95),
-                greedy=greedy,
-                repetition_penalty=1.0,
-                end_strings=end_strings,
-                min_tokens_to_generate=0,
-                compute_logprob=logprobs,
-                random_seed=random_seed,
-                **extra,
-            )
-            for k in output:
-                if isinstance(output[k], torch.Tensor):
-                    output[k] = output[k].tolist()
-
-        output_sentence = output['sentences'][0][len(conversation) :]
-        tokens = output['tokens'][0]
-        logprobs = output['logprob'][0] if output['logprob'] is not None else None
-        num_prompt_tokens = len(conversation.split())  # @adithyare only produces an approx. number of tokens
-        num_output_sentence = len(output_sentence.split())
-
-        return jsonify(
-            {
-                "id": f"chatcmpl-{uuid.uuid4()}",
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": data.get("model", "nemo model"),
-                "choices": [
-                    {
-                        "index": 0,
-                        "message": {"role": "assistant", "content": output_sentence},
-                        "logprobs": logprobs,
-                        "tokens": tokens,
-                        "finish_reason": "",
-                    }
-                ],
-                "usage": {
-                    "prompt_tokens": num_prompt_tokens,
-                    "completion_tokens": num_output_sentence,
-                    "total_tokens": num_output_sentence + num_prompt_tokens,
-                },
-            }
-        )
-
-    def post(self):
-        # Access the request data if needed
-        if request.endpoint == "oai_completions":
-            data = request.get_json()
-            return self.completion(data)
-        elif request.endpoint == "oai_chat_completions":
-            data = request.get_json()
-            return self.chat_completion(data)
-        else:
-            raise RuntimeError("Unknown enpoint requested.")
-
-    def put(self):
-        logging.info("request IP: " + str(request.remote_addr))
-        logging.info(json.dumps(request.get_json()))
-        # check keys
-        for key in request.get_json().keys():
-            if key not in API_ALLOWED_KEYS:
-                logging.error(f"The request key {key} is not allowed")
-
-        sentences = request.get_json()["sentences"]
-        if isinstance(sentences, tuple):  # Input can be text or tensor
-            if len(sentences[0]) != len(sentences[1]) or sentences[0] > 128:
-                return "Maximum number of sentences is 128", 400
-        elif len(sentences) > 128:
-            return "Maximum number of sentences is 128", 400
-
-        task_ids = None  # Used for ptuned/prompt tuned models only
-        if "task_ids" in request.get_json():
-            task_ids = request.get_json()["task_ids"]
-            if not isinstance(sentences, tuple):
-                return "Input at 'sentences' must by a tuple of two tensors like:\
-                    (context_tokens_tensor, context_length_tensor) if task ids are given"
-            if len(task_ids) != len(sentences[0]):
-                return "Each sentence must have a corresponding task id for p-tuned/prompt-tuned models"
-
-        tokens_to_generate = 64  # Choosing hopefully sane default.  Full sequence is slow
-        if "tokens_to_generate" in request.get_json():
-            tokens_to_generate = request.get_json()["tokens_to_generate"]
-            if not isinstance(tokens_to_generate, int):
-                return "tokens_to_generate must be an integer greater than 0"
-            if tokens_to_generate < 1:
-                return "tokens_to_generate must be an integer greater than 0"
-
-        all_probs = False
-        if "all_probs" in request.get_json():
-            all_probs = request.get_json()["all_probs"]
-            if not isinstance(all_probs, bool):
-                return "all_probs must be a boolean value"
-
-        temperature = 1.0
-        if "temperature" in request.get_json():
-            temperature = request.get_json()["temperature"]
-            if not (type(temperature) == int or type(temperature) == float):
-                return "temperature must be a positive number less than or equal to 100.0"
-            if not (0.0 < temperature <= 100.0):
-                return "temperature must be a positive number less than or equal to 100.0"
-
-        add_BOS = False
-        if "add_BOS" in request.get_json():
-            add_BOS = request.get_json()["add_BOS"]
-            if not isinstance(add_BOS, bool):
-                return "add_BOS must be a boolean value"
-
-        greedy = False
-        if "greedy" in request.get_json():
-            greedy = request.get_json()["greedy"]
-            if not isinstance(greedy, bool):
-                return "greedy must be a boolean value"
-
-        top_k = 0
-        if "top_k" in request.get_json():
-            top_k = request.get_json()["top_k"]
-            if not (type(top_k) == int or type(top_k) == float):
-                return "top_k must be a positive integer number"
-            if not (0 <= top_k):
-                return "top_k must be a positive integer number"
-
-        top_p = 0.9
-        if "top_p" in request.get_json():
-            top_p = request.get_json()["top_p"]
-            if not (type(top_p) == int or type(top_p) == float):
-                return "top_p must be a positive number less than or equal to 1.0"
-            if not (0.0 <= top_p <= 1.0):
-                return "top_p must be a positive number less than or equal to 1.0"
-
-        repetition_penalty = 1.0
-        if "repetition_penalty" in request.get_json():
-            repetition_penalty = request.get_json()["repetition_penalty"]
-            if not (type(repetition_penalty) == int or type(repetition_penalty) == float):
-                return "repetition_penalty must be a positive number no less than 1.0"
-            if not (1.0 <= repetition_penalty):
-                return "repetition_penalty must be a positive number no less than 1.0"
-
-        end_strings = ['<|endoftext|>']
-        if 'end_strings' in request.get_json():
-            end_strings = request.get_json()['end_strings']
-            if not isinstance(end_strings, list):
-                return "expect end_strings to be a list of strings"
-            if not all([isinstance(s, str) for s in end_strings]):
-                return "expect end_strings to be a list of strings"
-
-        min_tokens_to_generate = 0
-        if "min_tokens_to_generate" in request.get_json():
-            min_tokens_to_generate = request.get_json()["min_tokens_to_generate"]
-            if not isinstance(min_tokens_to_generate, int):
-                return "min_tokens_to_generate must be an integer no less than 0"
-            if min_tokens_to_generate < 0:
-                return "min_tokens_to_generate must be an integer no less than 0"
-
-        neighbors = None
-        if "neighbors" in request.get_json():
-            neighbors = request.get_json()["neighbors"]
-            if not isinstance(neighbors, int):
-                return "num of neighbors must be an integer no less than 0"
-            if neighbors < 0:
-                return "num of neighbors must be an integer no less than 0"
-
-        compute_logprob = False
-        if "compute_logprob" in request.get_json():
-            compute_logprob = request.get_json()["compute_logprob"]
-            if not isinstance(compute_logprob, bool):
-                return "compute_logprob must be a boolean value"
-
-        random_seed = None
-        if "random_seed" in request.get_json():
-            random_seed = request.get_json()["random_seed"]
-            if random_seed is not None and not isinstance(random_seed, int):
-                return "random_seed must be a positive integer number or None"
-            if random_seed is not None and random_seed < 0:
-                return "random_seed must be a positive integer number or None"
-
-        with lock:  # Need to get lock to keep multiple threads from hitting code
-            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-            extra = {}
-            if task_ids is not None:
-                extra['task_ids'] = task_ids
-            if self.inference_strategy is not None:
-                extra['strategy'] = self.inference_strategy
-                # RETRO specific arguments
-                if isinstance(
-                    self.inference_strategy, (RetroModelTextGenerationStrategy, RetroQAModelTextGenerationStrategy)
-                ):
-                    if neighbors is not None:
-                        self.inference_strategy.update_neighbors(neighbors)
-
-            output = generate(
-                self.model,
-                sentences,
-                tokens_to_generate,
-                all_probs,
-                temperature,
-                add_BOS,
-                top_k,
-                top_p,
-                greedy,
-                repetition_penalty,
-                end_strings=end_strings,
-                min_tokens_to_generate=min_tokens_to_generate,
-                compute_logprob=compute_logprob,
-                random_seed=random_seed,
-                **extra,
-            )
-            for k in output:
-                if isinstance(output[k], torch.Tensor):
-                    output[k] = output[k].tolist()
-        if not all_probs:
-            del output['full_logprob']
-
-        if self.inference_strategy is not None:
-            if isinstance(
-                self.inference_strategy, (RetroModelTextGenerationStrategy, RetroQAModelTextGenerationStrategy)
-            ):
-                retrieved_doc = self.inference_strategy.retrieved_text
-                output['retrieved'] = retrieved_doc
-        return jsonify(output)
-
-
-class MegatronServer(object):
-    def __init__(self, model, inference_strategy=None):
-        self.app = Flask(__name__, static_url_path='')
-        api = Api(self.app)
-        api.add_resource(
-            MegatronGenerate,
-            '/generate',
-            endpoint="generate",
-            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
-        )
-        api.add_resource(
-            MegatronGenerate,
-            '/v1/completions',
-            endpoint="oai_completions",
-            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
-        )
-        api.add_resource(
-            MegatronGenerate,
-            '/v1/chat/completions',
-            endpoint="oai_chat_completions",
-            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
-        )
-
-    def run(self, url, port=5000):
-        self.app.run(url, threaded=True, port=port, debug=False)

From 8cdd75d4b28f9b8931bb4cd9ead691b26e7355ce Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 05:59:24 -0700
Subject: [PATCH 19/21] remove .py files

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../common/text_generation_strategy.py        |  21 -
 .../common/retro_inference_strategies.py      | 458 ------------------
 2 files changed, 479 deletions(-)
 delete mode 100644 nemo/collections/nlp/modules/common/retro_inference_strategies.py

diff --git a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
index 48d0baedd776..0ea1d8137a52 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
@@ -891,7 +891,6 @@ def model_inference_strategy_dispatcher(model, **args):
         )
         from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
         from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
-        from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
         from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
     except (ImportError, ModuleNotFoundError):
         from abc import ABC
@@ -900,15 +899,8 @@ def model_inference_strategy_dispatcher(model, **args):
         MegatronGPTPromptLearningModel = ABC
         MegatronGriffinModel = ABC
         MegatronMambaModel = ABC
-        MegatronRetrievalModel = ABC
         MegatronRetroModel = ABC
 
-    from nemo.collections.nlp.modules.common.retro_inference_strategies import (
-        RetroFileQAModelTextGenerationStrategy,
-        RetroModelTextGenerationStrategy,
-        RetroQAModelTextGenerationStrategy,
-    )
-
     if isinstance(model, MegatronGriffinModel):
         return GriffinModelTextGenerationStrategy(model)
     if isinstance(model, MegatronMambaModel):
@@ -917,19 +909,6 @@ def model_inference_strategy_dispatcher(model, **args):
         return PromptLearningModelTextGenerationStrategy(model, **args)
     elif isinstance(model, MegatronGPTModel) and not (isinstance(model, MegatronRetroModel)):
         return GPTModelTextGenerationStrategy(model)
-    elif isinstance(model, MegatronRetrievalModel):
-        strategy_name = args['strategy']
-        del args['strategy']
-        megatron_lm_compatible = model.model.megatron_lm_compatible
-        args['megatron_lm_compatible'] = megatron_lm_compatible
-        if strategy_name == 'RetroModelTextGenerationStrategy':
-            return RetroModelTextGenerationStrategy(model, **args)
-        elif strategy_name == 'RetroQAModelTextGenerationStrategy':
-            return RetroQAModelTextGenerationStrategy(model, **args)
-        elif strategy_name == 'RetroFileQAModelTextGenerationStrategy':
-            return RetroFileQAModelTextGenerationStrategy(model, **args)
-        else:
-            raise ValueError(f'{strategy_name} is not supported for inference')
     elif isinstance(model, MegatronRetroModel):
         return McoreRetroModelTextGenerationStrategy(model)
     else:
diff --git a/nemo/collections/nlp/modules/common/retro_inference_strategies.py b/nemo/collections/nlp/modules/common/retro_inference_strategies.py
deleted file mode 100644
index 2c267fe06e64..000000000000
--- a/nemo/collections/nlp/modules/common/retro_inference_strategies.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import pickle
-from typing import List, Tuple
-
-import numpy as np
-import torch
-import torch.distributed as dist
-
-from nemo.collections.nlp.modules.common.lm_utils import pad_batch
-from nemo.collections.nlp.modules.common.megatron.retrieval_services.retrieval_service import ComboRetrievalService
-from nemo.collections.nlp.modules.common.text_generation_strategy import TextGenerationStrategy
-
-
-class RetroModelTextGenerationStrategy(TextGenerationStrategy):
-    def __init__(self, model, **args):
-        super().__init__(model)
-        self.forward_model = self.model.model
-        self.frequent_query = args['frequent_query']
-        self.pad_token_for_retrieval = args['pad_tokens']
-        self.store_retrieved = args['store_retrieved']
-        self.store = dist.FileStore('/tmp/filestore_eval', -1)
-        self.store.set('neighbors', str(args['neighbors']))
-        self.megatron_lm_compatible = args['megatron_lm_compatible']
-        combo_cfg = args['combo_service']
-        self.service = ComboRetrievalService(
-            tokenizer=self.model.tokenizer, service_ip=combo_cfg['service_ip'], service_port=combo_cfg['service_port']
-        )
-        self.retrieved = []
-        self.retrieved_text = []
-        self.chunk_size = self.model.cfg.chunk_size
-
-    def update_neighbors(self, neighbors):
-        # dynamically change the number of neighbors during the query
-        self.store.set('neighbors', str(neighbors))
-
-    @property
-    def neighbors(self):
-        return int(self.store.get('neighbors'))
-
-    def tokenize_batch(self, sentences, max_len, add_BOS):
-        """
-        convert the sentences into lists of tokens, pad them to the same length, add bos tokens if it is needed
-        Args:
-            sentences (List[str]): list of input sentences in str format.
-            max_len (int): max number of tokens to generate.
-            add_BOS (bool): whether to add the BOS token at the beginning
-        Returns:
-            Tuple[torch.Tensor], the tokenized and padded torch tensor and the token context length tensor.
-        """
-        tokenizer = self.model.tokenizer
-        if add_BOS:
-            context_tokens = [[tokenizer.bos_id] + tokenizer.text_to_ids(s) for s in sentences]
-        else:
-            context_tokens = [tokenizer.text_to_ids(s) for s in sentences]
-        if self.pad_token_for_retrieval:
-            padded = []
-            for line in context_tokens:
-                if len(line) < self.chunk_size:
-                    pad_len = self.chunk_size - len(line)
-                    if self.megatron_lm_compatible:
-                        # megatron lm use eos to pad
-                        padded.append([tokenizer.eos_id] * pad_len + line)
-                    else:
-                        padded.append([tokenizer.pad_id] * pad_len + line)
-                else:
-                    padded.append(line)
-            context_tokens = padded
-        context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eos_id, max_len)
-        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-        context_length_tensor = torch.cuda.LongTensor(context_lengths)
-        return context_tokens_tensor, context_length_tensor
-
-    def tokenize_batch_with_context_and_completion(self, sentences, max_len, add_BOS):
-        """
-        convert the sentences into lists of tokens, pad them to the same length, add bos tokens if it is needed
-        Args:
-            sentences (List[str]): list of input sentences in str format.
-            max_len (int): max number of tokens to generate.
-            add_BOS (bool): whether to add the BOS token at the beginning
-        Returns:
-            Tuple[torch.Tensor], the tokenized and padded torch tensor and the token context length tensor.
-        """
-        tokenizer = self.model.tokenizer
-        if add_BOS:
-            context_tokens = [
-                [[tokenizer.bos_id] + tokenizer.text_to_ids(s[0]), tokenizer.text_to_ids(s[1])] for s in sentences
-            ]
-        else:
-            context_tokens = [[tokenizer.text_to_ids(s[0]), tokenizer.text_to_ids(s[1])] for s in sentences]
-        if self.pad_token_for_retrieval:
-            padded = []
-            for line in context_tokens:
-                if len(line[0]) < self.chunk_size:
-                    pad_len = self.chunk_size - len(line[0])
-                    if self.megatron_lm_compatible:
-                        # megatron lm use eos to pad
-                        padded.append([tokenizer.eos_id] * pad_len + line[0] + line[1])
-                    else:
-                        padded.append([tokenizer.pad_id] * pad_len + line[0] + line[1])
-                else:
-                    padded.append(line[0] + line[1])
-            context_tokens = padded
-        context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eos_id, max_len)
-        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-        context_length_tensor = torch.cuda.LongTensor(context_lengths)
-        return context_tokens_tensor, context_length_tensor
-
-    def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
-        if maxlen > self.model.cfg.encoder_seq_length + 1:
-            maxlen = self.model.cfg.encoder_seq_length + 1
-        return maxlen
-
-    def _store_retrieved(self, tokens, neighbors):
-        tokenizer = self.model.tokenizer
-        for batch_id in range(len(tokens)):
-            item = {}
-            query_text = tokenizer.ids_to_text(tokens[batch_id])
-            item['query'] = query_text
-            item['neighbors'] = []
-            for context_id in range(len(neighbors[batch_id])):
-                neighbor_text = tokenizer.ids_to_text(neighbors[batch_id][context_id])
-                item['neighbors'].append(neighbor_text)
-            self.retrieved_text.append(item)
-
-    def init_batch(self, context_tokens: torch.Tensor, context_length: int):
-        self.retrieved = []
-        self.retrieved_text = []
-        """initialize the batch data before the inference steps."""
-        # Move to GPU.
-        tokenizer = self.model.tokenizer
-        tokens = context_tokens.contiguous().cuda()
-        micro_batch_size, seq_length = tokens.size()
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=tokens.device)
-        self.position_ids = position_ids.unsqueeze(0).repeat(micro_batch_size, 1)
-        if self.megatron_lm_compatible:
-            # all TRUE for megatron lm, there is no attention mask
-            self.attention_mask = torch.ones_like(tokens, dtype=torch.bool)
-        else:
-            self.attention_mask = tokens != tokenizer.pad_id
-        for i in range(0, context_length, 64):
-            if i > 0:
-                tokens = context_tokens[:, i - 64 : i]
-                chunks = self.service.get_knn(tokens, self.neighbors)
-                if self.store_retrieved:
-                    self._store_retrieved(tokens, chunks)
-                self.retrieved.append(chunks)
-
-    def prepare_batch_at_step(
-        self, tokens: torch.Tensor, maxlen: int, micro_batch_size: int, step: int, context_length: int
-    ) -> Tuple[List[torch.Tensor], List[int]]:
-        tokenizer = self.model.tokenizer
-
-        if context_length % 64 == 0:
-            # added a new retrieval context
-            token_context = tokens[:, context_length - 64 : context_length]
-            chunks = self.service.get_knn(token_context, self.neighbors)
-            if self.store_retrieved:
-                self._store_retrieved(token_context, chunks)
-            self.retrieved.append(chunks)
-        elif self.frequent_query and len(self.retrieved) > 0:
-            token_context = tokens[:, context_length - 64 : context_length]
-            chunks = self.service.get_knn(token_context, self.neighbors)
-            if self.store_retrieved:
-                self._store_retrieved(token_context, chunks)
-            self.retrieved[-1] = chunks
-
-        # types2use = None
-        if step == 0:
-            # Allocate memory for the entire context.
-            set_inference_key_value_memory = True
-            tokens2use = tokens[:, :context_length]
-            positions2use = self.position_ids[:, :context_length]
-            # not using type2use. uncomment it if it is used
-            # if type_ids is not None:
-            #     types2use = type_ids[:, :context_length]
-        else:
-            # Set this to false so the memory is not reallocated.
-            set_inference_key_value_memory = False
-            tokens2use = tokens[:, context_length - 1].view(micro_batch_size, -1)
-            positions2use = self.position_ids[:, context_length - 1].view(micro_batch_size, -1)
-            # not using type2use. uncomment it if it is used
-            # if type_ids is not None:
-            #     types2use = type_ids[:, context_length - 1].view(batch_size, -1)
-        retrieved = torch.tensor(np.array(self.retrieved), device=torch.cuda.current_device())
-        if retrieved.numel() != 0:
-            retrieved = retrieved.transpose(0, 1).contiguous()
-        if self.megatron_lm_compatible:
-            # all TRUE for megatron lm, there is no attention mask
-            retrieved_mask = torch.ones_like(retrieved, dtype=torch.bool)
-        else:
-            retrieved_mask = retrieved != tokenizer.pad_id
-        if retrieved.numel() == 0:
-            # add empty retrieved
-            retrieved = (
-                torch.tensor(self.service.get_knn(['a'], 0), device=torch.cuda.current_device())
-                .unsqueeze(0)
-                .repeat(1, len(self.retrieved), 1, 1)
-            )
-            retrieved_mask = retrieved != tokenizer.pad_id
-            # retrieved = torch.tensor([-1] * micro_batch_size)
-            # retrieved_mask = torch.tensor([-1] * micro_batch_size)
-
-        """Prepare batch for each of the inference steps"""
-        # attention_mask_repeat = torch.concat([self.attention_mask for _ in range(micro_batch_size)])
-        setkey_value_array = torch.tensor(
-            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
-        )
-        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
-        if self.neighbors == 0:
-            # no retrieval, use 1 padding
-            neighbors_array = torch.tensor([1] * micro_batch_size, device=torch.cuda.current_device())
-        else:
-            neighbors_array = torch.tensor([self.neighbors] * micro_batch_size, device=torch.cuda.current_device())
-
-        batch = [
-            tokens2use,
-            self.attention_mask[:, :context_length],
-            retrieved,
-            retrieved_mask,
-            setkey_value_array,
-            len_array,
-            neighbors_array,
-            positions2use,
-        ]
-        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
-        return batch, tensor_shape
-
-
-class RetroQAModelTextGenerationStrategy(RetroModelTextGenerationStrategy):
-    def tokenize_batch(self, questions, max_len, add_BOS):
-        """
-        convert the sentences into lists of tokens, pad them to the same length, add bos tokens if it is needed
-        Args:
-            questions (List[str]): list of input questions in str format.
-            max_len (int): max number of tokens to generate.
-            add_BOS (bool): whether to add the BOS token at the beginning
-        Returns:
-            Tuple[torch.Tensor], the tokenized and padded torch tensor and the token context length tensor.
-        """
-        tokenizer = self.model.tokenizer
-        all_lookups = self.service.get_knn(questions, 1 + self.neighbors)
-        # hack to add "source: " tag
-        prepend_ids = np.array(tokenizer.text_to_ids('source: '))
-        all_lookups = np.pad(all_lookups, ((0, 0), (0, 0), (len(prepend_ids), 0)))
-        all_lookups[:, :, : len(prepend_ids)] = prepend_ids
-        all_lookups = all_lookups[:, :, : -len(prepend_ids)]
-        reuse_neighbors = all_lookups[:, 1:]
-        self.store.set('reuse_neighbors', pickle.dumps(reuse_neighbors))
-        neighbor_tokens = [neighbors[0].tolist() for neighbors in all_lookups]
-
-        # combine question and context
-        context_tokens = [
-            n + tokenizer.text_to_ids('\nquestion: ' + q + ' \nanswer:') for n, q in zip(neighbor_tokens, questions)
-        ]
-
-        if add_BOS:
-            context_tokens = [[tokenizer.bos_id] + s for s in context_tokens]
-        if self.pad_token_for_retrieval:
-            padded = []
-            for line in context_tokens:
-                pad_len = (self.chunk_size - len(line) % self.chunk_size) % self.chunk_size
-                if self.megatron_lm_compatible:
-                    padded.append([tokenizer.eos_id] * pad_len + line)
-                else:
-                    padded.append([tokenizer.pad_id] * pad_len + line)
-            context_tokens = padded
-        context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eos_id, max_len)
-        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-        context_length_tensor = torch.cuda.LongTensor(context_lengths)
-        return context_tokens_tensor, context_length_tensor
-
-    def init_batch(self, context_tokens: torch.Tensor, context_length: int):
-        self.retrieved = []
-        self.retrieved_text = []
-        self.reuse_neighbors = pickle.loads(self.store.get('reuse_neighbors'))
-        """initialize the batch data before the inference steps."""
-        # Move to GPU.
-        tokenizer = self.model.tokenizer
-        tokens = context_tokens.contiguous().cuda()
-        micro_batch_size, seq_length = tokens.size()
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=tokens.device)
-        self.position_ids = position_ids.unsqueeze(0).repeat(micro_batch_size, 1)
-        if self.megatron_lm_compatible:
-            # all TRUE for megatron lm, there is no attention mask
-            self.attention_mask = torch.ones_like(tokens, dtype=torch.bool)
-        else:
-            self.attention_mask = tokens != tokenizer.pad_id
-        for i in range(0, context_length, 64):
-            if i > 0:
-                tokens = context_tokens[:, i - 64 : i]
-                chunks = self.reuse_neighbors
-                if self.store_retrieved:
-                    self._store_retrieved(tokens, chunks)
-                self.retrieved.append(chunks)
-
-    def prepare_batch_at_step(
-        self, tokens: torch.Tensor, maxlen: int, micro_batch_size: int, step: int, context_length: int
-    ) -> Tuple[List[torch.Tensor], List[int]]:
-        tokenizer = self.model.tokenizer
-
-        if context_length % 64 == 0:
-            # added a new retrieval context
-            token_context = tokens[:, context_length - 64 : context_length]
-            chunks = self.reuse_neighbors
-            if self.store_retrieved:
-                self._store_retrieved(token_context, chunks)
-            self.retrieved.append(chunks)
-        elif self.frequent_query and len(self.retrieved) > 0:
-            token_context = tokens[:, context_length - 64 : context_length]
-            chunks = self.reuse_neighbors
-            if self.store_retrieved:
-                self._store_retrieved(token_context, chunks)
-            self.retrieved[-1] = chunks
-
-        # types2use = None
-        if step == 0:
-            # Allocate memory for the entire context.
-            set_inference_key_value_memory = True
-            tokens2use = tokens[:, :context_length]
-            positions2use = self.position_ids[:, :context_length]
-            # not using type2use. uncomment it if it is used
-            # if type_ids is not None:
-            #     types2use = type_ids[:, :context_length]
-        else:
-            # Set this to false so the memory is not reallocated.
-            set_inference_key_value_memory = False
-            tokens2use = tokens[:, context_length - 1].view(micro_batch_size, -1)
-            positions2use = self.position_ids[:, context_length - 1].view(micro_batch_size, -1)
-            # not using type2use. uncomment it if it is used
-            # if type_ids is not None:
-            #     types2use = type_ids[:, context_length - 1].view(batch_size, -1)
-        retrieved = torch.tensor(np.array(self.retrieved), device=torch.cuda.current_device())
-        if retrieved.numel() != 0:
-            retrieved = retrieved.transpose(0, 1).contiguous()
-        if self.megatron_lm_compatible:
-            # all TRUE for megatron lm, there is no attention mask
-            retrieved_mask = torch.ones_like(retrieved, dtype=torch.bool)
-        else:
-            retrieved_mask = retrieved != tokenizer.pad_id
-        if retrieved.numel() == 0:
-            # add empty retrieved
-            retrieved = (
-                torch.tensor(self.service.get_knn(['a'], 0), device=torch.cuda.current_device())
-                .unsqueeze(0)
-                .repeat(1, len(self.retrieved), 1, 1)
-            )
-            retrieved_mask = retrieved != tokenizer.pad_id
-
-        """Prepare batch for each of the inference steps"""
-        # attention_mask_repeat = torch.concat([self.attention_mask for _ in range(micro_batch_size)])
-        setkey_value_array = torch.tensor(
-            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
-        )
-        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
-        if self.neighbors == 0:
-            # no retrieval, use 1 padding
-            neighbors_array = torch.tensor([1] * micro_batch_size, device=torch.cuda.current_device())
-        else:
-            neighbors_array = torch.tensor([self.neighbors] * micro_batch_size, device=torch.cuda.current_device())
-
-        batch = [
-            tokens2use,
-            self.attention_mask[:, :context_length],
-            retrieved,
-            retrieved_mask,
-            setkey_value_array,
-            len_array,
-            neighbors_array,
-            positions2use,
-        ]
-        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
-        return batch, tensor_shape
-
-    def post_generation_process(self, output):
-        sentences = output['sentences']
-        modified = []
-        for sentence in sentences:
-            sentence = 'answer:' + sentence.split(' \nanswer:')[1]
-            modified.append(sentence)
-        output['sentences'] = modified
-        return output
-
-
-class RetroFileQAModelTextGenerationStrategy(RetroQAModelTextGenerationStrategy):
-    def __init__(self, model, **args):
-        super().__init__(model, **args)
-        # load the DPR to memory
-        self.context_db = {}
-        with open('/dataset/FiD/test.jsonl_title', 'r') as f:
-            for line in f:
-                obj = json.loads(line)
-                self.context_db[obj['question']] = obj
-
-    def tokenize_batch(self, questions, max_len, add_BOS):
-        """
-        convert the sentences into lists of tokens, pad them to the same length, add bos tokens if it is needed
-        Args:
-            questions (List[str]): list of input questions in str format.
-            max_len (int): max number of tokens to generate.
-            add_BOS (bool): whether to add the BOS token at the beginning
-        Returns:
-            Tuple[torch.Tensor], the tokenized and padded torch tensor and the token context length tensor.
-        """
-
-        tokenizer = self.model.tokenizer
-
-        # get context from memory
-        chunks = []
-        first_context = []
-        for question in questions:
-            hash_code = question
-            if hash_code not in self.context_db:
-                raise ValueError(f"wrong question is fed: {question}")
-            contexts = self.context_db[hash_code]['ctxs']
-            for i, neighbor in enumerate(contexts[: self.neighbors + 1]):
-                text = "title: " + neighbor["title"] + ", source: " + neighbor["text"]
-                if i == 0:
-                    first_context.append(text)
-                tokens = tokenizer.text_to_ids(text)
-                tokens = tokens[:128]
-                if len(tokens) < 128:
-                    tokens = tokens + [tokenizer.eos_id] * (128 - len(tokens))
-                chunks.append(tokens)
-        all_lookups = np.array(chunks).reshape(1, self.neighbors + 1, -1).astype(np.int64)
-        reuse_neighbors = all_lookups[:, 1:]
-        self.store.set('reuse_neighbors', pickle.dumps(reuse_neighbors))
-        # combine question and context
-        context_tokens = [
-            tokenizer.text_to_ids(n + '\nquestion: ' + q + ' \nanswer:') for n, q in zip(first_context, questions)
-        ]
-
-        if add_BOS:
-            context_tokens = [[tokenizer.bos_id] + s for s in context_tokens]
-        if self.pad_token_for_retrieval:
-            padded = []
-            for line in context_tokens:
-                pad_len = (self.chunk_size - len(line) % self.chunk_size) % self.chunk_size
-                padded.append([tokenizer.eos_id] * pad_len + line)
-            context_tokens = padded
-        context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eos_id, max_len)
-        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-        context_length_tensor = torch.cuda.LongTensor(context_lengths)
-        return context_tokens_tensor, context_length_tensor

From 80ac3f177c82ca932fcaa18caa5070f763556377 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 06:24:54 -0700
Subject: [PATCH 20/21] remove .py files

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../common/text_generation_strategy.py        |  12 +-
 nemo/collections/nlp/modules/__init__.py      |   2 -
 .../nlp/modules/common/__init__.py            |   6 -
 .../nlp/modules/common/chat_css.py            |  84 ---
 .../nlp/modules/common/chatbot_component.py   | 193 -------
 .../nlp/modules/common/decoder_module.py      |  59 ---
 .../nlp/modules/common/encoder_module.py      |  40 --
 .../nlp/modules/common/gpt_module.py          |  94 ----
 .../nlp/modules/common/lm_utils.py            | 251 ---------
 .../nlp/modules/common/megatron_web_server.py | 498 ------------------
 .../nlp/modules/common/prompt_table.py        |  32 --
 .../tts/models/language_modeling/nlp_model.py |  88 +++-
 12 files changed, 98 insertions(+), 1261 deletions(-)
 delete mode 100644 nemo/collections/nlp/modules/common/chat_css.py
 delete mode 100644 nemo/collections/nlp/modules/common/chatbot_component.py
 delete mode 100644 nemo/collections/nlp/modules/common/decoder_module.py
 delete mode 100644 nemo/collections/nlp/modules/common/encoder_module.py
 delete mode 100644 nemo/collections/nlp/modules/common/gpt_module.py
 delete mode 100644 nemo/collections/nlp/modules/common/lm_utils.py
 delete mode 100644 nemo/collections/nlp/modules/common/megatron_web_server.py
 delete mode 100644 nemo/collections/nlp/modules/common/prompt_table.py

diff --git a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
index 0ea1d8137a52..e54e407f80d0 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/text_generation_strategy.py
@@ -23,7 +23,6 @@
 from transformers import CLIPImageProcessor
 
 from nemo.collections.common.tokenizers.chat_template_mixin import explode_chat_template_input, is_chat_input
-from nemo.collections.nlp.modules.common.lm_utils import pad_batch
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 from nemo.utils import logging
@@ -51,6 +50,17 @@
 END_OF_SEQ = '<|endoftext|>'
 
 
+def pad_batch(batch, pad_id, max_len):
+    context_lengths = []
+    max_context_length = max([len(tokens) for tokens in batch])
+    for tokens in batch:
+        context_length = len(tokens)
+        if context_length < max_context_length + max_len:
+            tokens.extend([pad_id] * (max_context_length + max_len - context_length))
+        context_lengths.append(context_length)
+    return batch, context_lengths
+
+
 class TextGenerationStrategy:
     """
     Base class for TextGeneration Strategy
diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py
index d3b35b8a794b..071d66f23754 100644
--- a/nemo/collections/nlp/modules/__init__.py
+++ b/nemo/collections/nlp/modules/__init__.py
@@ -14,5 +14,3 @@
 
 
 from nemo.collections.nlp.modules.common import BertModule  # noqa: F401
-from nemo.collections.nlp.modules.common import get_lm_model  # noqa: F401
-from nemo.collections.nlp.modules.common import get_pretrained_lm_models_list  # noqa: F401
diff --git a/nemo/collections/nlp/modules/common/__init__.py b/nemo/collections/nlp/modules/common/__init__.py
index cf7bf9de9043..a252c63c6561 100644
--- a/nemo/collections/nlp/modules/common/__init__.py
+++ b/nemo/collections/nlp/modules/common/__init__.py
@@ -18,9 +18,3 @@
 # pylint: skip-file
 
 from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.collections.nlp.modules.common.lm_utils import get_lm_model, get_pretrained_lm_models_list
-from nemo.collections.nlp.modules.common.prompt_table import (
-    VirtualPromptPlaceholderToken,
-    VirtualPromptSource,
-    VirtualPromptStyle,
-)
diff --git a/nemo/collections/nlp/modules/common/chat_css.py b/nemo/collections/nlp/modules/common/chat_css.py
deleted file mode 100644
index e6b9a79c4bfe..000000000000
--- a/nemo/collections/nlp/modules/common/chat_css.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-CSS = """
-#chatbot .hll { background-color: #ffffcc }
-#chatbot .c { color: #408080; font-style: italic }
-#chatbot .err { border: 1px solid #FF0000 }
-#chatbot .k { color: #008000; font-weight: bold }
-#chatbot .o { color: #666666 }
-#chatbot .ch { color: #408080; font-style: italic }
-#chatbot .cm { color: #408080; font-style: italic }
-#chatbot .cp { color: #BC7A00 }
-#chatbot .cpf { color: #408080; font-style: italic }
-#chatbot .c1 { color: #408080; font-style: italic }
-#chatbot .cs { color: #408080; font-style: italic }
-#chatbot .gd { color: #A00000 }
-#chatbot .ge { font-style: italic }
-#chatbot .gr { color: #FF0000 }
-#chatbot .gh { color: #000080; font-weight: bold }
-#chatbot .gi { color: #00A000 }
-#chatbot .go { color: #888888 }
-#chatbot .gp { color: #000080; font-weight: bold }
-#chatbot .gs { font-weight: bold }
-#chatbot .gu { color: #800080; font-weight: bold }
-#chatbot .gt { color: #0044DD }
-#chatbot .kc { color: #008000; font-weight: bold }
-#chatbot .kd { color: #008000; font-weight: bold }
-#chatbot .kn { color: #008000; font-weight: bold }
-#chatbot .kp { color: #008000 }
-#chatbot .kr { color: #008000; font-weight: bold }
-#chatbot .kt { color: #B00040 }
-#chatbot .m { color: #666666 }
-#chatbot .s { color: #BA2121 }
-#chatbot .na { color: #7D9029 }
-#chatbot .nb { color: #008000 }
-#chatbot .nc { color: #0000FF; font-weight: bold }
-#chatbot .no { color: #880000 }
-#chatbot .nd { color: #AA22FF }
-#chatbot .ni { color: #999999; font-weight: bold }
-#chatbot .ne { color: #D2413A; font-weight: bold }
-#chatbot .nf { color: #0000FF }
-#chatbot .nl { color: #A0A000 }
-#chatbot .nn { color: #0000FF; font-weight: bold }
-#chatbot .nt { color: #008000; font-weight: bold }
-#chatbot .nv { color: #19177C }
-#chatbot .ow { color: #AA22FF; font-weight: bold }
-#chatbot .w { color: #bbbbbb }
-#chatbot .mb { color: #666666 }
-#chatbot .mf { color: #666666 }
-#chatbot .mh { color: #666666 }
-#chatbot .mi { color: #666666 }
-#chatbot .mo { color: #666666 }
-#chatbot .sa { color: #BA2121 }
-#chatbot .sb { color: #BA2121 }
-#chatbot .sc { color: #BA2121 }
-#chatbot .dl { color: #BA2121 }
-#chatbot .sd { color: #BA2121; font-style: italic }
-#chatbot .s2 { color: #BA2121 }
-#chatbot .se { color: #BB6622; font-weight: bold }
-#chatbot .sh { color: #BA2121 }
-#chatbot .si { color: #BB6688; font-weight: bold }
-#chatbot .sx { color: #008000 }
-#chatbot .sr { color: #BB6688 }
-#chatbot .s1 { color: #BA2121 }
-#chatbot .ss { color: #19177C }
-#chatbot .bp { color: #008000 }
-#chatbot .fm { color: #0000FF }
-#chatbot .vc { color: #19177C }
-#chatbot .vg { color: #19177C }
-#chatbot .vi { color: #19177C }
-#chatbot .vm { color: #19177C }
-#chatbot .il { color: #666666 }
-"""
diff --git a/nemo/collections/nlp/modules/common/chatbot_component.py b/nemo/collections/nlp/modules/common/chatbot_component.py
deleted file mode 100644
index afc86d9defec..000000000000
--- a/nemo/collections/nlp/modules/common/chatbot_component.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-
-Adopted from https://github.com/gradio-app/gradio/blob/main/gradio/components.py
-Fix a markdown render problem.
-"""
-from __future__ import annotations
-
-import warnings
-
-from markdown2 import Markdown
-
-try:
-    from typing import Any, Callable, Dict, List, Literal, Tuple
-
-    from gradio.components import (
-        Changeable,
-        Component,
-        Enum,
-        EventListenerMethod,
-        IOComponent,
-        JSONSerializable,
-        Selectable,
-        document,
-        processing_utils,
-    )
-
-    GRADIO_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    GRADIO_AVAILABLE = False
-
-
-class _Keywords(Enum):
-    NO_VALUE = "NO_VALUE"  # Used as a sentinel to determine if nothing is provided as a argument for `value` in `Component.update()`
-    FINISHED_ITERATING = (
-        "FINISHED_ITERATING"  # Used to skip processing of a component's value (needed for generators + state)
-    )
-
-
-@document("style")
-class Chatbot(Changeable, Selectable, IOComponent, JSONSerializable):
-    """
-    Displays a chatbot output showing both user submitted messages and responses. Supports a subset of Markdown including bold, italics, code, and images.
-    Preprocessing: this component does *not* accept input.
-    Postprocessing: expects function to return a {List[Tuple[str | None | Tuple, str | None | Tuple]]}, a list of tuples with user message and response messages. Messages should be strings, tuples, or Nones. If the message is a string, it can include Markdown. If it is a tuple, it should consist of (string filepath to image/video/audio, [optional string alt text]). Messages that are `None` are not displayed.
-
-    Demos: chatbot_simple, chatbot_multimodal
-    """
-
-    def __init__(
-        self,
-        value: List[Tuple[str | None, str | None]] | Callable | None = None,
-        color_map: Dict[str, str] | None = None,  # Parameter moved to Chatbot.style()
-        *,
-        label: str | None = None,
-        every: float | None = None,
-        show_label: bool = True,
-        visible: bool = True,
-        elem_id: str | None = None,
-        elem_classes: List[str] | str | None = None,
-        **kwargs,
-    ):
-        """
-        Parameters:
-            value: Default value to show in chatbot. If callable, the function will be called whenever the app loads to set the initial value of the component.
-            label: component name in interface.
-            every: If `value` is a callable, run the function 'every' number of seconds while the client connection is open. Has no effect otherwise. Queue must be enabled. The event can be accessed (e.g. to cancel it) via this component's .load_event attribute.
-            show_label: if True, will display label.
-            visible: If False, component will be hidden.
-            elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles.
-            elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles.
-        """
-        if color_map is not None:
-            warnings.warn("The 'color_map' parameter has been deprecated.",)
-        # self.md = utils.get_markdown_parser()
-        self.md = Markdown(extras=["fenced-code-blocks", "tables", "break-on-newline"])
-        self.select: EventListenerMethod
-        """
-        Event listener for when the user selects message from Chatbot.
-        Uses event data gradio.SelectData to carry `value` referring to text of selected message, and `index` tuple to refer to [message, participant] index.
-        See EventData documentation on how to use this event data.
-        """
-
-        IOComponent.__init__(
-            self,
-            label=label,
-            every=every,
-            show_label=show_label,
-            visible=visible,
-            elem_id=elem_id,
-            elem_classes=elem_classes,
-            value=value,
-            **kwargs,
-        )
-
-    def get_config(self):
-        return {
-            "value": self.value,
-            "selectable": self.selectable,
-            **IOComponent.get_config(self),
-        }
-
-    @staticmethod
-    def update(
-        value: Any | Literal[_Keywords.NO_VALUE] | None = _Keywords.NO_VALUE,
-        label: str | None = None,
-        show_label: bool | None = None,
-        visible: bool | None = None,
-    ):
-        updated_config = {
-            "label": label,
-            "show_label": show_label,
-            "visible": visible,
-            "value": value,
-            "__type__": "update",
-        }
-        return updated_config
-
-    def _process_chat_messages(self, chat_message: str | Tuple | List | Dict | None) -> str | Dict | None:
-        if chat_message is None:
-            return None
-        elif isinstance(chat_message, (tuple, list)):
-            mime_type = processing_utils.get_mimetype(chat_message[0])
-            return {
-                "name": chat_message[0],
-                "mime_type": mime_type,
-                "alt_text": chat_message[1] if len(chat_message) > 1 else None,
-                "data": None,  # These last two fields are filled in by the frontend
-                "is_file": True,
-            }
-        elif isinstance(chat_message, dict):  # This happens for previously processed messages
-            return chat_message
-        elif isinstance(chat_message, str):
-            # return self.md.render(chat_message)
-            return str(self.md.convert(chat_message))
-        else:
-            raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
-
-    def postprocess(
-        self, y: List[Tuple[str | Tuple | List | Dict | None, str | Tuple | List | Dict | None]],
-    ) -> List[Tuple[str | Dict | None, str | Dict | None]]:
-        """
-        Parameters:
-            y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.  It can also be a tuple whose first element is a string filepath or URL to an image/video/audio, and second (optional) element is the alt text, in which case the media file is displayed. It can also be None, in which case that message is not displayed.
-        Returns:
-            List of tuples representing the message and response. Each message and response will be a string of HTML, or a dictionary with media information.
-        """
-        if y is None:
-            return []
-        processed_messages = []
-        for message_pair in y:
-            assert isinstance(
-                message_pair, (tuple, list)
-            ), f"Expected a list of lists or list of tuples. Received: {message_pair}"
-            assert (
-                len(message_pair) == 2
-            ), f"Expected a list of lists of length 2 or list of tuples of length 2. Received: {message_pair}"
-            processed_messages.append(
-                (
-                    #                    '<pre style="font-family: var(--font)">' +
-                    #                    message_pair[0] + "</pre>",
-                    message_pair[0],
-                    self._process_chat_messages(message_pair[1]),
-                )
-            )
-        return processed_messages
-
-    def style(self, height: int | None = None, **kwargs):
-        """
-        This method can be used to change the appearance of the Chatbot component.
-        """
-        if height is not None:
-            self._style["height"] = height
-        if kwargs.get("color_map") is not None:
-            warnings.warn("The 'color_map' parameter has been deprecated.")
-
-        Component.style(
-            self, **kwargs,
-        )
-        return self
diff --git a/nemo/collections/nlp/modules/common/decoder_module.py b/nemo/collections/nlp/modules/common/decoder_module.py
deleted file mode 100644
index d1cb8ac9b1f0..000000000000
--- a/nemo/collections/nlp/modules/common/decoder_module.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC
-from typing import Any, Dict, Optional
-
-from nemo.core.classes import NeuralModule
-from nemo.core.neural_types import ChannelType, EncodedRepresentation, MaskType, NeuralType
-
-__all__ = ['DecoderModule']
-
-
-class DecoderModule(NeuralModule, ABC):
-    """ Base class for decoder neural module to be used in NLP models. """
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), ChannelType()),
-            "decoder_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
-            "encoder_embeddings": NeuralType(('B', 'T', 'D'), ChannelType(), optional=True),
-            "encoder_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
-            "decoder_mems": NeuralType(('B', 'D', 'T', 'D'), EncodedRepresentation(), optional=True),
-        }
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"last_hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
-
-    @property
-    def hidden_size(self) -> Optional[int]:
-        raise NotImplementedError
-
-    @property
-    def vocab_size(self) -> Optional[int]:
-        raise NotImplementedError
-
-    @property
-    def embedding(self) -> Optional[Any]:
-        raise NotImplementedError
-
-    @property
-    def decoder(self) -> Optional[Any]:
-        raise NotImplementedError
-
-    @property
-    def max_sequence_length(self) -> Optional[int]:
-        raise NotImplementedError
diff --git a/nemo/collections/nlp/modules/common/encoder_module.py b/nemo/collections/nlp/modules/common/encoder_module.py
deleted file mode 100644
index bd3912e0e693..000000000000
--- a/nemo/collections/nlp/modules/common/encoder_module.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC
-from typing import Dict, Optional
-
-from nemo.core.classes import NeuralModule
-from nemo.core.neural_types import ChannelType, MaskType, NeuralType
-
-__all__ = ['EncoderModule']
-
-
-class EncoderModule(NeuralModule, ABC):
-    """ Base class for encoder neural module to be used in NLP models. """
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), ChannelType()),
-            "encoder_mask": NeuralType(('B', 'T'), MaskType()),
-        }
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"last_hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
-
-    @property
-    def hidden_size(self) -> Optional[int]:
-        raise NotImplementedError
diff --git a/nemo/collections/nlp/modules/common/gpt_module.py b/nemo/collections/nlp/modules/common/gpt_module.py
deleted file mode 100644
index 2a145b0fc607..000000000000
--- a/nemo/collections/nlp/modules/common/gpt_module.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-from typing import Dict, Optional
-
-import torch
-
-from nemo.core.classes import NeuralModule
-from nemo.core.classes.exportable import Exportable
-from nemo.core.neural_types import ChannelType, FloatType, IntType, MaskType, NeuralType, StringType, VoidType
-from nemo.utils import logging
-
-__all__ = ['GPTModule']
-
-
-class GPTModule(NeuralModule, Exportable):
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), ChannelType()),
-            "token_type_ids": NeuralType(('B', 'T'), ChannelType(), optional=True),
-            "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
-            "labels": NeuralType(('B', 'T'), ChannelType(), optional=True),
-            'past_key_values': [[NeuralType(None, StringType(), optional=True)]],
-            'use_cache': NeuralType(None, VoidType(), optional=True),
-            'position_ids': NeuralType(('B', 'T'), ChannelType(), optional=True),
-            "return_dict": NeuralType(None, StringType(), optional=True),
-            "output_attentions": NeuralType(None, StringType(), optional=True),
-            "output_hidden_states": NeuralType(None, StringType(), optional=True),
-            "max_length": NeuralType(None, IntType(), optional=True),
-        }
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            'loss': NeuralType(None, FloatType(), optional=True),
-            'hidden_states': NeuralType(('B', 'T', 'D'), ChannelType()),
-        }
-
-    def restore_weights(self, restore_path: str):
-        """Restores module/model's weights"""
-        logging.info(f"Restoring weights from {restore_path}")
-
-        if not os.path.exists(restore_path):
-            logging.warning(f'Path {restore_path} not found')
-            return
-
-        pretrained_dict = torch.load(restore_path)
-
-        # backward compatibility with NeMo0.11
-        if "state_dict" in pretrained_dict.keys():
-            pretrained_dict = pretrained_dict["state_dict"]
-
-        # remove prefix from pretrained dict
-        m = re.match(r"^gpt.*?\.", list(pretrained_dict.keys())[0])
-        if m:
-            prefix = m.group(0)
-            pretrained_dict = {k[len(prefix) :]: v for k, v in pretrained_dict.items()}
-        model_dict = self.state_dict()
-        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
-
-        # starting with transformers 3.1.0, embeddings.position_ids is added to the model's state dict and could be
-        # missing in checkpoints trained with older transformers version
-        if 'embeddings.position_ids' in model_dict and 'embeddings.position_ids' not in pretrained_dict:
-            pretrained_dict['embeddings.position_ids'] = model_dict['embeddings.position_ids']
-
-        model_dict.update(pretrained_dict)
-        self.load_state_dict(model_dict)
-        logging.info(f"Weights for {type(self).__name__} restored from {restore_path}")
-
-    def input_example(self):
-        """
-        Generates input examples for tracing etc.
-        Returns:
-            A tuple of input examples.
-        """
-        sample = next(self.parameters())
-        input_ids = torch.randint(low=0, high=2048, size=(2, 16), device=sample.device)
-        token_type_ids = torch.randint(low=0, high=1, size=(2, 16), device=sample.device)
-        attention_mask = torch.randint(low=0, high=1, size=(2, 16), device=sample.device)
-        return tuple([input_ids, token_type_ids, attention_mask])
diff --git a/nemo/collections/nlp/modules/common/lm_utils.py b/nemo/collections/nlp/modules/common/lm_utils.py
deleted file mode 100644
index 5e6d8c14d2f9..000000000000
--- a/nemo/collections/nlp/modules/common/lm_utils.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint: skip-file
-# flake8: noqa
-
-import os
-from typing import List, Optional, Union
-
-from attr import asdict
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-
-from nemo.collections.nlp.modules.common.bert_module import BertModule
-from nemo.collections.nlp.modules.common.decoder_module import DecoderModule
-from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
-from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import (
-    get_huggingface_lm_model,
-    get_huggingface_pretrained_lm_models_list,
-)
-from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_pretrained_bert_models
-from nemo.collections.nlp.modules.common.transformer.transformer import NeMoTransformerConfig
-from nemo.collections.nlp.modules.common.transformer.transformer_utils import (
-    get_huggingface_transformer,
-    get_nemo_transformer,
-)
-from nemo.utils import AppState, logging
-
-__all__ = ['get_pretrained_lm_models_list', 'get_lm_model', 'pad_batch']
-
-
-def pad_batch(batch, pad_id, max_len):
-    context_lengths = []
-    max_context_length = max([len(tokens) for tokens in batch])
-    for tokens in batch:
-        context_length = len(tokens)
-        if context_length < max_context_length + max_len:
-            tokens.extend([pad_id] * (max_context_length + max_len - context_length))
-        context_lengths.append(context_length)
-    return batch, context_lengths
-
-
-def get_pretrained_lm_models_list(include_external: bool = False) -> List[str]:
-    """
-    Returns the list of supported pretrained model names
-
-    Args:
-        include_external if true includes all HuggingFace model names, not only those supported language models in NeMo.
-
-    """
-    return get_huggingface_pretrained_lm_models_list(include_external=include_external)
-
-
-def get_lm_model(
-    config_dict: Optional[dict] = None,
-    config_file: Optional[str] = None,
-    vocab_file: Optional[str] = None,
-    trainer: Optional[Trainer] = None,
-    cfg: DictConfig = None,
-) -> BertModule:
-    """
-    Helper function to instantiate a language model encoder, either from scratch or a pretrained model.
-    If only pretrained_model_name are passed, a pretrained model is returned.
-    If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights.
-
-    Args:
-        config_dict: path to the model configuration dictionary
-        config_file: path to the model configuration file
-        vocab_file: path to vocab_file to be used with Megatron-LM
-        trainer: an instance of a PyTorch Lightning trainer
-        cfg: a model configuration
-    Returns:
-        Pretrained BertModule
-    """
-
-    # check valid model type
-    if cfg.language_model.get('pretrained_model_name'):
-        if (
-            not cfg.language_model.pretrained_model_name
-            or cfg.language_model.pretrained_model_name not in get_pretrained_lm_models_list(include_external=False)
-        ):
-            logging.warning(
-                f'{cfg.language_model.pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), '
-                f'will be using AutoModel from HuggingFace.'
-            )
-
-    # warning when user passes both configuration dict and file
-    if config_dict and config_file:
-        logging.warning(
-            f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used."
-        )
-
-    pretrain_model_name = ''
-    if cfg.get('language_model') and cfg.language_model.get('pretrained_model_name', ''):
-        pretrain_model_name = cfg.language_model.get('pretrained_model_name', '')
-
-    from nemo.collections.nlp.modules.common.megatron.megatron_utils import list_available_models
-
-    def get_megatron_pretrained_bert_models() -> List[str]:
-
-        all_pretrained_megatron_bert_models = [model.pretrained_model_name for model in list_available_models()]
-        return all_pretrained_megatron_bert_models
-
-    all_pretrained_megatron_bert_models = get_megatron_pretrained_bert_models()
-    if (
-        cfg.tokenizer is not None
-        and cfg.tokenizer.get("tokenizer_name", "") is not None
-        and "megatron" in cfg.tokenizer.get("tokenizer_name", "")
-    ) or pretrain_model_name in all_pretrained_megatron_bert_models:
-        import torch
-
-        try:
-            from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
-        except (ImportError, ModuleNotFoundError):
-            from abc import ABC
-
-            MegatronBertModel = ABC
-
-        class Identity(torch.nn.Module):
-            def __init__(self):
-                super(Identity, self).__init__()
-
-            def forward(self, x, *args):
-                return x
-
-        if cfg.language_model.get("lm_checkpoint"):
-            model = MegatronBertModel.restore_from(restore_path=cfg.language_model.lm_checkpoint, trainer=trainer)
-        else:
-            model = MegatronBertModel.from_pretrained(cfg.language_model.get('pretrained_model_name'), trainer=trainer)
-        # remove the headers that are only revelant for pretraining
-        model.model.lm_head = Identity()
-        model.model.binary_head = Identity()
-        model.model.language_model.pooler = Identity()
-
-    else:
-        model = get_huggingface_lm_model(
-            config_dict=config_dict,
-            config_file=config_file,
-            pretrained_model_name=cfg.language_model.pretrained_model_name,
-        )
-
-        if cfg.language_model.get("lm_checkpoint"):
-            app_state = AppState()
-            if not app_state.is_model_being_restored and not os.path.exists(cfg.language_model.lm_checkpoint):
-                raise ValueError(f'{cfg.language_model.lm_checkpoint} not found')
-            model.restore_weights(restore_path=cfg.language_model.lm_checkpoint)
-
-    return model
-
-
-# @dataclass
-# class TransformerConfig:
-#     library: str = 'nemo'
-#     model_name: Optional[str] = None
-#     pretrained: bool = False
-#     config_dict: Optional[dict] = None
-#     checkpoint_file: Optional[str] = None
-#     encoder: bool = True
-
-
-def get_transformer(
-    library: str = 'nemo',
-    model_name: Optional[str] = None,
-    pretrained: bool = False,
-    config_dict: Optional[dict] = None,
-    checkpoint_file: Optional[str] = None,
-    encoder: bool = True,
-    pre_ln_final_layer_norm: bool = True,
-    padding_idx: int = 0,
-) -> Union[EncoderModule, DecoderModule]:
-    """Gets Transformer based model to be used as an Encoder or Decoder in NeMo NLP.
-       First choose the library to get the transformer from. This can be huggingface,
-       megatron, or nemo. Use the model_name arg to get a named model architecture
-       and use the pretrained arg to get the named model architecture with pretrained weights.
-
-       If model_name is None, then we can pass in a custom configuration via the config_dict.
-       For example, to instantiate a HuggingFace BERT model with custom configuration we would do:
-       encoder = get_transformer(library='huggingface',
-                                 config_dict={
-                                     '_target_': 'transformers.BertConfig',
-                                     'hidden_size': 1536
-                                 })
-
-
-    Args:
-        library (str, optional): Can be 'nemo', 'huggingface', or 'megatron'. Defaults to 'nemo'.
-        model_name (Optional[str], optional): Named model architecture from the chosen library. Defaults to None.
-        pretrained (bool, optional): Use True to get pretrained weights.
-                                     False will use the same architecture but with randomly initialized weights.
-                                     Defaults to False.
-        config_dict (Optional[dict], optional): Use for custom configuration of transformer. Defaults to None.
-        checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint. Defaults to None.
-        encoder (bool, optional): True returns an EncoderModule, False returns a DecoderModule. Defaults to True.
-
-    Returns:
-        Union[EncoderModule, DecoderModule]: Ensures that Encoder/Decoder will work in EncDecNLPModel
-    """
-
-    model = None
-
-    if library == 'nemo':
-        if isinstance(config_dict, NeMoTransformerConfig):
-            config_dict = asdict(config_dict)
-        model = get_nemo_transformer(
-            model_name=model_name,
-            pretrained=pretrained,
-            config_dict=config_dict,
-            encoder=encoder,
-            pre_ln_final_layer_norm=pre_ln_final_layer_norm,
-            padding_idx=padding_idx,
-        )
-
-        if checkpoint_file is not None:
-            if os.path.isfile(checkpoint_file):
-                raise ValueError(f'Loading transformer weights from checkpoint file has not been implemented yet.')
-
-    elif library == 'huggingface':
-        model = get_huggingface_transformer(
-            model_name=model_name, pretrained=pretrained, config_dict=config_dict, encoder=encoder
-        )
-
-    elif library == 'megatron':
-        raise ValueError(
-            f'megatron-lm bert support has been deprecated in NeMo 1.5+. Please use NeMo 1.4 for support.'
-        )
-        # TODO: enable megatron bert in nemo
-        # model = get_megatron_transformer(
-        #     model_name=model_name,
-        #     pretrained=pretrained,
-        #     config_dict=config_dict,
-        #     encoder=encoder,
-        #     checkpoint_file=checkpoint_file,
-        # )
-
-    else:
-        raise ValueError("Libary must be 'nemo', 'huggingface' or 'megatron'")
-
-    return model
diff --git a/nemo/collections/nlp/modules/common/megatron_web_server.py b/nemo/collections/nlp/modules/common/megatron_web_server.py
deleted file mode 100644
index 91fb12af102c..000000000000
--- a/nemo/collections/nlp/modules/common/megatron_web_server.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-
-try:
-    import gradio as gr
-
-    GRADIO_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    GRADIO_AVAILABLE = False
-
-from nemo.collections.nlp.modules.common.chat_css import CSS
-from nemo.collections.nlp.modules.common.megatron.retrieval_services.util import (
-    convert_retrieved_to_md,
-    request_data,
-    text_generation,
-)
-
-__all__ = ['RetroDemoWebApp', 'get_demo']
-
-TURN_TOKEN = '<extra_id_1>'
-
-PROMPT_PRESETS = {
-    "DIALOGUE": {
-        "SYSTEM_TURN_TOKEN": '',
-        "USER_TURN_TOKEN": '<extra_id_1>',
-        "BOT_TURN_TOKEN": '<extra_id_2>',
-        "END_OF_NAME": '',
-        "END_OF_TURN": '\n',
-    },
-    "DIALOGUE2": {
-        "SYSTEM_TURN_TOKEN": '<extra_id_0>System\n',
-        "USER_TURN_TOKEN": '<extra_id_1>',
-        "BOT_TURN_TOKEN": '<extra_id_1>',
-        "END_OF_NAME": '\n',
-        "END_OF_TURN": '\n',
-    },
-}
-
-
-PRESETS = {
-    "K1-Greedy": {"temperature": 1.0, "top_p": 0.9, "top_k": 1, "repetition_penalty": 1.0,},
-    "K50": {"temperature": 0.75, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,},
-    "K50-Creative": {"temperature": 0.85, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,},
-    "K50-Precise": {"temperature": 0.1, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,},
-    "K50-Original": {"temperature": 0.9, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,},
-    "Nucleus9": {"temperature": 0.8, "top_p": 0.9, "top_k": 10000, "repetition_penalty": 1.0,},
-    "Custom": {"temperature": 0.75, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,},
-}
-
-
-def check_gradio_import():
-    if not GRADIO_AVAILABLE:
-        msg = (
-            f"could not find the gradio library.\n"
-            f"****************************************************************\n"
-            f"To install it, please follow the steps below:\n"
-            f"pip install gradio==3.34.0\n"
-        )
-        raise ImportError(msg)
-
-
-def create_gen_function(port=5555, chat=False):
-    def get_generation(prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings):
-        data = {
-            "sentences": [prompt],
-            "tokens_to_generate": int(token_to_gen),
-            "temperature": temp,
-            "add_BOS": add_BOS,
-            "top_k": top_k,
-            "top_p": top_p,
-            "greedy": greedy,
-            "all_probs": False,
-            "repetition_penalty": repetition,
-            "min_tokens_to_generate": int(min_tokens),
-            "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0],
-        }
-        response = text_generation(data, port=port)
-        sentences = response['sentences']
-        bot_message = sentences[0]
-        if bot_message.find('<extra_id_0') < 0:
-            # hack due to the problem that huggingface's tokenizer strips out the <extra_id_x> token
-            prompt = prompt.replace('<extra_id_0>', '').replace('<extra_id_1>', '').replace('<extra_id_2>', '')
-        bot_message = bot_message[len(prompt) :]
-        return bot_message
-
-    return get_generation
-
-
-def get_demo(share, username, password, server_port=5555, web_port=9889, loop=None):
-    check_gradio_import()
-    asyncio.set_event_loop(loop)
-    with gr.Blocks(css=CSS) as demo:
-        with gr.Row():
-            with gr.Column(scale=2, width=200):
-                # store the mutliple turn conversation
-                token_to_gen = gr.Number(label='Number of Tokens to generate', value=300, type=int)
-                min_token_to_gen = gr.Number(label='Min number of Tokens to generate', value=1, type=int)
-                seed = gr.Number(label='Random seed', value=0, type=int)
-                end_strings = gr.Textbox(label="End strings (comma separated)", value="<extra_id_1>,", lines=1,)
-                add_BOS = gr.Checkbox(label="Add BOS token", value=False)
-                sampling_method = gr.Dropdown(
-                    list(PRESETS.keys()), label='Sampling Presets', default='K50', value='K50'
-                )
-                temperature = gr.Slider(minimum=0.0, maximum=5.0, value=0.75, label='Temperature', step=0.1)
-                top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.02, value=0.95, label='Top P')
-                top_k = gr.Slider(minimum=0, maximum=1024, step=2, value=50, label='Top K')
-
-                repetition_penality = gr.Slider(
-                    minimum=1.0, maximum=5.0, step=0.02, value=1.0, label='Repetition penalty'
-                )
-
-                def set_sampling(x):
-                    return list(PRESETS[x].values())
-
-                sampling_method.change(
-                    set_sampling, inputs=[sampling_method], outputs=[temperature, top_p, top_k, repetition_penality]
-                )
-
-            with gr.Column(scale=1, min_width=900):
-                text = gr.Textbox(label="Playground", value="", lines=60, placeholder="Type something here...",)
-                submit_btn = gr.Button("Generate")
-                clear = gr.Button("Clear")
-
-                def on_submit(
-                    prompt_text,
-                    token_to_gen,
-                    temperature,
-                    top_p,
-                    top_k,
-                    repetition_penality,
-                    seed,
-                    end_strings,
-                    add_BOS,
-                    min_token_to_gen,
-                ):
-
-                    output = create_gen_function(server_port)(
-                        prompt_text,
-                        False,
-                        add_BOS,
-                        token_to_gen,
-                        min_token_to_gen,
-                        temperature,
-                        top_p,
-                        top_k,
-                        repetition_penality,
-                        end_strings,
-                    )
-                    print(output)
-                    print('-------------------')
-                    return prompt_text + output
-
-                def clear_fun():
-                    return ''
-
-                submit_btn.click(
-                    on_submit,
-                    [
-                        text,
-                        token_to_gen,
-                        temperature,
-                        top_p,
-                        top_k,
-                        repetition_penality,
-                        seed,
-                        end_strings,
-                        add_BOS,
-                        min_token_to_gen,
-                    ],
-                    [text],
-                    queue=False,
-                )
-                clear.click(clear_fun, None, text, queue=False)
-        demo.queue(concurrency_count=16).launch(
-            share=share, server_port=web_port, server_name='0.0.0.0', auth=(username, password)
-        )
-
-
-def get_chatbot_demo(
-    share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None, attributes=None,
-):
-    check_gradio_import()
-    from nemo.collections.nlp.modules.common.chatbot_component import Chatbot
-
-    asyncio.set_event_loop(loop)
-    with gr.Blocks(css=CSS) as demo:
-        with gr.Row():
-            with gr.Column(scale=2, width=200):
-                # store the mutliple turn conversation
-                session_state = gr.State(value=[])
-                token_to_gen = gr.Number(label='Number of Tokens to generate', value=300, type=int)
-                seed = gr.Number(label='Random seed', value=0, type=int)
-                prompt_presets = gr.Dropdown(
-                    list(PROMPT_PRESETS.keys()), label='Template Presets', default='DIALOGUE2', value='DIALOGUE2'
-                )
-                sampling_method = gr.Dropdown(
-                    list(PRESETS.keys()), label='Sampling Presets', default='K50', value='K50'
-                )
-                with gr.Accordion("Sampling Parameters", open=False):
-                    temperature = gr.Slider(
-                        minimum=0.0, maximum=5.0, value=0.75, label='Temperature', step=0.1, interactive=False
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.0, maximum=1.0, step=0.02, value=0.95, label='Top P', interactive=False
-                    )
-                    top_k = gr.Slider(minimum=0, maximum=1024, step=2, value=50, label='Top K', interactive=False)
-                    repetition_penality = gr.Slider(
-                        minimum=1.0, maximum=5.0, step=0.02, value=1.0, label='Repetition penalty', interactive=False
-                    )
-
-                with gr.Accordion("Value Parameters", open=True, visible=value):
-                    keys = [k.key for k in attributes]
-                    # keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
-                    widgets = []
-                    for item in attributes:
-                        if item.type == 'int':
-                            slider = gr.Slider(
-                                minimum=item.min, maximum=item.max, step=1, value=item.default, label=item.name
-                            )
-                            widgets.append(slider)
-                        elif item.type == 'list':
-                            dropdown = gr.Dropdown(
-                                item.choices, label=item.name, default=item.default, value=item.default
-                            )
-                            widgets.append(dropdown)
-                    used_value = gr.CheckboxGroup(keys, value=keys)
-
-                    def change_visibility(x):
-                        values = []
-                        for key in keys:
-                            if key in x:
-                                values.append(gr.update(visible=True))
-                            else:
-                                values.append(gr.update(visible=False))
-                        return values
-
-                    used_value.change(
-                        change_visibility, inputs=[used_value], outputs=widgets,
-                    )
-
-                def set_sampling(x):
-                    if x == 'Custom':
-                        values = [gr.update(value=v, interactive=True) for v in PRESETS[x].values()]
-                        return values
-                    else:
-                        values = [gr.update(value=v, interactive=False) for v in PRESETS[x].values()]
-                        return values
-
-                sampling_method.change(
-                    set_sampling, inputs=[sampling_method], outputs=[temperature, top_p, top_k, repetition_penality]
-                )
-
-                gr.HTML("<hr>")
-                human_name = gr.Textbox(label="Human Name", value=defaults['user'], line=1,)
-                assistant_name = gr.Textbox(label="Assistant Name", value=defaults['assistant'], line=1,)
-                preamble = gr.Textbox(label="System", value=defaults['system'], lines=2,)
-
-                def set_prompt(x):
-                    if x == "DIALOGUE":
-                        return '', ''
-                    return defaults['user'], defaults['assistant']
-
-                prompt_presets.change(set_prompt, inputs=[prompt_presets], outputs=[human_name, assistant_name])
-
-            with gr.Column(scale=1, min_width=900):
-                chatbot = Chatbot(elem_id="chatbot").style(height=800)
-                msg = gr.Textbox(label="User", value="", lines=1,)
-                clear = gr.Button("Clear")
-
-                def user(user_message, history, session_state):
-                    session_state.append(user_message)
-                    user_message = user_message.replace('\n', '<br>')
-                    return "", history + [[user_message, None]]
-
-                def get_value_str(values_array, used_value):
-                    if len(used_value) == 0:
-                        return ''
-                    assert len(values_array) == len(keys)
-                    value_str = '<extra_id_2>'
-                    elements = []
-                    for i, key in enumerate(keys):
-                        if key in used_value:
-                            elements.append(f'{key}:{values_array[i]}')
-                    value_str += ','.join(elements) + '\n'
-                    return value_str
-
-                def bot(
-                    history,
-                    preamble,
-                    token_to_gen,
-                    temperature,
-                    top_p,
-                    top_k,
-                    repetition_penality,
-                    seed,
-                    human_name,
-                    assistant_name,
-                    session_state,
-                    prompts_presets,
-                    used_value,
-                    *values,
-                ):
-
-                    values_array = values
-                    if value:
-                        value_str = get_value_str(values_array, used_value)
-                    else:
-                        value_str = ''
-
-                    prompt_preset = PROMPT_PRESETS[prompts_presets]
-                    prompt_text = ''
-                    names = [human_name, assistant_name]
-                    turn_tokens = [prompt_preset['USER_TURN_TOKEN'], prompt_preset['BOT_TURN_TOKEN']]
-                    for i, meg in enumerate(session_state):
-                        name = names[i % 2]
-                        turn = turn_tokens[i % 2]
-                        prompt_text += turn + name + prompt_preset['END_OF_NAME'] + meg + prompt_preset['END_OF_TURN']
-                    prompt_text += (
-                        prompt_preset['BOT_TURN_TOKEN'] + assistant_name + prompt_preset['END_OF_NAME'] + value_str
-                    )
-                    prompt_text = prompt_preset['SYSTEM_TURN_TOKEN'] + preamble + prompt_text
-                    bot_message = create_gen_function(server_port)(
-                        prompt_text,
-                        False,
-                        False,
-                        token_to_gen,
-                        1,
-                        temperature,
-                        top_p,
-                        top_k,
-                        repetition_penality,
-                        '<extra_id_1>',
-                    )
-                    if bot_message.endswith(TURN_TOKEN):
-                        bot_message = bot_message[: -len(TURN_TOKEN)]
-                    history[-1][1] = bot_message
-                    print(prompt_text)
-                    print(bot_message)
-                    print('-------------------')
-                    session_state.append(value_str + bot_message.strip())
-                    return history
-
-                msg.submit(user, [msg, chatbot, session_state], [msg, chatbot], queue=False).then(
-                    bot,
-                    [
-                        chatbot,
-                        preamble,
-                        token_to_gen,
-                        temperature,
-                        top_p,
-                        top_k,
-                        repetition_penality,
-                        seed,
-                        human_name,
-                        assistant_name,
-                        session_state,
-                        prompt_presets,
-                        used_value,
-                        *widgets,
-                    ],
-                    [chatbot],
-                )
-
-                def clear_fun(session_state):
-                    session_state.clear()
-                    return None
-
-                clear.click(clear_fun, [session_state], chatbot, queue=False)
-        demo.launch(share=share, server_port=web_port, server_name='0.0.0.0', auth=(username, password))
-
-
-class RetroDemoWebApp:
-    def __init__(self, text_service_ip, text_service_port, combo_service_ip, combo_service_port):
-        self.text_service_ip = text_service_ip
-        self.text_service_port = text_service_port
-        self.combo_service_ip = combo_service_ip
-        self.combo_service_port = combo_service_port
-
-    def get_retro_generation(
-        self,
-        prompt,
-        greedy,
-        add_BOS,
-        token_to_gen,
-        min_tokens,
-        temp,
-        top_p,
-        top_k,
-        repetition,
-        neighbors,
-        weight,
-        end_strings,
-    ):
-        data = {
-            "sentences": [prompt],
-            "tokens_to_generate": int(token_to_gen),
-            "temperature": temp,
-            "add_BOS": add_BOS,
-            "top_k": top_k,
-            "top_p": top_p,
-            "greedy": greedy,
-            "all_probs": False,
-            "repetition_penalty": repetition,
-            "min_tokens_to_generate": int(min_tokens),
-            "neighbors": int(neighbors),
-            "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0],
-        }
-        self.update_weight(weight)
-        output_json = text_generation(data, self.text_service_ip, self.text_service_port)
-        sentences = output_json['sentences']
-        retrieved = output_json['retrieved']
-        return sentences[0], convert_retrieved_to_md(retrieved)
-
-    def update_weight(self, weight):
-        data = {"update_weight": [weight, 1.0 - weight]}
-        return request_data(data, self.combo_service_ip, self.combo_service_port)
-
-    def add_doc(self, doc, add_eos):
-        data = {
-            "sentences": [doc],
-            "add_eos": add_eos,
-        }
-        return request_data(data, self.combo_service_ip, self.combo_service_port)
-
-    def reset_index(self):
-        data = {"reset": None}
-        return request_data(data, self.combo_service_ip, self.combo_service_port)
-
-    def run_demo(self, share, username, password, port):
-        check_gradio_import()
-        with gr.Blocks(css="table, th, td { border: 1px solid blue; table-layout: fixed; width: 100%; }") as demo:
-            with gr.Row():
-                with gr.Column(scale=2, width=200):
-                    greedy_flag = gr.Checkbox(label="Greedy", value=True)
-                    add_BOS = gr.Checkbox(label="Add BOS token", value=False)
-                    token_to_gen = gr.Number(label='Number of Tokens to generate', value=30, type=int)
-                    min_token_to_gen = gr.Number(label='Min number of Tokens to generate', value=1, type=int)
-                    temperature = gr.Slider(minimum=0.0, maximum=10.0, value=1.0, label='Temperature', step=0.1)
-                    top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.02, value=0.9, label='Top P')
-                    top_k = gr.Slider(minimum=0, maximum=10000, step=2, value=0, label='Top K')
-                    repetition_penality = gr.Slider(
-                        minimum=1.0, maximum=5.0, step=0.02, value=1.2, label='Repetition penalty'
-                    )
-                    end_strings = gr.Textbox(label="End strings (comma separated)", value="<|endoftext|>,", lines=1,)
-                    k_neighbors = gr.Slider(minimum=0, maximum=50, step=1, value=2, label='Retrieved Documents')
-                    weight = gr.Slider(
-                        minimum=0.0, maximum=1.0, value=1.0, label='Weight for the Static Retrieval DB', step=0.02
-                    )
-                    add_retrival_doc = gr.Textbox(label="Add New Retrieval Doc", value="", lines=5,)
-                    add_EOS = gr.Checkbox(label="Add EOS token to Retrieval Doc", value=False)
-                    with gr.Row():
-                        add_btn = gr.Button(value="Add")
-                        reset_btn = gr.Button(value="Reset Index")
-                    output_status = gr.Label(value='')
-                    add_btn.click(self.add_doc, inputs=[add_retrival_doc, add_EOS], outputs=[output_status])
-                    reset_btn.click(self.reset_index, inputs=[], outputs=[output_status])
-
-                with gr.Column(scale=1, min_width=800):
-                    input_prompt = gr.Textbox(
-                        label="Input",
-                        value="Ariel was playing basketball. 1 of her shots went in the hoop. 2 of her shots did not go in the hoop. How many shots were there in total?",
-                        lines=5,
-                    )
-                    output_box = gr.Textbox(value="", label="Output")
-                    btn = gr.Button(value="Submit")
-                    output_retrieval = gr.HTML()
-                    btn.click(
-                        self.get_retro_generation,
-                        inputs=[
-                            input_prompt,
-                            greedy_flag,
-                            add_BOS,
-                            token_to_gen,
-                            min_token_to_gen,
-                            temperature,
-                            top_p,
-                            top_k,
-                            repetition_penality,
-                            k_neighbors,
-                            weight,
-                            end_strings,
-                        ],
-                        outputs=[output_box, output_retrieval],
-                    )
-        demo.launch(share=share, server_port=port, server_name='0.0.0.0', auth=(username, password))
diff --git a/nemo/collections/nlp/modules/common/prompt_table.py b/nemo/collections/nlp/modules/common/prompt_table.py
deleted file mode 100644
index 4cb2262837ee..000000000000
--- a/nemo/collections/nlp/modules/common/prompt_table.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import enum
-
-__all__ = ['VirtualPromptSource', 'VirtualPromptStyle', 'VirtualPromptPlaceholderToken']
-
-
-class VirtualPromptStyle(enum.Enum):
-    P_TUNING = 'p-tuning'
-    NO_PROMPT = 'no-prompts'
-
-
-class VirtualPromptSource(enum.Enum):
-    PROMPT_ENCODER = 'prompt_encoder'
-    NO_PROMPT = 'no-prompts'
-
-
-class VirtualPromptPlaceholderToken(enum.Enum):
-    BASE = '<prompt_'
-    END = '>'
diff --git a/nemo/collections/tts/models/language_modeling/nlp_model.py b/nemo/collections/tts/models/language_modeling/nlp_model.py
index 38b8eec469d9..ea1087a1d7ae 100644
--- a/nemo/collections/tts/models/language_modeling/nlp_model.py
+++ b/nemo/collections/tts/models/language_modeling/nlp_model.py
@@ -31,7 +31,6 @@
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer
 from nemo.collections.nlp.modules import BertModule
-from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
     MEGATRON_CONFIG_MAP,
     get_megatron_pretrained_bert_models,
@@ -69,6 +68,93 @@
 }
 
 
+def get_lm_model(
+    config_dict: Optional[dict] = None,
+    config_file: Optional[str] = None,
+    vocab_file: Optional[str] = None,
+    trainer: Optional[Trainer] = None,
+    cfg: DictConfig = None,
+) -> BertModule:
+    """
+    Helper function to instantiate a language model encoder, either from scratch or a pretrained model.
+    If only pretrained_model_name are passed, a pretrained model is returned.
+    If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights.
+
+    Args:
+        config_dict: path to the model configuration dictionary
+        config_file: path to the model configuration file
+        vocab_file: path to vocab_file to be used with Megatron-LM
+        trainer: an instance of a PyTorch Lightning trainer
+        cfg: a model configuration
+    Returns:
+        Pretrained BertModule
+    """
+
+    # check valid model type
+    if cfg.language_model.get('pretrained_model_name'):
+        if (
+            not cfg.language_model.pretrained_model_name
+            or cfg.language_model.pretrained_model_name not in get_pretrained_lm_models_list(include_external=False)
+        ):
+            logging.warning(
+                f'{cfg.language_model.pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), '
+                f'will be using AutoModel from HuggingFace.'
+            )
+
+    # warning when user passes both configuration dict and file
+    if config_dict and config_file:
+        logging.warning(
+            f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used."
+        )
+
+    pretrain_model_name = ''
+    if cfg.get('language_model') and cfg.language_model.get('pretrained_model_name', ''):
+        pretrain_model_name = cfg.language_model.get('pretrained_model_name', '')
+
+    from nemo.collections.nlp.modules.common.megatron.megatron_utils import list_available_models
+
+    def get_megatron_pretrained_bert_models() -> List[str]:
+
+        all_pretrained_megatron_bert_models = [model.pretrained_model_name for model in list_available_models()]
+        return all_pretrained_megatron_bert_models
+
+    all_pretrained_megatron_bert_models = get_megatron_pretrained_bert_models()
+    if (
+        cfg.tokenizer is not None
+        and cfg.tokenizer.get("tokenizer_name", "") is not None
+        and "megatron" in cfg.tokenizer.get("tokenizer_name", "")
+    ) or pretrain_model_name in all_pretrained_megatron_bert_models:
+        import torch
+
+        try:
+            from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+        except (ImportError, ModuleNotFoundError):
+            from abc import ABC
+
+            MegatronBertModel = ABC
+
+        class Identity(torch.nn.Module):
+            def __init__(self):
+                super(Identity, self).__init__()
+
+            def forward(self, x, *args):
+                return x
+
+        if cfg.language_model.get("lm_checkpoint"):
+            model = MegatronBertModel.restore_from(restore_path=cfg.language_model.lm_checkpoint, trainer=trainer)
+        else:
+            model = MegatronBertModel.from_pretrained(cfg.language_model.get('pretrained_model_name'), trainer=trainer)
+        # remove the headers that are only revelant for pretraining
+        model.model.lm_head = Identity()
+        model.model.binary_head = Identity()
+        model.model.language_model.pooler = Identity()
+
+    else:
+        raise ValueError("Model type is not supported.")
+
+    return model
+
+
 class NLPModel(ModelPT, Exportable):
     """Base class for NLP Models."""
 

From 96b1a8b2663fb1022c5c6e7b27b7ae7c76dba3ca Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 15 Oct 2025 06:32:29 -0700
Subject: [PATCH 21/21] fix style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/tts/models/language_modeling/nlp_model.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/tts/models/language_modeling/nlp_model.py b/nemo/collections/tts/models/language_modeling/nlp_model.py
index ea1087a1d7ae..0d6b14253405 100644
--- a/nemo/collections/tts/models/language_modeling/nlp_model.py
+++ b/nemo/collections/tts/models/language_modeling/nlp_model.py
@@ -16,7 +16,7 @@
 import hashlib
 import json
 import os
-from typing import Any, Mapping, Optional, Union
+from typing import Any, List, Mapping, Optional, Union
 
 import torch
 from lightning.fabric.utilities.cloud_io import _load as pl_load
@@ -92,10 +92,7 @@ def get_lm_model(
 
     # check valid model type
     if cfg.language_model.get('pretrained_model_name'):
-        if (
-            not cfg.language_model.pretrained_model_name
-            or cfg.language_model.pretrained_model_name not in get_pretrained_lm_models_list(include_external=False)
-        ):
+        if not cfg.language_model.pretrained_model_name:
             logging.warning(
                 f'{cfg.language_model.pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), '
                 f'will be using AutoModel from HuggingFace.'