NVIDIA-NeMo · terrykong · Feb 25, 2026 · Feb 19, 2026
@@ -1,7 +1,7 @@
 [submodule "3rdparty/Megatron-LM"]
 	path = 3rdparty/Megatron-LM-workspace/Megatron-LM
-	url = https://github.com/yaoyu-33/Megatron-LM.git
-	branch = yifu/remove_do_not_average_loss
+	url = https://github.com/NVIDIA/Megatron-LM.git
+	branch = main
 	shallow = true
 [submodule "3rdparty/Megatron-Bridge"]
 	path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge

@@ -27,7 +27,7 @@
 
 CACHED_DEPENDENCIES = [
     "transformers<5.0.0",
-    "datasets",
+    "datasets>=2.20.0",
     "accelerate",
     "omegaconf>=2.3.0",
     "tensorboard>=2.19.0",
@@ -41,13 +41,15 @@
     "hydra-core>1.3,<=1.3.2",
     "megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
     "qwen-vl-utils",
-    "transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
+    "transformer-engine[pytorch,core_cu13]>=2.10.0a0,<2.13.0",
     "mamba-ssm",
     "nvidia-resiliency-ext",
     "causal-conv1d",
     "flash-linear-attention",
     "timm",
     "open-clip-torch>=3.2.0",
+    "mlflow>=3.5.0",
+    "torch>=2.6.0",
 ]
 
 # If the bridge source exists, compare cached dependencies with the submodule's pyproject

@@ -43,7 +43,7 @@
 # VCS dependencies use full "pkg @ git+URL@rev" format matching pyproject.toml [tool.uv.sources]
 CACHED_DEPENDENCIES = [
     # Default dependencies from pyproject.toml
-    "torch",
+    "torch>=2.6.0",
     "numpy",
     "packaging>=24.2",
     # Dev dependencies from pyproject.toml
@@ -58,7 +58,7 @@
     "opentelemetry-api~=1.33.1",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
-    "flash-linear-attention~=0.3.2",
+    "flash-linear-attention~=0.4.0",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
     "av",
@@ -69,6 +69,9 @@
     "emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
     "datasets",
     "fastapi~=0.50",
+    "flask[async]",
+    "hypercorn",
+    "openai",
 ]
 
 

@@ -36,9 +36,7 @@
 from megatron.core.distributed.fsdp.mcore_fsdp_adapter import (
     FullyShardedDataParallel as custom_FSDP,
 )
-from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
-    InferenceWrapperConfig,
-)
+from megatron.core.inference.config import InferenceConfig
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
@@ -702,14 +700,8 @@ def generate(
             )
 
         model_cfg = self.megatron_cfg.model
-        inference_wrapper_config = InferenceWrapperConfig(
-            hidden_size=model_cfg.hidden_size,
-            inference_batch_times_seqlen_threshold=1000000,
-            fp32_residual_connection=model_cfg.fp32_residual_connection,
-            params_dtype=model_cfg.params_dtype,
-            padded_vocab_size=self.final_padded_vocab_size,  # Use the potentially updated value
-            inference_max_seq_length=self.cfg["generation"]["max_new_tokens"],  # type: ignore
-            inference_max_requests=self.cfg["generation_batch_size"],
+        mcore_generation_config = cast(
+            MegatronGenerationConfig, self.cfg["generation"]["mcore_generation_config"]
         )
 
         from megatron.core.inference.contexts.dynamic_context import (
@@ -723,45 +715,32 @@ def generate(
         )
         from megatron.core.inference.sampling_params import SamplingParams
 
-        mcore_generation_config = cast(
-            MegatronGenerationConfig, self.cfg["generation"]["mcore_generation_config"]
-        )
-        buffer_size_gb = mcore_generation_config["buffer_size_gb"]
-
-        num_cuda_graphs = mcore_generation_config["num_cuda_graphs"]
-        block_size_tokens = mcore_generation_config["block_size_tokens"]
-        use_cuda_graphs_for_non_decode_steps = mcore_generation_config[
-            "use_cuda_graphs_for_non_decode_steps"
-        ]
-        enable_chunked_prefill = mcore_generation_config["enable_chunked_prefill"]
-        unified_memory_level = mcore_generation_config["unified_memory_level"]
-        max_tokens = mcore_generation_config["max_tokens"]
-
         model_config = self.model.config
         model_config.cuda_graph_impl = "local"
 
-        dynamic_context = DynamicInferenceContext(
-            params_dtype=inference_wrapper_config.params_dtype,
-            num_layers=model_config.num_layers,
-            kv_channels=model_config.kv_channels,
-            num_attention_heads=model_config.num_query_groups,
+        local_rank = torch.cuda.current_device()
+        num_gpus_per_node = torch.cuda.device_count()
+        node_idx = self.rank // num_gpus_per_node if num_gpus_per_node > 0 else 0
+        model_config.inference_sampling_seed = (node_idx * 1024) + local_rank
+
+        inference_config = InferenceConfig(
             max_sequence_length=self.cfg["generation"]["max_new_tokens"],
-            buffer_size_gb=buffer_size_gb,
-            materialize_only_last_token_logits=False,
-            num_cuda_graphs=num_cuda_graphs,
-            block_size_tokens=block_size_tokens,
-            tensor_model_parallel_size=self.cfg["megatron_cfg"][
-                "tensor_model_parallel_size"
+            buffer_size_gb=mcore_generation_config["buffer_size_gb"],
+            num_cuda_graphs=mcore_generation_config["num_cuda_graphs"],
+            block_size_tokens=mcore_generation_config["block_size_tokens"],
+            use_cuda_graphs_for_non_decode_steps=mcore_generation_config[
+                "use_cuda_graphs_for_non_decode_steps"
             ],
-            use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps,
+            enable_chunked_prefill=mcore_generation_config["enable_chunked_prefill"],
+            unified_memory_level=mcore_generation_config["unified_memory_level"],
+            max_tokens=mcore_generation_config["max_tokens"],
+            materialize_only_last_token_logits=False,
             use_flashinfer_fused_rope=False,
-            unified_memory_level=unified_memory_level,
-            max_tokens=max_tokens,
-        )
-        inference_wrapped_model = GPTInferenceWrapper(
-            self.model, inference_wrapper_config, dynamic_context
         )
 
+        dynamic_context = DynamicInferenceContext(model_config, inference_config)
+        inference_wrapped_model = GPTInferenceWrapper(self.model, dynamic_context)
+
         inference_wrapped_model.prep_model_for_inference()
         # Set pipeline parallel flag
         inference_wrapped_model.model_is_pipeline_parallel = (
@@ -773,21 +752,9 @@ def generate(
             tokenizer=self.megatron_tokenizer,
         )
 
-        # Calculate seed based on node and rank to ensure reproducibility across workers
-        local_rank = torch.cuda.current_device()  # Local GPU index on the node
-        num_gpus_per_node = torch.cuda.device_count()
-        node_idx = self.rank // num_gpus_per_node if num_gpus_per_node > 0 else 0
-        seed = (node_idx * 1024) + local_rank
-
-        # New API: DynamicInferenceEngine has additional parameters
         dynamic_engine = DynamicInferenceEngine(
             text_generation_controller,
             dynamic_context,
-            enable_cuda_graph=True,
-            random_seed=seed,
-            track_paused_request_events=False,
-            enable_chunked_prefill=enable_chunked_prefill,
-            inference_logging_step_interval=0,
         )
 
         # Handle None values for top_k - convert to integer as required by Megatron

@@ -66,7 +66,7 @@ automodel = [
   "mamba-ssm",
   "causal-conv1d",
   "nv-grouped-gemm",
-  "transformer-engine[pytorch]==2.8.0",
+  "transformer-engine[pytorch]>=2.9.0a0,<2.12.0",
   "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480",
 ]
 vllm = [
@@ -108,7 +108,7 @@ mcore = [
   # This dependency also needs to be compatible with the spec in Megatron-Bridge/pyproject.toml.
   # It is specified here since we don't directly use Megatron-Bridge/pyproject.toml, but a proxy setup.py+pyproject.toml combo
   # outside to allow "optionally" installing the megatron path. It's simpler to deal with transformer-engine here in the NeMo RL pyproject.toml
-  "transformer-engine[pytorch]==2.8.0",
+  "transformer-engine[pytorch]>=2.9.0a0,<2.12.0",
   "megatron-core",
   "megatron-bridge",
   # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
@@ -235,12 +235,12 @@ default-groups = ["dev", "build"]
 #  --link-mode=copy (slower but more reliable; supresses warning)
 #  --link-mode=symlink (fastest option when uv cache and venv on different file-system; caveat: venv is brittle since it depends on the environment/container)
 link-mode = "copy"
-# The TE override is needed because automodel/mbridge we are on is still on 2.5.0
+# The TE override is needed because automodel/mbridge we are on is still on an older version
 # The opencv-python-headless override is needed because automodel pins it to 4.10.0.84, whereas vllm>=0.11.0 needs >= 4.11.0
 # The timm override is needed because current automodel pins to 1.0.16. This can be removed once we move ToT automodel
 # The nvidia-modelopt override is needed because mcore is still on 0.33
 override-dependencies = [
-  "transformer-engine[pytorch]==2.8.0",
+  "transformer-engine[pytorch]>=2.9.0a0,<2.12.0",
   "opencv-python-headless>=4.11.0",
   "timm<=1.0.22",
   "nvidia-modelopt[torch]>=0.39.0",