Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[submodule "3rdparty/Megatron-LM"]
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
url = https://github.com/yaoyu-33/Megatron-LM.git
branch = yifu/remove_do_not_average_loss
url = https://github.com/NVIDIA/Megatron-LM.git
branch = main
shallow = true
[submodule "3rdparty/Megatron-Bridge"]
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 432 files
6 changes: 4 additions & 2 deletions 3rdparty/Megatron-Bridge-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

CACHED_DEPENDENCIES = [
"transformers<5.0.0",
"datasets",
"datasets>=2.20.0",
"accelerate",
"omegaconf>=2.3.0",
"tensorboard>=2.19.0",
Expand All @@ -41,13 +41,15 @@
"hydra-core>1.3,<=1.3.2",
"megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
"qwen-vl-utils",
"transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
"transformer-engine[pytorch,core_cu13]>=2.10.0a0,<2.13.0",
"mamba-ssm",
"nvidia-resiliency-ext",
"causal-conv1d",
"flash-linear-attention",
"timm",
"open-clip-torch>=3.2.0",
"mlflow>=3.5.0",
"torch>=2.6.0",
]

# If the bridge source exists, compare cached dependencies with the submodule's pyproject
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM-workspace/Megatron-LM
Submodule Megatron-LM updated 751 files
7 changes: 5 additions & 2 deletions 3rdparty/Megatron-LM-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# VCS dependencies use full "pkg @ git+URL@rev" format matching pyproject.toml [tool.uv.sources]
CACHED_DEPENDENCIES = [
# Default dependencies from pyproject.toml
"torch",
"torch>=2.6.0",
"numpy",
"packaging>=24.2",
# Dev dependencies from pyproject.toml
Expand All @@ -58,7 +58,7 @@
"opentelemetry-api~=1.33.1",
"mamba-ssm~=2.2",
"causal-conv1d~=1.5",
"flash-linear-attention~=0.3.2",
"flash-linear-attention~=0.4.0",
"nv-grouped-gemm~=1.1",
"megatron-energon[av_decode]~=6.0",
"av",
Expand All @@ -69,6 +69,9 @@
"emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
"datasets",
"fastapi~=0.50",
"flask[async]",
"hypercorn",
"openai",
]


Expand Down
75 changes: 21 additions & 54 deletions nemo_rl/models/policy/workers/megatron_policy_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@
from megatron.core.distributed.fsdp.mcore_fsdp_adapter import (
FullyShardedDataParallel as custom_FSDP,
)
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from megatron.core.inference.config import InferenceConfig
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
Expand Down Expand Up @@ -702,14 +700,8 @@ def generate(
)

model_cfg = self.megatron_cfg.model
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=model_cfg.hidden_size,
inference_batch_times_seqlen_threshold=1000000,
fp32_residual_connection=model_cfg.fp32_residual_connection,
params_dtype=model_cfg.params_dtype,
padded_vocab_size=self.final_padded_vocab_size, # Use the potentially updated value
inference_max_seq_length=self.cfg["generation"]["max_new_tokens"], # type: ignore
inference_max_requests=self.cfg["generation_batch_size"],
mcore_generation_config = cast(
MegatronGenerationConfig, self.cfg["generation"]["mcore_generation_config"]
)

from megatron.core.inference.contexts.dynamic_context import (
Expand All @@ -723,45 +715,32 @@ def generate(
)
from megatron.core.inference.sampling_params import SamplingParams

mcore_generation_config = cast(
MegatronGenerationConfig, self.cfg["generation"]["mcore_generation_config"]
)
buffer_size_gb = mcore_generation_config["buffer_size_gb"]

num_cuda_graphs = mcore_generation_config["num_cuda_graphs"]
block_size_tokens = mcore_generation_config["block_size_tokens"]
use_cuda_graphs_for_non_decode_steps = mcore_generation_config[
"use_cuda_graphs_for_non_decode_steps"
]
enable_chunked_prefill = mcore_generation_config["enable_chunked_prefill"]
unified_memory_level = mcore_generation_config["unified_memory_level"]
max_tokens = mcore_generation_config["max_tokens"]

model_config = self.model.config
model_config.cuda_graph_impl = "local"

dynamic_context = DynamicInferenceContext(
params_dtype=inference_wrapper_config.params_dtype,
num_layers=model_config.num_layers,
kv_channels=model_config.kv_channels,
num_attention_heads=model_config.num_query_groups,
local_rank = torch.cuda.current_device()
num_gpus_per_node = torch.cuda.device_count()
node_idx = self.rank // num_gpus_per_node if num_gpus_per_node > 0 else 0
model_config.inference_sampling_seed = (node_idx * 1024) + local_rank

inference_config = InferenceConfig(
max_sequence_length=self.cfg["generation"]["max_new_tokens"],
buffer_size_gb=buffer_size_gb,
materialize_only_last_token_logits=False,
num_cuda_graphs=num_cuda_graphs,
block_size_tokens=block_size_tokens,
tensor_model_parallel_size=self.cfg["megatron_cfg"][
"tensor_model_parallel_size"
buffer_size_gb=mcore_generation_config["buffer_size_gb"],
num_cuda_graphs=mcore_generation_config["num_cuda_graphs"],
block_size_tokens=mcore_generation_config["block_size_tokens"],
use_cuda_graphs_for_non_decode_steps=mcore_generation_config[
"use_cuda_graphs_for_non_decode_steps"
],
use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps,
enable_chunked_prefill=mcore_generation_config["enable_chunked_prefill"],
unified_memory_level=mcore_generation_config["unified_memory_level"],
max_tokens=mcore_generation_config["max_tokens"],
materialize_only_last_token_logits=False,
use_flashinfer_fused_rope=False,
unified_memory_level=unified_memory_level,
max_tokens=max_tokens,
)
inference_wrapped_model = GPTInferenceWrapper(
self.model, inference_wrapper_config, dynamic_context
)

dynamic_context = DynamicInferenceContext(model_config, inference_config)
inference_wrapped_model = GPTInferenceWrapper(self.model, dynamic_context)

inference_wrapped_model.prep_model_for_inference()
# Set pipeline parallel flag
inference_wrapped_model.model_is_pipeline_parallel = (
Expand All @@ -773,21 +752,9 @@ def generate(
tokenizer=self.megatron_tokenizer,
)

# Calculate seed based on node and rank to ensure reproducibility across workers
local_rank = torch.cuda.current_device() # Local GPU index on the node
num_gpus_per_node = torch.cuda.device_count()
node_idx = self.rank // num_gpus_per_node if num_gpus_per_node > 0 else 0
seed = (node_idx * 1024) + local_rank

# New API: DynamicInferenceEngine has additional parameters
dynamic_engine = DynamicInferenceEngine(
text_generation_controller,
dynamic_context,
enable_cuda_graph=True,
random_seed=seed,
track_paused_request_events=False,
enable_chunked_prefill=enable_chunked_prefill,
inference_logging_step_interval=0,
)

# Handle None values for top_k - convert to integer as required by Megatron
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ automodel = [
"mamba-ssm",
"causal-conv1d",
"nv-grouped-gemm",
"transformer-engine[pytorch]==2.8.0",
"transformer-engine[pytorch]>=2.9.0a0,<2.12.0",
"deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480",
]
vllm = [
Expand Down Expand Up @@ -108,7 +108,7 @@ mcore = [
# This dependency also needs to be compatible with the spec in Megatron-Bridge/pyproject.toml.
# It is specified here since we don't directly use Megatron-Bridge/pyproject.toml, but a proxy setup.py+pyproject.toml combo
# outside to allow "optionally" installing the megatron path. It's simpler to deal with transformer-engine here in the NeMo RL pyproject.toml
"transformer-engine[pytorch]==2.8.0",
"transformer-engine[pytorch]>=2.9.0a0,<2.12.0",
"megatron-core",
"megatron-bridge",
# Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
Expand Down Expand Up @@ -235,12 +235,12 @@ default-groups = ["dev", "build"]
# --link-mode=copy (slower but more reliable; supresses warning)
# --link-mode=symlink (fastest option when uv cache and venv on different file-system; caveat: venv is brittle since it depends on the environment/container)
link-mode = "copy"
# The TE override is needed because automodel/mbridge we are on is still on 2.5.0
# The TE override is needed because automodel/mbridge we are on is still on an older version
# The opencv-python-headless override is needed because automodel pins it to 4.10.0.84, whereas vllm>=0.11.0 needs >= 4.11.0
# The timm override is needed because current automodel pins to 1.0.16. This can be removed once we move ToT automodel
# The nvidia-modelopt override is needed because mcore is still on 0.33
override-dependencies = [
"transformer-engine[pytorch]==2.8.0",
"transformer-engine[pytorch]>=2.9.0a0,<2.12.0",
"opencv-python-headless>=4.11.0",
"timm<=1.0.22",
"nvidia-modelopt[torch]>=0.39.0",
Expand Down
Loading
Loading