Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions recipe/fully_async_policy/megatron_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
get_device_name,
get_torch_device,
)
from verl.utils.megatron_utils import load_megatron_model_to_gpu, offload_megatron_model_to_cpu, per_tensor_generator
from verl.utils.megatron_utils import load_megatron_model_to_gpu, offload_megatron_model_to_cpu
from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker

logger = logging.getLogger(__file__)
Expand Down Expand Up @@ -113,17 +113,7 @@ def clear_cpu_model(self, n):
class DetachActorWorker(DetachNcclSync):
def _get_actor_params_generator(self):
assert self._is_actor
if self.bridge is not None:
generator = self.bridge.export_weights(self.actor.actor_module)
else:
generator = per_tensor_generator(
self.actor.actor_module,
self.actor_model_config,
self.weight_converter,
self.tf_config,
self.layer_name_mapping,
)

generator = self.bridge.export_weights(self.actor.actor_module)
return generator

@register(dispatch_mode=Dispatch.ONE_TO_ALL)
Expand Down
16 changes: 1 addition & 15 deletions recipe/one_step_off_policy/megatron_workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,7 @@ class DetachActorWorker(DetachSync):
@register(dispatch_mode=Dispatch.ONE_TO_ALL)
def _get_actor_params_generator(self):
assert self._is_actor
from verl.models.mcore import get_mcore_weight_converter
from verl.utils.megatron_utils import per_tensor_generator

layer_name_mapping = {
"qkv_layer_name": "self_attention.linear_qkv.",
"gate_proj_layer_name": "linear_fc1.",
}
weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
generator = per_tensor_generator(
self.actor.actor_module,
self.actor_model_config,
weight_converter,
self.tf_config,
layer_name_mapping,
)
generator = self.bridge.export_weights(self.actor.actor_module)
return generator

@register(dispatch_mode=Dispatch.ONE_TO_ALL)
Expand Down
100 changes: 0 additions & 100 deletions tests/special_distributed/test_mcore_config_converter.py

This file was deleted.

4 changes: 3 additions & 1 deletion tests/special_e2e/run_fully_async_policy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,12 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
actor_rollout_ref.actor.megatron.grad_offload=${actor_offload} \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
actor_rollout_ref.actor.megatron.use_mbridge=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.ref.megatron.use_mbridge=True \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
actor_rollout_ref.ref.megatron.param_offload=${ref_offload} $@
actor_rollout_ref.ref.megatron.param_offload=${ref_offload} $@
else
echo "Error: Unknown strategy ${ACTOR_STRATEGY}. Please use 'fsdp2' or 'megatron'"
exit 1
Expand Down
2 changes: 2 additions & 0 deletions tests/special_e2e/run_one_step_off_policy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
actor_rollout_ref.actor.megatron.grad_offload=${actor_offload} \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
actor_rollout_ref.actor.megatron.use_mbridge=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.ref.megatron.use_mbridge=True \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
actor_rollout_ref.ref.megatron.param_offload=${ref_offload} $@
Expand Down
2 changes: 1 addition & 1 deletion tests/special_e2e/run_ppo_trainer_megatron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
USE_MBRIDGE=${USE_MBRIDGE:-False}
USE_MBRIDGE=${USE_MBRIDGE:-True}
VANILLA_MBRIDGE=${VANILLA_MBRIDGE:-True}
VALUE_VANILLA_MBRIDGE=${VALUE_VANILLA_MBRIDGE:-$VANILLA_MBRIDGE}
USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-False}
Expand Down
7 changes: 3 additions & 4 deletions verl/utils/megatron_peft_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_adapter_state_dict(model):
Returns:
Dict of adapter parameter names to tensors
"""
from verl.utils.megatron_utils import unwrap_model
from megatron.core.utils import unwrap_model

# Unwrap model from DDP/Float16Module
unwrapped = unwrap_model(model)
Expand Down Expand Up @@ -138,8 +138,7 @@ def load_adapter_checkpoint(
strict: Whether to strictly enforce parameter name matching
"""
from megatron.core import mpu

from verl.utils.megatron_utils import unwrap_model
from megatron.core.utils import unwrap_model

# Get rank-specific path
rank_path = _get_rank_checkpoint_path(checkpoint_path)
Expand Down Expand Up @@ -192,7 +191,7 @@ def count_adapter_parameters(model):
Returns:
Tuple of (adapter_params, total_params, percentage)
"""
from verl.utils.megatron_utils import unwrap_model
from megatron.core.utils import unwrap_model

unwrapped = unwrap_model(model)
if isinstance(unwrapped, list):
Expand Down
2 changes: 1 addition & 1 deletion verl/workers/actor/megatron_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
# from megatron.core.optimizer import DistributedOptimizer
from megatron.core.optimizer import DistributedOptimizer
from megatron.core.pipeline_parallel import get_forward_backward_func
from megatron.core.utils import get_model_config, unwrap_model
from omegaconf import OmegaConf
from torch import nn

Expand All @@ -49,7 +50,6 @@
set_router_replay_data,
)
from verl.utils.megatron.tensor_parallel import vocab_parallel_entropy, vocab_parallel_log_probs_from_logits
from verl.utils.megatron_utils import get_model_config, unwrap_model
from verl.utils.profiler import GPUMemoryLogger
from verl.utils.profiler.profile import Profiler
from verl.utils.py_functional import append_to_dict
Expand Down
Loading
Loading