diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 03d5415134f..883534bdc1f 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -491,9 +491,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO # Only support Qwen2VLImageProcessor for multi-modal processing currently # TODO: support other multi-modal inputs multi_modal_inputs = None - if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__: - from verl.models.transformers.qwen2_vl import get_rope_index - + if self.processor is not None: images = getattr(output, "multi_modal_data", {}).get("image", None) current_text = self.tokenizer.decode(input_ids.squeeze(0), skip_special_tokens=True) multi_modal_inputs = self.processor(text=[current_text], images=images, return_tensors="pt") @@ -502,7 +500,9 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO # We must use dict(multi_modal_inputs) to convert BatchFeature values to a new dict # because np.array() only keeps the keys for BatchFeature. - multi_modal_inputs = dict(multi_modal_inputs) + multi_modal_inputs = dict(multi_modal_inputs.convert_to_tensors("pt")) + if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__: + from verl.models.transformers.qwen2_vl import get_rope_index image_grid_thw = multi_modal_inputs.get("image_grid_thw") video_grid_thw = multi_modal_inputs.get("video_grid_thw") diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index 73ebaa35929..80d1ee3d0c2 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -27,6 +27,7 @@ from verl.interactions.utils.interaction_registry import initialize_interactions_from_config from verl.tools.schemas import ToolResponse from verl.tools.utils.tool_registry import initialize_tools_from_config +from verl.utils.chat_template import initialize_system_prompt from verl.utils.profiler import simple_timer from verl.utils.rollout_trace import rollout_trace_op @@ -108,9 +109,8 @@ def init_class(cls, config, tokenizer, processor, **kwargs): cls.apply_chat_template_kwargs = config.data.get("apply_chat_template_kwargs", {}) cls.prompt_length = config.actor_rollout_ref.rollout.prompt_length cls.response_length = config.actor_rollout_ref.rollout.response_length - cls.system_prompt = tokenizer.apply_chat_template( - [{}], add_generation_prompt=False, tokenize=True, **cls.apply_chat_template_kwargs - ) + cls.system_prompt = initialize_system_prompt(cls.tokenizer, **cls.apply_chat_template_kwargs) + # Initialize interactions from config file cls.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path if cls.interaction_config_file: diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml index f92ba0586af..1c42f66b22d 100644 --- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml @@ -506,6 +506,7 @@ reward_model: path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} trust_remote_code: false + override_config: {} micro_batch_size: null micro_batch_size_per_gpu: null max_length: null diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml index c9f88f013da..3ce021d675a 100644 --- a/verl/trainer/config/_generated_ppo_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_trainer.yaml @@ -437,6 +437,7 @@ reward_model: path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} trust_remote_code: false + override_config: {} use_shm: false use_remove_padding: false use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} diff --git a/verl/trainer/config/reward_model/reward_model.yaml b/verl/trainer/config/reward_model/reward_model.yaml index e9ffc60fbc6..dde8a814f14 100644 --- a/verl/trainer/config/reward_model/reward_model.yaml +++ b/verl/trainer/config/reward_model/reward_model.yaml @@ -34,6 +34,9 @@ model: # Whether to enable loading a remote code model, default to False trust_remote_code: False + # override hf config + override_config: {} + # [Deprecated] Global micro batch size # will be deprecated, use micro_batch_size_per_gpu micro_batch_size: null diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py new file mode 100644 index 00000000000..70b30452c01 --- /dev/null +++ b/verl/utils/chat_template.py @@ -0,0 +1,28 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +import logging +import os + +from jinja2 import TemplateError + +logger = logging.getLogger(__name__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +def initialize_system_prompt(tokenizer, **apply_chat_template_kwargs) -> list[int]: + """ + Initialize system prompt tokens for chat templates that support them. + + Args: + tokenizer: The tokenizer with a chat template + **apply_chat_template_kwargs: Additional arguments for apply_chat_template + + Returns: + List of token IDs for the system prompt, or empty list if not supported + """ + try: + return tokenizer.apply_chat_template( + [{}], add_generation_prompt=False, tokenize=True, **apply_chat_template_kwargs + ) + except TemplateError as e: + logger.warning(f"Chat template does not support system prompt: {e}") + return []