From dccce95b0a487acb32a6c386e06ec6f4ff6b6c9a Mon Sep 17 00:00:00 2001 From: Nancy Date: Thu, 23 Oct 2025 16:03:31 -0700 Subject: [PATCH 1/2] add verl submodule to latest main and then enable processor support so multimodal models can work with rllm. that way for issue #242, multimodal ReAct agents can process and generate images for RL training --- rllm/trainer/verl/agent_workflow_trainer.py | 7 ++++--- rllm/trainer/verl/agent_workflow_trainer_fireworks.py | 6 ++++-- rllm/trainer/verl/train_workflow_pipeline.py | 5 +++-- verl | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/rllm/trainer/verl/agent_workflow_trainer.py b/rllm/trainer/verl/agent_workflow_trainer.py index bcd6eecd5..859168ccf 100644 --- a/rllm/trainer/verl/agent_workflow_trainer.py +++ b/rllm/trainer/verl/agent_workflow_trainer.py @@ -38,15 +38,16 @@ def __init__( self, config, tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, + processor=None, + role_worker_mapping: dict[Role, WorkerType] = None, + resource_pool_manager: ResourcePoolManager = None, ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, reward_fn=None, val_reward_fn=None, workflow_class=None, workflow_args=None, ): - super().__init__(config=config, tokenizer=tokenizer, role_worker_mapping=role_worker_mapping, resource_pool_manager=resource_pool_manager, ray_worker_group_cls=ray_worker_group_cls, reward_fn=reward_fn, val_reward_fn=val_reward_fn) + super().__init__(config=config, tokenizer=tokenizer, processor=processor, role_worker_mapping=role_worker_mapping, resource_pool_manager=resource_pool_manager, ray_worker_group_cls=ray_worker_group_cls, reward_fn=reward_fn, val_reward_fn=val_reward_fn) self.workflow_class = workflow_class self.workflow_args = workflow_args or {} diff --git a/rllm/trainer/verl/agent_workflow_trainer_fireworks.py b/rllm/trainer/verl/agent_workflow_trainer_fireworks.py index 239348e7e..e296f0276 100644 --- a/rllm/trainer/verl/agent_workflow_trainer_fireworks.py +++ b/rllm/trainer/verl/agent_workflow_trainer_fireworks.py @@ -37,8 +37,9 @@ def __init__( self, config, tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, + processor=None, + role_worker_mapping: dict[Role, WorkerType] = None, + resource_pool_manager: ResourcePoolManager = None, ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, reward_fn=None, val_reward_fn=None, @@ -48,6 +49,7 @@ def __init__( super().__init__( config=config, tokenizer=tokenizer, + processor=processor, role_worker_mapping=role_worker_mapping, resource_pool_manager=resource_pool_manager, ray_worker_group_cls=ray_worker_group_cls, diff --git a/rllm/trainer/verl/train_workflow_pipeline.py b/rllm/trainer/verl/train_workflow_pipeline.py index 46e13d585..362b93f17 100644 --- a/rllm/trainer/verl/train_workflow_pipeline.py +++ b/rllm/trainer/verl/train_workflow_pipeline.py @@ -82,12 +82,12 @@ def run(self, config, workflow_class=None, workflow_args=None): local_path = copy_to_local(config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)) # Instantiate the tokenizer and processor. - from verl.utils import hf_tokenizer + from verl.utils import hf_processor, hf_tokenizer trust_remote_code = config.data.get("trust_remote_code", False) tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) # Used for multimodal LLM, could be None - # processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) + processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) # Define worker classes based on the actor strategy. if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: @@ -173,6 +173,7 @@ def run(self, config, workflow_class=None, workflow_args=None): trainer = FireworksAgentWorkflowPPOTrainer( config=config, tokenizer=tokenizer, + processor=processor, role_worker_mapping=role_worker_mapping, resource_pool_manager=resource_pool_manager, ray_worker_group_cls=ray_worker_group_cls, diff --git a/verl b/verl index 8fdc4d3f2..7df2afb93 160000 --- a/verl +++ b/verl @@ -1 +1 @@ -Subproject commit 8fdc4d3f202f41461f4de9f42a637228e342668b +Subproject commit 7df2afb936cd37b7b3a262edc119b2a57f070e3b From af5e9e0f8da829c7a96c3fbd8731b67e56f3e538 Mon Sep 17 00:00:00 2001 From: Nancy Date: Thu, 23 Oct 2025 18:22:05 -0700 Subject: [PATCH 2/2] bump transformers to >=4.57.0 for Qwen3-VL processor support, as Qwen3VLProcessor requires transformers 4.57.0+ --- examples/solver_judge/solver_judge_flow_colab.ipynb | 2 +- scripts/install_verl.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/solver_judge/solver_judge_flow_colab.ipynb b/examples/solver_judge/solver_judge_flow_colab.ipynb index 3f3643bd1..76e44220c 100644 --- a/examples/solver_judge/solver_judge_flow_colab.ipynb +++ b/examples/solver_judge/solver_judge_flow_colab.ipynb @@ -87,7 +87,7 @@ }, "outputs": [], "source": [ - "!pip install \"transformers[hf_xet]>=4.51.0\" accelerate datasets peft hf-transfer \\\n", + "!pip install \"transformers[hf_xet]>=4.57.0\" accelerate datasets peft hf-transfer \\\n", " \"numpy<2.0.0\" \"pyarrow>=15.0.0\" pandas \\\n", " ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \\\n", " pytest py-spy pyext pre-commit ruff tensorboard\n", diff --git a/scripts/install_verl.sh b/scripts/install_verl.sh index 2dfcda415..d3239ee16 100644 --- a/scripts/install_verl.sh +++ b/scripts/install_verl.sh @@ -7,7 +7,7 @@ pip install --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21 echo "2. install basic packages" -pip install "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \ +pip install "transformers[hf_xet]>=4.57.0" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ "ray[default]" codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff