diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml new file mode 100644 index 00000000..5e8a0b1a --- /dev/null +++ b/.github/workflows/doc.yml @@ -0,0 +1,50 @@ +name: doc_test + +on: + push: + branches: + - main + pull_request: + branches: + - main + paths: + - "**/*.py" + - "docs/**" + - .github/workflows/doc.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +permissions: + contents: read + +jobs: + doc_test: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python 3.11 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.11" + - name: Install dependencies + run: | + pip install --no-deps -e . + pip install -r docs/requirements-docs.txt + - name: Build docs + run: | + cd docs + make clean + make html SPHINXOPTS="--keep-going -w _build/sphinx.log" + if grep -q ": ERROR:" _build/sphinx.log; then + echo "Sphinx build contained ERRORs - see _build/sphinx.log" + cat _build/sphinx.log + exit 1 + fi + if grep -q "WARNING: document isn't included in any toctree" _build/sphinx.log; then + echo "Sphinx build WARNING: document not included in any toctree" + cat _build/sphinx.log + exit 1 + fi diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..19a3d9f4 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,18 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +sphinx: + configuration: docs/conf.py + +python: + install: + - requirements: docs/requirements-docs.txt + - method: pip + path: . diff --git a/docs/.gitkeep b/docs/.gitkeep deleted file mode 100644 index 691a6584..00000000 --- a/docs/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -drop later diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..20a59f75 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,15 @@ +# Minimal makefile for Sphinx documentation + +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = verl-omni +SOURCEDIR = . +BUILDDIR = _build + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/algo/flowgrpo.md b/docs/algo/flowgrpo.md new file mode 100644 index 00000000..b2ee8eef --- /dev/null +++ b/docs/algo/flowgrpo.md @@ -0,0 +1,272 @@ +# Flow-GRPO + +Last updated: 04/23/2026. + +Flow-GRPO ([paper](https://arxiv.org/abs/2505.05470), [code](https://github.com/yifan123/flow_grpo)) is the first method to integrate online policy gradient reinforcement learning into **flow matching** generative models (e.g., Stable Diffusion 3, FLUX). It enables direct reward optimization for tasks such as compositional text-to-image generation, visual text rendering, and human preference alignment, without modifying the standard inference pipeline. + +Two core technical contributions make this possible: + +1. **ODE-to-SDE Conversion**: Flow matching models natively use a deterministic ODE sampler. Flow-GRPO converts this ODE into an equivalent SDE that preserves the model's marginal distribution at every timestep. This introduces the stochasticity required for group sampling and RL exploration. + +2. **Denoising Reduction**: Training on all denoising steps is expensive. Flow-GRPO reduces the number of *training* steps while keeping the original number of *inference* steps, significantly improving sampling efficiency without sacrificing reward performance. + +Empirically, RL-tuned SD3.5-M with Flow-GRPO raises GenEval accuracy from 63% to 95% and visual text rendering accuracy from 59% to 92%. + +## Key Components + +- **Flow Matching Backbone**: operates on continuous-time flow matching models (e.g., SD3.5, FLUX) rather than discrete-token LLMs. +- **ODE-to-SDE Rollout**: generates a group of diverse image trajectories by injecting controlled noise via SDE sampling at selected denoising steps. +- **Denoising Reduction**: trains on a reduced subset of denoising steps (configurable via `sde_window_size` and `sde_window_range`) while inference uses the full step count. +- **Image Reward Models**: rewards are assigned by external reward models (e.g., GenEval, OCR, PickScore, aesthetic score) rather than rule-based verifiers. +- **No Critic**: like GRPO for LLMs, no separate value network is trained; advantages are computed from group-relative rewards. + +## Key Differences: GRPO vs. Flow-GRPO + +| Dimension | GRPO (LLM) | Flow-GRPO (Diffusion) | +|---|---|---| +| **Model type** | Autoregressive language model | Flow matching / diffusion model | +| **Action space** | Discrete token sequences | Continuous denoising trajectories (SDE paths) | +| **Rollout mechanism** | Sample `n` token sequences per prompt | Convert ODE to SDE; sample `n` image trajectories per prompt via stochastic denoising | +| **Log-probability** | Standard next-token log-prob | Log-prob of the SDE noise prediction at each selected denoising step | +| **Training steps** | All decoding steps are trivially identical in cost | Denoising Reduction: train on a small window of steps, infer with full steps | +| **Reward signal** | Rule-based verifiers or LLM judges on text | Image reward models (GenEval, OCR, PickScore, aesthetic, etc.) | +| **KL regularization** | KL penalty added to reward or directly to loss | KL-style regularization is available, but the exact setup depends on the training config | +| **CFG (guidance)** | Not applicable | CFG distillation occurs naturally; CFG can be disabled at both train and test time | +| **Advantage estimator** | `algorithm.adv_estimator=grpo` | `algorithm.adv_estimator=flow_grpo` | +| **Loss mode** | `actor_rollout_ref.actor.policy_loss.loss_mode` not diffusion-specific | `actor_rollout_ref.actor.diffusion_loss.loss_mode=flow_grpo` | + +## Configuration + +Diffusion training now uses dedicated diffusion config blocks. In `verl/trainer/config/diffusion_trainer.yaml`, +the main sections are: + +- `algorithm`: diffusion-specific advantage computation and normalization +- `actor_rollout_ref.actor`: optimization and diffusion loss settings +- `actor_rollout_ref.rollout`: rollout backend, sampling, and SDE controls +- `actor_rollout_ref.model`: model path plus diffusion-model / LoRA settings +- `reward`: reward manager, reward model, and custom reward function + +The default diffusion model YAML mirrors several rollout fields +(`num_inference_steps`, `true_cfg_scale`, `max_sequence_length`, +`guidance_scale`, and `algo`) into `actor_rollout_ref.model.*`, so in practice +the rollout section is the main place to override sampling behavior. + +### Core parameters + +#### Algorithm + +- `algorithm.adv_estimator`: Set to `flow_grpo`. + +#### Actor / loss + +- `actor_rollout_ref.actor.diffusion_loss.loss_mode`: Set to `flow_grpo`. + +- `actor_rollout_ref.actor.diffusion_loss.clip_ratio`: clipping + factor used in the diffusion loss. + +- `actor_rollout_ref.actor.diffusion_loss.adv_clip_max`: Maximum absolute + advantage used before computing the policy loss. + +- `actor_rollout_ref.actor.use_kl_loss`: Enables KL loss against the reference + policy. + +- `actor_rollout_ref.actor.kl_loss_coef`: Coefficient for the KL term when KL enabled. + +#### Rollout / sampling + +- `actor_rollout_ref.rollout.name`: Selects the rollout backend. Currently supports `vllm_omni`. + +- `actor_rollout_ref.rollout.n`: Number of sampled image trajectories per + prompt. This is the FlowGRPO group size and should be greater than `1`. + +- `actor_rollout_ref.rollout.algo.noise_level`: Magnitude of SDE noise injected + during rollout. Larger values increase diversity but can hurt image quality. + +- `actor_rollout_ref.rollout.algo.sde_type`: SDE variant for rollout. The + current example uses `sde`. + +- `actor_rollout_ref.rollout.algo.sde_window_size`: Number of denoising steps + included in the active training window. Smaller values reduce training cost. + +- `actor_rollout_ref.rollout.algo.sde_window_range`: Range used to sample the + start of that active denoising window. + +- `actor_rollout_ref.rollout.num_inference_steps`: Number of denoising steps + used for rollout generation during training. + +- `actor_rollout_ref.rollout.val_kwargs.num_inference_steps`: Number of + denoising steps used during validation / evaluation. + +- `actor_rollout_ref.rollout.true_cfg_scale`: True classifier-free guidance + scale used during rollout. Used in `Qwen-Image`. + +- `actor_rollout_ref.rollout.guidance_scale`: Distilled guidance scale for + models that expose a guidance embedding; keep `null` to disable it. + +- `actor_rollout_ref.rollout.external_lib`: Python module path imported on + every rollout worker before the engine starts. Use this to register custom + pipeline implementations (e.g., `examples.flowgrpo_trainer.vllm_omni_impl` + for the Qwen-Image `vllm_omni` example). The module must call + `@VllmOmniPipelineBase.register(...)` at import time. + +#### Model + +- `actor_rollout_ref.model.path`: Base diffusion model path. + +- `actor_rollout_ref.model.tokenizer_path`: Optional tokenizer path if it is + not located under the model path. + +#### Batch size + +FlowGRPO uses three nested batch-size parameters that operate at different +stages of the training loop. They address different concerns (RL sample +diversity, multi-epoch reuse, and GPU memory) and must be understood together. + +**Step 1 — Rollout (`data.train_batch_size`)** + +`data.train_batch_size` is the number of **unique prompts** drawn from the +dataset per training step. Before rollout, each prompt is replicated +`actor_rollout_ref.rollout.n` times so that the rollout engine generates `n` +independent image trajectories per prompt. The in-memory batch after rollout +therefore holds `train_batch_size × n` image samples. GRPO advantage +normalization runs over this **full** batch — it needs all `n` trajectories +for every prompt to compute group-relative rewards before any splitting occurs. + +**Step 2 — Actor update (`actor_rollout_ref.actor.ppo_mini_batch_size`)** + +`ppo_mini_batch_size` controls how the full post-rollout batch is sliced for +actor gradient updates. **Important:** this value is specified in **prompts**, +not image samples. The trainer internally scales it by `rollout.n` to get +the actual mini-batch size in samples: + +``` +effective mini-batch = ppo_mini_batch_size × rollout.n (image samples) +number of mini-batches per epoch = train_batch_size / ppo_mini_batch_size +``` + +All `n` trajectories belonging to the same prompt are kept in the same +mini-batch. This is not optional: although advantages are already computed +globally before this split, the gradient update for each image depends on its +advantage relative to the other images in its group. Scattering a prompt's +trajectories across different mini-batches would break that correspondence. +`ppo_mini_batch_size` must divide `train_batch_size` evenly. + +**Step 3 — FSDP sharding and gradient accumulation +(`actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`)** + +Each mini-batch is distributed across GPUs by FSDP data parallelism, so each +GPU receives `(ppo_mini_batch_size × n) / n_gpus` image samples. That +per-GPU shard is then **chunked into micro-batches** of +`ppo_micro_batch_size_per_gpu` for the actual forward/backward passes, with +gradients accumulated across chunks before the optimizer step. This is pure +gradient accumulation: the effective gradient is identical to running the full +per-GPU shard in one shot; only peak activation memory changes. + +For diffusion models the accumulation is two-dimensional: the engine also +loops over each active denoising timestep inside every micro-batch, so the +total gradient accumulation steps per GPU per mini-batch is: + +``` +gradient_accumulation_steps = (per_gpu_samples / ppo_micro_batch_size_per_gpu) + × sde_window_size +``` + +`ppo_micro_batch_size_per_gpu` must satisfy: +`(ppo_mini_batch_size × n) / n_gpus` is divisible by +`ppo_micro_batch_size_per_gpu`. + +**Concrete walkthrough** (reference OCR script, 4 GPUs, `sde_window_size=2`): + +``` +data.train_batch_size = 32 # 32 prompts loaded +actor_rollout_ref.rollout.n = 16 # 16 images generated per prompt + → post-rollout batch = 512 # advantage computed over all 512 + +ppo_mini_batch_size (config) = 16 # in prompts + → effective mini-batch = 16 × 16 = 256 samples + → mini-batches per epoch = 512 / 256 = 2 actor gradient steps + +FSDP shards 256 samples across 4 GPUs: + → per-GPU samples = 256 / 4 = 64 + +ppo_micro_batch_size_per_gpu = 16 + → micro-batches per GPU = 64 / 16 = 4 + → gradient_accumulation_steps = 4 × 2 (sde_window_size) = 8 +``` + +#### Reward + +- `reward.reward_manager.name`: Selects the reward manager. + +- `reward.custom_reward_function.path` and + `reward.custom_reward_function.name`: Register the task-specific reward + post-processing function such as `compute_score_ocr`. + +For an end-to-end OCR training walkthrough, including dataset preparation and +the full runnable command, see `docs/start/flowgrpo_quickstart.md`. + + +## Reference Example + +Standard LoRA training with OCR reward (Qwen-Image, 4 GPUs) using the current +`vllm_omni` rollout example: + +```bash +bash examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh +``` + +## Variants + +### Rule-Based Reward Training: JPEG incompressibility + +FlowGRPO also supports rule-based rewards that score images directly without a +VLM reward model, using the same `reward.reward_manager.name=visual` setup. + +`verl/utils/reward_score/jpeg_compressibility.py` rewards images that are +harder to JPEG-compress (richer texture, more complex content). No extra +dependencies or reward model process are required. + +Minimal dataset row: + +```python +{ + "data_source": "jpeg_compressibility", + "prompt": [{"role": "user", "content": ""}], + "reward_model": {"ground_truth": ""}, # required by schema, ignored by scorer +} +``` + +Config changes relative to the OCR example — **remove** these lines: + +```bash +reward.reward_model.enable=True +reward.reward_model.model_path=... +reward.reward_model.rollout.name=... +reward.reward_model.rollout.tensor_model_parallel_size=... +reward.custom_reward_function.path=... +reward.custom_reward_function.name=... +``` + +Keep `reward.reward_manager.name=visual` and all actor/rollout settings +unchanged. + +### Async Reward + + +For reward models that are expensive to evaluate (e.g., a VLM judge), the reward model can be allocated its own dedicated GPU resource pool and run asynchronously alongside the policy. This avoids blocking policy training on reward computation. + +```bash +bash examples/flowgrpo_trainer/run_qwen_image_ocr_lora_async_reward.sh +``` + + +## Citation + +```bibtex +@article{liu2025flow, + title={Flow-GRPO: Training Flow Matching Models via Online RL}, + author={Liu, Jie and Liu, Gongye and Liang, Jiajun and Li, Yangguang and Liu, Jiaheng and Wang, Xintao and Wan, Pengfei and Zhang, Di and Ouyang, Wanli}, + journal={arXiv preprint arXiv:2505.05470}, + year={2025} +} +``` diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..f0528246 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,51 @@ +# Copyright 2026 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configuration file for the Sphinx documentation builder. +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +project = "VeRL-Omni" +copyright = "2026 Bytedance Ltd. and/or its affiliates" +author = "VeRL-Omni contributors" + +master_doc = "index" + +extensions = [ + "myst_parser", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.autosectionlabel", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", +] + +myst_enable_extensions = [ + "dollarmath", + "amsmath", +] + +napoleon_google_docstring = True +napoleon_numpy_docstring = False + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +language = "en" +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +html_theme = "sphinx_rtd_theme" + +suppress_warnings = ["ref.duplicate", "ref.myst"] diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..0b3fbba4 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,58 @@ +# Welcome to VeRL-Omni's documentation! + +Last updated: 04/23/2026 + +[VeRL-Omni](https://github.com/verl-project/verl-omni) is a general RL training framework focused on diffusion and omni-modality generative models. It starts from the multimodal generation RL work incubated in [verl](https://github.com/verl-project/verl) and provides a dedicated home for building and evolving this stack in a more focused way. + +Key capabilities: + +- **Specialized rollout support** via [vLLM-Omni](https://github.com/vllm-project/vllm-omni) for concurrent diffusion and multimodal generation. +- **Efficient diffusion RL training** for image and other non-autoregressive models. +- **Flexible reward pipelines** spanning rule-based rewards, model-based rewards, and multimodal reward computation. +- **Modular training backends** that integrate various parallelism strategies (FSDP, USP) without rebuilding the full stack. + +```{toctree} +:maxdepth: 2 +:caption: Getting Started + +start/install.md +start/flowgrpo_quickstart.md +``` + +```{toctree} +:maxdepth: 1 +:caption: Algorithms + +algo/flowgrpo.md +``` + +## Contribution + +VeRL-Omni is free software; you can redistribute it and/or modify it under the terms +of the Apache License 2.0. We welcome contributions. +Join us on [GitHub](https://github.com/verl-project/verl-omni) for discussions. + +See the [2026 Q2 roadmap](https://github.com/verl-project/verl/issues/5755) for planned work. + +### Code Linting and Formatting + +We use pre-commit to help improve code quality. To initialize pre-commit, run: + +```bash +pip install pre-commit +pre-commit install +``` + +To resolve CI errors locally, you can also manually run pre-commit by: + +```bash +pre-commit run +``` + +### Adding CI tests + +If possible, please add CI test(s) for your new feature: + +1. Find the most relevant workflow yml file, which usually corresponds to a `hydra` default config (e.g. `ppo_trainer`, `ppo_megatron_trainer`, `sft_trainer`, etc). +2. Add related path patterns to the `paths` section if not already included. +3. Minimize the workload of the test script(s) (see existing scripts for examples). diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt new file mode 100644 index 00000000..de4e92da --- /dev/null +++ b/docs/requirements-docs.txt @@ -0,0 +1,3 @@ +myst_parser +sphinx-markdown-tables +sphinx-rtd-theme diff --git a/docs/start/flowgrpo_quickstart.md b/docs/start/flowgrpo_quickstart.md new file mode 100644 index 00000000..696f4bac --- /dev/null +++ b/docs/start/flowgrpo_quickstart.md @@ -0,0 +1,195 @@ +(flowgrpo_quickstart)= +# Quickstart: FlowGRPO training on Qwen-Image OCR dataset + +Last updated: 04/23/2026 + +Post-train a diffusion image generation model with FlowGRPO. + +## Introduction + +In this example, we post-train a `Qwen-Image` policy with FlowGRPO for OCR-style image generation tasks. The rollout uses `vllm-omni` for multimodal generation, and the reward is computed by a visual generative reward model (*Qwen3-VL-8B-Instruct* in this example) that compares OCR text extracted from generated images against the dataset ground truth. + +## Prerequisite + +- Install VeRL-Omni and its dependencies following the {doc}`installation guide `. Also install the FlowGRPO-specific reward dependency: + +```bash +pip install Levenshtein +``` + +- Use a machine with `4` GPUs for the provided example script. +- Run the commands below from the repository root. + +## Dataset Introduction + +We use the OCR dataset from the original Flow-GRPO repository: [dataset/ocr](https://github.com/yifan123/flow_grpo/tree/main/dataset/ocr). Each sample asks the model to generate an image that contains specific text, and the reward model scores the generated image by reading the rendered text and comparing it with the reference OCR string. + +The raw dataset is a plain-text file (`train.txt` / `test.txt`) where each line is one generation prompt. The OCR target — the text the model must render in the image — is enclosed in double quotes within the prompt. A few representative samples: + +```text +A close-up of a medicine bottle with a clear, red warning label that reads "Take With Food" prominently displayed, set against a neutral background. +A close-up of a robot's chest panel, with a digital display blinking "System Override Active" in red, set against a dimly lit industrial background. +A detailed textbook diagram labeled "Photosynthesis Process", viewed under a high-powered microscope, showcasing the intricate cellular structures and chemical reactions involved. +An ancient, leather-bound wizard's spellbook lies open, revealing a worn, yellowed page. A delicate bookmark rests precisely on "Page 666", casting a subtle glow that illuminates the arcane text. +An astronaut's boot print on the Martian surface, clearly reading "First Steps", surrounded by the red, dusty terrain under a pale, distant sky. +``` + +The preprocessing script converts the raw dataset into parquet files that contain: + +- the multimodal prompt used for image generation, +- a negative prompt for true CFG sampling, +- OCR ground truth stored under `reward_model.ground_truth`, +- auxiliary metadata such as split and sample index. + +## Step 1: Prepare the dataset + +Set the `WORKSPACE` environment variable to any writable directory you prefer (defaults to `$HOME` if unset): + +```bash +export WORKSPACE=${WORKSPACE:-$HOME} +``` + +Obtain the raw OCR dataset from the original Flow-GRPO repository and place it under `$WORKSPACE/data/ocr`. Then preprocess it into `train.parquet` and `test.parquet`: + +```bash +python3 examples/flowgrpo_trainer/data_process/qwenimage_ocr.py \ + --input_dir $WORKSPACE/data/ocr \ + --output_dir $WORKSPACE/data/ocr +``` + +The command above writes: + +- `$WORKSPACE/data/ocr/train.parquet` +- `$WORKSPACE/data/ocr/test.parquet` + +These parquet files are the inputs consumed by the FlowGRPO training script. + +### Preparing a custom dataset + +To train on your own OCR-style data, create `train.txt` and `test.txt` following the same one-prompt-per-line convention. Each prompt must contain the target OCR string enclosed in double quotes — the preprocessing script extracts the text between the first pair of quotes as the ground truth. For example: + +```text +A vintage storefront sign above the door reads "Open 24 Hours" in bold neon letters. +A handwritten sticky note on a refrigerator says "Buy milk" in blue ink. +``` + +Place the files in `$WORKSPACE/data/ocr/` (or any directory you prefer) and run the same preprocessing command, adjusting `--input_dir` and `--output_dir` as needed: + +```bash +python3 examples/flowgrpo_trainer/data_process/qwenimage_ocr.py \ + --input_dir $WORKSPACE/data/ocr \ + --output_dir $WORKSPACE/data/ocr +``` + +For datasets with a different ground-truth extraction scheme (e.g. a CSV with an explicit label column), modify `extract_solution` and the `process_fn` function in `examples/flowgrpo_trainer/data_process/qwenimage_ocr.py` to match your format, then re-run the script to regenerate the parquet files. + +## Step 2: Obtain models for RL training + +In this example, we train `Qwen/Qwen-Image` with LoRA and use `Qwen/Qwen3-VL-8B-Instruct` as the OCR reward model. + +**Policy model (Qwen-Image):** download the weights to a local directory (e.g. `$WORKSPACE/models/Qwen/Qwen-Image`). + +**Reward model (Qwen3-VL-8B-Instruct):** the script defaults to the Hugging Face Hub ID `Qwen/Qwen3-VL-8B-Instruct`, so no manual download is required — Hugging Face will cache it automatically on first run. To use a local copy instead, edit the `reward_model_name` variable in the script directly. + +The run script exposes the following environment variables to override model and data paths without editing the script: + +```bash +WORKSPACE # base directory for data and models (default: $HOME) +MODEL_PATH # policy model path (default: $WORKSPACE/models/Qwen/Qwen-Image) +ACTOR_TOKENIZER_PATH # tokenizer path (default: $WORKSPACE/models/Qwen/Qwen-Image/tokenizer) +``` + +## Step 3: Perform FlowGRPO training + +The provided example script launches `python3 -m verl_omni.trainer.main_flowgrpo` with the FlowGRPO-specific config needed for this OCR task: + +- `algorithm.adv_estimator=flow_grpo` +- `actor_rollout_ref.rollout.name=vllm_omni` +- `reward.reward_manager.name=visual` +- `reward.custom_reward_function.name=compute_score_ocr` +- LoRA fine-tuning on `Qwen-Image` +- a single-node, `4`-GPU layout + +Run the training script: + +```bash +bash examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh +``` + +Optional KL loss tuning: + +- `actor_rollout_ref.actor.use_kl_loss=True` +- `actor_rollout_ref.actor.kl_loss_coef=0.001` + +The script uses `$WORKSPACE` (default: `$HOME`) as the base directory. Override any path via the environment variables described in Step 2, or set `WORKSPACE` to point to a volume with enough free space before launching. + +You are expected to see training, validation, actor, critic, and reward metrics logged through the configured backends. By default, checkpoints are saved under: + +```bash +checkpoints/${trainer.project_name}/${trainer.experiment_name} +``` + +## Wandb logging + +The provided script already enables: + +```bash +trainer.logger='["console", "wandb"]' \ +trainer.project_name=flow_grpo \ +trainer.experiment_name=qwen_image_ocr_lora +``` + +Set your W&B credentials before launching if you want remote tracking: + +```bash +export WANDB_API_KEY= +``` + +You can also override `trainer.project_name` and `trainer.experiment_name` from the command line to organize runs under your own project names. + +## Diffusion-specific metrics + +The following metrics are specific to diffusion FlowGRPO training. + +**critic/rewards/zero_std_ratio** — the fraction of prompt groups (out of +`train_batch_size` prompts) where every one of the `n` generated images +received the same reward, giving a within-group standard deviation of zero. +GRPO derives its learning signal from *relative* rewards within a group, so a +group with zero std contributes no gradient regardless of the absolute reward +value. A persistently high ratio (e.g. above 0.5) means the reward model is +saturated or the task difficulty is poorly calibrated — either all images are +rewarded or none are — and the policy is not receiving useful training signal. + +**critic/rewards/std_mean** — the mean of the per-prompt reward standard +deviations across all prompt groups in the batch. Complements +`zero_std_ratio`: while `zero_std_ratio` flags completely collapsed groups, +`std_mean` tracks the average spread of rewards within a group across the +whole batch. A healthy, rising `std_mean` indicates the reward model is +producing diverse signal; a declining `std_mean` is an early warning of +reward saturation before `zero_std_ratio` spikes. + +**actor/pg_clipfrac_higher** and **actor/pg_clipfrac_lower** — these break +down PPO clipping by direction. `pg_clipfrac_higher` is the fraction of +`(image, denoising-timestep)` pairs where the probability ratio +`π_new / π_old` exceeded `1 + clip_ratio`, meaning the policy is trying +to increase the probability of high-advantage images more than the clip +allows. `pg_clipfrac_lower` is the fraction where the ratio fell below +`1 - clip_ratio`, meaning the policy is suppressing low-advantage images +more aggressively than allowed. A large asymmetry between the two (e.g. +`higher` >> `lower`) indicates the dominant learning direction and can +guide tuning of `clip_ratio` or the learning rate. + +**timing_per_image_ms/{stage}** — per-image latency in milliseconds for each +core compute stage: `gen` (rollout), `ref` (reference log-prob), `old_log_prob`, +`adv` (advantage computation), and `update_actor`. Use these to pinpoint which stage +dominates step time and where to focus optimisation effort. + +**perf/throughput** — images processed per second per GPU, computed as +`(train_batch_size × rollout.n) / (time_per_step × n_gpus)`. + +## Further reading + +For the algorithm background, detailed configuration notes, async reward, and rule-based +reward training (e.g. JPEG incompressibility), see: + +- {doc}`../algo/flowgrpo` diff --git a/docs/start/install.md b/docs/start/install.md new file mode 100644 index 00000000..9b0873d0 --- /dev/null +++ b/docs/start/install.md @@ -0,0 +1,43 @@ +# Installation + +Last updated: 04/23/2026 + +## Requirements + +| Dependency | Version | +|---|---| +| Python | >= 3.10 | +| CUDA | >= 12.1 | +| GPU | NVIDIA GPU (≥ 24 GB VRAM recommended) | + +## Install + +Install in this order to avoid dependency conflicts: + +```bash +# 1. vLLM and vLLM-Omni rollout backend +pip install "vllm==0.18" "vllm-omni==0.18" + +# 2. verl +pip install git+https://github.com/verl-project/verl.git@3eab8ccc6143c624e7f11c871896f941b3fec900 + +# 3. VeRL-Omni +pip install git+https://github.com/verl-project/verl-omni.git@main +``` + +Note: Install vLLM and vLLM-Omni first — they may override your existing PyTorch installation, +so installing them before verl and VeRL-Omni ensures a compatible CUDA-aware torch version. + +## Optional Dependencies + +| Extra | Install | When needed | +|---|---|---| +| OCR reward | `pip install Levenshtein` | FlowGRPO training with OCR-based reward | + +## Post-Installation Verification + +```bash +python -c "import torch; print('torch', torch.__version__, '| CUDA', torch.version.cuda)" +python -c "import vllm; print('vllm', vllm.__version__)" +python -c "import verl; print('VeRL-Omni ready')" +``` diff --git a/examples/flowgrpo_trainer/README.md b/examples/flowgrpo_trainer/README.md new file mode 100644 index 00000000..d1cef8a0 --- /dev/null +++ b/examples/flowgrpo_trainer/README.md @@ -0,0 +1,177 @@ +# FlowGRPO Trainer + +This example shows how to post-train `Qwen-Image` with FlowGRPO on an OCR-style image generation task using `vllm-omni` rollout and a visual generative reward model (`Qwen3-VL-8B-Instruct` in this example). + +For the full installation and quickstart guide, see `docs/start/flowgrpo_quickstart.md`. For algorithm details and rule-based reward training (e.g. JPEG incompressibility), see `docs/algo/flowgrpo.md`. + +## Installation + +Install dependencies in this order to avoid conflicts: + +```bash +# 1. vLLM and vLLM-Omni rollout backend +pip install "vllm==0.18" "vllm-omni==0.18" + +# 2. verl +pip install git+https://github.com/verl-project/verl.git@3eab8ccc6143c624e7f11c871896f941b3fec900 + +# 3. verl-omni +pip install git+https://github.com/verl-project/verl-omni.git@main + +# 4. FlowGRPO example-specific dependency +pip install Levenshtein +``` + +For full installation details see `docs/start/install.md`. + +The provided script is configured for a single node with `4` GPUs. + +## Prepare the dataset + +Obtain the raw OCR dataset from the original Flow-GRPO repository: + +- https://github.com/yifan123/flow_grpo/tree/main/dataset/ocr + +Place the raw dataset under `$WORKSPACE/data/ocr` (where `WORKSPACE` defaults to `$HOME`), then preprocess it into parquet files: + +```bash +python3 examples/flowgrpo_trainer/data_process/qwenimage_ocr.py \ + --input_dir $WORKSPACE/data/ocr \ + --output_dir $WORKSPACE/data/ocr +``` + +This produces: + +- `$WORKSPACE/data/ocr/train.parquet` +- `$WORKSPACE/data/ocr/test.parquet` + +## Prepare the models + +The scripts use `WORKSPACE` (default: `$HOME`) as the base directory. Set it to any writable location before launching: + +```bash +export WORKSPACE=/path/to/your/workspace # optional, defaults to $HOME +``` + +**Policy model (Qwen-Image):** download the weights locally. The default expected path is `$WORKSPACE/models/Qwen/Qwen-Image` (with the tokenizer at `$WORKSPACE/models/Qwen/Qwen-Image/tokenizer`). + +**Reward model (Qwen3-VL-8B-Instruct):** the script defaults to the Hugging Face Hub ID `Qwen/Qwen3-VL-8B-Instruct`, so no manual download is required — Hugging Face will cache it automatically on first run. + +Override any path without editing the script via environment variables: + +```bash +MODEL_PATH # policy model path (default: $WORKSPACE/models/Qwen/Qwen-Image) +ACTOR_TOKENIZER_PATH # tokenizer path (default: $WORKSPACE/models/Qwen/Qwen-Image/tokenizer) +``` + +## Run training + +Launch the example from the repository root: + +```bash +bash examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh +``` + +Optional KL loss tuning: + +- `actor_rollout_ref.actor.use_kl_loss=True` +- `actor_rollout_ref.actor.kl_loss_coef=0.001` + +The script runs `python3 -m verl_omni.trainer.main_flowgrpo` with: + +- `algorithm.adv_estimator=flow_grpo` +- `actor_rollout_ref.model.path=Qwen/Qwen-Image` +- `actor_rollout_ref.model.lora_rank=64` +- `actor_rollout_ref.model.lora_alpha=128` +- `actor_rollout_ref.rollout.name=vllm_omni` +- `reward.reward_manager.name=visual` +- `reward.custom_reward_function.name=compute_score_ocr` +- `trainer.n_gpus_per_node=4` + +## Logging + +W&B logging is enabled by default in the example script: + +```bash +export WANDB_API_KEY= +``` + +The script sets: + +```bash +trainer.logger='["console", "wandb"]' +trainer.project_name=flow_grpo +trainer.experiment_name=qwen_image_ocr_lora +``` + +Override these values on the command line if you want to log under a different project or run name. + +### Diffusion-specific metrics + +The following metrics are specific to diffusion FlowGRPO training. + +**`critic/rewards/zero_std_ratio`** — the fraction of prompt groups (out of +`train_batch_size` prompts) where every one of the `n` generated images +received the same reward, giving a within-group standard deviation of zero. +GRPO derives its learning signal from *relative* rewards within a group, so a +group with zero std contributes no gradient regardless of the absolute reward +value. A persistently high ratio (e.g. above 0.5) means the reward model is +saturated or the task difficulty is poorly calibrated — either all images are +rewarded or none are — and the policy is not receiving useful training signal. + +**`critic/rewards/std_mean`** — the mean of the per-prompt reward standard +deviations across all prompt groups in the batch. Complements +`zero_std_ratio`: while `zero_std_ratio` flags completely collapsed groups, +`std_mean` tracks the average reward spread within a group across the whole +batch. A healthy, rising `std_mean` indicates the reward model is producing +diverse signal; a declining `std_mean` is an early warning of reward +saturation before `zero_std_ratio` spikes. + +**`actor/pg_clipfrac_higher`** and **`actor/pg_clipfrac_lower`** — these +break down PPO clipping by direction. `pg_clipfrac_higher` is the fraction of +`(image, denoising-timestep)` pairs where the probability ratio +`π_new / π_old` exceeded `1 + clip_ratio`, meaning the policy is trying to +increase the probability of high-advantage images more than the clip allows. +`pg_clipfrac_lower` is the fraction where the ratio fell below +`1 - clip_ratio`, meaning the policy is trying to suppress low-advantage +images more aggressively than allowed. A large asymmetry between the two +(e.g. `higher` >> `lower`) indicates the dominant learning direction and can +guide tuning of `clip_ratio` or the learning rate. + +**`timing_per_image_ms/{stage}`** — per-image latency in milliseconds for +each core compute stage: `gen` (rollout), `ref` (reference log-prob), +`old_log_prob`, `adv` (advantage computation), and `update_actor`. Use +these to pinpoint which stage dominates step time. + +**`perf/throughput`** — images processed per second per GPU, computed as +`(train_batch_size × rollout.n) / (time_per_step × n_gpus)`. + +## Variants + +For reward models that are expensive to evaluate (e.g., a VLM judge), the reward model can be allocated its own dedicated GPU resource pool and run asynchronously alongside the policy. This avoids blocking policy training on reward computation. + +```bash +bash examples/flowgrpo_trainer/run_qwen_image_ocr_lora_async_reward.sh +``` + + +## Performance + +> All experiments were conducted on *NVIDIA H800* GPUs using the OCR reward. + +The experiment settings and throughputs are shown in the table below. + +| Script | Model | Algorithm | Hybrid Engine | # Cards | Reward Fn | # GPUs for Actor | # GPUs for Rollout | # GPUs for Async Reward | Batch Size | `rollout.n` | lr | # Val Samples | Training Samples per Step | `ppo_micro_batch_size_per_gpu` | Throughput (Samples / Seconds) | Time per Step (Seconds) | +| --- | --- | --- | --- | --- | --- | --- | --- |-------------------------| --- | --- |------| --- | --- | --- |------------------------------| --------------------------------| +| `run_qwen_image_ocr_lora.sh` | Qwen-Image | Flow-GRPO | True | 4 | qwenvl-ocr-vllm | 4 | 4 | 0 (sync) | 32 | 16 | 3e-4 | 1k (full set) | 32×16=512 | 16 | 0.305 | 420 | +| `run_qwen_image_ocr_lora_async_reward.sh` | Qwen-Image | Flow-GRPO | True | 5 | qwenvl-ocr-vllm | 4 | 4 | 1 | 32 | 16 | 3e-4 | 1k (full set) | 32×16=512 | 16 | 0.280 | 360 | + +- Validation reward curve (evaluated with `trainer.val_before_train=True`): + +
+2p_comparison +
+qwen_image_ocr_lora: corresponding with the script `run_qwen_image_ocr_lora.sh`; +
+qwen_image_ocr_lora_async_reward: corresponding with the script `run_qwen_image_ocr_lora_async_reward.sh`. +
diff --git a/examples/flowgrpo_trainer/data_process/qwenimage_ocr.py b/examples/flowgrpo_trainer/data_process/qwenimage_ocr.py index 96c93621..0c1a67e8 100644 --- a/examples/flowgrpo_trainer/data_process/qwenimage_ocr.py +++ b/examples/flowgrpo_trainer/data_process/qwenimage_ocr.py @@ -30,18 +30,12 @@ def extract_solution(solution_str): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--local_dir", default=None) parser.add_argument("--hdfs_dir", default=None) - parser.add_argument( - "--local_dataset_path", default="~/dataset/ocr/", help="The local path to the raw dataset, if it exists." - ) - parser.add_argument( - "--local_save_dir", default="~/data/ocr", help="The save directory for the preprocessed dataset." - ) + parser.add_argument("--input_dir", default="~/dataset/ocr/", help="Path to the raw OCR dataset directory.") + parser.add_argument("--output_dir", default="~/data/ocr", help="Directory to save the preprocessed parquet files.") args = parser.parse_args() - if args.local_dataset_path is not None: - local_dataset_path = os.path.expanduser(args.local_dataset_path) + local_dataset_path = os.path.expanduser(args.input_dir) data_source = "flow_grpo/ocr" @@ -88,11 +82,7 @@ def process_fn(example, idx): test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) hdfs_dir = args.hdfs_dir - local_save_dir = args.local_dir - if local_save_dir is not None: - print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") - else: - local_save_dir = args.local_save_dir + local_save_dir = args.output_dir local_save_dir = os.path.expanduser(local_save_dir) train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) diff --git a/examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh b/examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh index 984e22d2..1772c9da 100644 --- a/examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh +++ b/examples/flowgrpo_trainer/run_qwen_image_ocr_lora.sh @@ -1,14 +1,21 @@ # Qwen-Image lora RL, vllm_omni rollout set -x -ocr_train_path=$HOME/data/ocr/train.parquet -ocr_test_path=$HOME/data/ocr/test.parquet +# Set WORKSPACE to any writable directory; defaults to $HOME +WORKSPACE=${WORKSPACE:-$HOME} + +ocr_train_path=$WORKSPACE/data/ocr/train.parquet +ocr_test_path=$WORKSPACE/data/ocr/test.parquet ENGINE=vllm_omni REWARD_ENGINE=vllm reward_path=examples/flowgrpo_trainer/reward_fn.py -reward_model_name=$HOME/models/Qwen/Qwen3-VL-8B-Instruct +reward_model_name=Qwen/Qwen3-VL-8B-Instruct + +NUM_GPUS_ACTOR_ROLLOUT_REWARD=4 +ROLLOUT_TP=1 +REWARD_TP=4 python3 -m verl_omni.trainer.main_flowgrpo \ @@ -17,8 +24,8 @@ python3 -m verl_omni.trainer.main_flowgrpo \ data.val_files=$ocr_test_path \ data.train_batch_size=32 \ data.max_prompt_length=256 \ - actor_rollout_ref.model.path=$HOME/models/Qwen/Qwen-Image \ - actor_rollout_ref.model.tokenizer_path=$HOME/models/Qwen/Qwen-Image/tokenizer \ + actor_rollout_ref.model.path=${MODEL_PATH:-$WORKSPACE/models/Qwen/Qwen-Image} \ + actor_rollout_ref.model.tokenizer_path=${ACTOR_TOKENIZER_PATH:-$WORKSPACE/models/Qwen/Qwen-Image/tokenizer} \ actor_rollout_ref.model.external_lib="examples.flowgrpo_trainer.diffusers_impl" \ actor_rollout_ref.model.lora_rank=64 \ actor_rollout_ref.model.lora_alpha=128 \ @@ -35,7 +42,7 @@ python3 -m verl_omni.trainer.main_flowgrpo \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=$ENGINE \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.agent.num_workers=4 \ + actor_rollout_ref.rollout.agent.num_workers=$((NUM_GPUS_ACTOR_ROLLOUT_REWARD / ROLLOUT_TP)) \ actor_rollout_ref.rollout.load_format=safetensors \ actor_rollout_ref.rollout.layered_summon=True \ actor_rollout_ref.rollout.true_cfg_scale=4.0 \ @@ -48,7 +55,7 @@ python3 -m verl_omni.trainer.main_flowgrpo \ actor_rollout_ref.rollout.val_kwargs.algo.noise_level=0.0 \ actor_rollout_ref.rollout.external_lib=examples.flowgrpo_trainer.vllm_omni_impl \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ - reward.num_workers=4 \ + reward.num_workers=$((NUM_GPUS_ACTOR_ROLLOUT_REWARD / REWARD_TP)) \ reward.reward_manager.name=visual \ reward.reward_model.enable=True \ reward.reward_model.model_path=$reward_model_name \ @@ -61,9 +68,9 @@ python3 -m verl_omni.trainer.main_flowgrpo \ trainer.experiment_name=qwen_image_ocr_lora \ trainer.log_val_generations=8 \ trainer.val_before_train=False \ - trainer.n_gpus_per_node=4 \ + trainer.n_gpus_per_node=$NUM_GPUS_ACTOR_ROLLOUT_REWARD \ trainer.nnodes=1 \ trainer.save_freq=30 \ trainer.test_freq=30 \ trainer.total_epochs=15 \ - trainer.total_training_steps=300 $@ + trainer.total_training_steps=300 "$@" diff --git a/examples/flowgrpo_trainer/run_qwen_image_ocr_lora_async_reward.sh b/examples/flowgrpo_trainer/run_qwen_image_ocr_lora_async_reward.sh new file mode 100644 index 00000000..7bc0a091 --- /dev/null +++ b/examples/flowgrpo_trainer/run_qwen_image_ocr_lora_async_reward.sh @@ -0,0 +1,83 @@ +# Qwen-Image lora RL, vllm_omni rollout +set -x + +# Set WORKSPACE to any writable directory; defaults to $HOME +WORKSPACE=${WORKSPACE:-$HOME} + +ocr_train_path=$WORKSPACE/data/ocr/train.parquet +ocr_test_path=$WORKSPACE/data/ocr/test.parquet + +ENGINE=vllm_omni +REWARD_ENGINE=vllm + +reward_path=examples/flowgrpo_trainer/reward_fn.py +reward_model_name=Qwen/Qwen3-VL-8B-Instruct + +NUM_GPUS_ACTOR_ROLLOUT=4 +NUM_GPUS_REWARD=1 +ROLLOUT_TP=1 +REWARD_TP=1 + + +python3 -m verl_omni.trainer.main_flowgrpo \ + algorithm.adv_estimator=flow_grpo \ + data.train_files=$ocr_train_path \ + data.val_files=$ocr_test_path \ + data.train_batch_size=32 \ + data.max_prompt_length=256 \ + actor_rollout_ref.model.path=${MODEL_PATH:-$WORKSPACE/models/Qwen/Qwen-Image} \ + actor_rollout_ref.model.tokenizer_path=${ACTOR_TOKENIZER_PATH:-$WORKSPACE/models/Qwen/Qwen-Image/tokenizer} \ + actor_rollout_ref.model.external_lib="examples.flowgrpo_trainer.diffusers_impl" \ + actor_rollout_ref.model.lora_rank=64 \ + actor_rollout_ref.model.lora_alpha=128 \ + actor_rollout_ref.model.target_modules="['to_q','to_k','to_v','to_out.0','add_q_proj','add_k_proj','add_v_proj','to_add_out','img_mlp.net.0.proj','img_mlp.net.2','txt_mlp.net.0.proj','txt_mlp.net.2']" \ + actor_rollout_ref.actor.optim.lr=3e-4 \ + actor_rollout_ref.actor.optim.weight_decay=0.0001 \ + actor_rollout_ref.actor.ppo_mini_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \ + actor_rollout_ref.actor.diffusion_loss.loss_mode=flow_grpo \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=$ENGINE \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.agent.num_workers=$((NUM_GPUS_ACTOR_ROLLOUT / ROLLOUT_TP)) \ + actor_rollout_ref.rollout.load_format=safetensors \ + actor_rollout_ref.rollout.layered_summon=True \ + actor_rollout_ref.rollout.true_cfg_scale=4.0 \ + actor_rollout_ref.rollout.max_sequence_length=256 \ + actor_rollout_ref.rollout.algo.noise_level=1.2 \ + actor_rollout_ref.rollout.algo.sde_type="sde" \ + actor_rollout_ref.rollout.algo.sde_window_size=2 \ + actor_rollout_ref.rollout.algo.sde_window_range="[0,5]" \ + actor_rollout_ref.rollout.val_kwargs.num_inference_steps=50 \ + actor_rollout_ref.rollout.val_kwargs.algo.noise_level=0.0 \ + actor_rollout_ref.rollout.external_lib=examples.flowgrpo_trainer.vllm_omni_impl \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + reward.num_workers=$((NUM_GPUS_REWARD / REWARD_TP)) \ + reward.reward_manager.name=visual \ + reward.reward_model.enable=True \ + reward.reward_model.model_path=$reward_model_name \ + reward.reward_model.rollout.name=$REWARD_ENGINE \ + reward.reward_model.enable_resource_pool=True \ + reward.reward_model.nnodes=1 \ + reward.reward_model.n_gpus_per_node=$NUM_GPUS_REWARD \ + reward.reward_model.rollout.gpu_memory_utilization=0.9 \ + reward.reward_model.rollout.free_cache_engine=False \ + reward.reward_model.rollout.tensor_model_parallel_size=1 \ + reward.reward_model.rollout.enforce_eager=False \ + reward.custom_reward_function.path=$reward_path \ + reward.custom_reward_function.name=compute_score_ocr \ + trainer.logger='["console", "wandb"]' \ + trainer.project_name=flow_grpo \ + trainer.experiment_name=qwen_image_ocr_lora_async_reward \ + trainer.log_val_generations=8 \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$NUM_GPUS_ACTOR_ROLLOUT \ + trainer.nnodes=1 \ + trainer.save_freq=30 \ + trainer.test_freq=30 \ + trainer.total_epochs=15 \ + trainer.total_training_steps=300 "$@"