inclusionAI · garrett4wade · Aug 7, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/areal/README.md b/areal/README.md
@@ -558,4 +558,4 @@ class MyRolloutWorkflow:
 `TrainEngine` respectively. Controllers handle engine deployment across the cluster and
 manage data distribution, invoking engine methods through remote procedure calls (RPCs).
 This architecture enables distributed operation while maintaining familiar interfaces
-for users.
+for users.
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -633,6 +633,9 @@ class DatasetConfig:
         default=0, metadata={"help": "Number of worker processes for data loading"}
     )
     drop_last: bool = field(default=True)
+    reward_fn: Optional[str] = field(
+        default=None,
+    )
 
 
 @dataclass

diff --git a/areal/dataset/__init__.py b/areal/dataset/__init__.py
@@ -2,7 +2,7 @@
 
 import transformers
 
-VALID_DATASETS = ["gsm8k", "clevr_count_70k"]
+VALID_DATASETS = ["gsm8k", "clevr_count_70k", "geometry3k"]
 
 
 def get_custom_dataset(
@@ -17,25 +17,37 @@ def get_custom_dataset(
 ):
 
     if "gsm8k" in path and type == "sft":
-        from areal.dataset.gsm8k import get_gsm8k_sft_dataset
+        from .gsm8k import get_gsm8k_sft_dataset
 
         return get_gsm8k_sft_dataset(path, split, tokenizer, rank, world_size, **kwargs)
     elif "gsm8k" in path and type == "rl":
-        from areal.dataset.gsm8k import get_gsm8k_rl_dataset
+        from .gsm8k import get_gsm8k_rl_dataset
 
         return get_gsm8k_rl_dataset(path, split, rank, world_size, **kwargs)
     elif "clevr_count_70k" in path and type == "sft":
-        from areal.dataset.clevr_count_70k import get_clevr_count_70k_sft_dataset
+        from .clevr_count_70k import get_clevr_count_70k_sft_dataset
 
         return get_clevr_count_70k_sft_dataset(
             path, split, processor, rank, world_size, **kwargs
         )
     elif "clevr_count_70k" in path and type == "rl":
-        from areal.dataset.clevr_count_70k import get_clevr_count_70k_rl_dataset
+        from .clevr_count_70k import get_clevr_count_70k_rl_dataset
 
         return get_clevr_count_70k_rl_dataset(
             path, split, processor, rank, world_size, **kwargs
         )
+    elif "geometry3k" in path and type == "sft":
+        from .geometry3k import get_geometry3k_sft_dataset
+
+        return get_geometry3k_sft_dataset(
+            path, split, processor, rank, world_size, **kwargs
+        )
+    elif "geometry3k" in path and type == "rl":
+        from .geometry3k import get_geometry3k_rl_dataset
+
+        return get_geometry3k_rl_dataset(
+            path, split, processor, rank, world_size, **kwargs
+        )
     else:
         raise ValueError(
             f"Dataset {path} with split {split} and training type {type} is not supported. "

diff --git a/areal/dataset/geometry3k.py b/areal/dataset/geometry3k.py
@@ -0,0 +1,141 @@
+import math
+from io import BytesIO
+from typing import Any, Dict, Optional, Union
+
+from datasets import load_dataset
+from datasets.distributed import split_dataset_by_node
+from PIL import Image
+from PIL.Image import Image as ImageObject
+from torchvision import transforms
+
+
+def pad_to_square(img: Image.Image, fill=(0, 0, 0)) -> Image.Image:
+
+    w, h = img.size
+    side = max(w, h)
+    new_img = Image.new(img.mode, (side, side), color=fill)
+    offset = ((side - w) // 2, (side - h) // 2)
+    new_img.paste(img, offset)
+    return new_img
+
+
+def convert_image(
+    image: Union[Dict[str, Any], ImageObject, str],
+    fixed_width: Optional[int] = None,
+    fixed_height: Optional[int] = None,
+) -> ImageObject:
+    if (
+        fixed_width is not None
+        and fixed_height is not None
+        and (image.width != fixed_width or image.height != fixed_height)
+    ):
+        preprocess = transforms.Compose(
+            [
+                transforms.CenterCrop((fixed_width, fixed_height)),  # <─ 核心操作
+            ]
+        )
+        image = preprocess(image)
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    with BytesIO() as output:
+        image.save(output, format="JPEG")
+        return output.getvalue()
+
+
+def get_geometry3k_sft_dataset(path, split, processor, rank, world_size):
+    """
+    "geometry3k": {
+        "image_key": "images",
+        "question_key": "problem",
+        "answer_key": "answer"
+    },
+    """
+    dataset = load_dataset(path=path, split=split)
+    dataset = split_dataset_by_node(dataset, rank=rank, world_size=world_size)
+    tokenizer = processor.tokenizer
+
+    def process_example(example, idx):
+        # Add query_id column
+        images = example["images"]
+        if "qwen" in processor.image_processor.image_processor_type.lower():
+            image_token = "<|vision_start|><|image_pad|><|vision_end|>"
+        else:
+            image_token = processor.image_token if processor is not None else "<image>"
+        example["problem"] = (
+            example["problem"].replace("<image>", image_token).replace("different", "")
+        )
+        processed_images = []
+        for image in images:
+            processed_images.append(convert_image(image, 512, 512))
+        example["images"] = processed_images
+        example["seq"] = example["problem"] + example["answer"] + tokenizer.eos_token
+
+        return example
+
+    dataset = dataset.map(
+        lambda example, idx: process_example(example, idx),
+        with_indices=True,
+    )
+
+    def _process(example):
+        text = example["seq"]
+        processed_input = processor(
+            text=[text],
+            images=example["images"],
+            padding=False,
+            return_tensors="pt",
+            return_length=True,
+            return_attention_mask=False,
+        )
+
+        example["input_ids"] = processed_input["input_ids"].squeeze(0)
+        example["pixel_values"] = processed_input["pixel_values"]
+        example["image_grid_thw"] = processed_input["image_grid_thw"]
+        answer_token = tokenizer.encode(example["answer"])
+        loss_mask = [0] * (len(example["input_ids"]) - len(answer_token)) + [1] * len(
+            answer_token
+        )
+        example["loss_mask"] = loss_mask
+        return example
+
+    dataset = dataset.map(
+        lambda x: _process(x), remove_columns=["images", "seq", "problem", "answer"]
+    )
+    return dataset
+
+
+def get_geometry3k_rl_dataset(path, split, processor, rank, world_size):
+    dataset = load_dataset(path=path, split=split)
+    dataset = split_dataset_by_node(dataset, rank=rank, world_size=world_size)
+
+    def process(sample):
+        processed_images = [
+            convert_image(image, 448, 448) for image in sample["images"]
+        ]
+        if "qwen" in processor.image_processor.image_processor_type.lower():
+            image_token = "<|vision_start|><|image_pad|><|vision_end|>"
+        else:
+            image_token = processor.image_token if processor is not None else "<image>"
+        system_prompt = {
+            "role": "system",
+            "content": (
+                "Solve the following geometric problem based on the image. You may explain your reasoning before providing the final answer. The answer should be enclosed in [ ] and can be a number, decimal, or LaTeX format (e.g. \frac { 4 }{ 9 } \sqrt { 3 }).\n"
+            ),
+        }
+
+        messages = [
+            {
+                "role": "user",
+                "content": sample["problem"]
+                .replace("<image>", image_token)
+                .replace("different", ""),
+            }
+        ]
+        messages.insert(0, system_prompt)
+        messages = processor.tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=False
+        )
+        return {"messages": messages, "images": processed_images}
+
+    dataset = dataset.map(process).remove_columns(["problem"])
+    return dataset
diff --git a/areal/engine/base_hf_engine.py b/areal/engine/base_hf_engine.py
@@ -31,7 +31,11 @@
     unsqueeze_mb_list,
 )
 from areal.utils.fsdp import get_cosine_schedule_with_warmup
-from areal.utils.model import VALID_VISION_MODELS, disable_dropout_in_model
+from areal.utils.model import (
+    VALID_VISION_MODELS,
+    disable_dropout_in_model,
+    is_qwen2_vl_model,
+)
 from realhf.api.core.data_api import load_hf_processor_and_tokenizer, load_hf_tokenizer
 from realhf.base import constants, logging
 
@@ -253,10 +257,26 @@ def prepare_mb_list(self, input_: TensorDict) -> MicroBatchList:
             assert (
                 "pixel_values" in input_ and "image_grid_thw" in input_
             ), "For vision-language models, pixel_values and image_grid_thw must be present in input_"
-
         if isinstance(input_, dict):
             input_ = TensorDict(input_, batch_size=[input_["input_ids"].shape[0]])
-        input_ = amend_position_ids(input_)
+        if is_qwen2_vl_model(self.model_config.model_type):
+            # Create the special t,h,w position IDs for qwen 2.5 VL
+            attn_mask = input_["attention_mask"]
+            input_ids = input_["input_ids"]
+            image_grid_thw = input_.get("image_grid_thw", None)
+            video_grid_thw = input_.get("video_grid_thw", None)
+            if image_grid_thw is not None:
+                image_grid_thw = image_grid_thw.squeeze(1)
+            if video_grid_thw is not None:
+                video_grid_thw = video_grid_thw.squeeze(1)
+            position_ids, _ = self.model.model.get_rope_index(
+                input_ids, image_grid_thw, video_grid_thw, attn_mask
+            )
+            # [3, bs, seqlen] -> [bs, seqlen, 3]
+            position_ids = torch.einsum("ijk->jki", position_ids)
+            input_["position_ids"] = position_ids
+        else:
+            input_ = amend_position_ids(input_)
 
         mb_list = split_padded_tensor_dict_into_mb_list(input_, self.config.mb_spec)
         mb_list.mbs = [pack_tensor_dict(mb) for mb in mb_list.mbs]
@@ -272,6 +292,10 @@ def prepare_mb_list(self, input_: TensorDict) -> MicroBatchList:
         # NOTE: We unsqueeze here because huggingface transformer models requires
         # packed input to be of shape [1, total_seqlen].
         mb_list = unsqueeze_mb_list(mb_list)
+        if is_qwen2_vl_model(self.model_config.model_type):
+            for mb in mb_list.padded_mbs:
+                # [1, total_seqlen, 3] -> [3, 1, total_seqlen]
+                mb["position_ids"] = torch.einsum("ijk->kij", mb["position_ids"])
 
         # FIXME: the resulting max_seqlen is a tensor rather than an integer
         # TODO: remove the usage of tensordict
@@ -283,10 +307,12 @@ def prepare_mb_list(self, input_: TensorDict) -> MicroBatchList:
             mb_list.padded_mbs[i] = dict(**mb)
         for mb in mb_list.mbs:
             mb["max_seqlen"] = int(mb["max_seqlen"])
+            mb["cu_seqlens_q"] = mb["cu_seqlens_k"] = mb["cu_seqlens"]
             mb["use_cache"] = False
             mb["attention_mask"] = dict(full_attention=None)
         for mb in mb_list.padded_mbs:
             mb["max_seqlen"] = int(mb["max_seqlen"])
+            mb["cu_seqlens_q"] = mb["cu_seqlens_k"] = mb["cu_seqlens"]
             mb["use_cache"] = False
             mb["attention_mask"] = dict(full_attention=None)
 
@@ -317,7 +343,6 @@ def train_batch(
         for i, (pad_length, padded_mb_input, mb_input) in enumerate(
             zip(mb_list.padding_lengths, mb_list.padded_mbs, mb_list.mbs)
         ):
-
             outputs = self.model(**padded_mb_input)
 
             logits = outputs.logits.squeeze(0)
@@ -408,6 +433,7 @@ def forward(
         for pad_length, padded_mb_input, mb_input in zip(
             mb_list.padding_lengths, mb_list.padded_mbs, mb_list.mbs
         ):
+
             outputs = self.model(**padded_mb_input)
             logits = outputs.logits.squeeze(0)
             logits = logits[:-pad_length] if pad_length > 0 else logits

diff --git a/areal/reward/__init__.py b/areal/reward/__init__.py
@@ -0,0 +1,17 @@
+VALID_REWARD_FN = ["clevr_count_70k", "geometry3k"]
+
+
+def get_custom_reward_fn(path: str, **kwargs):
+    if "clevr_count_70k" in path:
+        from .clevr_count_70k import clevr_count_70k_reward_fn
+
+        return clevr_count_70k_reward_fn
+    elif "geometry3k" in path:
+        from .geometry3k import geometry3k_reward_fn
+
+        return geometry3k_reward_fn
+    else:
+        raise ValueError(
+            f"Reward function {path} is not supported. "
+            f"Supported reward functions are: {VALID_REWARD_FN}. "
+        )
diff --git a/areal/reward/clevr_count_70k.py b/areal/reward/clevr_count_70k.py
@@ -0,0 +1,27 @@
+import re
+
+
+def extract_answer(pred_str, data_name, use_last_number=True):
+    match = re.findall(r"\[([0-9\.]+)\]", pred_str)
+    if match:
+        return match[-1]
+
+    return ""
+
+
+def clevr_count_70k_reward_fn(
+    prompt, completions, prompt_ids, completion_ids, answer, **kwargs
+):
+    sol = extract_answer(completions, data_name="")  # str number
+    ans = answer
+
+    if sol is None:
+        return 0
+    if ans is None:
+        return 0
+
+    if sol.strip() == ans.strip():
+        print(f"completions: {completions}, answer: {answer}")
+        return 1
+
+    return 0
diff --git a/areal/reward/geometry3k.py b/areal/reward/geometry3k.py
@@ -0,0 +1,29 @@
+import re
+
+
+def extract_answer(pred_str, data_name, use_last_number=True):
+    matches = re.findall(r"\[([^\]]+)\]", pred_str)
+    if matches:
+        return matches[-1]
+
+    return ""
+
+
+def geometry3k_reward_fn(
+    prompt, completions, prompt_ids, completion_ids, answer, **kwargs
+):
+    sol = extract_answer(completions, data_name="")  # str number
+    ans = answer
+    sol = sol.replace(" ", "")
+    ans = ans.replace(" ", "")
+    if sol is None:
+        return 0
+    if ans is None:
+        return 0
+    # print(f"sol: {sol}, ans: {ans}")
+    from realhf.impl.dataset.math_parser import math_equal
+
+    if math_equal(sol, ans):
+        # print(f"completions: {completions}, answer: {answer}")
+        return 1
+    return 0