Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
84e6134
init
lim4349 Nov 17, 2025
e065b77
first
Nov 17, 2025
d71c66d
second
lim4349 Nov 17, 2025
ef0daf8
third
lim4349 Nov 18, 2025
ec7faf2
force
lim4349 Nov 18, 2025
9bb9c83
May
lim4349 Nov 19, 2025
9f37dc5
May
lim4349 Nov 19, 2025
6c55be2
remove annotations
lim4349 Nov 19, 2025
14453ec
changes discarded gpu_runner
lim4349 Nov 19, 2025
c5688f7
last
lim4349 Nov 20, 2025
9d9b7d7
feat: add registry
lim4349 Nov 20, 2025
9207e8c
[Model] Add OpenCUA-7B support
lim4349 Nov 20, 2025
2a38534
Update vllm/model_executor/models/opencua.py
lim4349 Nov 20, 2025
f5ee87b
Update vllm/model_executor/models/opencua.py
lim4349 Nov 20, 2025
423a670
Fix OpenCUA implementation
lim4349 Nov 20, 2025
cbb4294
Merge branch 'main' into main
lim4349 Nov 20, 2025
e8ba966
Fix OpenCUA __init__ to properly initialize language_model
lim4349 Nov 20, 2025
ab90b52
Fix OpenCUA implementation: remove video support and fix config handling
lim4349 Nov 20, 2025
705b533
Merge branch 'main' into main
lim4349 Nov 20, 2025
9387247
Remove OpenCUAConfig from _CONFIG_REGISTRY
lim4349 Nov 20, 2025
37e5bde
refactor: Replace OpenCUAVisionTransformer subclass with import alias
lim4349 Nov 20, 2025
d47eecc
refactor: Clean up OpenCUA model implementation
lim4349 Nov 20, 2025
1565dab
refactor: Remove OpenCUAConfig
lim4349 Nov 20, 2025
6f9dca1
Merge branch 'main' into main
lim4349 Nov 20, 2025
9d71d6d
Merge branch 'main' into main
lim4349 Nov 23, 2025
b6bccec
fix: Add _hf_processor_applies_updates override for OpenCUA processor
lim4349 Nov 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
Expand Down
3 changes: 3 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,9 @@ def check_available_online(
"NemotronH_Nano_VL_V2": _HfExamplesInfo(
"nano_vl_dummy", is_available_online=False, trust_remote_code=True
),
"OpenCUAForConditionalGeneration": _HfExamplesInfo(
"xlangai/OpenCUA-7B", trust_remote_code=True
),
"Ovis": _HfExamplesInfo(
"AIDC-AI/Ovis2-1B",
trust_remote_code=True,
Expand Down
261 changes: 261 additions & 0 deletions vllm/model_executor/models/opencua.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Adapted from Qwen2.5-VL implementation
# Copyright 2025 The vLLM team.
# Copyright 2025 XLANG Lab, The University of Hong Kong

"""Inference-only OpenCUA-7B model compatible with HuggingFace weights."""

from collections.abc import Mapping, Sequence
from typing import Any

import torch
import torch.nn as nn
from transformers import BatchFeature
from transformers.models.qwen2_vl import (
Qwen2VLImageProcessor,
Qwen2VLProcessor,
Qwen2VLVideoProcessor,
)

from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalFieldConfig,
MultiModalKwargs,
)
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
PromptReplacement,
PromptUpdate,
)
from vllm.transformers_utils.tokenizer import AnyTokenizer

from .qwen2_5_vl import (
Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
)
from .qwen2_5_vl import (
Qwen2_5_VLForConditionalGeneration,
)
from .qwen2_vl import (
Qwen2VLDummyInputsBuilder,
Qwen2VLMultiModalDataParser,
Qwen2VLProcessingInfo,
_create_qwen2vl_field_factory,
)
from .utils import (
WeightsMapper,
init_vllm_registered_model,
maybe_prefix,
)


class OpenCUAProcessingInfo(Qwen2VLProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()

def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None}

def get_hf_processor(self, **kwargs: object):
"""Load OpenCUA processor."""
tokenizer = self.get_tokenizer()
vision_config = self.ctx.get_hf_image_processor_config()
return OpenCUAProcessor(
vision_config=vision_config,
tokenizer=tokenizer,
**kwargs,
)


class OpenCUAProcessor(Qwen2VLProcessor):
def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> None:
if attribute_name == "tokenizer":
return
return super().check_argument_for_proper_class(attribute_name, arg)

def __init__(
self,
vision_config: dict,
tokenizer: AnyTokenizer,
**kwargs,
):
image_processor = Qwen2VLImageProcessor(**vision_config)
video_processor = Qwen2VLVideoProcessor(**vision_config)
chat_template = kwargs.pop("chat_template", None)

super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
video_processor=video_processor,
chat_template=chat_template,
**kwargs,
)

self.image_token = "<|media_placeholder|>"

def __call__(
Comment thread
lim4349 marked this conversation as resolved.
self,
text=None,
images=None,
return_tensors=None,
**kwargs,
):
if text is not None:
if not isinstance(text, list):
text = [text]
text_inputs = self.tokenizer(text, **kwargs)
else:
text_inputs = {}

image_inputs = {}
if images is not None:
if not isinstance(images, list):
images = [images]
if len(images) > 0:
image_inputs = self.image_processor(
images, return_tensors=return_tensors or "pt"
)

combined_inputs = {**text_inputs, **image_inputs}

return BatchFeature(combined_inputs, tensor_type=return_tensors)


class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]):
def _get_data_parser(self) -> MultiModalDataParser:
return Qwen2VLMultiModalDataParser(
self.info.get_hf_config().vision_config.spatial_merge_size
)

def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return _create_qwen2vl_field_factory(
self.info.get_hf_config().vision_config.spatial_merge_size
)(hf_inputs)

def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
Comment thread
lim4349 marked this conversation as resolved.
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()
hf_config = self.info.get_hf_config()

image_token_str = getattr(hf_processor, "image_token", "<|media_placeholder|>")
image_token_id = vocab.get(
image_token_str,
getattr(hf_config, "media_placeholder_token_id", 151664),
)

merge_length = image_processor.merge_size**2

def get_replacement_opencua(item_idx: int):
out_item = out_mm_kwargs["image"][item_idx]
grid_thw = out_item["image_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor)

num_tokens = int(grid_thw.prod()) // merge_length
return [image_token_id] * num_tokens

return [
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=get_replacement_opencua,
)
]


class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
Comment thread
lim4349 marked this conversation as resolved.
num_images = mm_counts.get("image", 0)

image_token = "<|media_placeholder|>"

return image_token * num_images


@MULTIMODAL_REGISTRY.register_processor(
OpenCUAMultiModalProcessor,
info=OpenCUAProcessingInfo,
dummy_inputs=OpenCUADummyInputsBuilder,
)
class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
merge_by_field_config = True
multimodal_cpu_fields = {"image_grid_thw"}

packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
}

hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"model.language_model.": "language_model.model.",
"model.visual.": "visual.",
"vision_tower.": "visual.",
"lm_head.": "language_model.lm_head.",
"model.": "language_model.model.",
}
)

supports_encoder_tp_data = True

Comment thread
lim4349 marked this conversation as resolved.
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
return "<|media_placeholder|>"
raise ValueError("Only image modality is supported")

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
nn.Module.__init__(self)
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config

self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
self.config = config
self.vllm_config = vllm_config
self.multimodal_config = multimodal_config
self.quant_config = quant_config
self.is_multimodal_pruning_enabled = (
multimodal_config.is_multimodal_pruning_enabled()
)

if multimodal_config.get_limit_per_prompt("image"):
attn_backend_override = (
multimodal_config.mm_encoder_attn_backend
if multimodal_config is not None
else None
)
self.visual = OpenCUAVisionTransformer(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=self.quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel,
attn_backend_override=attn_backend_override,
)
else:
self.visual = None

self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=config.text_config,
prefix=maybe_prefix(prefix, "language_model"),
architectures=["Qwen2ForCausalLM"],
)

self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors
)
4 changes: 4 additions & 0 deletions vllm/model_executor/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,10 @@
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"OpenCUAForConditionalGeneration": (
"opencua",
"OpenCUAForConditionalGeneration",
),
"InternS1ForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
Expand Down