-
-
Notifications
You must be signed in to change notification settings - Fork 18.9k
[Model] Add OpenCUA-7B support #29068
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit
Hold shift + click to select a range
84e6134
init
lim4349 e065b77
first
d71c66d
second
lim4349 ef0daf8
third
lim4349 ec7faf2
force
lim4349 9bb9c83
May
lim4349 9f37dc5
May
lim4349 6c55be2
remove annotations
lim4349 14453ec
changes discarded gpu_runner
lim4349 c5688f7
last
lim4349 9d9b7d7
feat: add registry
lim4349 9207e8c
[Model] Add OpenCUA-7B support
lim4349 2a38534
Update vllm/model_executor/models/opencua.py
lim4349 f5ee87b
Update vllm/model_executor/models/opencua.py
lim4349 423a670
Fix OpenCUA implementation
lim4349 cbb4294
Merge branch 'main' into main
lim4349 e8ba966
Fix OpenCUA __init__ to properly initialize language_model
lim4349 ab90b52
Fix OpenCUA implementation: remove video support and fix config handling
lim4349 705b533
Merge branch 'main' into main
lim4349 9387247
Remove OpenCUAConfig from _CONFIG_REGISTRY
lim4349 37e5bde
refactor: Replace OpenCUAVisionTransformer subclass with import alias
lim4349 d47eecc
refactor: Clean up OpenCUA model implementation
lim4349 1565dab
refactor: Remove OpenCUAConfig
lim4349 6f9dca1
Merge branch 'main' into main
lim4349 9d71d6d
Merge branch 'main' into main
lim4349 b6bccec
fix: Add _hf_processor_applies_updates override for OpenCUA processor
lim4349 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,261 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| # | ||
| # Adapted from Qwen2.5-VL implementation | ||
| # Copyright 2025 The vLLM team. | ||
| # Copyright 2025 XLANG Lab, The University of Hong Kong | ||
|
|
||
| """Inference-only OpenCUA-7B model compatible with HuggingFace weights.""" | ||
|
|
||
| from collections.abc import Mapping, Sequence | ||
| from typing import Any | ||
|
|
||
| import torch | ||
| import torch.nn as nn | ||
| from transformers import BatchFeature | ||
| from transformers.models.qwen2_vl import ( | ||
| Qwen2VLImageProcessor, | ||
| Qwen2VLProcessor, | ||
| Qwen2VLVideoProcessor, | ||
| ) | ||
|
|
||
| from vllm.config import VllmConfig | ||
| from vllm.multimodal import MULTIMODAL_REGISTRY | ||
| from vllm.multimodal.inputs import ( | ||
| MultiModalFieldConfig, | ||
| MultiModalKwargs, | ||
| ) | ||
| from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser | ||
| from vllm.multimodal.processing import ( | ||
| BaseMultiModalProcessor, | ||
| PromptReplacement, | ||
| PromptUpdate, | ||
| ) | ||
| from vllm.transformers_utils.tokenizer import AnyTokenizer | ||
|
|
||
| from .qwen2_5_vl import ( | ||
| Qwen2_5_VisionTransformer as OpenCUAVisionTransformer, | ||
| ) | ||
| from .qwen2_5_vl import ( | ||
| Qwen2_5_VLForConditionalGeneration, | ||
| ) | ||
| from .qwen2_vl import ( | ||
| Qwen2VLDummyInputsBuilder, | ||
| Qwen2VLMultiModalDataParser, | ||
| Qwen2VLProcessingInfo, | ||
| _create_qwen2vl_field_factory, | ||
| ) | ||
| from .utils import ( | ||
| WeightsMapper, | ||
| init_vllm_registered_model, | ||
| maybe_prefix, | ||
| ) | ||
|
|
||
|
|
||
| class OpenCUAProcessingInfo(Qwen2VLProcessingInfo): | ||
| def get_hf_config(self): | ||
| return self.ctx.get_hf_config() | ||
|
|
||
| def get_supported_mm_limits(self) -> Mapping[str, int | None]: | ||
| return {"image": None} | ||
|
|
||
| def get_hf_processor(self, **kwargs: object): | ||
| """Load OpenCUA processor.""" | ||
| tokenizer = self.get_tokenizer() | ||
| vision_config = self.ctx.get_hf_image_processor_config() | ||
| return OpenCUAProcessor( | ||
| vision_config=vision_config, | ||
| tokenizer=tokenizer, | ||
| **kwargs, | ||
| ) | ||
|
|
||
|
|
||
| class OpenCUAProcessor(Qwen2VLProcessor): | ||
| def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> None: | ||
| if attribute_name == "tokenizer": | ||
| return | ||
| return super().check_argument_for_proper_class(attribute_name, arg) | ||
|
|
||
| def __init__( | ||
| self, | ||
| vision_config: dict, | ||
| tokenizer: AnyTokenizer, | ||
| **kwargs, | ||
| ): | ||
| image_processor = Qwen2VLImageProcessor(**vision_config) | ||
| video_processor = Qwen2VLVideoProcessor(**vision_config) | ||
| chat_template = kwargs.pop("chat_template", None) | ||
|
|
||
| super().__init__( | ||
| image_processor=image_processor, | ||
| tokenizer=tokenizer, | ||
| video_processor=video_processor, | ||
| chat_template=chat_template, | ||
| **kwargs, | ||
| ) | ||
|
|
||
| self.image_token = "<|media_placeholder|>" | ||
|
|
||
| def __call__( | ||
| self, | ||
| text=None, | ||
| images=None, | ||
| return_tensors=None, | ||
| **kwargs, | ||
| ): | ||
| if text is not None: | ||
| if not isinstance(text, list): | ||
| text = [text] | ||
| text_inputs = self.tokenizer(text, **kwargs) | ||
| else: | ||
| text_inputs = {} | ||
|
|
||
| image_inputs = {} | ||
| if images is not None: | ||
| if not isinstance(images, list): | ||
| images = [images] | ||
| if len(images) > 0: | ||
| image_inputs = self.image_processor( | ||
| images, return_tensors=return_tensors or "pt" | ||
| ) | ||
|
|
||
| combined_inputs = {**text_inputs, **image_inputs} | ||
|
|
||
| return BatchFeature(combined_inputs, tensor_type=return_tensors) | ||
|
|
||
|
|
||
| class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]): | ||
| def _get_data_parser(self) -> MultiModalDataParser: | ||
| return Qwen2VLMultiModalDataParser( | ||
| self.info.get_hf_config().vision_config.spatial_merge_size | ||
| ) | ||
|
|
||
| def _get_mm_fields_config( | ||
| self, | ||
| hf_inputs: BatchFeature, | ||
| hf_processor_mm_kwargs: Mapping[str, object], | ||
| ) -> Mapping[str, MultiModalFieldConfig]: | ||
| return _create_qwen2vl_field_factory( | ||
| self.info.get_hf_config().vision_config.spatial_merge_size | ||
| )(hf_inputs) | ||
|
|
||
| def _get_prompt_updates( | ||
| self, | ||
| mm_items: MultiModalDataItems, | ||
| hf_processor_mm_kwargs: Mapping[str, Any], | ||
| out_mm_kwargs: MultiModalKwargs, | ||
| ) -> Sequence[PromptUpdate]: | ||
| hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) | ||
| image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) | ||
|
lim4349 marked this conversation as resolved.
|
||
| tokenizer = self.info.get_tokenizer() | ||
| vocab = tokenizer.get_vocab() | ||
| hf_config = self.info.get_hf_config() | ||
|
|
||
| image_token_str = getattr(hf_processor, "image_token", "<|media_placeholder|>") | ||
| image_token_id = vocab.get( | ||
| image_token_str, | ||
| getattr(hf_config, "media_placeholder_token_id", 151664), | ||
| ) | ||
|
|
||
| merge_length = image_processor.merge_size**2 | ||
|
|
||
| def get_replacement_opencua(item_idx: int): | ||
| out_item = out_mm_kwargs["image"][item_idx] | ||
| grid_thw = out_item["image_grid_thw"].data | ||
| assert isinstance(grid_thw, torch.Tensor) | ||
|
|
||
| num_tokens = int(grid_thw.prod()) // merge_length | ||
| return [image_token_id] * num_tokens | ||
|
|
||
| return [ | ||
| PromptReplacement( | ||
| modality="image", | ||
| target=[image_token_id], | ||
| replacement=get_replacement_opencua, | ||
| ) | ||
| ] | ||
|
|
||
|
|
||
| class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder): | ||
| def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: | ||
|
lim4349 marked this conversation as resolved.
|
||
| num_images = mm_counts.get("image", 0) | ||
|
|
||
| image_token = "<|media_placeholder|>" | ||
|
|
||
| return image_token * num_images | ||
|
|
||
|
|
||
| @MULTIMODAL_REGISTRY.register_processor( | ||
| OpenCUAMultiModalProcessor, | ||
| info=OpenCUAProcessingInfo, | ||
| dummy_inputs=OpenCUADummyInputsBuilder, | ||
| ) | ||
| class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): | ||
| merge_by_field_config = True | ||
| multimodal_cpu_fields = {"image_grid_thw"} | ||
|
|
||
| packed_modules_mapping = { | ||
| "qkv_proj": ["q_proj", "k_proj", "v_proj"], | ||
| "gate_up_proj": ["gate_proj", "up_proj"], | ||
| } | ||
|
|
||
| hf_to_vllm_mapper = WeightsMapper( | ||
| orig_to_new_prefix={ | ||
| "model.language_model.": "language_model.model.", | ||
| "model.visual.": "visual.", | ||
| "vision_tower.": "visual.", | ||
| "lm_head.": "language_model.lm_head.", | ||
| "model.": "language_model.model.", | ||
| } | ||
| ) | ||
|
|
||
| supports_encoder_tp_data = True | ||
|
|
||
|
lim4349 marked this conversation as resolved.
|
||
| @classmethod | ||
| def get_placeholder_str(cls, modality: str, i: int) -> str | None: | ||
| if modality.startswith("image"): | ||
| return "<|media_placeholder|>" | ||
| raise ValueError("Only image modality is supported") | ||
|
|
||
| def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): | ||
| nn.Module.__init__(self) | ||
| config = vllm_config.model_config.hf_config | ||
| quant_config = vllm_config.quant_config | ||
| multimodal_config = vllm_config.model_config.multimodal_config | ||
|
|
||
| self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" | ||
| self.config = config | ||
| self.vllm_config = vllm_config | ||
| self.multimodal_config = multimodal_config | ||
| self.quant_config = quant_config | ||
| self.is_multimodal_pruning_enabled = ( | ||
| multimodal_config.is_multimodal_pruning_enabled() | ||
| ) | ||
|
|
||
| if multimodal_config.get_limit_per_prompt("image"): | ||
| attn_backend_override = ( | ||
| multimodal_config.mm_encoder_attn_backend | ||
| if multimodal_config is not None | ||
| else None | ||
| ) | ||
| self.visual = OpenCUAVisionTransformer( | ||
| vision_config=config.vision_config, | ||
| norm_eps=getattr(config, "rms_norm_eps", 1e-6), | ||
| quant_config=self.quant_config, | ||
| prefix=maybe_prefix(prefix, "visual"), | ||
| use_data_parallel=self.use_data_parallel, | ||
| attn_backend_override=attn_backend_override, | ||
| ) | ||
| else: | ||
| self.visual = None | ||
|
|
||
| self.language_model = init_vllm_registered_model( | ||
| vllm_config=vllm_config, | ||
| hf_config=config.text_config, | ||
| prefix=maybe_prefix(prefix, "language_model"), | ||
| architectures=["Qwen2ForCausalLM"], | ||
| ) | ||
|
|
||
| self.make_empty_intermediate_tensors = ( | ||
| self.language_model.make_empty_intermediate_tensors | ||
| ) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.