Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions tests/models/multimodal/generation/vlm_utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,13 +489,14 @@ def __init__(self, hf_runner: HfRunner):
self.image_size = self.vision_config.image_size

def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
from vllm.transformers_utils.processors.h2ovl import (
image_to_pixel_values_h2ovl,
)

IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"

images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_h2ovl(
Expand Down Expand Up @@ -751,16 +752,17 @@ def __init__(self, hf_runner: HfRunner):
self.image_size = self.vision_config.image_size

def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.skyworkr1v import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
)

IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"

images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_skyworkr1v(
image_to_pixel_values_internvl(
image,
input_size=self.image_size,
min_num=self.min_num,
Expand Down Expand Up @@ -815,14 +817,15 @@ def __call__(
videos: npt.NDArray | list[npt.NDArray] = None,
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)

IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"

images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
Expand Down
3 changes: 2 additions & 1 deletion tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,8 @@ def check_available_online(
"rednote-hilab/dots.ocr", trust_remote_code=True
),
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
"nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
"nvidia/Eagle2.5-8B",
trust_remote_code=True,
),
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
Expand Down
38 changes: 32 additions & 6 deletions vllm/model_executor/models/eagle2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape

from .interfaces import (
Expand Down Expand Up @@ -68,12 +71,35 @@ class Eagle2_5_VLImageEmbeddingInputs(TensorSchema):
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model."""

def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
return self.ctx.init_processor(
Eagle2_5_VLProcessor,
config=self.ctx.get_hf_config(),
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config

kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault(
"image_size", config.force_image_size or vision_config.image_size
)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)

return InternVLImageProcessor(**kwargs)

def get_hf_processor(self, **kwargs) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config

image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))

return InternVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)


Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,13 +395,13 @@ def get_image_processor(self, **kwargs):
vision_config = config.vision_config

image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size})

return GLM4VImageProcessorFast(**kwargs)

def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
return self.ctx.init_processor(
GLM4VProcessor,
return GLM4VProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs),
)
Expand Down
34 changes: 28 additions & 6 deletions vllm/model_executor/models/h2ovl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
PromptUpdate,
TimingContext,
)
from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor

from .intern_vit import InternVisionModel
from .internvl import (
Expand All @@ -40,12 +40,34 @@


class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config

kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
kwargs.setdefault("use_msac", config.use_msac)

return H2OVLImageProcessor(**kwargs)

def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
vision_config = config.vision_config

image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))

return H2OVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)

def get_num_image_tokens(
Expand Down Expand Up @@ -106,7 +128,7 @@ def get_replacement_internvl(item_idx: int):
if num_patches is not None:
assert isinstance(num_patches, int)

return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)

return [
PromptReplacement(
Expand Down
Loading
Loading