From c697b405f8f8ba0eee2f2d4a90bdb5a41fd39686 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 10 Sep 2025 05:10:50 +0000 Subject: [PATCH 1/2] fix shape mismatch Signed-off-by: wwl2755 --- vllm/model_executor/models/molmo.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index b2fc7be1af22..3fcc2b4dfad7 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -77,19 +77,21 @@ class MolmoImageInputs(TensorSchema): Dimensions: - bn: Batch size * number of images - nc: Number of crops - - np: Number of patches + - np: Number of patches (dynamic) + - tp: Token sequence positions - pd: Patch dimension """ images: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "nc", "np", "pd")] + TensorShape("bn", "nc", "np", "pd", dynamic_dims={"nc"})] + # Number of crops may vary per batch and image, so pass it as a list. image_masks: Annotated[Optional[Union[torch.Tensor, list[torch.Tensor]]], - TensorShape("bn", "nc", "np")] + TensorShape("bn", "nc", "np", dynamic_dims={"nc"})] - feat_is_patch: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "nc", "np")] + feat_is_patch: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "nc", "tp", dynamic_dims={"nc"})] # A boolean mask indicating which image features correspond to patch tokens. - num_crops: Annotated[torch.Tensor, TensorShape("bn")] From 571ffeae0fbf600919e31bc12b272be239bd9233 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 10 Sep 2025 05:17:09 +0000 Subject: [PATCH 2/2] fix comment Signed-off-by: wwl2755 --- vllm/model_executor/models/molmo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 3fcc2b4dfad7..5d999a02b4e6 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -76,8 +76,8 @@ class MolmoImageInputs(TensorSchema): """ Dimensions: - bn: Batch size * number of images - - nc: Number of crops - - np: Number of patches (dynamic) + - nc: Number of crops (dynamic) + - np: Number of patches - tp: Token sequence positions - pd: Patch dimension """