Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions vllm/model_executor/models/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"""PyTorch Mllama model."""
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import Literal, Optional, TypedDict, Union
from typing import Annotated, Literal, Optional, Union

import numpy as np
import torch
Expand Down Expand Up @@ -64,6 +64,7 @@
EncDecMultiModalProcessor,
PromptReplacement, PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.utils.tensor_schema import TensorSchema, TensorShape

from .clip import CLIPMLP
from .interfaces import SupportsMultiModal, SupportsV0Only
Expand All @@ -73,15 +74,30 @@
logger = init_logger(__name__)


class MllamaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: """
"""(batch_size, max_num_image, max_num_chunk, num_channel, height, width)"""
aspect_ratio_ids: torch.Tensor
"""Shape: `(batch_size, max_num_image)`"""
aspect_ratio_mask: torch.Tensor
"""Shape: `(batch_size, max_num_image, max_num_tiles)`"""
class MllamaImagePixelInputs(TensorSchema):
"""
Dimensions:
- batch_size: Batch size
- max_num_image: Max number of images
- max_num_chunk: Max number of chunks
- max_num_tiles: Max number of tiles per image
- num_channel: Number of channels
- height: Height
- width: Width
"""

type: Literal["pixel_values"] = "pixel_values"

data: Annotated[torch.Tensor,
TensorShape("batch_size", "max_num_image", "max_num_chunk",
"num_channel", "height", "width")]

aspect_ratio_ids: Annotated[torch.Tensor,
TensorShape("batch_size", "max_num_image")]

aspect_ratio_mask: Annotated[
torch.Tensor,
TensorShape("batch_size", "max_num_image", "max_num_tiles")]


# TODO: support LlamaImageEmbeddingInputs
Expand Down