Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import torch

from vllm.config import MultiModalConfig
from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.models.vision import get_vit_attn_backend
Expand Down Expand Up @@ -32,7 +31,6 @@ def __init__(
scale: float | None = None,
num_kv_heads: int | None = None,
prefix: str = "",
multimodal_config: MultiModalConfig | None = None,
) -> None:
"""
Args:
Expand All @@ -42,7 +40,6 @@ def __init__(
num_kv_heads: number of kv heads.
prefix: This has no effect, it is only here to make it easier to
swap between Attention and MultiHeadAttention
multimodal_config: configs for multi-modal.
"""
super().__init__()

Expand All @@ -62,16 +59,10 @@ def __init__(
# weight and activation dtype.
dtype = torch.get_default_dtype()

# Try to get vision attention backend from multimodal_config.
attn_backend_override = None
if multimodal_config is not None:
attn_backend_override = multimodal_config.mm_encoder_attn_backend

# Get device-specific vision attention backend.
self.attn_backend = get_vit_attn_backend(
head_size=head_size,
dtype=dtype,
attn_backend_override=attn_backend_override,
)

self.is_flash_attn_backend = self.attn_backend in {
Expand Down
28 changes: 4 additions & 24 deletions vllm/model_executor/models/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from vllm.attention.layer import Attention
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import divide, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
Expand Down Expand Up @@ -59,6 +59,7 @@
VisionFeatureSelectStrategy,
VisionFeatureSelectStrategyStr,
get_num_selected_vision_tokens,
is_vit_use_data_parallel,
resolve_visual_encoder_outputs,
)

Expand Down Expand Up @@ -353,7 +354,6 @@ def __init__(
self,
config: CLIPTextConfig | CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
prefix: str = "",
attn_cls: type[Attention] | type[MMEncoderAttention],
Expand All @@ -372,11 +372,7 @@ def __init__(
)
self.scale = self.head_dim**-0.5

use_data_parallel = (
multimodal_config.mm_encoder_tp_mode == "data"
if multimodal_config
else False
)
use_data_parallel = is_vit_use_data_parallel()
self.qkv_proj = QKVParallelLinear(
hidden_size=self.embed_dim,
head_size=self.head_dim,
Expand Down Expand Up @@ -405,7 +401,6 @@ def __init__(
self.head_dim,
self.scale,
prefix=f"{prefix}.attn",
multimodal_config=multimodal_config,
)
else:
self.attn = attn_cls(
Expand Down Expand Up @@ -434,17 +429,12 @@ def __init__(
self,
config: CLIPTextConfig | CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
) -> None:
super().__init__()

self.config = config
use_data_parallel = (
multimodal_config.mm_encoder_tp_mode == "data"
if multimodal_config
else False
)
use_data_parallel = is_vit_use_data_parallel()
self.activation_fn = get_act_fn(config.hidden_act)

self.fc1 = ColumnParallelLinear(
Expand Down Expand Up @@ -477,7 +467,6 @@ def __init__(
self,
config: CLIPTextConfig | CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
prefix: str = "",
attn_cls: type[Attention] | type[MMEncoderAttention],
Expand All @@ -487,15 +476,13 @@ def __init__(
self.self_attn = CLIPAttention(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.self_attn",
attn_cls=attn_cls,
)
self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.mlp = CLIPMLP(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.mlp",
)
self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
Expand Down Expand Up @@ -528,7 +515,6 @@ def __init__(
self,
config: CLIPTextConfig | CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
num_hidden_layers_override: int | None = None,
*,
prefix: str = "",
Expand All @@ -548,7 +534,6 @@ def __init__(
CLIPEncoderLayer(
config=config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.layers.{layer_idx}",
attn_cls=attn_cls,
)
Expand Down Expand Up @@ -658,7 +643,6 @@ def __init__(
self,
config: CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
num_hidden_layers_override: int | None = None,
require_post_norm: bool | None = None,
Expand All @@ -678,7 +662,6 @@ def __init__(
self.encoder = CLIPEncoder(
config=config,
quant_config=quant_config,
multimodal_config=multimodal_config,
num_hidden_layers_override=num_hidden_layers_override,
prefix=f"{prefix}.encoder",
attn_cls=MMEncoderAttention,
Expand Down Expand Up @@ -780,7 +763,6 @@ def __init__(
self,
config: CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
num_hidden_layers_override: int | None = None,
require_post_norm: bool | None = None,
Expand All @@ -791,7 +773,6 @@ def __init__(
self.vision_model = CLIPVisionTransformer(
config=config,
quant_config=quant_config,
multimodal_config=multimodal_config,
num_hidden_layers_override=num_hidden_layers_override,
require_post_norm=require_post_norm,
prefix=f"{prefix}.vision_model",
Expand Down Expand Up @@ -869,7 +850,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.vision_model = CLIPVisionTransformer(
vision_config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=maybe_prefix(prefix, "vision_model"),
)
self.visual_projection = nn.Linear(
Expand Down
3 changes: 0 additions & 3 deletions vllm/model_executor/models/deepencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import torch.nn.functional as F
from transformers import CLIPVisionConfig

from vllm.config import MultiModalConfig
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.conv import Conv2dLayer
from vllm.model_executor.layers.quantization import QuantizationConfig
Expand Down Expand Up @@ -609,7 +608,6 @@ def __init__(
self,
config: CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
num_hidden_layers_override: int | None = None,
prefix: str = "",
Expand All @@ -628,7 +626,6 @@ def __init__(
self.transformer = CLIPEncoder(
config=config,
quant_config=quant_config,
multimodal_config=multimodal_config,
num_hidden_layers_override=num_hidden_layers_override,
prefix=f"{prefix}.encoder",
attn_cls=MMEncoderAttention,
Expand Down
1 change: 0 additions & 1 deletion vllm/model_executor/models/deepseek_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.vision_model = DeepCLIPVisionTransformer(
config=clip_vision_config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=maybe_prefix(prefix, "vision_model"),
)

Expand Down
39 changes: 5 additions & 34 deletions vllm/model_executor/models/dots_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from torch.nn import LayerNorm
from transformers.models.qwen2_vl import Qwen2VLProcessor

from vllm.config import MultiModalConfig, VllmConfig
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import utils as dist_utils
from vllm.distributed.parallel_state import (
Expand Down Expand Up @@ -60,7 +60,7 @@
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.v1.attention.backends.registry import AttentionBackendEnum

from .vision import run_dp_sharded_mrope_vision_model
from .vision import is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model

IMAGE_TOKEN = "<|imgpad|>"

Expand Down Expand Up @@ -183,9 +183,9 @@ def __init__(
spatial_merge_size: int = 2,
pre_norm="layernorm",
prefix: str = "",
use_data_parallel: bool = False,
) -> None:
super().__init__()
use_data_parallel = is_vit_use_data_parallel()
self.hidden_size = context_dim * (spatial_merge_size**2)
self.pre_norm = pre_norm
if self.pre_norm == "layernorm":
Expand Down Expand Up @@ -230,15 +230,10 @@ def __init__(
bias: bool = True,
*,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
) -> None:
super().__init__()
use_data_parallel = (
multimodal_config.mm_encoder_tp_mode == "data"
if multimodal_config
else False
)
use_data_parallel = is_vit_use_data_parallel()

self.embed_dim = dim
self.tp_size = (
Expand Down Expand Up @@ -272,7 +267,6 @@ def __init__(
num_heads=self.num_attention_heads_per_partition,
head_size=self.hidden_size_per_attention_head,
scale=self.hidden_size_per_attention_head**-0.5,
multimodal_config=multimodal_config,
prefix=f"{prefix}.attn",
)

Expand Down Expand Up @@ -329,19 +323,14 @@ def __init__(
config,
*,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__()
hidden_features = config.intermediate_size
in_features = config.embed_dim
bias = config.use_bias

use_data_parallel = (
multimodal_config.mm_encoder_tp_mode == "data"
if multimodal_config
else False
)
use_data_parallel = is_vit_use_data_parallel()
# Referenced aimv2.py AIMv2SwiGLUFFN
self.fc13 = MergedColumnParallelLinear(
in_features,
Expand Down Expand Up @@ -447,7 +436,6 @@ def __init__(
config,
*,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__()
Expand All @@ -458,14 +446,12 @@ def __init__(
num_heads=config.num_attention_heads,
bias=config.use_bias,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.attn",
)
self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
self.mlp = DotsSwiGLUFFN(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.mlp",
)
self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
Expand Down Expand Up @@ -493,7 +479,6 @@ def __init__(
self,
config: DotsVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
num_hidden_layers_override: int | None = None,
require_post_norm: bool | None = None,
Expand All @@ -507,15 +492,9 @@ def __init__(

head_dim = config.embed_dim // config.num_attention_heads
self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
attn_backend_override = (
multimodal_config.mm_encoder_attn_backend
if multimodal_config is not None
else None
)
self.attn_backend = get_vit_attn_backend(
head_size=head_dim,
dtype=torch.get_default_dtype(),
attn_backend_override=attn_backend_override,
)
self.out_hidden_size = config.hidden_size
# Keep blocks for compatibility with other vision towers
Expand All @@ -529,7 +508,6 @@ def __init__(
DotsVisionBlock(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.blocks.{i}",
)
for i in range(num_layers)
Expand All @@ -542,16 +520,10 @@ def __init__(
else:
self.post_trunk_norm = None

use_data_parallel = (
multimodal_config.mm_encoder_tp_mode == "data"
if multimodal_config
else False
)
self.merger = PatchMerger(
dim=config.hidden_size,
context_dim=config.embed_dim,
spatial_merge_size=config.spatial_merge_size,
use_data_parallel=use_data_parallel,
)

@property
Expand Down Expand Up @@ -693,7 +665,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.vision_tower = DotsVisionTransformer(
vision_config,
quant_config=self.quant_config,
multimodal_config=multimodal_config,
prefix=maybe_prefix(prefix, "vision_tower"),
)

Expand Down
1 change: 0 additions & 1 deletion vllm/model_executor/models/eagle2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,6 @@ def _init_vision_model(
return SiglipVisionModel(
vision_config,
quant_config=quant_config,
multimodal_config=self.multimodal_config,
num_hidden_layers_override=num_hidden_layers,
prefix=prefix,
)
Expand Down
Loading