Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Add BNB quantization support for Idefics3 #10310

Merged
merged 15 commits into from
Nov 14, 2024
47 changes: 42 additions & 5 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from PIL import Image
from torch import nn
# Temporary solution for transformers below 4.46.0.
from transformers import PretrainedConfig as Idefics3Config
from transformers import ProcessorMixin as Idefics3ImageProcessor

from vllm.attention import AttentionMetadata
Expand All @@ -31,6 +32,7 @@
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.models.module_mapping import MultiModelKeys
Expand Down Expand Up @@ -374,12 +376,17 @@ def dummy_data_for_idefics3(

class Idefics3SimpleMLP(nn.Module):

def __init__(self, config):
def __init__(self,
config: Idefics3Config,
quant_config: Optional[QuantizationConfig] = None):
super().__init__()
input_size = config.vision_config.hidden_size * (config.scale_factor**
2)
output_size = config.text_config.hidden_size
self.proj = ReplicatedLinear(input_size, output_size, bias=False)
self.proj = ReplicatedLinear(input_size,
output_size,
bias=False,
quant_config=quant_config)
B-201 marked this conversation as resolved.
Show resolved Hide resolved

def forward(self, x: torch.Tensor) -> torch.Tensor:
out, _ = self.proj(x)
Expand All @@ -388,10 +395,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:

class Idefics3Connector(nn.Module):

def __init__(self, config):
def __init__(self,
config: Idefics3Config,
quant_config: Optional[QuantizationConfig] = None):
super().__init__()
self.scale_factor = config.scale_factor
self.modality_projection = Idefics3SimpleMLP(config)
self.modality_projection = Idefics3SimpleMLP(config, quant_config)

def pixel_shuffle(self,
x: torch.Tensor,
Expand Down Expand Up @@ -433,7 +442,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.vocab_size = self.config.text_config.vocab_size
self.vision_model = Idefics3VisionTransformer(config.vision_config,
quant_config)
self.connector = Idefics3Connector(config)
self.connector = Idefics3Connector(config, quant_config)
self.text_model = LlamaModel(
vllm_config=vllm_config.with_hf_config(config.text_config),
prefix=maybe_prefix(prefix, "text_model"),
Expand Down Expand Up @@ -637,9 +646,37 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
"gate_up_proj",
"down_proj",
]

# BitandBytes specific attributes
default_bitsandbytes_target_modules = [
".gate_proj.",
".down_proj.",
".up_proj.",
".q_proj.",
".k_proj.",
".v_proj.",
".o_proj.",
# vision_model
".fc1.",
".fc2.",
".out_proj.",
# connector
".proj.",
]
bitsandbytes_stacked_params_mapping = {
# shard_name, weight_name, index
"q_proj": ("qkv_proj", 0),
"k_proj": ("qkv_proj", 1),
"v_proj": ("qkv_proj", 2),
"gate_proj": ("gate_up_proj", 0),
"up_proj": ("gate_up_proj", 1),
}

embedding_modules = {}
embedding_padding_modules = []



def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()

Expand Down
Loading