Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
4f76390
update conversion script
Nov 19, 2024
32b1443
update for bias again
Nov 19, 2024
1e6695e
remove pdv
Nov 19, 2024
5ac86ec
use my dir
Rocketknight1 Nov 20, 2024
2448e23
Update how we initialize the tokenizer
Rocketknight1 Nov 20, 2024
08ae1d8
Convert in bfloat16
Rocketknight1 Nov 20, 2024
e4658f0
Undo that one again
Rocketknight1 Nov 20, 2024
72a1460
fix config dump
Rocketknight1 Nov 20, 2024
399731b
.to() was broken for BatchMixFeature
Rocketknight1 Nov 21, 2024
22fa744
quick debug breakpoint
Rocketknight1 Nov 21, 2024
e95af1b
put the breakpoint in the right place
Rocketknight1 Nov 21, 2024
9df78af
Add a config flag for the multimodal projector bias
Rocketknight1 Nov 21, 2024
b756909
Add a config flag for the multimodal projector bias
Rocketknight1 Nov 21, 2024
069accf
Conversion script can load chat templates
Rocketknight1 Nov 21, 2024
90670a5
Indent config for comparison
Rocketknight1 Nov 25, 2024
e124a93
Stop clobbering the config
Rocketknight1 Nov 25, 2024
bbaa6f1
Re-enable the config clobber
Rocketknight1 Nov 25, 2024
7a7acbb
Get rid of the config manual save - it has no effect!
Rocketknight1 Nov 25, 2024
65e3336
Handle adapter bias correctly
Rocketknight1 Nov 25, 2024
4c98a65
Default vision transformer activation to silu
Rocketknight1 Nov 27, 2024
17bfbdd
Remove legacy processing path
Rocketknight1 Dec 2, 2024
2e50b89
One commit with all the debug breakpoints before I delete them all, i…
Rocketknight1 Dec 20, 2024
e2b6531
Update conversion
Rocketknight1 Dec 20, 2024
2320c14
Remove vLLM debugging instrumentation
Rocketknight1 Dec 20, 2024
624f3c2
Drop xformers
Rocketknight1 Dec 20, 2024
81bde8d
Remove debug enumerates
Rocketknight1 Dec 20, 2024
aa0630f
make fixup
Rocketknight1 Dec 20, 2024
63282bb
make fixup
Rocketknight1 Dec 20, 2024
436104d
Break copied from in pixtral
Rocketknight1 Dec 20, 2024
3a87990
Propagate multimodal_projector_bias change
Rocketknight1 Dec 23, 2024
97ebda3
Propagate multimodal_projector_bias change
Rocketknight1 Dec 23, 2024
05294a1
Remove debug device .to()
Rocketknight1 Dec 23, 2024
984a55e
Restore attention weights output
Rocketknight1 Dec 23, 2024
8884969
Fix Pixtral test
Rocketknight1 Dec 23, 2024
8074e95
Drop image_seq_length
Rocketknight1 Dec 23, 2024
0d29bc3
Drop image_seq_length
Rocketknight1 Dec 23, 2024
99ea497
Put the legacy processing code back
Rocketknight1 Dec 23, 2024
5fe8af6
Merge branch 'main' into pixtral-large-script
Rocketknight1 Dec 23, 2024
5eb3f6c
Add the bias option to the llava_next_video config
Rocketknight1 Dec 23, 2024
6f026c1
Add the bias option to the llava_next_video config
Rocketknight1 Dec 23, 2024
e39d0ea
Make certain args required in converter
Rocketknight1 Jan 7, 2025
9562cd9
Make certain args required in converter
Rocketknight1 Jan 7, 2025
41454f7
Merge branch 'main' into pixtral-large-script
Rocketknight1 Jan 8, 2025
d48f25d
typo
Rocketknight1 Jan 8, 2025
5f125b7
make fixup
Rocketknight1 Jan 8, 2025
bf2ddda
Reverting some dtype changes since it seems to work without them
Rocketknight1 Jan 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/transformers/models/llava/configuration_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class LlavaConfig(PretrainedConfig):
The index of the layer to select the vision feature.
image_seq_length (`int`, *optional*, defaults to 576):
Sequence length of one image embedding.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.

Example:

Expand Down Expand Up @@ -85,6 +87,7 @@ def __init__(
vision_feature_select_strategy="default",
vision_feature_layer=-2,
image_seq_length=576,
multimodal_projector_bias=True,
**kwargs,
):
self.ignore_index = ignore_index
Expand Down Expand Up @@ -127,6 +130,7 @@ def __init__(
text_config = CONFIG_MAPPING["llama"]()

self.text_config = text_config
self.multimodal_projector_bias = multimodal_projector_bias

super().__init__(**kwargs)

Expand Down
9 changes: 6 additions & 3 deletions src/transformers/models/llava/modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,13 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
class LlavaMultiModalProjector(nn.Module):
def __init__(self, config: LlavaConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class LlavaNextConfig(PretrainedConfig):
Whether the model's input and output word embeddings should be tied.
image_seq_length (`int`, *optional*, defaults to 576):
Sequence length of one image embedding.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.

Example:

Expand Down Expand Up @@ -92,12 +94,14 @@ def __init__(
image_grid_pinpoints=None,
tie_word_embeddings=False,
image_seq_length=576,
multimodal_projector_bias=True,
**kwargs,
):
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.image_seq_length = image_seq_length
self.multimodal_projector_bias = multimodal_projector_bias

if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
Expand Down
9 changes: 6 additions & 3 deletions src/transformers/models/llava_next/modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,13 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
class LlavaNextMultiModalProjector(nn.Module):
def __init__(self, config: LlavaNextConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
The image token index to encode the image prompt.
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
Expand Down Expand Up @@ -95,6 +97,7 @@ def __init__(
ignore_index=-100,
image_token_index=32001,
projector_hidden_act="gelu",
multimodal_projector_bias=True,
vision_feature_select_strategy="default",
vision_feature_layer=-2,
image_grid_pinpoints=None,
Expand All @@ -114,6 +117,7 @@ def __init__(
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.multimodal_projector_bias = multimodal_projector_bias

if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,13 @@ def _init_weights(self, module):
class LlavaNextVideoMultiModalProjector(nn.Module):
def __init__(self, config: LlavaNextVideoConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
The image token index to encode the image prompt.
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
Expand Down Expand Up @@ -109,6 +111,7 @@ def __init__(
ignore_index=-100,
image_token_index=32001,
projector_hidden_act="gelu",
multimodal_projector_bias=True,
vision_feature_select_strategy="default",
vision_feature_layer=-2,
image_grid_pinpoints=None,
Expand All @@ -128,6 +131,7 @@ def __init__(
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.multimodal_projector_bias = multimodal_projector_bias

if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class LlavaOnevisionConfig(PretrainedConfig):
of the form `(height, width)`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.

Example:

Expand Down Expand Up @@ -95,11 +97,13 @@ def __init__(
vision_aspect_ratio="anyres_max_9",
image_grid_pinpoints=None,
tie_word_embeddings=False,
multimodal_projector_bias=True,
**kwargs,
):
self.image_token_index = image_token_index
self.video_token_index = video_token_index
self.projector_hidden_act = projector_hidden_act
self.multimodal_projector_bias = multimodal_projector_bias

if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,13 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
class LlavaOnevisionMultiModalProjector(nn.Module):
def __init__(self, config: LlavaOnevisionConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Loading
Loading