Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
3a87a41
i guessreverted all CdGen classes
zucchini-nlp Mar 26, 2025
8d7088a
style
zucchini-nlp Mar 26, 2025
95ac049
llava onevision
zucchini-nlp Mar 26, 2025
f0e917e
fix copies
zucchini-nlp Mar 27, 2025
85b1e7a
Merge branch 'main' into vlm-base-models
zucchini-nlp Mar 27, 2025
5e4d0e8
fix some tests
zucchini-nlp Mar 27, 2025
02e7b6e
some more tests
zucchini-nlp Mar 27, 2025
c0e41e6
dump
zucchini-nlp Mar 27, 2025
ef70523
Merge branch 'main' into vlm-base-models
zucchini-nlp Mar 28, 2025
06b8227
skip these
zucchini-nlp Mar 28, 2025
5655657
nevermind, i am dumb
zucchini-nlp Mar 28, 2025
083b9bc
revert fix not needed
zucchini-nlp Mar 28, 2025
4fe8a82
Merge branch 'main' into vlm-base-models
zucchini-nlp Mar 31, 2025
2e6caa4
fixup
zucchini-nlp Mar 31, 2025
d397075
Merge branch 'main' into vlm-base-models
zucchini-nlp Mar 31, 2025
0d1409f
Merge branch 'main' into vlm-base-models
zucchini-nlp Mar 31, 2025
a32e47e
Merge branch 'main' into vlm-base-models
zucchini-nlp Apr 1, 2025
5c019fe
fixup
zucchini-nlp Apr 4, 2025
32a67b1
Merge remote-tracking branch 'upstream/main' into vlm-base-models
zucchini-nlp Apr 4, 2025
a9b3816
another fixup
zucchini-nlp Apr 4, 2025
1f7172c
more fixup to make ci finally happy
zucchini-nlp Apr 4, 2025
1e5ee3b
merge main
zucchini-nlp Apr 22, 2025
c6bfa8d
fixup after rebasing
zucchini-nlp Apr 22, 2025
7631fdb
fix qwen tests
zucchini-nlp Apr 22, 2025
da33a04
add internVL + typos here and there
zucchini-nlp Apr 22, 2025
141c102
image token index -> id
zucchini-nlp Apr 22, 2025
ba58575
style
zucchini-nlp Apr 22, 2025
4a73546
fix init weights
zucchini-nlp Apr 22, 2025
4d4ae05
Merge remote-tracking branch 'upstream/main' into vlm-base-models
zucchini-nlp Apr 22, 2025
6298cc4
Merge branch 'main' into vlm-base-models
zucchini-nlp Apr 24, 2025
a25e02d
revert blip-2 not supported
zucchini-nlp May 1, 2025
3bbf3fd
address comments
zucchini-nlp May 1, 2025
8087394
Merge remote-tracking branch 'upstream/main' into vlm-base-models
zucchini-nlp May 1, 2025
32cbc87
Merge remote-tracking branch 'upstream/main' into vlm-base-models
zucchini-nlp May 1, 2025
43999e8
fix copies
zucchini-nlp May 1, 2025
43639f4
revert blip2 test file as well
zucchini-nlp May 1, 2025
d31a4c9
as discussed internally, revert back CdGen models
zucchini-nlp May 2, 2025
e7ff08c
fix some tests
zucchini-nlp May 2, 2025
c265726
fix more tests for compile
zucchini-nlp May 2, 2025
db069f1
CI red
zucchini-nlp May 2, 2025
d309ead
fix copies
zucchini-nlp May 2, 2025
f5b18eb
enumerate explicitly allowed models
zucchini-nlp May 2, 2025
c58c4f2
address comments
zucchini-nlp May 6, 2025
9971e7f
fix tests
zucchini-nlp May 7, 2025
f601c52
fixup
zucchini-nlp May 7, 2025
4e617b4
merge main
zucchini-nlp May 7, 2025
df62bdf
style again
zucchini-nlp May 7, 2025
2509f77
add tests for new model class
zucchini-nlp May 7, 2025
ce4374b
another fixup ( x _ x )
zucchini-nlp May 7, 2025
24d127f
[fixup] unused attributes can be removed post-deprecation
zucchini-nlp May 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,7 @@
"AriaPreTrainedModel",
"AriaTextForCausalLM",
"AriaTextModel",
"AriaModel",
"AriaTextPreTrainedModel",
]
)
Expand Down Expand Up @@ -1626,7 +1627,9 @@
"AutoformerPreTrainedModel",
]
)
_import_structure["models.aya_vision"].extend(["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"])
_import_structure["models.aya_vision"].extend(
["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel", "AyaVisionModel"]
)
_import_structure["models.bamba"].extend(
[
"BambaForCausalLM",
Expand Down Expand Up @@ -2338,6 +2341,7 @@
"Emu3PreTrainedModel",
"Emu3TextModel",
"Emu3VQVAE",
"Emu3Model",
]
)
_import_structure["models.encodec"].extend(
Expand Down Expand Up @@ -2457,7 +2461,7 @@
"load_tf_weights_in_funnel",
]
)
_import_structure["models.fuyu"].extend(["FuyuForCausalLM", "FuyuPreTrainedModel"])
_import_structure["models.fuyu"].extend(["FuyuForCausalLM", "FuyuPreTrainedModel", "FuyuModel"])
_import_structure["models.gemma"].extend(
[
"GemmaForCausalLM",
Expand All @@ -2482,6 +2486,7 @@
"Gemma3ForConditionalGeneration",
"Gemma3PreTrainedModel",
"Gemma3TextModel",
"Gemma3Model",
]
)
_import_structure["models.git"].extend(
Expand Down Expand Up @@ -2512,6 +2517,7 @@
[
"GotOcr2ForConditionalGeneration",
"GotOcr2PreTrainedModel",
"GotOcr2Model",
]
)
_import_structure["models.gpt2"].extend(
Expand Down Expand Up @@ -2701,6 +2707,7 @@
"InstructBlipPreTrainedModel",
"InstructBlipQFormerModel",
"InstructBlipVisionModel",
"InstructBlipModel",
]
)
_import_structure["models.instructblipvideo"].extend(
Expand Down Expand Up @@ -2802,11 +2809,13 @@
[
"LlavaForConditionalGeneration",
"LlavaPreTrainedModel",
"LlavaModel",
]
)
_import_structure["models.llava_next"].extend(
[
"LlavaNextForConditionalGeneration",
"LlavaNextModel",
"LlavaNextPreTrainedModel",
]
)
Expand All @@ -2825,12 +2834,14 @@
[
"LlavaNextVideoForConditionalGeneration",
"LlavaNextVideoPreTrainedModel",
"LlavaNextVideoModel",
]
)
_import_structure["models.llava_onevision"].extend(
[
"LlavaOnevisionForConditionalGeneration",
"LlavaOnevisionPreTrainedModel",
"LlavaOnevisionModel",
]
)
_import_structure["models.longformer"].extend(
Expand Down Expand Up @@ -2975,6 +2986,7 @@
[
"Mistral3ForConditionalGeneration",
"Mistral3PreTrainedModel",
"Mistral3Model",
]
)
_import_structure["models.mixtral"].extend(
Expand All @@ -2995,6 +3007,7 @@
"MllamaProcessor",
"MllamaTextModel",
"MllamaVisionModel",
"MllamaModel",
]
)
_import_structure["models.mobilebert"].extend(
Expand Down Expand Up @@ -3243,6 +3256,7 @@
[
"PaliGemmaForConditionalGeneration",
"PaliGemmaPreTrainedModel",
"PaliGemmaModel",
"PaliGemmaProcessor",
]
)
Expand Down Expand Up @@ -3878,6 +3892,7 @@
"VideoLlavaForConditionalGeneration",
"VideoLlavaPreTrainedModel",
"VideoLlavaProcessor",
"VideoLlavaModel",
]
)
_import_structure["models.videomae"].extend(
Expand All @@ -3903,6 +3918,7 @@
[
"VipLlavaForConditionalGeneration",
"VipLlavaPreTrainedModel",
"VipLlavaModel",
]
)
_import_structure["models.vision_encoder_decoder"].extend(["VisionEncoderDecoderModel"])
Expand Down Expand Up @@ -6753,6 +6769,7 @@
)
from .models.aria import (
AriaForConditionalGeneration,
AriaModel,
AriaPreTrainedModel,
AriaTextForCausalLM,
AriaTextModel,
Expand Down Expand Up @@ -6853,7 +6870,7 @@
AutoformerModel,
AutoformerPreTrainedModel,
)
from .models.aya_vision import AyaVisionForConditionalGeneration, AyaVisionPreTrainedModel
from .models.aya_vision import AyaVisionForConditionalGeneration, AyaVisionModel, AyaVisionPreTrainedModel
from .models.bamba import BambaForCausalLM, BambaModel, BambaPreTrainedModel
from .models.bark import (
BarkCausalModel,
Expand Down Expand Up @@ -7420,6 +7437,7 @@
from .models.emu3 import (
Emu3ForCausalLM,
Emu3ForConditionalGeneration,
Emu3Model,
Emu3PreTrainedModel,
Emu3TextModel,
Emu3VQVAE,
Expand Down Expand Up @@ -7525,6 +7543,7 @@
)
from .models.fuyu import (
FuyuForCausalLM,
FuyuModel,
FuyuPreTrainedModel,
)
from .models.gemma import (
Expand All @@ -7544,6 +7563,7 @@
from .models.gemma3 import (
Gemma3ForCausalLM,
Gemma3ForConditionalGeneration,
Gemma3Model,
Gemma3PreTrainedModel,
Gemma3TextModel,
)
Expand All @@ -7567,6 +7587,7 @@
)
from .models.got_ocr2 import (
GotOcr2ForConditionalGeneration,
GotOcr2Model,
GotOcr2PreTrainedModel,
)
from .models.gpt2 import (
Expand Down Expand Up @@ -7709,6 +7730,7 @@
)
from .models.instructblip import (
InstructBlipForConditionalGeneration,
InstructBlipModel,
InstructBlipPreTrainedModel,
InstructBlipQFormerModel,
InstructBlipVisionModel,
Expand Down Expand Up @@ -7788,18 +7810,22 @@
)
from .models.llava import (
LlavaForConditionalGeneration,
LlavaModel,
LlavaPreTrainedModel,
)
from .models.llava_next import (
LlavaNextForConditionalGeneration,
LlavaNextModel,
LlavaNextPreTrainedModel,
)
from .models.llava_next_video import (
LlavaNextVideoForConditionalGeneration,
LlavaNextVideoModel,
LlavaNextVideoPreTrainedModel,
)
from .models.llava_onevision import (
LlavaOnevisionForConditionalGeneration,
LlavaOnevisionModel,
LlavaOnevisionPreTrainedModel,
)
from .models.longformer import (
Expand Down Expand Up @@ -7910,6 +7936,7 @@
)
from .models.mistral3 import (
Mistral3ForConditionalGeneration,
Mistral3Model,
Mistral3PreTrainedModel,
)
from .models.mixtral import (
Expand All @@ -7923,6 +7950,7 @@
from .models.mllama import (
MllamaForCausalLM,
MllamaForConditionalGeneration,
MllamaModel,
MllamaPreTrainedModel,
MllamaProcessor,
MllamaTextModel,
Expand Down Expand Up @@ -8118,6 +8146,7 @@
)
from .models.paligemma import (
PaliGemmaForConditionalGeneration,
PaliGemmaModel,
PaliGemmaPreTrainedModel,
PaliGemmaProcessor,
)
Expand Down Expand Up @@ -8620,6 +8649,7 @@
)
from .models.video_llava import (
VideoLlavaForConditionalGeneration,
VideoLlavaModel,
VideoLlavaPreTrainedModel,
VideoLlavaProcessor,
)
Expand All @@ -8640,6 +8670,7 @@
)
from .models.vipllava import (
VipLlavaForConditionalGeneration,
VipLlavaModel,
VipLlavaPreTrainedModel,
)
from .models.vision_encoder_decoder import VisionEncoderDecoderModel
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1787,6 +1787,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
main_input_name = "input_ids"
model_tags = None

_key_mapping = None # used for BC support in VLMs, not meant to be used by new models

_auto_class = None
_no_split_modules = None
_skip_keys_device_placement = None
Expand Down Expand Up @@ -4067,7 +4069,7 @@ def from_pretrained(
generation_config = kwargs.pop("generation_config", None)
gguf_file = kwargs.pop("gguf_file", None)
tp_plan = kwargs.pop("tp_plan", None)
key_mapping = kwargs.pop("key_mapping", None)
key_mapping = kwargs.pop("key_mapping", cls._key_mapping)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And so only use it here if it exists

Suggested change
key_mapping = kwargs.pop("key_mapping", cls._key_mapping)
key_mapping = kwargs.pop("key_mapping", getattr(cls, "._key_mapping", None))

But just a bit worried if it could become an issue when saving then reloading?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, only case. I verified by saving and reloading that it works. We can remove if you think it's not needed, imo the new classes do not have to be loadable in transformers. in vLLM they use their own loader and I am adding these mapping on their end as well


if state_dict is not None and (pretrained_model_name_or_path is not None or gguf_file is not None):
raise ValueError(
Expand Down
Loading