Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
574f25a
first commit
yaswanth19 Jun 3, 2025
31dfdac
changes
yaswanth19 Jun 12, 2025
9eb25fb
dump
yaswanth19 Jun 14, 2025
c479731
Add ae model arc🚀
yaswanth19 Jun 14, 2025
ad2fea1
Merge branch 'main' into add-bagel
yaswanth19 Jun 14, 2025
319aa06
Merge branch 'main' into add-bagel
yaswanth19 Jun 24, 2025
4326649
Dump changes for now
yaswanth19 Jun 24, 2025
7706ed5
config working
yaswanth19 Jun 24, 2025
f1b1615
added processor file draft
Shakib-IO Jun 26, 2025
f0d16d0
A few more missing pieces
yaswanth19 Jun 28, 2025
a722f37
cleanup
yaswanth19 Jun 28, 2025
9a20d92
some more progress
yaswanth19 Jun 28, 2025
99fb8d8
vision forward pass succesfull
yaswanth19 Jun 28, 2025
764452f
remove
yaswanth19 Jun 28, 2025
00d0178
Merge branch 'main' into add-bagel
yaswanth19 Jun 28, 2025
e8a9887
Image Processor
Shakib-IO Jul 1, 2025
941fda9
Soem progress and generate is working
yaswanth19 Jul 5, 2025
22b6bf9
add converstion mapping
yaswanth19 Jul 5, 2025
b179da2
Merge branch 'main' into add-bagel
yaswanth19 Jul 5, 2025
53f405b
Merge branch 'main' into add-bagel
yaswanth19 Jul 10, 2025
ea9c1de
up
yaswanth19 Jul 10, 2025
a32fcd5
Merge branch 'main' into add-bagel
yaswanth19 Jul 10, 2025
348ece1
import fixes
yaswanth19 Jul 11, 2025
2ec43ac
Update Image Processor
Shakib-IO Jul 12, 2025
49f31a7
add processing_bagel
Shakib-IO Jul 15, 2025
a27f131
Update bagel
Shakib-IO Jul 15, 2025
d88508f
Processor update
yaswanth19 Jul 16, 2025
6fe0785
raw logits matching with some hacks
yaswanth19 Jul 18, 2025
817c3b2
Some fixes
yaswanth19 Jul 19, 2025
c33fe42
support both transforms
yaswanth19 Jul 20, 2025
48a56d2
Merge branch 'main' into add-bagel
yaswanth19 Jul 25, 2025
f7b01a2
Don't know where it's going
yaswanth19 Aug 2, 2025
3a1c27c
This is working atleast
yaswanth19 Aug 2, 2025
84ff02f
rough forward step logic - still need to fix cfg
yaswanth19 Aug 3, 2025
aacb512
enough for today
yaswanth19 Aug 3, 2025
f5df973
Merge branch 'main' into add-bagel
yaswanth19 Aug 10, 2025
757bb26
Merge branch 'main' into add-bagel
yaswanth19 Aug 12, 2025
c480758
make style
yaswanth19 Aug 10, 2025
cad7bd1
temp push
yaswanth19 Aug 12, 2025
fb5c5b5
docs
yaswanth19 Aug 12, 2025
6349ee6
cleanup
yaswanth19 Aug 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,8 @@
title: Aria
- local: model_doc/aya_vision
title: AyaVision
- local: model_doc/bagel
title: Bagel
- local: model_doc/blip
title: BLIP
- local: model_doc/blip-2
Expand Down
52 changes: 52 additions & 0 deletions docs/source/en/model_doc/bagel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.

-->

<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>

# Bagel


## BagelConfig

[[autodoc]] BagelConfig

## BagelProcessor

[[autodoc]] BagelProcessor

## BagelImageProcessor

[[autodoc]] BagelImageProcessor

## BagelImageProcessorFast

[[autodoc]] BagelImageProcessorFast

## BagelModel

[[autodoc]] BagelModel
- forward

## BagelForConditionalGeneration

[[autodoc]] BagelForConditionalGeneration
- forward
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .auto import *
from .autoformer import *
from .aya_vision import *
from .bagel import *
from .bamba import *
from .bark import *
from .bart import *
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
("audio-spectrogram-transformer", "ASTConfig"),
("autoformer", "AutoformerConfig"),
("aya_vision", "AyaVisionConfig"),
("bagel", "BagelConfig"),
("bamba", "BambaConfig"),
("bark", "BarkConfig"),
("bart", "BartConfig"),
Expand Down Expand Up @@ -441,6 +442,7 @@
("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
("autoformer", "Autoformer"),
("aya_vision", "AyaVision"),
("bagel", "Bagel"),
("bamba", "Bamba"),
("bark", "Bark"),
("bart", "BART"),
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("audio-spectrogram-transformer", "ASTModel"),
("autoformer", "AutoformerModel"),
("aya_vision", "AyaVisionModel"),
("bagel", "BagelModel"),
("bamba", "BambaModel"),
("bark", "BarkModel"),
("bart", "BartModel"),
Expand Down Expand Up @@ -409,6 +410,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
[
# Model for pre-training mapping
("albert", "AlbertForPreTraining"),
("bagel", "BagelForConditionalGeneration"),
("bart", "BartForConditionalGeneration"),
("bert", "BertForPreTraining"),
("big_bird", "BigBirdForPreTraining"),
Expand Down Expand Up @@ -959,6 +961,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
[
("aria", "AriaForConditionalGeneration"),
("aya_vision", "AyaVisionForConditionalGeneration"),
("bagel", "BagelForConditionalGeneration"),
("blip", "BlipForConditionalGeneration"),
("blip-2", "Blip2ForConditionalGeneration"),
("chameleon", "ChameleonForConditionalGeneration"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
("altclip", "AltCLIPProcessor"),
("aria", "AriaProcessor"),
("aya_vision", "AyaVisionProcessor"),
("bagel", "BagelProcessor"),
("bark", "BarkProcessor"),
("blip", "BlipProcessor"),
("blip-2", "Blip2Processor"),
Expand Down
29 changes: 29 additions & 0 deletions src/transformers/models/bagel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING

from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure


if TYPE_CHECKING:
from .configuration_bagel import *
from .image_processing_bagel import *
from .modeling_bagel import *
from .processing_bagel import *
else:
import sys

_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
190 changes: 190 additions & 0 deletions src/transformers/models/bagel/configuration_bagel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/bagel/modular_bagel.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_bagel.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 ByteDance and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from ...configuration_utils import PretrainedConfig
from ...modeling_utils import logging
from ..auto import CONFIG_MAPPING, AutoConfig


logger = logging.get_logger(__name__)


class BagelVQVAEConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a
BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
with defaults taken from [ibm-fms/Bamba-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/Bamba-9.8b-2.2T-hf).

The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU.
The checkpoints are jointly trained by IBM, Princeton, and UIUC.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""

model_type = "bagel_vqvae"
base_config_key = "vq_config"

def __init__(
self,
double_latent: bool = False,
latent_channels: int = 16,
num_patches: int = 32,
latent_patch_size=2,
in_channels: int = 3,
out_channels: int = 3,
base_channels: int = 128,
channel_multiplier: list[int] = [1, 2, 4, 4],
num_res_blocks: int = 2,
dropout: float = 0.0,
initializer_range=0.02,
scale_factor=0.2611,
shift_factor=0.1159,
downsample=8,
**kwargs,
):
super().__init__(**kwargs)
self.double_latent = double_latent
self.latent_channels = latent_channels
self.in_channels = in_channels
self.base_channels = base_channels
self.channel_multiplier = channel_multiplier
self.num_res_blocks = num_res_blocks
self.dropout = dropout
self.initializer_range = initializer_range
self.num_patches = num_patches
self.out_channels = out_channels
self.scale_factor = scale_factor
self.shift_factor = shift_factor
self.latent_patch_size = latent_patch_size
self.downsample = downsample


llm_config = {
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 3584,
"initializer_range": 0.02,
"intermediate_size": 18944,
"max_position_embeddings": 32768,
"max_window_layers": 28,
"model_type": "qwen2",
"num_attention_heads": 28,
"num_hidden_layers": 1, # 28
"num_key_value_heads": 4,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"tie_word_embeddings": False,
"torch_dtype": "bfloat16",
"transformers_version": "4.43.1",
"use_cache": True,
"vocab_size": 152064,
}

vit_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 1, # 26
"patch_size": 14,
"vision_use_head": False,
}


class BagelConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a
BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
with defaults taken from [ibm-fms/Bamba-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/Bamba-9.8b-2.2T-hf).

The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU.
The checkpoints are jointly trained by IBM, Princeton, and UIUC.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""

model_type = "bagel"
sub_configs = {
"text_config": AutoConfig,
"vision_config": AutoConfig,
"vq_config": BagelVQVAEConfig,
}

def __init__(
self,
text_config=None,
vision_config=None,
vq_config=None,
**kwargs,
):
if isinstance(text_config, dict):
text_config["model_type"] = text_config.get("model_type", "qwen2")
self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
logger.info("`text_config` is None. Initializing with default values")
self.text_config = CONFIG_MAPPING["qwen2"](**llm_config)
elif isinstance(text_config, PretrainedConfig):
self.text_config = text_config
else:
raise ValueError(
f"Invalid type for `text_config`. Must be either `dict` or `Qwen2Config`."
f" Type found: {type(text_config)}"
)

if isinstance(text_config, dict):
vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
self.vision_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
logger.info("`vision_config` is None. Initializing with default values")
self.vision_config = CONFIG_MAPPING["siglip_vision_model"](**vit_config)
elif isinstance(vision_config, PretrainedConfig):
self.vision_config = vision_config
else:
raise ValueError(
f"Invalid type for `vision_config`. Must be either `dict` or `SiglipVisionConfig`."
f" Type found: {type(vision_config)}"
)

if vq_config is None:
logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values")
self.vq_config = BagelVQVAEConfig()
elif isinstance(vq_config, dict):
self.vq_config = BagelVQVAEConfig(**vq_config)
elif isinstance(vq_config, BagelVQVAEConfig):
self.vq_config = vq_config
else:
raise ValueError(
f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`."
f" Type found: {type(vq_config)}"
)
self.vit_max_num_patch_per_side = 70
self.max_latent_size = 64
self.timestep_shift = 1.0
super().__init__(**kwargs)


__all__ = ["BagelVQVAEConfig", "BagelConfig"]
Loading