Add Bagel #38569

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

yaswanth19 wants to merge 41 commits into huggingface:main from yaswanth19:add-bagel

docs/source/en/_toctree.yml

-Original file line number
+Diff line change
@@ Expand Up / @@ -965,6 +965,8 @@ @@
             title: Aria
           - local: model_doc/aya_vision
             title: AyaVision
+          - local: model_doc/bagel
+            title: Bagel
           - local: model_doc/blip
             title: BLIP
           - local: model_doc/blip-2
@@ Expand Down @@

docs/source/en/model_doc/bagel.md

-Original file line number
+Diff line change
@@ -0,0 +1,52 @@
+    <!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+    ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+    rendered properly in your Markdown viewer.
+    -->
+    <div style="float: right;">
+        <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        </div>
+    </div>
+    # Bagel
+    ## BagelConfig
+    [[autodoc]] BagelConfig
+    ## BagelProcessor
+    [[autodoc]] BagelProcessor
+    ## BagelImageProcessor
+    [[autodoc]] BagelImageProcessor
+    ## BagelImageProcessorFast
+    [[autodoc]] BagelImageProcessorFast
+    ## BagelModel
+    [[autodoc]] BagelModel
+        - forward
+    ## BagelForConditionalGeneration
+    [[autodoc]] BagelForConditionalGeneration
+        - forward

src/transformers/models/__init__.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ @@
         from .auto import *
         from .autoformer import *
         from .aya_vision import *
+        from .bagel import *
         from .bamba import *
         from .bark import *
         from .bart import *
@@ Expand Down @@

src/transformers/models/auto/configuration_auto.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -47,6 +47,7 @@ @@
             ("audio-spectrogram-transformer", "ASTConfig"),
             ("autoformer", "AutoformerConfig"),
             ("aya_vision", "AyaVisionConfig"),
+            ("bagel", "BagelConfig"),
             ("bamba", "BambaConfig"),
             ("bark", "BarkConfig"),
             ("bart", "BartConfig"),
@@ Expand Down Expand Up / @@ -441,6 +442,7 @@ @@
             ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
             ("autoformer", "Autoformer"),
             ("aya_vision", "AyaVision"),
+            ("bagel", "Bagel"),
             ("bamba", "Bamba"),
             ("bark", "Bark"),
             ("bart", "BART"),
@@ Expand Down @@

src/transformers/models/auto/modeling_auto.py

-Original file line number
+Diff line change
@@ Expand Up @@
             ("audio-spectrogram-transformer", "ASTModel"),
             ("autoformer", "AutoformerModel"),
             ("aya_vision", "AyaVisionModel"),
+            ("bagel", "BagelModel"),
             ("bamba", "BambaModel"),
             ("bark", "BarkModel"),
             ("bart", "BartModel"),
@@ Expand Down Expand Up @@
         [
             # Model for pre-training mapping
             ("albert", "AlbertForPreTraining"),
+            ("bagel", "BagelForConditionalGeneration"),
             ("bart", "BartForConditionalGeneration"),
             ("bert", "BertForPreTraining"),
             ("big_bird", "BigBirdForPreTraining"),
@@ Expand Down Expand Up @@
         [
             ("aria", "AriaForConditionalGeneration"),
             ("aya_vision", "AyaVisionForConditionalGeneration"),
+            ("bagel", "BagelForConditionalGeneration"),
             ("blip", "BlipForConditionalGeneration"),
             ("blip-2", "Blip2ForConditionalGeneration"),
             ("chameleon", "ChameleonForConditionalGeneration"),
@@ Expand Down @@

src/transformers/models/auto/processing_auto.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,7 @@ @@
             ("altclip", "AltCLIPProcessor"),
             ("aria", "AriaProcessor"),
             ("aya_vision", "AyaVisionProcessor"),
+            ("bagel", "BagelProcessor"),
             ("bark", "BarkProcessor"),
             ("blip", "BlipProcessor"),
             ("blip-2", "Blip2Processor"),
@@ Expand Down @@

src/transformers/models/bagel/__init__.py

-Original file line number
+Diff line change
@@ -0,0 +1,29 @@
+    # Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    from typing import TYPE_CHECKING
+    from ...utils import _LazyModule
+    from ...utils.import_utils import define_import_structure
+    if TYPE_CHECKING:
+        from .configuration_bagel import *
+        from .image_processing_bagel import *
+        from .modeling_bagel import *
+        from .processing_bagel import *
+    else:
+        import sys
+        _file = globals()["__file__"]
+        sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

src/transformers/models/bagel/configuration_bagel.py

-Original file line number
+Diff line change
@@ -0,0 +1,190 @@
+    #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+    #           This file was automatically generated from src/transformers/models/bagel/modular_bagel.py.
+    #               Do NOT edit this file manually as any edits will be overwritten by the generation of
+    #             the file from the modular. If any change should be done, please apply the change to the
+    #                          modular_bagel.py file directly. One of our CI enforces this.
+    #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+    # coding=utf-8
+    # Copyright 2025 ByteDance and The HuggingFace Team. All rights reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    from ...configuration_utils import PretrainedConfig
+    from ...modeling_utils import logging
+    from ..auto import CONFIG_MAPPING, AutoConfig
+    logger = logging.get_logger(__name__)
+    class BagelVQVAEConfig(PretrainedConfig):
+        r"""
+        This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a
+        BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
+        with defaults taken from [ibm-fms/Bamba-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/Bamba-9.8b-2.2T-hf).
+        The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU.
+        The checkpoints are  jointly trained by IBM, Princeton, and UIUC.
+        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+        documentation from [`PretrainedConfig`] for more information.
+        """
+        model_type = "bagel_vqvae"
+        base_config_key = "vq_config"
+        def __init__(
+            self,
+            double_latent: bool = False,
+            latent_channels: int = 16,
+            num_patches: int = 32,
+            latent_patch_size=2,
+            in_channels: int = 3,
+            out_channels: int = 3,
+            base_channels: int = 128,
+            channel_multiplier: list[int] = [1, 2, 4, 4],
+            num_res_blocks: int = 2,
+            dropout: float = 0.0,
+            initializer_range=0.02,
+            scale_factor=0.2611,
+            shift_factor=0.1159,
+            downsample=8,
+            **kwargs,
+        ):
+            super().__init__(**kwargs)
+            self.double_latent = double_latent
+            self.latent_channels = latent_channels
+            self.in_channels = in_channels
+            self.base_channels = base_channels
+            self.channel_multiplier = channel_multiplier
+            self.num_res_blocks = num_res_blocks
+            self.dropout = dropout
+            self.initializer_range = initializer_range
+            self.num_patches = num_patches
+            self.out_channels = out_channels
+            self.scale_factor = scale_factor
+            self.shift_factor = shift_factor
+            self.latent_patch_size = latent_patch_size
+            self.downsample = downsample
+    llm_config = {
+        "attention_dropout": 0.0,
+        "bos_token_id": 151643,
+        "eos_token_id": 151645,
+        "hidden_act": "silu",
+        "hidden_size": 3584,
+        "initializer_range": 0.02,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "max_window_layers": 28,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 1,  # 28
+        "num_key_value_heads": 4,
+        "rms_norm_eps": 1e-06,
+        "rope_theta": 1000000.0,
+        "tie_word_embeddings": False,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.43.1",
+        "use_cache": True,
+        "vocab_size": 152064,
+    }
+    vit_config = {
+        "hidden_size": 1152,
+        "image_size": 980,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 1,  # 26
+        "patch_size": 14,
+        "vision_use_head": False,
+    }
+    class BagelConfig(PretrainedConfig):
+        r"""
+        This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a
+        BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
+        with defaults taken from [ibm-fms/Bamba-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/Bamba-9.8b-2.2T-hf).
+        The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU.
+        The checkpoints are  jointly trained by IBM, Princeton, and UIUC.
+        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+        documentation from [`PretrainedConfig`] for more information.
+        """
+        model_type = "bagel"
+        sub_configs = {
+            "text_config": AutoConfig,
+            "vision_config": AutoConfig,
+            "vq_config": BagelVQVAEConfig,
+        }
+        def __init__(
+            self,
+            text_config=None,
+            vision_config=None,
+            vq_config=None,
+            **kwargs,
+        ):
+            if isinstance(text_config, dict):
+                text_config["model_type"] = text_config.get("model_type", "qwen2")
+                self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            elif text_config is None:
+                logger.info("`text_config` is None. Initializing with default values")
+                self.text_config = CONFIG_MAPPING["qwen2"](**llm_config)
+            elif isinstance(text_config, PretrainedConfig):
+                self.text_config = text_config
+            else:
+                raise ValueError(
+                    f"Invalid type for `text_config`. Must be either `dict` or `Qwen2Config`."
+                    f" Type found: {type(text_config)}"
+                )
+            if isinstance(text_config, dict):
+                vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+                self.vision_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            elif text_config is None:
+                logger.info("`vision_config` is None. Initializing with default values")
+                self.vision_config = CONFIG_MAPPING["siglip_vision_model"](**vit_config)
+            elif isinstance(vision_config, PretrainedConfig):
+                self.vision_config = vision_config
+            else:
+                raise ValueError(
+                    f"Invalid type for `vision_config`. Must be either `dict` or `SiglipVisionConfig`."
+                    f" Type found: {type(vision_config)}"
+                )
+            if vq_config is None:
+                logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values")
+                self.vq_config = BagelVQVAEConfig()
+            elif isinstance(vq_config, dict):
+                self.vq_config = BagelVQVAEConfig(**vq_config)
+            elif isinstance(vq_config, BagelVQVAEConfig):
+                self.vq_config = vq_config
+            else:
+                raise ValueError(
+                    f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`."
+                    f" Type found: {type(vq_config)}"
+                )
+            self.vit_max_num_patch_per_side = 70
+            self.max_latent_size = 64
+            self.timestep_shift = 1.0
+            super().__init__(**kwargs)
+    __all__ = ["BagelVQVAEConfig", "BagelConfig"]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add Bagel #38569

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Add Bagel #38569

Are you sure you want to change the base?

Uh oh!

Add Bagel #38569

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!