ggml-org · ngxson · Jun 5, 2026 · May 6, 2026 · May 5, 2026 · May 6, 2026
@@ -253,6 +253,7 @@
     "Glm4vMoeForConditionalGeneration": "qwen3vl",
     "GlmOcrForConditionalGeneration": "qwen3vl",
     "GlmasrModel": "ultravox",
+    "Granite4VisionForConditionalGeneration": "granite",
     "GraniteSpeechForConditionalGeneration": "granite",
     "HunYuanVLForConditionalGeneration": "hunyuan",
     "Idefics3ForConditionalGeneration": "smolvlm",

@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 from typing import Any, Callable, Iterable, TYPE_CHECKING
 
 import torch
@@ -13,7 +14,7 @@
 from .mamba import Mamba2Model
 
 
-@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
+@ModelBase.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):
     """Conversion for IBM's GraniteForCausalLM"""
     model_arch = gguf.MODEL_ARCH.GRANITE
@@ -46,11 +47,29 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_logit_scale(logits_scale)
             logger.info("gguf: (granite) logits_scale = %s", logits_scale)
 
+        # If being used as the base for Granite4 Vision, add deepstack_layer_arr
+        if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
+            normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
+            deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
+            for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
+                # Skip the first projector which is handled as the base embedding
+                # stream like normal
+                if proj_idx == 0:
+                    continue
+                deepstack_mapping_arr[llm_layer] = proj_idx
+            self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
+
     @classmethod
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
-        if name.startswith("encoder."):
-            return None
+        # Skip multimodal tensors
+        if (
+            name.startswith(("encoder."))
+            or "image_" in name
+            or "layerwise_projectors" in name
+            or "spatial_projectors" in name
+        ):
+            return
         return super().filter_tensors(item)
 
 
@@ -241,7 +260,8 @@ def set_gguf_parameters(self):
         assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
 
     def set_vocab(self):
-        self.hparams["pad_vocab_size_multiple"] = 8
+        # For models with no ssm layers, don't pad for mamba2
+        self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
         Mamba2Model.set_vocab(self)
 
 
@@ -326,3 +346,133 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 data_torch = data_torch.squeeze(1)
 
         yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Granite4VisionForConditionalGeneration")
+class Granite4VisionMmprojModel(MmprojModel):
+    has_vision_encoder = True
+    has_audio_encoder = False
+
+    @staticmethod
+    def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
+        """Normalize both deepstack and spatial projector maps to the form:
+        (vision_layer, llm_layer, <type>, type_index)
+
+        This is then used to populate the following mappings:
+        - vision_feature_layers (mmproj hparam): ordered list of all
+          vision_layer values where order corresponds with the order of the
+          stacked projector tensors
+          NOTE: Values may appear multiple times for spatial projectors
+        - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
+          the index of the corresponding projector in the stacked tensors
+        - deepstack_layer_arr (llm hparam): per-text-layer array indicating
+          which input vision feature should be injected at that layer
+          (-1 if none)
+
+        Output: (vision_layer, llm_layer, <type>, type_index)
+        """
+        deepstack_map = global_config.get("deepstack_layer_map", [])  # [[vis_layer, llm_layer], ...]
+        spatial_layers = global_config.get("spatial_target_layers", [])  # [llm_layer, ...]
+        n_text_layers = global_config["text_config"]["num_hidden_layers"]
+        n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
+        normalized_projector_map = []
+        if deepstack_map:
+            for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
+                if vision_layer < 0:
+                    vision_layer = n_vision_layers + vision_layer
+                if llm_layer < 0:
+                    llm_layer = n_text_layers + llm_layer
+                normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
+        if spatial_layers:
+            spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
+            if spatial_vision_layer < 0:
+                spatial_vision_layer = n_vision_layers + spatial_vision_layer
+            for spatial_idx, llm_layer in enumerate(spatial_layers):
+                normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
+        return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        normalized_projector_map = self.get_normalized_projector_map(self.global_config)
+        self._n_proj = len(normalized_projector_map)
+
+        self._tensor_prefix_map = {
+            f"model.{proj_type}_projectors.{type_idx}": proj_idx
+            for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
+        }
+        self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
+        self._spatial_offsets = [
+            type_idx if proj_type == "spatial" else -1
+            for _, _, proj_type, type_idx in normalized_projector_map
+        ]
+
+    def set_gguf_parameters(self):
+        assert self.hparams_vision is not None
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
+
+        # SigLIP encoder hparams
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)
+
+        # Preprocessor
+        self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
+
+        # QFormer projector config
+        ds_rate = self.global_config["downsample_rate"]
+        ds_parts = ds_rate.split("/")
+        assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
+        query_side, window_side = [int(p) for p in ds_parts]
+        self.gguf_writer.add_vision_projector_query_side(query_side)
+        self.gguf_writer.add_vision_projector_window_side(window_side)
+
+        # Set vision feature layers
+        self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
+
+        # Set the spatial offests per projector
+        self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
+
+        # Add flattened image grind pinpoints (resolution candidates internally)
+        if pinpoints := self.global_config.get("image_grid_pinpoints"):
+            # Flatten with h, w -> w, h inversion
+            pinpoints = [val for h, w in pinpoints for val in (w, h)]
+            self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        if ("vision_model.head" in name or name.startswith("lm_head")):
+            return None
+        return super().filter_tensors(item)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+
+        # Detect projector tensors and bin them
+        projector_idx = None
+        for prefix, proj_idx in self._tensor_prefix_map.items():
+            if name.startswith(prefix):
+                projector_idx = proj_idx
+                break
+        if projector_idx is not None:
+            # If this projector tensor has a block id within the projector,
+            # alias the bid to projector_idx
+            #
+            # TODO: currently, none of the Granite 4 Vision models have
+            # projectors with multiple QFormer layers, so the `layer.{}` index
+            # is always 0. This allows us to simply map to a single `bid` that
+            # matches the projector index. If this changes, we'll need a
+            # convention that merges the two IDs.
+            id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
+            all_ids = [int(m.group(1)) for m in id_matches]
+            assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
+            # If not layer id, just use the projector index
+            new_bid = projector_idx
+            if len(all_ids) == 1:
+                new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
+            else: # len(all_ids) == 2
+                new_bid = projector_idx # + all_ids[1]
+                new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
+            yield from super().modify_tensors(data_torch, new_name, new_bid)
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
@@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace:
         "--base-model-id", type=str,
         help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
     )
+    parser.add_argument(
+        "--trust-remote-code", default=False, action="store_true",
+        help="trust remote code in the model",
+    )
     parser.add_argument(
         "lora_path", type=Path,
         help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
+def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
     from huggingface_hub import try_to_load_from_cache
 
     # normally, adapter does not come with base model config, we need to load it from AutoConfig
-    config = AutoConfig.from_pretrained(hf_model_id)
+    config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
     cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
     cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
 
@@ -372,13 +376,13 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]
     # load base model
     if base_model_id is not None:
         logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
+        hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
     elif dir_base_model is None:
         if "base_model_name_or_path" in lparams:
             model_id = lparams["base_model_name_or_path"]
             logger.info(f"Loading base model from Hugging Face: {model_id}")
             try:
-                hparams, dir_base_model = load_hparams_from_hf(model_id)
+                hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
             except OSError as e:
                 logger.error(f"Failed to load base model config: {e}")
                 logger.error("Please try downloading the base model and add its path to --base")
@@ -393,7 +397,9 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]
 
     with torch.inference_mode():
         try:
-            model_class = get_model_class(hparams["architectures"][0])
+            model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
+            logger.info("Using model architecture: %s", model_arch)
+            model_class = get_model_class(model_arch)
         except NotImplementedError:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)

@@ -128,6 +128,7 @@ class LLM:
         MOE_LATENT_SIZE                   = "{arch}.moe_latent_size"
         NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
         NUM_DEEPSTACK_LAYERS              = "{arch}.n_deepstack_layers"
+        DEEPSTACK_MAPPING                 = "{arch}.deepstack_mapping"
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
@@ -325,6 +326,8 @@ class ClipVision:
         WA_PATTERN_MODE       = "clip.vision.wa_pattern_mode"  # used by mimovl, per-layer -1/0/1
         IS_DEEPSTACK_LAYERS   = "clip.vision.is_deepstack_layers"
         WINDOW_SIZE           = "clip.vision.window_size"
+        FEATURE_LAYERS        = "clip.vision.feature_layer" # Granite4 Vision
+        IMAGE_GRID_PINPOINTS  = "clip.vision.image_grid_pinpoints" # Granite4 Vision
 
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
@@ -333,6 +336,9 @@ class Attention:
 
         class Projector:
             SCALE_FACTOR    = "clip.vision.projector.scale_factor"
+            QUERY_SIDE      = "clip.vision.projector.query_side"
+            WINDOW_SIDE     = "clip.vision.projector.window_side"
+            SPATIAL_OFFSETS = "clip.vision.projector.spatial_offsets"
 
         class SAM:
             BLOCK_COUNT         = "clip.vision.sam.block_count"
@@ -821,6 +827,31 @@ class MODEL_TENSOR(IntEnum):
     V_RESMPL_QUERY_768   = auto() # Deepseek-OCR-2
     V_RESMPL_QUERY_1024  = auto() # Deepseek-OCR-2
 
+    # qformer projector (vision) - Granite4 Vision
+    V_QF_PROJ_QUERY      = auto()
+    V_QF_PROJ_NORM       = auto()
+    V_QF_PROJ_LINEAR     = auto()
+    V_QF_SELF_ATTN_Q     = auto()
+    V_QF_SELF_ATTN_K     = auto()
+    V_QF_SELF_ATTN_V     = auto()
+    V_QF_SELF_ATTN_O     = auto()
+    V_QF_SELF_ATTN_NORM  = auto()
+    V_QF_CROSS_ATTN_Q    = auto()
+    V_QF_CROSS_ATTN_K    = auto()
+    V_QF_CROSS_ATTN_V    = auto()
+    V_QF_CROSS_ATTN_O    = auto()
+    V_QF_CROSS_ATTN_NORM = auto()
+    V_QF_FFN_UP          = auto()
+    V_QF_FFN_DOWN        = auto()
+    V_QF_FFN_NORM        = auto()
+    V_PROJ_NORM          = auto()
+    # multi-projector (bid => projector id) - Granite4 vision
+    V_MULTI_PROJ_IMG_POS   = auto()
+    V_MULTI_PROJ_QUERY     = auto()
+    V_MULTI_PROJ_NORM      = auto()
+    V_MULTI_PROJ_LINEAR    = auto()
+    V_MULTI_PROJ_POST_NORM = auto()
+
     # audio (mtmd)
     A_ENC_EMBD_POS        = auto()
     A_ENC_EMBD_NORM       = auto()
@@ -885,7 +916,7 @@ class MODEL_TENSOR(IntEnum):
     A_CTC_OUT              = auto()
     A_CTC_OUT_MID          = auto()
     A_ENC_ATTN_REL_POS_EMB = auto()
-    # qformer projector
+    # audio qformer projector
     A_QF_PROJ_QUERY        = auto()
     A_QF_PROJ_NORM         = auto()
     A_QF_PROJ_LINEAR       = auto()
@@ -1337,10 +1368,33 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_SAM_NECK:                "v.sam.neck.{bid}",
     MODEL_TENSOR.V_SAM_NET_2:               "v.sam.net_2",
     MODEL_TENSOR.V_SAM_NET_3:               "v.sam.net_3",
-    MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR
+    MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR, Granite4Vision
     MODEL_TENSOR.V_ENC_EMBD_VSEP:           "v.view_seperator", # Deepseek-OCR
     MODEL_TENSOR.V_RESMPL_QUERY_768:        "v.resample_query_768", # Deepseek-OCR-2 qwen2
     MODEL_TENSOR.V_RESMPL_QUERY_1024:       "v.resample_query_1024", # Deepseek-OCR-2 qwen2
+    # Granite4 Vision
+    # qformer layers (bid => proj_id)
+    # NOTE: Names align with A_QF_*
+    MODEL_TENSOR.V_QF_SELF_ATTN_Q:          "v.proj_blk.{bid}.self_attn_q",
+    MODEL_TENSOR.V_QF_SELF_ATTN_K:          "v.proj_blk.{bid}.self_attn_k",
+    MODEL_TENSOR.V_QF_SELF_ATTN_V:          "v.proj_blk.{bid}.self_attn_v",
+    MODEL_TENSOR.V_QF_SELF_ATTN_O:          "v.proj_blk.{bid}.self_attn_out",
+    MODEL_TENSOR.V_QF_SELF_ATTN_NORM:       "v.proj_blk.{bid}.self_attn_norm",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_Q:         "v.proj_blk.{bid}.cross_attn_q",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_K:         "v.proj_blk.{bid}.cross_attn_k",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_V:         "v.proj_blk.{bid}.cross_attn_v",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_O:         "v.proj_blk.{bid}.cross_attn_out",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_NORM:      "v.proj_blk.{bid}.cross_attn_norm",
+    MODEL_TENSOR.V_QF_FFN_UP:               "v.proj_blk.{bid}.ffn_up",
+    MODEL_TENSOR.V_QF_FFN_DOWN:             "v.proj_blk.{bid}.ffn_down",
+    MODEL_TENSOR.V_QF_FFN_NORM:             "v.proj_blk.{bid}.ffn_norm",
+    # multi-projector (bid => projector ID)
+    MODEL_TENSOR.V_MULTI_PROJ_IMG_POS:   "v.proj_blk.{bid}.img_pos",
+    MODEL_TENSOR.V_MULTI_PROJ_QUERY:     "v.proj_blk.{bid}.query",
+    MODEL_TENSOR.V_MULTI_PROJ_NORM:      "v.proj_blk.{bid}.norm",
+    MODEL_TENSOR.V_MULTI_PROJ_LINEAR:    "v.proj_blk.{bid}.linear",
+    MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: "v.proj_blk.{bid}.post_norm",
+
     # audio (mtmd)
     # note: all audio tensor names must use prefix "a." or "mm.a."
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
@@ -1522,6 +1576,29 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_SAM_NET_3,
         MODEL_TENSOR.V_RESMPL_QUERY_768,
         MODEL_TENSOR.V_RESMPL_QUERY_1024,
+        MODEL_TENSOR.V_PROJ_NORM,
+        MODEL_TENSOR.V_QF_PROJ_QUERY,
+        MODEL_TENSOR.V_QF_PROJ_NORM,
+        MODEL_TENSOR.V_QF_PROJ_LINEAR,
+        MODEL_TENSOR.V_QF_SELF_ATTN_Q,
+        MODEL_TENSOR.V_QF_SELF_ATTN_K,
+        MODEL_TENSOR.V_QF_SELF_ATTN_V,
+        MODEL_TENSOR.V_QF_SELF_ATTN_O,
+        MODEL_TENSOR.V_QF_SELF_ATTN_NORM,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_Q,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_K,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_V,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_O,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_NORM,
+        MODEL_TENSOR.V_QF_FFN_UP,
+        MODEL_TENSOR.V_QF_FFN_DOWN,
+        MODEL_TENSOR.V_QF_FFN_NORM,
+        MODEL_TENSOR.V_QF_PROJ_NORM,
+        MODEL_TENSOR.V_MULTI_PROJ_IMG_POS,
+        MODEL_TENSOR.V_MULTI_PROJ_QUERY,
+        MODEL_TENSOR.V_MULTI_PROJ_LINEAR,
+        MODEL_TENSOR.V_MULTI_PROJ_NORM,
+        MODEL_TENSOR.V_MULTI_PROJ_POST_NORM,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_EMBD_NORM,
@@ -4388,6 +4465,7 @@ class VisionProjectorType:
     MINICPMV4_6    = "minicpmv4_6"
     GRANITE_SPEECH = "granite_speech"  # audio
     MIMOVL         = "mimovl"
+    GRANITE4_VISION = "granite4_vision"
 
 
 # Items here are (block size, type size)