Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@
MMPROJ_MODEL_MAP: dict[str, str] = {
"AudioFlamingo3ForConditionalGeneration": "ultravox",
"CogVLMForCausalLM": "cogvlm",
"DeepseekOCR2ForCausalLM": "deepseek",
"DeepseekOCRForCausalLM": "deepseek",
"DotsOCRForCausalLM": "dotsocr",
"Gemma3ForConditionalGeneration": "gemma",
Expand Down
2 changes: 1 addition & 1 deletion conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
# Skip multimodal tensors
if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
or "vision_" in name or "audio_" in name or "sam_model" in name \
or "vision_" in name or "audio_" in name \
or "token2wav." in name or "code2wav." in name \
or "projector." in name or "pre_mm_projector_norm" in name \
or "image_newline" in name or "view_seperator" in name \
Expand Down
68 changes: 56 additions & 12 deletions conversion/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@

@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
Expand Down Expand Up @@ -49,22 +53,27 @@ def get_vision_config(self) -> dict[str, Any]:
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")

vision_config['sam'] = vision_config['width']['sam_vit_b']
vision_config.update(vision_config['width']['clip-l-14-224'])
vision_config['hidden_size'] = vision_config['width']
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4
if vision_config['width'].get('clip-l-14-224') is not None:
vision_config.update(vision_config['width']['clip-l-14-224'])
if isinstance(vision_config['width'], int):
Comment thread
CISC marked this conversation as resolved.
vision_config['hidden_size'] = vision_config['width']
if vision_config.get('heads') is not None:
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4

return vision_config

def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".embeddings." in name or 'pos_embed' in name:
return gguf.GGMLQuantizationType.F32
if ".rel_pos_h" in name or '.rel_pos_w' in name:
return gguf.GGMLQuantizationType.F32
if ".neck." in name or ".net_" in name:
return gguf.GGMLQuantizationType.F32
for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
if nq_name in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith("view_seperator"):
data_torch = data_torch.unsqueeze(0)
yield from super().modify_tensors(data_torch, name, bid)

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
Expand All @@ -81,6 +90,33 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
return super().filter_tensors((name, gen))


@ModelBase.register("DeepseekOCR2ForCausalLM")
class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2

def set_gguf_parameters(self):
# the vision tower's qwen2 encoder is built from fixed defaults,
# see build_qwen2_decoder_as_encoder() in deepencoderv2.py
if self.hparams.get("patch_size") is None:
self.hparams["patch_size"] = 16
if self.hparams.get("intermediate_size") is None:
self.hparams["intermediate_size"] = 4864
if self.hparams.get("num_attention_heads") is None:
self.hparams["num_attention_heads"] = 14
super().set_gguf_parameters()
# qwen2 encoder is GQA: 14 Q heads, 2 KV heads
self.gguf_writer.add_vision_head_count_kv(2)

def get_vision_config(self) -> dict[str, Any]:
vision_config = super().get_vision_config()
vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
if vision_config.get('layers') is None:
vision_config['layers'] = 24
return vision_config


@ModelBase.register("DeepseekForCausalLM")
class DeepseekModel(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK
Expand Down Expand Up @@ -188,13 +224,21 @@ def __init__(self, *args, **kwargs):
self.origin_hf_arch = hparams.get('architectures', [None])[0]

# special handling for Deepseek OCR
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
# default jinja template
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
# DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
if "sam_model" in name or "qwen2_model" in name:
return None
return super().filter_tensors(item)

def set_vocab(self):
try:
self._set_vocab_gpt2()
Expand Down
7 changes: 7 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,8 @@ class MODEL_TENSOR(IntEnum):
V_SAM_NET_3 = auto() # Deepseek-OCR
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR
V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2
V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2

# audio (mtmd)
A_ENC_EMBD_POS = auto()
Expand Down Expand Up @@ -1327,6 +1329,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2
MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2
# audio (mtmd)
# note: all audio tensor names must use prefix "a." or "mm.a."
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
Expand Down Expand Up @@ -1505,6 +1509,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_SAM_NECK,
MODEL_TENSOR.V_SAM_NET_2,
MODEL_TENSOR.V_SAM_NET_3,
MODEL_TENSOR.V_RESMPL_QUERY_768,
MODEL_TENSOR.V_RESMPL_QUERY_1024,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_EMBD_NORM,
Expand Down Expand Up @@ -4283,6 +4289,7 @@ class VisionProjectorType:
JANUS_PRO = "janus_pro"
DOTSOCR = "dots_ocr"
DEEPSEEKOCR = "deepseekocr"
DEEPSEEKOCR2 = "deepseekocr2"
LFM2A = "lfm2a" # audio
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
Expand Down
18 changes: 18 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
Expand All @@ -1509,6 +1510,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
Expand All @@ -1533,6 +1535,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_INPUT_NORM: (
Expand All @@ -1554,6 +1557,7 @@ class TensorNameMap:
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
"vision_tower.blocks.{bid}.norm1", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_ATTN_O: (
Expand All @@ -1574,6 +1578,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
"model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
Expand Down Expand Up @@ -1603,6 +1608,7 @@ class TensorNameMap:
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
"vision_tower.blocks.{bid}.norm2", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_FFN_UP: (
Expand All @@ -1625,13 +1631,15 @@ class TensorNameMap:
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
"model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_FFN_DOWN: (
Expand All @@ -1652,6 +1660,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
"model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
),
Expand Down Expand Up @@ -1699,6 +1708,7 @@ class TensorNameMap:
"vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
"siglip2.vision_model.post_layernorm",
"model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_MM_POST_NORM: (
Expand Down Expand Up @@ -1879,6 +1889,14 @@ class TensorNameMap:
"model.sam_model.net_3",
),

MODEL_TENSOR.V_RESMPL_QUERY_768: (
"model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_RESMPL_QUERY_1024: (
"model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_MM_POST_FC_NORM: (
"model.vision.linear_proj.norm1", # cogvlm
),
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ add_library(mtmd
models/siglip.cpp
models/whisper-enc.cpp
models/deepseekocr.cpp
models/deepseekocr2.cpp
models/mobilenetv5.cpp
models/youtuvl.cpp
models/yasa2.cpp
Expand Down
7 changes: 7 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
#define TN_SAM_NECK "v.sam.neck.%d.%s"
#define TN_SAM_NET "v.sam.net_%d.%s"
// deepseek-ocr-2
#define TN_RESMPL_QUERY "v.resample_query_%d.%s"
// (conformer) lfm2
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
Expand Down Expand Up @@ -337,6 +339,7 @@ enum projector_type {
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_DOTS_OCR,
PROJECTOR_TYPE_DEEPSEEKOCR,
PROJECTOR_TYPE_DEEPSEEKOCR2,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
Expand Down Expand Up @@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
Expand Down Expand Up @@ -424,6 +428,9 @@ struct clip_image_f32 {
int ny;

std::vector<float> buf;

// marks the global view in e.g., DeepSeek-OCR Models
bool add_viewsep = false;
};

//
Expand Down
5 changes: 5 additions & 0 deletions tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,11 @@ struct clip_model {
int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder

std::vector<clip_layer> sam_layers;

// deepseek-ocr-2
ggml_tensor * resample_query_768 = nullptr;
ggml_tensor * resample_query_1024 = nullptr;

// lfm2 audio
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
Expand Down
Loading