Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@
MMPROJ_MODEL_MAP: dict[str, str] = {
"AudioFlamingo3ForConditionalGeneration": "ultravox",
"CogVLMForCausalLM": "cogvlm",
"DeepseekOCR2ForCausalLM": "deepseek",
"DeepseekOCRForCausalLM": "deepseek",
"DotsOCRForCausalLM": "dotsocr",
"Gemma3ForConditionalGeneration": "gemma",
Expand Down
2 changes: 1 addition & 1 deletion conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
# Skip multimodal tensors
if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
or "vision_" in name or "audio_" in name or "sam_model" in name \
or "vision_" in name or "audio_" in name \
or "token2wav." in name or "code2wav." in name \
or "projector." in name or "pre_mm_projector_norm" in name \
or "image_newline" in name or "view_seperator" in name \
Expand Down
68 changes: 56 additions & 12 deletions conversion/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@

@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
Expand Down Expand Up @@ -49,22 +53,27 @@ def get_vision_config(self) -> dict[str, Any]:
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")

vision_config['sam'] = vision_config['width']['sam_vit_b']
vision_config.update(vision_config['width']['clip-l-14-224'])
vision_config['hidden_size'] = vision_config['width']
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4
if vision_config['width'].get('clip-l-14-224') is not None:
vision_config.update(vision_config['width']['clip-l-14-224'])
if isinstance(vision_config['width'], int):
Comment thread
CISC marked this conversation as resolved.
vision_config['hidden_size'] = vision_config['width']
if vision_config.get('heads') is not None:
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4

return vision_config

def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".embeddings." in name or 'pos_embed' in name:
return gguf.GGMLQuantizationType.F32
if ".rel_pos_h" in name or '.rel_pos_w' in name:
return gguf.GGMLQuantizationType.F32
if ".neck." in name or ".net_" in name:
return gguf.GGMLQuantizationType.F32
for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
if nq_name in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith("view_seperator"):
data_torch = data_torch.unsqueeze(0)
yield from super().modify_tensors(data_torch, name, bid)

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
Expand All @@ -81,6 +90,33 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
return super().filter_tensors((name, gen))


@ModelBase.register("DeepseekOCR2ForCausalLM")
class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2

def set_gguf_parameters(self):
# the vision tower's qwen2 encoder is built from fixed defaults,
# see build_qwen2_decoder_as_encoder() in deepencoderv2.py
if self.hparams.get("patch_size") is None:
self.hparams["patch_size"] = 16
if self.hparams.get("intermediate_size") is None:
self.hparams["intermediate_size"] = 4864
if self.hparams.get("num_attention_heads") is None:
self.hparams["num_attention_heads"] = 14
super().set_gguf_parameters()
# qwen2 encoder is GQA: 14 Q heads, 2 KV heads
self.gguf_writer.add_vision_head_count_kv(2)

def get_vision_config(self) -> dict[str, Any]:
vision_config = super().get_vision_config()
vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
if vision_config.get('layers') is None:
vision_config['layers'] = 24
return vision_config


@ModelBase.register("DeepseekForCausalLM")
class DeepseekModel(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK
Expand Down Expand Up @@ -188,13 +224,21 @@ def __init__(self, *args, **kwargs):
self.origin_hf_arch = hparams.get('architectures', [None])[0]

# special handling for Deepseek OCR
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
# default jinja template
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
# DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
if "sam_model" in name or "qwen2_model" in name:
return None
return super().filter_tensors(item)

def set_vocab(self):
try:
self._set_vocab_gpt2()
Expand Down
7 changes: 7 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,8 @@ class MODEL_TENSOR(IntEnum):
V_SAM_NET_3 = auto() # Deepseek-OCR
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR
V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2
V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2

# audio (mtmd)
A_ENC_EMBD_POS = auto()
Expand Down Expand Up @@ -1327,6 +1329,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2
MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2
# audio (mtmd)
# note: all audio tensor names must use prefix "a." or "mm.a."
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
Expand Down Expand Up @@ -1505,6 +1509,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_SAM_NECK,
MODEL_TENSOR.V_SAM_NET_2,
MODEL_TENSOR.V_SAM_NET_3,
MODEL_TENSOR.V_RESMPL_QUERY_768,
MODEL_TENSOR.V_RESMPL_QUERY_1024,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_EMBD_NORM,
Expand Down Expand Up @@ -4283,6 +4289,7 @@ class VisionProjectorType:
JANUS_PRO = "janus_pro"
DOTSOCR = "dots_ocr"
DEEPSEEKOCR = "deepseekocr"
DEEPSEEKOCR2 = "deepseekocr2"
LFM2A = "lfm2a" # audio
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
Expand Down
18 changes: 18 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
Expand All @@ -1509,6 +1510,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
Expand All @@ -1533,6 +1535,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_INPUT_NORM: (
Expand All @@ -1554,6 +1557,7 @@ class TensorNameMap:
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
"vision_tower.blocks.{bid}.norm1", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_ATTN_O: (
Expand All @@ -1574,6 +1578,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
"model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
Expand Down Expand Up @@ -1603,6 +1608,7 @@ class TensorNameMap:
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
"vision_tower.blocks.{bid}.norm2", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_FFN_UP: (
Expand All @@ -1625,13 +1631,15 @@ class TensorNameMap:
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
"model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_ENC_FFN_DOWN: (
Expand All @@ -1652,6 +1660,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
"model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
),
Expand Down Expand Up @@ -1699,6 +1708,7 @@ class TensorNameMap:
"vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
"siglip2.vision_model.post_layernorm",
"model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_MM_POST_NORM: (
Expand Down Expand Up @@ -1879,6 +1889,14 @@ class TensorNameMap:
"model.sam_model.net_3",
),

MODEL_TENSOR.V_RESMPL_QUERY_768: (
"model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_RESMPL_QUERY_1024: (
"model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
),

MODEL_TENSOR.V_MM_POST_FC_NORM: (
"model.vision.linear_proj.norm1", # cogvlm
),
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ add_library(mtmd
models/siglip.cpp
models/whisper-enc.cpp
models/deepseekocr.cpp
models/deepseekocr2.cpp
models/mobilenetv5.cpp
models/youtuvl.cpp
models/yasa2.cpp
Expand Down
7 changes: 7 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
#define TN_SAM_NECK "v.sam.neck.%d.%s"
#define TN_SAM_NET "v.sam.net_%d.%s"
// deepseek-ocr-2
#define TN_RESMPL_QUERY "v.resample_query_%d.%s"
// (conformer) lfm2
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
Expand Down Expand Up @@ -337,6 +339,7 @@ enum projector_type {
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_DOTS_OCR,
PROJECTOR_TYPE_DEEPSEEKOCR,
PROJECTOR_TYPE_DEEPSEEKOCR2,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
Expand Down Expand Up @@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
Expand Down Expand Up @@ -424,6 +428,9 @@ struct clip_image_f32 {
int ny;

std::vector<float> buf;

// marks the global view in e.g., DeepSeek-OCR Models
bool add_viewsep = false;
};

//
Expand Down
5 changes: 5 additions & 0 deletions tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,11 @@ struct clip_model {
int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder

std::vector<clip_layer> sam_layers;

// deepseek-ocr-2
ggml_tensor * resample_query_768 = nullptr;
ggml_tensor * resample_query_1024 = nullptr;

// lfm2 audio
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
Expand Down
Loading
Loading