Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0024aa7
mtmd : add Nemotron 3 Nano Omni support (parakeet)
danbev Apr 29, 2026
e557406
mtmd : generate rel pos tensor in graph instead of in conversion [no ci]
danbev Apr 30, 2026
a9929d4
mtmd : add clip_get_model to clip API [no ci]
danbev Apr 30, 2026
8e279f4
mtmd : read mel_filters and window into hparams
danbev Apr 30, 2026
ffd1b99
mtmd : use set_input_f32 lambda [no ci]
danbev May 1, 2026
8af100f
mtmd : add better asserts for mel_filters and hann window [no ci]
danbev May 5, 2026
b5a35e0
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 5, 2026
7ed9294
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 8, 2026
49658ba
mtmd : add missing size_t cast
danbev May 8, 2026
9a8398e
mtmd : change type of pad to size_t
danbev May 8, 2026
6ba52fc
mtmd : zero initialize samples_padded
danbev May 8, 2026
385b2d4
mtmd : remove unsued ctx member from parakeet preprocessor
danbev May 8, 2026
cef7ff7
mtmd : make log_mel_spectrogram_parakeet_worker_thread private static
danbev May 11, 2026
681a199
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 12, 2026
44cb51f
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 15, 2026
0cd9e16
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 16, 2026
78e28f4
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 28, 2026
96b1326
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev Jun 1, 2026
656437b
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev Jun 4, 2026
79e1dba
Merge branch 'upstream/master' into nemotron-3-omni-mtmd-audio
danbev Jun 12, 2026
1fcd9db
mtmd : sync/update parakeeet impl with latest whisper.cpp
danbev Jun 17, 2026
5b741d1
mtmd : add audio_conv_kernel_size to model conversion
danbev Jun 17, 2026
4f8882b
mtmd : cleanup [no ci]
danbev Jun 17, 2026
01a1f58
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev Jun 18, 2026
c1d465f
conversion : call super().filter_tensors [no ci]
danbev Jun 18, 2026
e49c091
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev Jun 18, 2026
3378340
do not discard result of super filter_tensors
CISC Jun 18, 2026
816d776
mtmd : use build_mm instead of ggml_mul_mat
danbev Jun 18, 2026
882c9b7
mtmd : use build_ffn
danbev Jun 18, 2026
79baf6c
mtmd : move and reuse get_vector lambda
danbev Jun 18, 2026
8835abe
mtmd : use build_inp_raw for parakeet
danbev Jun 22, 2026
602c218
mtmd : throw exception in get_scalar instead of assert
danbev Jun 22, 2026
2ab3beb
mtmd : fix std::min call
danbev Jun 22, 2026
1337d74
mtmt : use .c_str in throw clause in get_vector
danbev Jun 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 50 additions & 5 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4463,29 +4463,49 @@ def dequant_model(self):
return
super().dequant_model()

def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("sound_config")

def set_gguf_parameters(self):
if "image_mean" not in self.preprocessor_config:
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
if "image_std" not in self.preprocessor_config:
self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]

if self.hparams_audio is not None:
self.has_vision_encoder = True
self.has_audio_encoder = True
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
self.gguf_writer.add_audio_subsampling_factor(self.hparams_audio["subsampling_factor"])
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.PARAKEET)
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
else:
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)

super().set_gguf_parameters()
hparams = self.global_config
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
self.gguf_writer.add_vision_use_gelu(True)
downsample_ratio = hparams.get("downsample_ratio", 0.5)
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))

def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name or "pos_embed" in new_name:
return gguf.GGMLQuantizationType.F32
if "sound_encoder" in name or new_name.startswith("mm.a."):
if "bias" in new_name or "norm" in new_name:
return gguf.GGMLQuantizationType.F32
if "conv" in new_name and "weight" in new_name:
return gguf.GGMLQuantizationType.F32

return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "input_conditioner" in name:
return

if "language_model" in name:
return

# mtmd does not support video yet so skip tensors related to video.
if "radio_model.model.patch_generator.video_embedder" in name:
return
Expand Down Expand Up @@ -4517,8 +4537,33 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
n_embd = self.hparams["hidden_size"]
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)

if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
yield from super().modify_tensors(data_torch, name, bid)
# num_batches is only use for training not inference.
if "conv.norm" in name and "num_batches" in name:
return

if "depthwise_conv.weight" in name:
data_torch = data_torch.unsqueeze(-1)
data_torch = data_torch.permute(3, 1, 0, 2).contiguous()

if "pointwise_conv" in name and name.endswith(".weight"):
if len(data_torch.shape) == 3 and data_torch.shape[2] == 1:
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])

if "subsampling.layers" in name and name.endswith(".bias"):
if len(data_torch.shape) == 1:
data_torch = data_torch.reshape(1, -1, 1, 1)

if "pointwise_conv" in name and name.endswith(".bias"):
if len(data_torch.shape) == 1:
data_torch = data_torch.reshape(1, -1, 1, 1)

if name.startswith(("vision_model.radio_model.model.", "mlp1.", "sound_encoder.", "sound_projection.")):
for mapped_name, tensor in super().modify_tensors(data_torch, name, bid):
if name.startswith("sound_projection.") and mapped_name.startswith("mm.model.mlp."):
mapped_name = mapped_name.replace("mm.model.mlp.", "mm.a.mlp.")
yield (mapped_name, tensor)
else:
yield (name, data_torch)


@ModelBase.register("WavTokenizerDec")
Expand Down
14 changes: 14 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ class ClipAudio:
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
PROJECTION_DIM = "clip.audio.projection_dim"
BLOCK_COUNT = "clip.audio.block_count"
SUBSAMPLING_FACTOR = "clip.audio.subsampling_factor"

class Attention:
HEAD_COUNT = "clip.audio.attention.head_count"
Expand Down Expand Up @@ -854,6 +855,10 @@ class MODEL_TENSOR(IntEnum):
A_ENC_CONV_NORM = auto() # SSM conv
A_ENC_CONV_PW1 = auto()
A_ENC_CONV_PW2 = auto()
A_ENC_CONV_NORM_MEAN = auto() # parakeet
A_ENC_CONV_NORM_VAR = auto() # parakeet
A_ENC_MEL_FILTERS = auto() # parakeet
A_ENC_WINDOW = auto() # parakeet


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
Expand Down Expand Up @@ -1333,6 +1338,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
MODEL_TENSOR.A_ENC_CONV_NORM_MEAN: "a.blk.{bid}.conv_norm_mean",
MODEL_TENSOR.A_ENC_CONV_NORM_VAR: "a.blk.{bid}.conv_norm_var",
MODEL_TENSOR.A_ENC_MEL_FILTERS: "a.mel_filters",
MODEL_TENSOR.A_ENC_WINDOW: "a.window",
# NextN/MTP
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
Expand Down Expand Up @@ -1474,6 +1483,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_CONV_NORM,
MODEL_TENSOR.A_ENC_CONV_PW1,
MODEL_TENSOR.A_ENC_CONV_PW2,
MODEL_TENSOR.A_ENC_CONV_NORM_MEAN,
MODEL_TENSOR.A_ENC_CONV_NORM_VAR,
MODEL_TENSOR.A_ENC_MEL_FILTERS,
MODEL_TENSOR.A_ENC_WINDOW,
MODEL_TENSOR.A_MM_INP_PROJ,
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
MODEL_TENSOR.A_MM_EMBEDDING,
Expand Down Expand Up @@ -4158,6 +4171,7 @@ class VisionProjectorType:
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
HUNYUANVL = "hunyuanvl"
PARAKEET = "parakeet"


# Items here are (block size, type size)
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1260,6 +1260,9 @@ def add_audio_num_mel_bins(self, value: int) -> None:
def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)

def add_audio_subsampling_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.SUBSAMPLING_FACTOR, value)

def add_xielu_alpha_p(self, values: Sequence[float]):
self.add_array(Keys.xIELU.ALPHA_P, values)

Expand Down
40 changes: 40 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,7 @@ class TensorNameMap:
"conformer.pre_encode.conv.{bid}", # lfm2
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
"conformer.subsample_conv_projection.layer{bid}.conv", # gemma4
"sound_encoder.encoder.subsampling.layers.{bid}", # parakeet
),

MODEL_TENSOR.A_ENC_CONV1D_NORM: (
Expand Down Expand Up @@ -1912,20 +1913,23 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.q_proj", # parakeet
),

MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.k_proj", # parakeet
),

MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.v_proj", # parakeet
),

MODEL_TENSOR.A_ENC_ATTN_K_REL: (
Expand Down Expand Up @@ -1953,25 +1957,29 @@ class TensorNameMap:
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.norm_self_att", # parakeet
),

MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
"conformer.layers.{bid}.attention.post", # gemma3n
"conformer.layers.{bid}.self_attn.post", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.o_proj", # parakeet
),

MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
"conformer.layers.{bid}.attention.post_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.norm_out", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
"sound_encoder.encoder.layers.{bid}.norm_feed_forward1", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
Expand All @@ -1988,6 +1996,7 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward1.linear1", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_GATE: (),
Expand All @@ -1997,24 +2006,28 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward1.linear2", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward2.linear1", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward2.linear2", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
"sound_encoder.encoder.layers.{bid}.norm_feed_forward2", # parakeet
),

MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
Expand All @@ -2029,20 +2042,24 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_LINEAR_POS: (
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
"conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
"sound_encoder.encoder.layers.{bid}.self_attn.relative_k_proj", # parakeet
),

MODEL_TENSOR.A_ENC_POS_BIAS_U: (
"conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
"sound_encoder.encoder.layers.{bid}.self_attn.bias_u", # parakeet
),

MODEL_TENSOR.A_ENC_POS_BIAS_V: (
"conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
"sound_encoder.encoder.layers.{bid}.self_attn.bias_v", # parakeet
),

MODEL_TENSOR.A_ENC_OUT: (
"conformer.pre_encode.out", # lfm2
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported)
"conformer.output_proj", # gemma4
"sound_encoder.encoder.subsampling.linear", # parakeet
),

# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
Expand All @@ -2052,6 +2069,7 @@ class TensorNameMap:
"audio.multi_modal_projector.linear_{bid}", # ultravox, meralion
"audio_adapter.model.{bid}", # lfm2
"audio_tower.proj{bid}", # qwen3omni
"sound_projection.linear{bid}", # parakeet (linear1, linear2)
),

MODEL_TENSOR.A_MMPROJ_FC: (
Expand All @@ -2062,6 +2080,7 @@ class TensorNameMap:

MODEL_TENSOR.A_MM_NORM_PRE: (
"audio.multi_modal_projector.ln_pre", # ultravox
"sound_projection.norm", # parakeet
),

MODEL_TENSOR.A_MM_NORM_MID: (
Expand All @@ -2071,26 +2090,39 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.depthwise_conv", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.norm", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_NORM_MEAN: (
"sound_encoder.encoder.layers.{bid}.conv.norm.running_mean", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_NORM_VAR: (
"sound_encoder.encoder.layers.{bid}.conv.norm.running_var", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.pointwise_conv1", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.pointwise_conv2", # parakeet
),

MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.norm_conv", # parakeet
),

MODEL_TENSOR.A_PER_DIM_K_SCALE: (
Expand All @@ -2101,6 +2133,14 @@ class TensorNameMap:
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4
),

MODEL_TENSOR.A_ENC_MEL_FILTERS: (
"sound_encoder.encoder.feature_extractor.featurizer.fb", # parakeet
),

MODEL_TENSOR.A_ENC_WINDOW: (
"sound_encoder.encoder.feature_extractor.featurizer.window", # parakeet
),

MODEL_TENSOR.A_MM_EMBEDDING: (
"model.embed_audio.embedding", # gemma3n
),
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ add_library(mtmd
models/mobilenetv5.cpp
models/youtuvl.cpp
models/yasa2.cpp
models/parakeet.cpp
)

set_target_properties(mtmd PROPERTIES
Expand Down
10 changes: 10 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"

#define KEY_AUDIO_SUBSAMPLING_FACTOR "clip.audio.subsampling_factor"


//
// tensor name constants
Expand Down Expand Up @@ -252,6 +254,12 @@
#define TN_YASA_STAGE_DOWN_CONV "v.stage.%d.down.conv.%s"
#define TN_YASA_STAGE_BLK "v.stage.%d.blk.%d.%s.%s"

// parakeet
#define TN_MEL_FILTERS "a.mel_filters"
#define TN_WINDOW "a.window"
#define TN_CONV_NORM_MEAN "%s.blk.%d.conv_norm_mean"
#define TN_CONV_NORM_VAR "%s.blk.%d.conv_norm_var"

// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

Expand Down Expand Up @@ -304,6 +312,7 @@ enum projector_type {
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_HUNYUANOCR,
PROJECTOR_TYPE_HUNYUANVL,
PROJECTOR_TYPE_PARAKEET,
PROJECTOR_TYPE_UNKNOWN,
};

Expand Down Expand Up @@ -351,6 +360,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
{ PROJECTOR_TYPE_PARAKEET, "parakeet"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
Loading
Loading