From 84da624b0221b85f9bfad12ccddfdd2e26c37f66 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 1 Dec 2025 15:10:48 +0100 Subject: [PATCH 1/4] convert backbone to gguf --- convert_hf_to_gguf.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8ddb6d04cd9..f210e32b817 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -694,6 +694,9 @@ def load_hparams(dir_model: Path, is_mistral_format: bool): if "llm_config" in config: # rename for InternVL config["text_config"] = config["llm_config"] + if "lfm" in config: + # rename for LFM2-Audio + config["text_config"] = config["lfm"] if "thinker_config" in config: # rename for Qwen2.5-Omni config["text_config"] = config["thinker_config"]["text_config"] @@ -9616,12 +9619,12 @@ def set_gguf_parameters(self): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name - if is_vision_tensor: - # skip vision tensors + if self._is_vision_tensor(name) or self._is_audio_tensor(name): + # skip multimodal tensors return [] - name = name.replace("language_model.", "") + name = name.replace("language_model.", "") # vision + name = name.replace("lfm.", "model.") # audio # conv op requires 2d tensor if 'conv.conv' in name: @@ -9629,6 +9632,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] + def _is_vision_tensor(self, name: str) -> bool: + return "vision_tower" in name or "multi_modal_projector" in name + + def _is_audio_tensor(self, name: str): + return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) + @ModelBase.register("Lfm2MoeForCausalLM") class LFM2MoeModel(TextModel): From 9f1d9e41c202fb781d56e0a45bbb26505cea77ea Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Tue, 2 Dec 2025 13:15:57 +0100 Subject: [PATCH 2/4] convert mmproj to gguf --- convert_hf_to_gguf.py | 64 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 43 +++++++++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 59 +++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f210e32b817..6ae1f40e169 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9743,6 +9743,70 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("Lfm2AudioForConditionalGeneration") +class LFM2AudioModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + model_name = "Lfm2AudioEncoder" + + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder") + + def set_gguf_parameters(self): + self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] + self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # skip language model tensors + if name.startswith("lfm."): + return [] + + # for training only + if any(p in name for p in ["audio_loss_weight"]): + return [] + + # for audio output + if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): + return [] + + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + + if len(self._batch_norm_tensors[bid]) < 5: + return [] + + weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] + bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] + running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] + running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] + eps = 1e-5 # default value + + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + return [ + (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a), + (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), + ] + + return [(self.map_tensor_name(name), data_torch)] + + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): model_arch = gguf.MODEL_ARCH.SMALLTHINKER diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2b8489c591b..564ae37835f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -687,6 +687,8 @@ class MODEL_TENSOR(IntEnum): V_TOK_EOI = auto() # cogvlm # audio (mtmd) A_ENC_EMBD_POS = auto() + A_ENC_EMBD_NORM = auto() + A_ENC_EMBD_TO_LOGITS = auto() A_ENC_CONV1D = auto() A_PRE_NORM = auto() A_POST_NORM = auto() @@ -697,8 +699,13 @@ class MODEL_TENSOR(IntEnum): A_ENC_OUTPUT = auto() A_ENC_OUTPUT_NORM = auto() A_ENC_FFN_UP = auto() + A_ENC_FFN_NORM = auto() A_ENC_FFN_GATE = auto() A_ENC_FFN_DOWN = auto() + A_ENC_FFN_UP_1 = auto() + A_ENC_FFN_NORM_1 = auto() + A_ENC_FFN_GATE_1 = auto() + A_ENC_FFN_DOWN_1 = auto() A_MMPROJ = auto() A_MMPROJ_FC = auto() A_MM_NORM_PRE = auto() @@ -710,6 +717,12 @@ class MODEL_TENSOR(IntEnum): NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() + # lfm2 audio + A_ENC_NORM_CONV = auto() + A_ENC_LINEAR_POS = auto() + A_ENC_POS_BIAS_U = auto() + A_ENC_POS_BIAS_V = auto() + A_ENC_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -1059,6 +1072,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_TOK_EOI: "v.eoi", # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", + MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", MODEL_TENSOR.A_POST_NORM: "a.post_ln", @@ -1068,9 +1083,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1", MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out", MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2", + MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm", MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up", MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate", MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down", + MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1", + MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1", + MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1", + MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1", MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}", MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", @@ -1082,6 +1102,12 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + # lfm2 + MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", + MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", + MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u", + MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v", + MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1137,6 +1163,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_TOK_EOI, # audio MODEL_TENSOR.A_ENC_EMBD_POS, + MODEL_TENSOR.A_ENC_EMBD_NORM, + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS, MODEL_TENSOR.A_ENC_CONV1D, MODEL_TENSOR.A_PRE_NORM, MODEL_TENSOR.A_POST_NORM, @@ -1146,13 +1174,27 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_INPUT_NORM, MODEL_TENSOR.A_ENC_OUTPUT, MODEL_TENSOR.A_ENC_OUTPUT_NORM, + MODEL_TENSOR.A_ENC_FFN_NORM, MODEL_TENSOR.A_ENC_FFN_UP, MODEL_TENSOR.A_ENC_FFN_GATE, MODEL_TENSOR.A_ENC_FFN_DOWN, + MODEL_TENSOR.A_ENC_FFN_NORM_1, + MODEL_TENSOR.A_ENC_FFN_UP_1, + MODEL_TENSOR.A_ENC_FFN_GATE_1, + MODEL_TENSOR.A_ENC_FFN_DOWN_1, MODEL_TENSOR.A_MMPROJ, MODEL_TENSOR.A_MMPROJ_FC, MODEL_TENSOR.A_MM_NORM_PRE, MODEL_TENSOR.A_MM_NORM_MID, + MODEL_TENSOR.CONVNEXT_DW, + MODEL_TENSOR.CONVNEXT_NORM, + MODEL_TENSOR.CONVNEXT_PW1, + MODEL_TENSOR.CONVNEXT_PW2, + MODEL_TENSOR.A_ENC_NORM_CONV, + MODEL_TENSOR.A_ENC_LINEAR_POS, + MODEL_TENSOR.A_ENC_POS_BIAS_U, + MODEL_TENSOR.A_ENC_POS_BIAS_V, + MODEL_TENSOR.A_ENC_OUT, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, @@ -3327,6 +3369,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + LFM2A = "lfm2a" # audio # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7b09739791..ce07128552a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1113,18 +1113,26 @@ class TensorNameMap: MODEL_TENSOR.CONVNEXT_DW: ( "backbone.convnext.{bid}.dwconv", # wavtokenizer + "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 ), MODEL_TENSOR.CONVNEXT_NORM: ( "backbone.convnext.{bid}.norm", # wavtokenizer + "conformer.layers.{bid}.conv.batch_norm", #lfm2 ), MODEL_TENSOR.CONVNEXT_PW1: ( "backbone.convnext.{bid}.pwconv1", # wavtokenizer + "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 ), MODEL_TENSOR.CONVNEXT_PW2: ( "backbone.convnext.{bid}.pwconv2", # wavtokenizer + "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_NORM_CONV: ( + "conformer.layers.{bid}.norm_conv", # lfm2 ), MODEL_TENSOR.CONVNEXT_GAMMA: ( @@ -1502,10 +1510,20 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_EMBD_POS: ( "audio_tower.embed_positions", # ultravox + "audio_embedding.embedding", # lfm2 + ), + + MODEL_TENSOR.A_ENC_EMBD_NORM: ( + "audio_embedding.embedding_norm", # lfm2 + ), + + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: ( + "audio_embedding.to_logits", # lfm2 ), MODEL_TENSOR.A_ENC_CONV1D: ( "audio_tower.conv{bid}", # ultravox + "conformer.pre_encode.conv.{bid}", # lfm2 ), MODEL_TENSOR.A_PRE_NORM: (), @@ -1517,36 +1535,76 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_ATTN_Q: ( "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_q", # lfm2 ), MODEL_TENSOR.A_ENC_ATTN_K: ( "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_k", # lfm2 ), MODEL_TENSOR.A_ENC_ATTN_V: ( "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_v", # lfm2 ), MODEL_TENSOR.A_ENC_INPUT_NORM: ( "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox + "conformer.layers.{bid}.norm_self_att", # lfm2 ), MODEL_TENSOR.A_ENC_OUTPUT: ( "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_out", # lfm2 ), MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( "audio_tower.layers.{bid}.final_layer_norm", # ultravox + "conformer.layers.{bid}.norm_out", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_NORM: ( + "conformer.layers.{bid}.norm_feed_forward1", # lfm2 ), MODEL_TENSOR.A_ENC_FFN_UP: ( "audio_tower.layers.{bid}.fc1", # ultravox + "conformer.layers.{bid}.feed_forward1.linear1", # lfm2 ), MODEL_TENSOR.A_ENC_FFN_GATE: (), MODEL_TENSOR.A_ENC_FFN_DOWN: ( "audio_tower.layers.{bid}.fc2", # ultravox + "conformer.layers.{bid}.feed_forward1.linear2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_UP_1: ( + "conformer.layers.{bid}.feed_forward2.linear1", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_DOWN_1: ( + "conformer.layers.{bid}.feed_forward2.linear2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_NORM_1: ( + "conformer.layers.{bid}.norm_feed_forward2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_LINEAR_POS: ( + "conformer.layers.{bid}.self_attn.linear_pos", # lfm2 + ), + + MODEL_TENSOR.A_ENC_POS_BIAS_U: ( + "conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2 + ), + + MODEL_TENSOR.A_ENC_POS_BIAS_V: ( + "conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2 + ), + + MODEL_TENSOR.A_ENC_OUT: ( + "conformer.pre_encode.out", # lfm2 ), # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors @@ -1554,6 +1612,7 @@ class TensorNameMap: MODEL_TENSOR.A_MMPROJ: ( "audio.multi_modal_projector.linear_{bid}", # ultravox + "audio_adapter.model.{bid}" # lfm2 ), MODEL_TENSOR.A_MMPROJ_FC: ( From b224f3905d5c2b361c6faeb94188f4a7dd940c4e Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Tue, 2 Dec 2025 14:17:47 +0100 Subject: [PATCH 3/4] ASR works --- common/arg.cpp | 2 +- tools/mtmd/clip-impl.h | 13 + tools/mtmd/clip.cpp | 395 +++++++++++++++++++- tools/mtmd/clip.h | 1 - tools/mtmd/mtmd-audio.cpp | 763 ++++++++++++-------------------------- tools/mtmd/mtmd-audio.h | 36 +- tools/mtmd/mtmd-cli.cpp | 11 +- tools/mtmd/mtmd.cpp | 52 ++- 8 files changed, 710 insertions(+), 563 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 52094e3f10a..c3763584a3f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1035,7 +1035,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index cd47865bf4a..c12ecfb8317 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -129,6 +129,17 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" +// lfm2 +#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" +#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" +#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s" +#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s" +#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s" +#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u" +#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v" +#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s" +#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s" + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -152,6 +163,7 @@ enum projector_type { PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_LFM2, + PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, @@ -178,6 +190,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_LFM2, "lfm2"}, + { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index ea89259f92d..30cedab1457 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -267,6 +267,29 @@ struct clip_layer { ggml_tensor * deepstack_fc2_w = nullptr; ggml_tensor * deepstack_fc2_b = nullptr; + // lfm2 + ggml_tensor * ff_norm_w = nullptr; + ggml_tensor * ff_norm_b = nullptr; + ggml_tensor * ff_norm_1_w = nullptr; + ggml_tensor * ff_norm_1_b = nullptr; + ggml_tensor * ff_up_1_w = nullptr; + ggml_tensor * ff_up_1_b = nullptr; + ggml_tensor * ff_down_1_w = nullptr; + ggml_tensor * ff_down_1_b = nullptr; + ggml_tensor * pos_bias_u = nullptr; + ggml_tensor * pos_bias_v = nullptr; + ggml_tensor * norm_conv_w = nullptr; + ggml_tensor * norm_conv_b = nullptr; + ggml_tensor * linear_pos_w = nullptr; + ggml_tensor * conv_bn_w = nullptr; + ggml_tensor * conv_bn_b = nullptr; + ggml_tensor * conv_dw_w = nullptr; + ggml_tensor * conv_dw_b = nullptr; + ggml_tensor * conv_pw1_w = nullptr; + ggml_tensor * conv_pw1_b = nullptr; + ggml_tensor * conv_pw2_w = nullptr; + ggml_tensor * conv_pw2_b = nullptr; + bool has_deepstack() const { return deepstack_fc1_w != nullptr; } @@ -399,6 +422,20 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // lfm2 + ggml_tensor * pre_encode_conv_0_w = nullptr; + ggml_tensor * pre_encode_conv_0_b = nullptr; + ggml_tensor * pre_encode_conv_2_w = nullptr; + ggml_tensor * pre_encode_conv_2_b = nullptr; + ggml_tensor * pre_encode_conv_3_w = nullptr; + ggml_tensor * pre_encode_conv_3_b = nullptr; + ggml_tensor * pre_encode_conv_5_w = nullptr; + ggml_tensor * pre_encode_conv_5_b = nullptr; + ggml_tensor * pre_encode_conv_6_w = nullptr; + ggml_tensor * pre_encode_conv_6_b = nullptr; + ggml_tensor * pre_encode_out_w = nullptr; + ggml_tensor * pre_encode_out_b = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; @@ -1999,6 +2036,276 @@ struct clip_graph { return gf; } + ggml_cgraph * build_lfm2_audio() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd); + ggml_set_name(pos_emb, "pos_emb"); + ggml_set_input(pos_emb); + ggml_build_forward_expand(gf, pos_emb); + + ggml_tensor * inp = build_inp_raw(1); + cb(inp, "input", -1); + + auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + // pre encode, conv subsampling + { + // layer.0 - conv2d + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_0_w, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_0_b, 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 0); + + // layer.1 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.2 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_2_w, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_2_b, 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 2); + + // layer.3 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_3_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_3_b, 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 3); + + // layer.4 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.5 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_5_w, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_5_b, 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 5); + + // layer.6 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_6_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_6_b, 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 6); + + // layer.7 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // flatten channel and frequency axis + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); + + // calculate out + cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur); + cur = ggml_add(ctx0, cur, model.pre_encode_out_b); + cb(cur, "conformer.pre_encode.out", -1); + } + + // pos_emb + cb(pos_emb, "pos_emb", -1); + + for (int il = 0; il < hparams.n_layer; il++) { + auto & layer = model.layers[il]; + + auto * residual = cur; + + cb(cur, "layer.in", il); + + // feed_forward1 + cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward1", il); + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + nullptr, nullptr, + layer.ff_down_w, layer.ff_down_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); + + const auto fc_factor = 0.5f; // TODO(tarek): read from config + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + + + + // self-attention + { + cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_self_att", il); + + cb(cur, "conformer.layers.{}.self_attn.id", il); + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + cb(Qcur, "conformer.layers.{}.self_attn.linear_q", il); + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + cb(Kcur, "conformer.layers.{}.self_attn.linear_k", il); + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + cb(Vcur, "conformer.layers.{}.self_attn.linear_v", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); + + ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); + ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); + + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); + matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); + cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); + + auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); + cb(p, "conformer.layers.{}.self_attn.linear_pos", il); + p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); + + Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); + cb(Q_bias_v, "conformer.layers.{}.self_attn.id0", il); + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 2, 0, 3)); + cb(p, "conformer.layers.{}.self_attn.id1", il); + + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 0, 2, 3)); + auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); + matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); + + + // rel shift + { + const auto pos_len = matrix_bd->ne[0]; + const auto q_len = matrix_bd->ne[1]; + const auto h = matrix_bd->ne[2]; + matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, + q_len, pos_len, h, + matrix_bd->nb[1], matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); + } + + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, + matrix_ac->ne[0], matrix_bd->ne[1], matrix_bd->ne[2], + matrix_bd->nb[1], matrix_bd->nb[2], 0)); + auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); + scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); + cb(scores, "conformer.layers.{}.self_attn.id0", il); + + + ggml_tensor * attn = ggml_soft_max(ctx0, scores); + // TODO(tarek): combine permutes + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 0, 2, 1, 3)); + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 0, 2, 3)); + ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); + // TODO(tarek): combine permutes + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 1, 0, 2, 3)); + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3)); + x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + + x = ggml_mul_mat(ctx0, layer.o_w, x); + ggml_tensor * out = ggml_add(ctx0, x, layer.o_b); + cb(out, "conformer.layers.{}.self_attn.linear_out", il); + + cur = out; + } + + residual = ggml_add(ctx0, residual, cur); + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_conv", il); + + // conv + { + auto * x = cur; + auto * conv_pw1_w = ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); + x = ggml_mul_mat(ctx0, conv_pw1_w, x); + x = ggml_add(ctx0, x, layer.conv_pw1_b); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); + + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + + // torch.funtional.glu + // TODO(tarek): chekc if llama.cpp impl exists + { + int64_t d = x->ne[0] / 2; + ggml_tensor *gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + } + + // use ggml_ssm_conv for f32 precision + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_cont(ctx0, x); + auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]); + x = ggml_ssm_conv(ctx0, x, conv_dw_w); + x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + + cb(x, "conformer.layers.{}.conv.depthwise_conv", il); + + // TODO(tarek): fold into another op + { + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_bn_w), layer.conv_bn_b); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cb(x, "conformer.layers.{}.conv.batch_norm", il); + } + x = ggml_silu(ctx0, x); + + // pointwise_conv2 + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + auto * conv_pw2_w = ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); + x = ggml_mul_mat(ctx0, conv_pw2_w, x); + x = ggml_add(ctx0, x, layer.conv_pw2_b); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cb(x, "conformer.layers.{}.conv.pointwise_conv2", il); + + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cur = x; + } + + residual = ggml_add(ctx0, residual, cur); + + cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward2", il); + + cur = build_ffn(cur, + layer.ff_up_1_w, layer.ff_up_1_b, + nullptr, nullptr, + layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); + + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + cb(residual, "conformer.layers.{}.conv.id", il); + + cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_out", il); + } + + // audio adapter + { + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cb(cur, "audio_adapter.model.{}", 0); + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + cb(cur, "audio_adapter.model.{}", 1); + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "audio_adapter.model.{}", 2); + cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); + cur = ggml_add(ctx0, cur, model.mm_3_b); + cb(cur, "audio_adapter.model.{}", 3); + } + + cb(cur, "projected", -1); + + + ggml_build_forward_expand(gf, cur); + + return gf; + } + private: // // utility functions @@ -2532,6 +2839,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_cogvlm(); } break; + case PROJECTOR_TYPE_LFM2A: + { + res = graph.build_lfm2_audio(); + } break; default: { res = graph.build_llava(); @@ -3248,6 +3559,58 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); } break; + case PROJECTOR_TYPE_LFM2A: + { + model.pre_encode_conv_0_w = get_tensor(string_format(TN_CONV1D, 0, "weight")); + model.pre_encode_conv_0_b = get_tensor(string_format(TN_CONV1D, 0, "bias")); + model.pre_encode_conv_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.pre_encode_conv_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.pre_encode_conv_3_w = get_tensor(string_format(TN_CONV1D, 3, "weight")); + model.pre_encode_conv_3_b = get_tensor(string_format(TN_CONV1D, 3, "bias")); + model.pre_encode_conv_5_w = get_tensor(string_format(TN_CONV1D, 5, "weight")); + model.pre_encode_conv_5_b = get_tensor(string_format(TN_CONV1D, 5, "bias")); + model.pre_encode_conv_6_w = get_tensor(string_format(TN_CONV1D, 6, "weight")); + model.pre_encode_conv_6_b = get_tensor(string_format(TN_CONV1D, 6, "bias")); + model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight")); + model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias")); + + model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight")); + model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias")); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias")); + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias")); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias")); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias")); + + layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il)); + layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il)); + + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight")); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias")); + + layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); + + layer.conv_bn_w = get_tensor(string_format("convnext.%d.norm.%s", il, "weight")); + layer.conv_bn_b = get_tensor(string_format("convnext.%d.norm.%s", il, "bias")); + layer.conv_dw_w = get_tensor(string_format("convnext.%d.dw.%s", il, "weight")); + layer.conv_dw_b = get_tensor(string_format("convnext.%d.dw.%s", il, "bias")); + layer.conv_pw1_w = get_tensor(string_format("convnext.%d.pw1.%s", il, "weight")); + layer.conv_pw1_b = get_tensor(string_format("convnext.%d.pw1.%s", il, "bias")); + layer.conv_pw2_w = get_tensor(string_format("convnext.%d.pw2.%s", il, "weight")); + layer.conv_pw2_b = get_tensor(string_format("convnext.%d.pw2.%s", il, "bias")); + } + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -4604,6 +4967,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { n_patches += 2; // for BOI and EOI token embeddings } break; + case PROJECTOR_TYPE_LFM2A: + return ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; default: GGML_ABORT("unsupported projector type"); } @@ -4955,6 +5320,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("pos_w", pos_data); } break; + case PROJECTOR_TYPE_LFM2A: + { + GGML_ASSERT(imgs.entries.size() == 1); + const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get()); + + auto d_model = 512; + auto seq_len = n_frames * 2 - 1; + std::vector pos_emb(d_model*seq_len); + auto half = d_model / 2; + std::vector inv_freq(half); + for (int64_t i = 0; i < half; ++i) { + inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i))); + } + for (int64_t pos = 0; pos < seq_len; ++pos) { + for (int64_t i = 0; i < half; ++i) { + const float ang = (n_frames - pos - 1) * inv_freq[i]; + pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even + pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd + } + } + set_input_f32("pos_emb", pos_emb); + } break; default: GGML_ABORT("Unknown projector type"); } @@ -5045,6 +5432,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_LFM2A: + return ctx->model.position_embeddings->ne[0]; default: GGML_ABORT("Unknown projector type"); } @@ -5083,12 +5472,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { - return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX - || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A - || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL; -} - bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index e8aeb2066c6..fdaa61e0cf7 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -113,4 +113,3 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); -bool clip_has_whisper_encoder(const struct clip_ctx * ctx); diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 4d053895cda..ef265ddd3e5 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -15,59 +15,41 @@ #define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) namespace whisper_preprocessor { - -#define SIN_COS_N_COUNT WHISPER_N_FFT namespace { -struct whisper_global_cache { - // In FFT, we frequently use sine and cosine operations with the same values. - // We can use precalculated values to speed up the process. - float sin_vals[SIN_COS_N_COUNT]; - float cos_vals[SIN_COS_N_COUNT]; - - // Hann window (Use cosf to eliminate difference) - // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html - // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 - float hann_window[WHISPER_N_FFT]; - - whisper_global_cache() { - fill_sin_cos_table(); - fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window); - } - void fill_sin_cos_table() { - for (int i = 0; i < SIN_COS_N_COUNT; i++) { - double theta = (2 * M_PI * i) / SIN_COS_N_COUNT; - sin_vals[i] = sinf(theta); - cos_vals[i] = cosf(theta); - } +void fill_sin_cos_table(float *sin_vals, float *cos_vals, int n) { + for (int i = 0; i < n; i++) { + double theta = (2 * M_PI * i) / n; + sin_vals[i] = sinf(theta); + cos_vals[i] = cosf(theta); } +} - void fill_hann_window(int length, bool periodic, float * output) { - int offset = -1; - if (periodic) { - offset = 0; - } - for (int i = 0; i < length; i++) { - output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); - } +void fill_hann_window(int length, bool periodic, float *output) { + int offset = -1; + if (periodic) { + offset = 0; } -} global_cache; + for (int i = 0; i < length; i++) { + output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } +} } // naive Discrete Fourier Transform // input is real-valued // output is complex-valued -static void dft(const float* in, int N, float* out) { - const int sin_cos_step = SIN_COS_N_COUNT / N; +static void dft(const float* in, int N, const float *sin_vals, const float *cos_vals, int n_sin_cos_vals, float* out) { + const int sin_cos_step = n_sin_cos_vals / N; for (int k = 0; k < N; k++) { float re = 0; float im = 0; for (int n = 0; n < N; n++) { - int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N - re += in[n]*global_cache.cos_vals[idx]; // cos(t) - im -= in[n]*global_cache.sin_vals[idx]; // sin(t) + int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N + re += in[n]*cos_vals[idx]; // cos(t) + im -= in[n]*sin_vals[idx]; // sin(t) } out[k*2 + 0] = re; @@ -79,7 +61,7 @@ static void dft(const float* in, int N, float* out) { // poor man's implementation - use something better // input is real-valued // output is complex-valued -static void fft(float* in, int N, float* out) { +static void fft(float* in, int N, const float *sin_vals, const float *cos_vals, int n_sin_cos_vals, float* out) { if (N == 1) { out[0] = in[0]; out[1] = 0; @@ -88,7 +70,7 @@ static void fft(float* in, int N, float* out) { const int half_N = N / 2; if (N - half_N*2 == 1) { - dft(in, N, out); + dft(in, N, sin_vals, cos_vals, n_sin_cos_vals, out); return; } @@ -97,20 +79,20 @@ static void fft(float* in, int N, float* out) { even[i]= in[2*i]; } float* even_fft = out + 2 * N; - fft(even, half_N, even_fft); + fft(even, half_N, sin_vals, cos_vals, n_sin_cos_vals, even_fft); float* odd = even; for (int i = 0; i < half_N; ++i) { odd[i] = in[2*i + 1]; } float* odd_fft = even_fft + N; - fft(odd, half_N, odd_fft); + fft(odd, half_N, sin_vals, cos_vals, n_sin_cos_vals, odd_fft); - const int sin_cos_step = SIN_COS_N_COUNT / N; + const int sin_cos_step = n_sin_cos_vals / N; for (int k = 0; k < half_N; k++) { int idx = k * sin_cos_step; // t = 2*M_PI*k/N - float re = global_cache.cos_vals[idx]; // cos(t) - float im = -global_cache.sin_vals[idx]; // sin(t) + float re = cos_vals[idx]; // cos(t) + float im = -sin_vals[idx]; // sin(t) float re_odd = odd_fft[2*k + 0]; float im_odd = odd_fft[2*k + 1]; @@ -125,15 +107,16 @@ static void fft(float* in, int N, float* out) { static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector & samples, int n_samples, int frame_size, int frame_step, int n_threads, - const whisper_filters & filters, whisper_mel & mel) { + const whisper_filter_params & filters, whisper_mel & mel) { std::vector fft_in(frame_size * 2, 0.0); std::vector fft_out(frame_size * 2 * 2 * 2); - int n_fft = filters.n_fft; + int n_fft_bins = filters.n_fft_bins; int i = ith; // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist - WHISPER_ASSERT(n_fft == 1 + (frame_size / 2)); + WHISPER_ASSERT(n_fft_bins == 1 + (frame_size / 2)); + WHISPER_ASSERT(filters.sin_vals.size() == filters.cos_vals.size()); // calculate FFT only when fft_in are not all zero for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) { @@ -150,11 +133,11 @@ static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const } // FFT - fft(fft_in.data(), frame_size, fft_out.data()); + fft(fft_in.data(), frame_size, filters.sin_vals.data(), filters.cos_vals.data(), filters.sin_vals.size(), fft_out.data()); // Calculate modulus^2 of complex numbers // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. - for (int j = 0; j < n_fft; j++) { + for (int j = 0; j < n_fft_bins; j++) { fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } @@ -163,24 +146,24 @@ static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const double sum = 0.0; // unroll loop (suggested by GH user @lunixbochs) int k = 0; - for (k = 0; k < n_fft - 3; k += 4) { + for (k = 0; k < n_fft_bins - 3; k += 4) { sum += - fft_out[k + 0] * filters.data[j * n_fft + k + 0] + - fft_out[k + 1] * filters.data[j * n_fft + k + 1] + - fft_out[k + 2] * filters.data[j * n_fft + k + 2] + - fft_out[k + 3] * filters.data[j * n_fft + k + 3]; + fft_out[k + 0] * filters.mel_filters[j * n_fft_bins + k + 0] + + fft_out[k + 1] * filters.mel_filters[j * n_fft_bins + k + 1] + + fft_out[k + 2] * filters.mel_filters[j * n_fft_bins + k + 2] + + fft_out[k + 3] * filters.mel_filters[j * n_fft_bins + k + 3]; } // handle n_fft remainder - for (; k < n_fft; k++) { - sum += fft_out[k] * filters.data[j * n_fft + k]; + for (; k < n_fft_bins; k++) { + sum += fft_out[k] * filters.mel_filters[j * n_fft_bins + k]; } - sum = log10(std::max(sum, 1e-10)); + sum = filters.use_natural_log ? log(sum + 5.960464477539063e-08) : log10(1e-10); mel.data[j * mel.n_len + i] = sum; } } // Otherwise fft_out are all zero - double sum = log10(1e-10); + double sum = log(1e-10); for (; i < mel.n_len; i += n_threads) { for (int j = 0; j < mel.n_mel; j++) { mel.data[j * mel.n_len + i] = sum; @@ -191,42 +174,66 @@ static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 static bool log_mel_spectrogram( const float * samples, - const int n_samples, - const int /*sample_rate*/, - const int frame_size, - const int frame_step, - const int n_mel, + const int n_samples_in, const int n_threads, - const whisper_filters & filters, + const whisper_filter_params & filters, const bool debug, whisper_mel & mel) { //const int64_t t_start_us = ggml_time_us(); + mel.n_len_org = n_samples_in; + int n_samples = n_samples_in; + // Hann window - WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size"); - const float * hann = global_cache.hann_window; + const float * hann = filters.hann_window.data(); - // Calculate the length of padding - int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; - int64_t stage_2_pad = frame_size / 2; + const int frame_size = (filters.n_fft_bins - 1) * 2; + const int frame_step = filters.hop_length; - // Initialize a vector and copy data from C array to it. + // Padding std::vector samples_padded; - samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); - std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); + if (filters.center_padding) { + const auto pad_amount = frame_size / 2; + samples_padded = std::vector(n_samples + 2 * pad_amount, 0); + std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount); + samples = samples_padded.data(); + n_samples = samples_padded.size(); + } else { + // existing padding logic + int64_t stage_1_pad = filters.sample_rate * 30; + int64_t stage_2_pad = frame_size / 2; + samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); + std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); + // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); + // reflective pad 200 samples at the beginning of audio + std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); + } - // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio - std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); + // preemphasis + if (filters.preemph) { + const int pad_amount = frame_size / 2; + const float preemph = 0.97f; + float prev = samples_padded[pad_amount]; + for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) { + float cur = samples_padded[i]; + samples_padded[i] = cur - preemph * prev; + prev = cur; + } + } + + // pad hann window if it's smaller than frame_size + std::vector hann_window_padded; + if (filters.hann_window_size < frame_size) { + hann_window_padded.resize(frame_size); + const int padding = (frame_size - filters.hann_window_size) / 2; + std::copy(hann, hann + filters.hann_window_size, &hann_window_padded[padding]); + hann = hann_window_padded.data(); + } - // reflective pad 200 samples at the beginning of audio - std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); - mel.n_mel = n_mel; - // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 - // Calculate number of frames + remove the last frame - mel.n_len = (samples_padded.size() - frame_size) / frame_step; - // Calculate semi-padded sample length to ensure compatibility - mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step; + mel.n_mel = filters.n_mel; + mel.n_len = (n_samples - frame_size) / frame_step + 1; mel.data.resize(mel.n_mel * mel.n_len); { @@ -234,34 +241,62 @@ static bool log_mel_spectrogram( for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw] = std::thread( log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), - n_samples + stage_2_pad, frame_size, frame_step, n_threads, + n_samples, frame_size, frame_step, n_threads, std::cref(filters), std::ref(mel)); } // main thread - log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel); + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, filters, mel); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw].join(); } } - // clamping and normalization - double mmax = -1e20; - for (int i = 0; i < mel.n_mel*mel.n_len; i++) { - if (mel.data[i] > mmax) { - mmax = mel.data[i]; - } - } + const int effective_n_len = n_samples_in / frame_step; + if (filters.normalize_per_feature) { + for (int i = 0; i < mel.n_mel; i++) { + double mean = 0; + for (int j = 0; j < effective_n_len; ++j) { + mean += mel.data[i * mel.n_len + j]; + } + mean /= effective_n_len; - mmax -= 8.0; + double var = 0.0; + for (int j = 0; j < effective_n_len; ++j) { + const double value = mel.data[i * mel.n_len + j] - mean; + var += value * value; + } + var /= effective_n_len - 1; // unbiased + const double mstd = std::sqrt(var + 1e-5); - for (int i = 0; i < mel.n_mel*mel.n_len; i++) { - if (mel.data[i] < mmax) { - mel.data[i] = mmax; + for (int j = 0; j < effective_n_len; ++j) { + auto &value = mel.data[i * mel.n_len + j]; + value = (value - mean) / mstd; + } + + // pad the rest with zeros + for (int j = effective_n_len; j < mel.n_len; ++j) { + mel.data[i * mel.n_len + j] = 0.0; + } } + } else { + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] > mmax) { + mmax = mel.data[i]; + } + } + + mmax -= 8.0; - mel.data[i] = (mel.data[i] + 4.0)/4.0; + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] < mmax) { + mel.data[i] = mmax; + } + mel.data[i] = (mel.data[i] + 4.0)/4.0; + } } // Dump log_mel_spectrogram @@ -281,7 +316,7 @@ static bool log_mel_spectrogram( bool preprocess_audio( const float * samples, size_t n_samples, - const whisper_filters & filters, + const whisper_filter_params & filters, std::vector & output) { if (n_samples == 0) { @@ -293,10 +328,6 @@ bool preprocess_audio( bool ok = log_mel_spectrogram( samples, n_samples, - COMMON_SAMPLE_RATE, - WHISPER_N_FFT, - WHISPER_HOP_LENGTH, - filters.n_mel, 4, // n_threads filters, false, // debug @@ -305,6 +336,11 @@ bool preprocess_audio( return false; } + if (!filters.need_chunking) { + output.push_back(std::move(out_full)); + return true; + } + // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel // we always expect the mel to have 3000 silent frames at the end // printf("n_len %d\n", out_full.n_len); @@ -335,434 +371,113 @@ bool preprocess_audio( } // namespace whisper_preprocessor +namespace whisper_precalc_filters { +namespace { +// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. +// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. +std::vector gen_mel_filterbank_matrix( + int n_mel, + int n_fft, + int sample_rate, // e.g. 16000 + float fmin = 0.0f, // e.g. 0.0 + float fmax = -1.0f, // e.g. sr/2; pass -1 for auto + bool slaney_area_norm = true, + float scale = 1.0f // optional extra scaling; use 1.0f/1000.0f to mimic your code +) { + GGML_ASSERT(n_mel > 0 && n_fft > 1); + if (fmax <= 0.0f) { + fmax = 0.5f * sample_rate; + } -// precalculated mel filter banks -// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function -// -// generated from python code: -// -// from numpy import load -// data = load('mel_filters.npz') -// lst = data.files -// for item in lst: -// print(item) -// print(data[item].shape) -// n_mel = data[item].shape[0] -// n_fft = data[item].shape[1] -// for i, row in enumerate(data[item]): -// for j, val in enumerate(row): -// val = val * 1000.0 -// if val != 0: -// print(f"data[{i*n_fft + j}] = {val:.6f};") + // Slaney scale (matches librosa default) + const double min_log_hz = 1000.0; + const double lin_slope = 3 / 200.; + const double min_log_mel = min_log_hz * lin_slope; + const double log_step = log(6.4) / 27.0; + auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double { + return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step; + }; + auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double { + return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step); + }; + + // infer N_fft from n_fft_bins + const double bin_hz_step = double(sample_rate) / double(n_fft); + + // mel grid: n_mel + 2 edges + const double m_lo = hz_to_mel(fmin); + const double m_hi = hz_to_mel(fmax); + std::vector mel_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; ++i) { + mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1)); + } -namespace whisper_precalc_filters { + // convert to Hz + std::vector hz_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; ++i) { + hz_pts[i] = mel_to_hz(mel_pts[i]); + } -whisper_preprocessor::whisper_filters get_128_bins() { - whisper_preprocessor::whisper_filters filters; - filters.n_mel = 128; - filters.n_fft = 201; - std::vector data(filters.n_mel * filters.n_fft, 0.0f); - - data[1] = 12.37398665; - data[202] = 30.39256483; - data[404] = 24.74797331; - data[605] = 18.01857911; - data[807] = 37.12195903; - data[1008] = 5.64459199; - data[1009] = 6.72939420; - data[1210] = 36.03715822; - data[1412] = 19.10337992; - data[1613] = 23.66316877; - data[1815] = 31.47736564; - data[2016] = 11.28918398; - data[2017] = 1.08480197; - data[2218] = 41.68175161; - data[2420] = 13.45878839; - data[2621] = 29.30776216; - data[2823] = 25.83277412; - data[3024] = 16.93377644; - data[3226] = 38.20675984; - data[3427] = 4.55979025; - data[3428] = 7.81419594; - data[3629] = 34.95235741; - data[3831] = 20.18818259; - data[4032] = 22.57836796; - data[4234] = 32.56217018; - data[4435] = 10.20438317; - data[4436] = 2.16960395; - data[4637] = 40.59694707; - data[4839] = 14.54358920; - data[5040] = 28.22295949; - data[5242] = 26.91757679; - data[5443] = 15.84897563; - data[5645] = 39.29156065; - data[5846] = 3.47498828; - data[5847] = 8.89899861; - data[6048] = 33.86755288; - data[6250] = 21.27298526; - data[6451] = 21.49356715; - data[6653] = 33.64697099; - data[6854] = 9.11958050; - data[6855] = 3.25440569; - data[7056] = 39.51214626; - data[7258] = 15.62839188; - data[7459] = 27.13815868; - data[7661] = 28.00237760; - data[7862] = 14.76417296; - data[8064] = 40.37636518; - data[8265] = 2.38068704; - data[8266] = 10.20263787; - data[8467] = 31.61146119; - data[8669] = 24.54700135; - data[8870] = 15.32919332; - data[8871] = 1.66583748; - data[9072] = 36.72905266; - data[9274] = 20.09709924; - data[9475] = 16.93102531; - data[9476] = 2.90265540; - data[9677] = 32.84499049; - data[9879] = 23.52004871; - data[10080] = 11.03894413; - data[10081] = 10.72582975; - data[10282] = 22.71829173; - data[10484] = 32.27872774; - data[10685] = 0.11626833; - data[10686] = 22.85348251; - data[10887] = 8.56344029; - data[10888] = 14.97978810; - data[11089] = 15.51398356; - data[11090] = 8.51490628; - data[11291] = 21.10680379; - data[11292] = 3.32652032; - data[11493] = 25.47064796; - data[11695] = 27.35907957; - data[11896] = 0.65853616; - data[11897] = 23.83812517; - data[12098] = 3.44359246; - data[12099] = 21.22455277; - data[12300] = 5.35842171; - data[12301] = 19.42555793; - data[12502] = 6.49324711; - data[12503] = 18.35542172; - data[12704] = 6.93138083; - data[12705] = 17.93504693; - data[12906] = 6.74968259; - data[12907] = 18.09151843; - data[13108] = 6.01899112; - data[13109] = 18.75767298; - data[13310] = 4.80452832; - data[13311] = 19.87172849; - data[13512] = 3.16627859; - data[13513] = 21.37690969; - data[13514] = 1.25317345; - data[13714] = 1.15934468; - data[13715] = 20.80361731; - data[13716] = 4.04486805; - data[13917] = 17.55363122; - data[13918] = 7.08320038; - data[14119] = 14.07538634; - data[14120] = 10.32655034; - data[14321] = 10.40921453; - data[14322] = 13.73696327; - data[14523] = 6.59187697; - data[14524] = 17.27988198; - data[14525] = 1.46804214; - data[14725] = 2.65681883; - data[14726] = 18.09193194; - data[14727] = 5.85655728; - data[14928] = 13.34277913; - data[14929] = 10.28267574; - data[15130] = 8.56800377; - data[15131] = 14.72230814; - data[15132] = 1.04039861; - data[15332] = 3.79085587; - data[15333] = 17.14678481; - data[15334] = 6.11609267; - data[15535] = 11.75929047; - data[15536] = 11.13393717; - data[15737] = 6.43857848; - data[15738] = 16.07806236; - data[15739] = 4.23917221; - data[15939] = 1.19989377; - data[15940] = 12.75671553; - data[15941] = 9.65298992; - data[16142] = 7.06935255; - data[16143] = 14.94054683; - data[16144] = 4.19024844; - data[16344] = 1.51483389; - data[16345] = 12.00899947; - data[16346] = 9.84823331; - data[16547] = 6.10224018; - data[16548] = 15.33857174; - data[16549] = 5.57676842; - data[16749] = 0.36827257; - data[16750] = 9.89749376; - data[16751] = 11.35340426; - data[16752] = 2.05122307; - data[16952] = 3.89297144; - data[16953] = 12.97352277; - data[16954] = 8.06631614; - data[17155] = 6.74493238; - data[17156] = 13.85874674; - data[17157] = 5.41190524; - data[17357] = 0.74220158; - data[17358] = 8.98779090; - data[17359] = 11.37871388; - data[17360] = 3.32958088; - data[17560] = 2.82313535; - data[17561] = 10.68049297; - data[17562] = 9.43340641; - data[17563] = 1.76325557; - data[17763] = 4.39018616; - data[17764] = 11.87758986; - data[17765] = 7.97005836; - data[17766] = 0.66104700; - data[17966] = 5.49466675; - data[17967] = 12.62953598; - data[17968] = 6.93987962; - data[18169] = 6.18401915; - data[18170] = 12.93473132; - data[18171] = 6.29778765; - data[18371] = 0.02325210; - data[18372] = 6.50206627; - data[18373] = 12.32661773; - data[18374] = 6.00216538; - data[18574] = 0.31548753; - data[18575] = 6.48925547; - data[18576] = 12.04130240; - data[18577] = 6.01462880; - data[18777] = 0.29979556; - data[18778] = 6.18288014; - data[18779] = 12.04272825; - data[18780] = 6.29981188; - data[18781] = 0.55689598; - data[18980] = 0.01120471; - data[18981] = 5.61729167; - data[18982] = 11.22337859; - data[18983] = 6.82516303; - data[18984] = 1.35264499; - data[19184] = 4.82410006; - data[19185] = 10.16623247; - data[19186] = 7.56075513; - data[19187] = 2.34590308; - data[19387] = 3.83235747; - data[19388] = 8.92296247; - data[19389] = 8.47910438; - data[19390] = 3.50978645; - data[19590] = 2.66873185; - data[19591] = 7.51965167; - data[19592] = 9.55500547; - data[19593] = 4.81966138; - data[19594] = 0.08431751; - data[19793] = 1.35767367; - data[19794] = 5.98019501; - data[19795] = 10.60271543; - data[19796] = 6.25298498; - data[19797] = 1.74059917; - data[19997] = 4.32644226; - data[19998] = 8.73131864; - data[19999] = 7.78916525; - data[20000] = 3.48923868; - data[20200] = 2.57835095; - data[20201] = 6.77582854; - data[20202] = 9.40941647; - data[20203] = 5.31194592; - data[20204] = 1.21447595; - data[20403] = 0.75411191; - data[20404] = 4.75395704; - data[20405] = 8.75380263; - data[20406] = 7.19209015; - data[20407] = 3.28754401; - data[20607] = 2.68179690; - data[20608] = 6.49331464; - data[20609] = 9.11457930; - data[20610] = 5.39387390; - data[20611] = 1.67316827; - data[20810] = 0.57394296; - data[20811] = 4.20600036; - data[20812] = 7.83805829; - data[20813] = 7.52023002; - data[20814] = 3.97470826; - data[20815] = 0.42918732; - data[21014] = 1.90464477; - data[21015] = 5.36569161; - data[21016] = 8.82673822; - data[21017] = 6.27609482; - data[21018] = 2.89750961; - data[21218] = 2.89885257; - data[21219] = 6.19694078; - data[21220] = 8.56699049; - data[21221] = 5.34748193; - data[21222] = 2.12797290; - data[21421] = 0.44750227; - data[21422] = 3.59030394; - data[21423] = 6.73310598; - data[21424] = 7.77023612; - data[21425] = 4.70231380; - data[21426] = 1.63439126; - data[21625] = 1.01536023; - data[21626] = 4.01018746; - data[21627] = 7.00501446; - data[21628] = 7.23442994; - data[21629] = 4.31095669; - data[21630] = 1.38748321; - data[21829] = 1.33348850; - data[21830] = 4.18730825; - data[21831] = 7.04112789; - data[21832] = 6.93188375; - data[21833] = 4.14605811; - data[21834] = 1.36023236; - data[22033] = 1.42879714; - data[22034] = 4.14824858; - data[22035] = 6.86769979; - data[22036] = 6.83705276; - data[22037] = 4.18239459; - data[22038] = 1.52773573; - data[22237] = 1.32610439; - data[22238] = 3.91751388; - data[22239] = 6.50892360; - data[22240] = 6.92639686; - data[22241] = 4.39672917; - data[22242] = 1.86706171; - data[22441] = 1.04827771; - data[22442] = 3.51767405; - data[22443] = 5.98707050; - data[22444] = 7.17824046; - data[22445] = 4.76767914; - data[22446] = 2.35711760; - data[22645] = 0.61636406; - data[22646] = 2.96949223; - data[22647] = 5.32262027; - data[22648] = 7.57265091; - data[22649] = 5.27558755; - data[22650] = 2.97852419; - data[22651] = 0.68146095; - data[22849] = 0.04971400; - data[22850] = 2.29204819; - data[22851] = 4.53438237; - data[22852] = 6.77671656; - data[22853] = 5.90240723; - data[22854] = 3.71349836; - data[22855] = 1.52458926; - data[23054] = 1.50285335; - data[23055] = 3.63961048; - data[23056] = 5.77636715; - data[23057] = 6.63159089; - data[23058] = 4.54574358; - data[23059] = 2.45989650; - data[23060] = 0.37404924; - data[23258] = 0.61795861; - data[23259] = 2.65410915; - data[23260] = 4.69025923; - data[23261] = 6.72641024; - data[23262] = 5.46034705; - data[23263] = 3.47270933; - data[23264] = 1.48507138; - data[23463] = 1.59233576; - data[23464] = 3.53261665; - data[23465] = 5.47289755; - data[23466] = 6.44368259; - data[23467] = 4.54962999; - data[23468] = 2.65557761; - data[23469] = 0.76152512; - data[23667] = 0.46749352; - data[23668] = 2.31641904; - data[23669] = 4.16534441; - data[23670] = 6.01426978; - data[23671] = 5.67844696; - data[23672] = 3.87357362; - data[23673] = 2.06870004; - data[23674] = 0.26382666; - data[23872] = 1.05349103; - data[23873] = 2.81536230; - data[23874] = 4.57723346; - data[23875] = 6.33910485; - data[23876] = 5.12815686; - data[23877] = 3.40826320; - data[23878] = 1.68837002; - data[24077] = 1.43350090; - data[24078] = 3.11241671; - data[24079] = 4.79133241; - data[24080] = 6.40943693; - data[24081] = 4.77052201; - data[24082] = 3.13160778; - data[24083] = 1.49269309; - data[24281] = 0.02932359; - data[24282] = 1.62918994; - data[24283] = 3.22905602; - data[24284] = 4.82892245; - data[24285] = 6.14671456; - data[24286] = 4.58496623; - data[24287] = 3.02321767; - data[24288] = 1.46146910; - data[24486] = 0.13601698; - data[24487] = 1.66055572; - data[24488] = 3.18509457; - data[24489] = 4.70963307; - data[24490] = 6.04072399; - data[24491] = 4.55250870; - data[24492] = 3.06429295; - data[24493] = 1.57607743; - data[24494] = 0.08786193; - data[24691] = 0.09328097; - data[24692] = 1.54603878; - data[24693] = 2.99879676; - data[24694] = 4.45155473; - data[24695] = 5.90431225; - data[24696] = 4.65566106; - data[24697] = 3.23751615; - data[24698] = 1.81937125; - data[24699] = 0.40122634; - data[24897] = 1.30262633; - data[24898] = 2.68698297; - data[24899] = 4.07133950; - data[24900] = 5.45569602; - data[24901] = 4.87832492; - data[24902] = 3.52695142; - data[24903] = 2.17557792; - data[24904] = 0.82420459; - data[25102] = 0.94595028; - data[25103] = 2.26512621; - data[25104] = 3.58430226; - data[25105] = 4.90347855; - data[25106] = 5.20569785; - data[25107] = 3.91795207; - data[25108] = 2.63020652; - data[25109] = 1.34246063; - data[25110] = 0.05471494; - data[25307] = 0.49037894; - data[25308] = 1.74744334; - data[25309] = 3.00450763; - data[25310] = 4.26157191; - data[25311] = 5.51863620; - data[25312] = 4.39707236; - data[25313] = 3.16995848; - data[25314] = 1.94284460; - data[25315] = 0.71573065; - data[25513] = 1.14698056; - data[25514] = 2.34485767; - data[25515] = 3.54273478; - data[25516] = 4.74061165; - data[25517] = 4.95198462; - data[25518] = 3.78264743; - data[25519] = 2.61331047; - data[25520] = 1.44397374; - data[25521] = 0.27463681; - data[25718] = 0.47569509; - data[25719] = 1.61717169; - data[25720] = 2.75864848; - data[25721] = 3.90012516; - data[25722] = 5.04160160; - data[25723] = 4.45712078; - data[25724] = 3.34284059; - data[25725] = 2.22856039; - data[25726] = 1.11428020; - - for (auto & val : data) { - val /= 1000.0f; + const int n_fft_bins = n_fft / 2 + 1; + + // filterbank + std::vector out(n_mel * n_fft_bins, 0); + for (int m = 0; m < n_mel; ++m) { + const double f_left = hz_pts[m]; + const double f_center = hz_pts[m + 1]; + const double f_right = hz_pts[m + 2]; + + const double denom_l = std::max(1e-30, f_center - f_left); + const double denom_r = std::max(1e-30, f_right - f_center); + const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0; + + for (int k = 0; k < n_fft_bins; ++k) { + const double f = k * bin_hz_step; + double w = 0.0; + if (f >= f_left && f <= f_center) { + w = (f - f_left) / denom_l; + } else if (f > f_center && f <= f_right) { + w = (f_right - f) / denom_r; + } + out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale); + } } - filters.data = std::move(data); + return out; +} +} + +whisper_preprocessor::whisper_filter_params get_whisper_params( + int32_t n_mel, + int32_t n_fft, + int32_t window_size, + int32_t hop_length, + int32_t sample_rate) { + whisper_preprocessor::whisper_filter_params filters; + + filters.n_mel = n_mel; + filters.n_fft_bins = n_fft / 2 + 1; + filters.mel_filters = gen_mel_filterbank_matrix(n_mel, n_fft, sample_rate); + WHISPER_ASSERT(static_cast(filters.mel_filters.size()) == n_mel * filters.n_fft_bins); + + filters.hann_window_size = window_size; + filters.hop_length = hop_length; + filters.hann_window.resize(window_size); + whisper_preprocessor::fill_hann_window(window_size, true, filters.hann_window.data()); + + filters.sin_vals.resize(n_fft); + filters.cos_vals.resize(n_fft); + whisper_preprocessor::fill_sin_cos_table(filters.sin_vals.data(), filters.cos_vals.data(), n_fft); + + filters.sample_rate = sample_rate; + +#if WHISPER_DEBUG + for (size_t i = 0; i < filters.mel_filters.size(); ++i) { + if (filters.mel_filters[i] != 0) { + printf("filters[%d] = %f\n", i, filters.mel_filters[i] * 1000); + } + } +#endif return filters; } diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index b7b940affb5..9ec75a76cd6 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -4,17 +4,9 @@ #include #include -#include #define WHISPER_ASSERT GGML_ASSERT -#define WHISPER_SAMPLE_RATE 16000 -#define WHISPER_N_FFT 400 -#define WHISPER_HOP_LENGTH 160 -#define WHISPER_CHUNK_SIZE 30 - -#define COMMON_SAMPLE_RATE 16000 - namespace whisper_preprocessor { struct whisper_mel { @@ -25,23 +17,39 @@ struct whisper_mel { std::vector data; }; -struct whisper_filters { +struct whisper_filter_params { int32_t n_mel; - int32_t n_fft; - - std::vector data; + int32_t n_fft_bins; + int32_t hann_window_size; + int32_t hop_length; + int32_t sample_rate; + bool center_padding = false; + float preemph = 0.f; + bool use_natural_log = false; + bool normalize_per_feature = false; + bool need_chunking = true; + + std::vector mel_filters; + std::vector hann_window; + std::vector sin_vals; + std::vector cos_vals; }; bool preprocess_audio( const float * samples, size_t n_samples, - const whisper_filters & filters, + const whisper_filter_params & filters, std::vector & output); } // namespace whisper_preprocessor namespace whisper_precalc_filters { -whisper_preprocessor::whisper_filters get_128_bins(); +whisper_preprocessor::whisper_filter_params get_whisper_params( + int32_t n_mel, + int32_t n_fft, + int32_t window_size, + int32_t hop_length, + int32_t sample_rate); } // namespace whisper_precalc_filters diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index b5bbc6536b5..9c373a57011 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -210,7 +210,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { return 0; } -static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) { +static std::string chat_add_and_format(mtmd_cli_context & ctx, const common_chat_msg & new_msg) { LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n", new_msg.role.c_str(), new_msg.content.c_str()); auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history, @@ -310,6 +310,15 @@ int main(int argc, char ** argv) { if (g_is_interrupted) return 130; + if (!params.system_prompt.empty()) { + common_chat_msg msg; + msg.role = "system"; + msg.content = params.system_prompt; + if (eval_message(ctx, msg)) { + return 1; + } + } + if (is_single_turn) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d06fa42e616..af9c705a639 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -152,7 +152,7 @@ struct mtmd_context { std::string sli_img_start_tmpl; // for whisper, we pre-calculate the mel filter bank - whisper_preprocessor::whisper_filters w_filters; + whisper_preprocessor::whisper_filter_params w_filters; // TODO @ngxson : add timings @@ -317,24 +317,44 @@ struct mtmd_context { GGML_ASSERT(ctx_a != nullptr); projector_type proj = clip_get_projector_type(ctx_a); - if (clip_has_whisper_encoder(ctx_a)) { - // TODO @ngxson : check if model n_mel is 128 or 80 - w_filters = whisper_precalc_filters::get_128_bins(); - } + using whisper_precalc_filters::get_whisper_params; + + switch (proj) { + case PROJECTOR_TYPE_ULTRAVOX: + { + // [BEGIN_AUDIO] ... (embeddings) ... + aud_beg = "[BEGIN_AUDIO]"; + // TODO @ngxson : check if model n_mel is 128 or 80 + w_filters = get_whisper_params(128, 400, 400, 160, mtmd_get_audio_bitrate(this)); + } break; + case PROJECTOR_TYPE_QWEN2A: + { + // <|audio_bos|> ... (embeddings) ... <|audio_eos|> + aud_beg = "<|audio_bos|>"; + aud_end = "<|audio_eos|>"; + w_filters = get_whisper_params(128, 400, 400, 160, mtmd_get_audio_bitrate(this)); + } break; + case PROJECTOR_TYPE_VOXTRAL: + { + w_filters = get_whisper_params(128, 400, 400, 160, mtmd_get_audio_bitrate(this)); + } break; + case PROJECTOR_TYPE_LFM2A: + { + w_filters = get_whisper_params(128, 512, 400, 160, mtmd_get_audio_bitrate(this)); + w_filters.preemph = 0.97f; + w_filters.use_natural_log = true; + w_filters.center_padding = true; + w_filters.normalize_per_feature = true; + w_filters.need_chunking = false; + } break; + default: + { + // + } break; + }; LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n" " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__); - - if (proj == PROJECTOR_TYPE_QWEN2A) { - // <|audio_bos|> ... (embeddings) ... <|audio_eos|> - aud_beg = "<|audio_bos|>"; - aud_end = "<|audio_eos|>"; - - } else if (proj == PROJECTOR_TYPE_ULTRAVOX) { - // [BEGIN_AUDIO] ... (embeddings) ... - aud_beg = "[BEGIN_AUDIO]"; - - } } // get clip ctx based on chunk type From 6d54ddc6fd18955143a113463685d5dc3b79613f Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Tue, 2 Dec 2025 15:16:17 +0100 Subject: [PATCH 4/4] Refactor and enable cuda convs --- ggml/src/ggml-cuda/ssm-conv.cu | 34 ++++++------- tests/test-backend-ops.cpp | 8 +-- tools/mtmd/clip.cpp | 89 ++++++++++++++-------------------- tools/mtmd/clip.h | 2 +- 4 files changed, 56 insertions(+), 77 deletions(-) diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 41979733601..6d5ea704c65 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -102,31 +102,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int const int threads = 128; GGML_ASSERT(nr % threads == 0); - if (n_t <= 32) { - const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); - if (nc == 4) { - ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, - dst, dst_nb0, dst_nb1, dst_nb2, n_t); - } else if (nc == 3) { - ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, - dst, dst_nb0, dst_nb1, dst_nb2, n_t); + auto launch_kernel = [&](auto NC) { + constexpr int kNC = decltype(NC)::value; + if (n_t <= 32) { + const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); + ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, + dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { - GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); - } - } else { - if (nc == 4) { - const int64_t split_n_t = 32; - dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); - ssm_conv_long_token_f32<<>>( - src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); - } else if (nc == 3) { const int64_t split_n_t = 32; dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); - ssm_conv_long_token_f32<<>>( + ssm_conv_long_token_f32<<>>( src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); - } else { - GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); } + }; + + switch (nc) { + case 3: launch_kernel(std::integral_constant{}); break; + case 4: launch_kernel(std::integral_constant{}); break; + case 9: launch_kernel(std::integral_constant{}); break; + default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now."); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9645d0b3909..81fc85a1509 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7179,11 +7179,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); - for (int64_t d_conv : {3, 4}) { + for (int64_t d_conv : {3, 4, 9}) { for (int64_t d_inner: {1024, 1536, 2048}) { - test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); - test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); - test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 4, 1}, {d_conv, d_inner, 1, 1})); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {2 * d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 4, 1}, {d_conv, d_inner, 1, 1})); } } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 30cedab1457..857d534fe23 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -281,8 +281,9 @@ struct clip_layer { ggml_tensor * norm_conv_w = nullptr; ggml_tensor * norm_conv_b = nullptr; ggml_tensor * linear_pos_w = nullptr; - ggml_tensor * conv_bn_w = nullptr; - ggml_tensor * conv_bn_b = nullptr; + + ggml_tensor * conv_norm_w = nullptr; + ggml_tensor * conv_norm_b = nullptr; ggml_tensor * conv_dw_w = nullptr; ggml_tensor * conv_dw_b = nullptr; ggml_tensor * conv_pw1_w = nullptr; @@ -423,16 +424,8 @@ struct clip_model { ggml_tensor * mm_eoi = nullptr; // lfm2 - ggml_tensor * pre_encode_conv_0_w = nullptr; - ggml_tensor * pre_encode_conv_0_b = nullptr; - ggml_tensor * pre_encode_conv_2_w = nullptr; - ggml_tensor * pre_encode_conv_2_b = nullptr; - ggml_tensor * pre_encode_conv_3_w = nullptr; - ggml_tensor * pre_encode_conv_3_b = nullptr; - ggml_tensor * pre_encode_conv_5_w = nullptr; - ggml_tensor * pre_encode_conv_5_b = nullptr; - ggml_tensor * pre_encode_conv_6_w = nullptr; - ggml_tensor * pre_encode_conv_6_b = nullptr; + std::array pre_encode_conv_X_w; + std::array pre_encode_conv_X_b; ggml_tensor * pre_encode_out_w = nullptr; ggml_tensor * pre_encode_out_b = nullptr; @@ -2039,7 +2032,7 @@ struct clip_graph { ggml_cgraph * build_lfm2_audio() { const int n_frames = img.nx; const int n_pos = n_frames / 2; - const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; + const int n_pos_embd = clip_n_output_tokens(ctx, &img) * 2 - 1; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd); @@ -2055,34 +2048,34 @@ struct clip_graph { // pre encode, conv subsampling { // layer.0 - conv2d - cur = ggml_conv_2d(ctx0, model.pre_encode_conv_0_w, cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_0_b, 1, 1, cur->ne[2], 1)); + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[0], 1, 1, cur->ne[2], 1)); cb(cur, "conformer.pre_encode.conv.{}", 0); // layer.1 - relu cur = ggml_relu_inplace(ctx0, cur); // layer.2 conv2d dw - cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_2_w, cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_2_b, 1, 1, cur->ne[2], 1)); + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[2], 1, 1, cur->ne[2], 1)); cb(cur, "conformer.pre_encode.conv.{}", 2); // layer.3 conv2d - cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_3_w, cur, 1, 1, 0, 0, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_3_b, 1, 1, cur->ne[2], 1)); + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[3], 1, 1, cur->ne[2], 1)); cb(cur, "conformer.pre_encode.conv.{}", 3); // layer.4 - relu cur = ggml_relu_inplace(ctx0, cur); // layer.5 conv2d dw - cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_5_w, cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_5_b, 1, 1, cur->ne[2], 1)); + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[5], 1, 1, cur->ne[2], 1)); cb(cur, "conformer.pre_encode.conv.{}", 5); // layer.6 conv2d - cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_6_w, cur, 1, 1, 0, 0, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_6_b, 1, 1, cur->ne[2], 1)); + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[6], 1, 1, cur->ne[2], 1)); cb(cur, "conformer.pre_encode.conv.{}", 6); // layer.7 - relu @@ -2102,7 +2095,7 @@ struct clip_graph { cb(pos_emb, "pos_emb", -1); for (int il = 0; il < hparams.n_layer; il++) { - auto & layer = model.layers[il]; + const auto & layer = model.layers[il]; auto * residual = cur; @@ -2116,14 +2109,12 @@ struct clip_graph { layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, - FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + FFN_SILU, il); cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); - const auto fc_factor = 0.5f; // TODO(tarek): read from config + const auto fc_factor = 0.5f; residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); - - // self-attention { cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); @@ -2223,8 +2214,7 @@ struct clip_graph { x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - // torch.funtional.glu - // TODO(tarek): chekc if llama.cpp impl exists + // TODO: add support of torch.funtional.glu { int64_t d = x->ne[0] / 2; ggml_tensor *gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); @@ -2244,10 +2234,9 @@ struct clip_graph { cb(x, "conformer.layers.{}.conv.depthwise_conv", il); - // TODO(tarek): fold into another op { x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_bn_w), layer.conv_bn_b); + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); cb(x, "conformer.layers.{}.conv.batch_norm", il); } @@ -3561,16 +3550,10 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_LFM2A: { - model.pre_encode_conv_0_w = get_tensor(string_format(TN_CONV1D, 0, "weight")); - model.pre_encode_conv_0_b = get_tensor(string_format(TN_CONV1D, 0, "bias")); - model.pre_encode_conv_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); - model.pre_encode_conv_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); - model.pre_encode_conv_3_w = get_tensor(string_format(TN_CONV1D, 3, "weight")); - model.pre_encode_conv_3_b = get_tensor(string_format(TN_CONV1D, 3, "bias")); - model.pre_encode_conv_5_w = get_tensor(string_format(TN_CONV1D, 5, "weight")); - model.pre_encode_conv_5_b = get_tensor(string_format(TN_CONV1D, 5, "bias")); - model.pre_encode_conv_6_w = get_tensor(string_format(TN_CONV1D, 6, "weight")); - model.pre_encode_conv_6_b = get_tensor(string_format(TN_CONV1D, 6, "bias")); + for (int i : {0, 2, 3, 5, 6}) { + model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight")); + model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias")); + } model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight")); model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias")); @@ -3601,14 +3584,14 @@ struct clip_model_loader { layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); - layer.conv_bn_w = get_tensor(string_format("convnext.%d.norm.%s", il, "weight")); - layer.conv_bn_b = get_tensor(string_format("convnext.%d.norm.%s", il, "bias")); - layer.conv_dw_w = get_tensor(string_format("convnext.%d.dw.%s", il, "weight")); - layer.conv_dw_b = get_tensor(string_format("convnext.%d.dw.%s", il, "bias")); - layer.conv_pw1_w = get_tensor(string_format("convnext.%d.pw1.%s", il, "weight")); - layer.conv_pw1_b = get_tensor(string_format("convnext.%d.pw1.%s", il, "bias")); - layer.conv_pw2_w = get_tensor(string_format("convnext.%d.pw2.%s", il, "weight")); - layer.conv_pw2_b = get_tensor(string_format("convnext.%d.pw2.%s", il, "bias")); + layer.conv_norm_w = get_tensor(string_format("convnext.%d.norm.%s", il, "weight")); + layer.conv_norm_b = get_tensor(string_format("convnext.%d.norm.%s", il, "bias")); + layer.conv_dw_w = get_tensor(string_format("convnext.%d.dw.%s", il, "weight")); + layer.conv_dw_b = get_tensor(string_format("convnext.%d.dw.%s", il, "bias")); + layer.conv_pw1_w = get_tensor(string_format("convnext.%d.pw1.%s", il, "weight")); + layer.conv_pw1_b = get_tensor(string_format("convnext.%d.pw1.%s", il, "bias")); + layer.conv_pw2_w = get_tensor(string_format("convnext.%d.pw2.%s", il, "weight")); + layer.conv_pw2_b = get_tensor(string_format("convnext.%d.pw2.%s", il, "bias")); } } break; default: @@ -4853,7 +4836,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * return 1; } -int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { +int clip_n_output_tokens(const struct clip_ctx * ctx, const struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; // for models with fixed size image, the input image is already pre-processed and resized to square @@ -4968,7 +4951,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches += 2; // for BOI and EOI token embeddings } break; case PROJECTOR_TYPE_LFM2A: - return ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + { + n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + } break; default: GGML_ABORT("unsupported projector type"); } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index fdaa61e0cf7..6df8364b487 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -56,7 +56,7 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx); // TODO: should be enum, not string const char * clip_patch_merge_type(const struct clip_ctx * ctx); -int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); +int clip_n_output_tokens(const struct clip_ctx * ctx, const struct clip_image_f32 * img); // for M-RoPE, this will be the number of token positions in X and Y directions // for other models, X will be the total number of tokens and Y will be 1