auroralabs-loci · loci-dev · Jan 4, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 4, 2026
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -54,6 +54,7 @@ static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CLI,
+    LLAMA_EXAMPLE_LIQUID_AUDIO,
 };
 
 static std::string read_file(const std::string & fname) {
@@ -1206,7 +1207,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_LIQUID_AUDIO}));
     add_opt(common_arg(
         {"--perf"},
         {"--no-perf"},
@@ -2006,7 +2007,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.image.emplace_back(item);
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_LIQUID_AUDIO}));
     add_opt(common_arg(
         {"--image-min-tokens"}, "N",
         "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -2482,7 +2483,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_LIQUID_AUDIO}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2614,14 +2615,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.hostname = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_HOST"));
     add_opt(common_arg(
         {"--port"}, "PORT",
         string_format("port to listen (default: %d)", params.port),
         [](common_params & params, int value) {
             params.port = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_PORT"));
     add_opt(common_arg(
         {"--path"}, "PATH",
         string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
@@ -3256,7 +3257,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.vocoder.model.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}));
      add_opt(common_arg(
         {"--tts-use-guide-tokens"},
         "Use guide tokens to improve TTS word recall",
@@ -3270,7 +3271,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.vocoder.speaker_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_LIQUID_AUDIO}));
 
     add_opt(common_arg(
         {"--diffusion-steps"}, "N",

diff --git a/common/common.h b/common/common.h
@@ -100,6 +100,7 @@ enum llama_example {
     LLAMA_EXAMPLE_DIFFUSION,
     LLAMA_EXAMPLE_FINETUNE,
     LLAMA_EXAMPLE_FIT_PARAMS,
+    LLAMA_EXAMPLE_LIQUID_AUDIO,
 
     LLAMA_EXAMPLE_COUNT,
 };

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -9917,7 +9917,7 @@
     def set_gguf_parameters(self):
         # set num_key_value_heads only for attention layers
         self.hparams["num_key_value_heads"] = [
-            self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
+            self.hparams["num_key_value_heads"] if layer_type != "conv" else 0
             for layer_type in self.hparams["layer_types"]
         ]
 
@@ -9948,6 +9948,46 @@
         return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
 
 
+@ModelBase.register("Lfm2Model")
+class LFM2ColBertModel(LFM2Model):
+    model_arch = gguf.MODEL_ARCH.LFM2
+    dense_tensor_name = "dense_2"
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if not name.startswith(self.dense_tensor_name):
+            name = "model." + name
+
+        return super().modify_tensors(data_torch, name, bid)
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        # dense tensor is stored in a separate safetensors file
+        from safetensors.torch import load_file
+        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
+        assert tensors_file.is_file()
+        tensor = load_file(tensors_file)["linear.weight"]
+        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
+        yield f"{self.dense_tensor_name}.weight", tensor.clone()
+
+
+@ModelBase.register("Lfm25AudioTokenizer")
+class LFM25AudioTokenizer(LFM2Model):
+    model_arch = gguf.MODEL_ARCH.LFM2
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+        self.gguf_writer.add_embedding_length_out(self.hparams.get("output_size"))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name == "istft.window" or name.startswith("emb.emb"):
+            return []
+
+        if name.startswith("lin"):
+            name = name.replace("lin", "dense_2_out")
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Lfm2MoeForCausalLM")
 class LFM2MoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LFM2MOE

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
     }
 }
 
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
@@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
             GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
         }
 
-        float * out = output + embd_pos * n_embd;
-        common_embd_normalize(embd, out, n_embd, embd_norm);
+        float * out = output + embd_pos * n_embd_out;
+        common_embd_normalize(embd, out, n_embd_out, embd_norm);
     }
 }
 
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
     }
 
     // allocate output
-    const int n_embd = llama_model_n_embd(model);
-    std::vector<float> embeddings(n_embd_count * n_embd, 0);
+    const int n_embd_out = llama_model_n_embd_out(model);
+    std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
     float * emb = embeddings.data();
 
     // break into batches
@@ -267,8 +267,8 @@ int main(int argc, char ** argv) {
 
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
-            float * out = emb + e * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+            float * out = emb + e * n_embd_out;
+            batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
             e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
             s = 0;
             common_batch_clear(batch);
@@ -280,28 +280,28 @@ int main(int argc, char ** argv) {
     }
 
     // final batch
-    float * out = emb + e * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+    float * out = emb + e * n_embd_out;
+    batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
 
     if (params.embd_out.empty()) {
         LOG("\n");
 
         if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
             for (int j = 0; j < n_embd_count; j++) {
                 LOG("embedding %d: ", j);
-                for (int i = 0; i < std::min(3, n_embd); i++) {
+                for (int i = 0; i < std::min(3, n_embd_out); i++) {
                     if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd_out + i]);
                     } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd_out + i]);
                     }
                 }
                 LOG(" ... ");
-                for (int i = n_embd - 3; i < n_embd; i++) {
+                for (int i = n_embd_out - 3; i < n_embd_out; i++) {
                     if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd_out + i]);
                     } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd_out + i]);
                     }
                 }
                 LOG("\n");
@@ -320,21 +320,21 @@ int main(int argc, char ** argv) {
                 for (uint32_t i = 0; i < n_cls_out; i++) {
                     // NOTE: if you change this log - update the tests in ci/run.sh
                     if (n_cls_out == 1) {
-                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
                     } else {
-                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
                     }
                 }
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
             for (int j = 0; j < n_prompts; j++) {
                 LOG("embedding %d: ", j);
-                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
                     if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd_out + i]);
                     } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd_out + i]);
                     }
                 }
                 LOG("\n");
@@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
                 LOG("\n");
                 for (int i = 0; i < n_prompts; i++) {
                     for (int j = 0; j < n_prompts; j++) {
-                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
                         LOG("%6.2f ", sim);
                     }
                     LOG("%1.10s", prompts[i].c_str());
@@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
             if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
             LOG("[");
             for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
                 i++;
-                if (i < n_embd) LOG(","); else break;
+                if (i < n_embd_out) LOG(","); else break;
             }
             LOG(notArray ? "]\n    }" : "]");
             j++;
@@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
             for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
                 LOG("    [");
                 for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
-                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
                     LOG("%6.2f", sim);
                     j++;
                     if (j < n_embd_count) LOG(", "); else break;
@@ -397,7 +397,7 @@ int main(int argc, char ** argv) {
 
         if (notArray) LOG("\n}\n");
     } else if (params.embd_out == "raw") {
-        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
+        print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
     }
 
     LOG("\n");

diff --git a/examples/model-conversion/logits.cpp b/examples/model-conversion/logits.cpp
@@ -161,9 +161,9 @@ int main(int argc, char ** argv) {
     std::vector<float> embd_out;
 
     if (embedding_mode) {
-        const int n_embd = llama_model_n_embd(model);
+        const int n_embd_out = llama_model_n_embd_out(model);
         const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
-        const int n_embeddings = n_embd * n_embd_count;
+        const int n_embeddings = n_embd_out * n_embd_count;
         float * embeddings;
         type = "-embeddings";
 
@@ -177,24 +177,24 @@ int main(int argc, char ** argv) {
             embeddings = llama_get_embeddings(ctx);
         }
 
-        printf("Embedding dimension: %d\n", n_embd);
+        printf("Embedding dimension: %d\n", n_embd_out);
         printf("\n");
 
         // Print embeddings in the specified format
         for (int j = 0; j < n_embd_count; j++) {
             printf("embedding %d: ", j);
 
             // Print first 3 values
-            for (int i = 0; i < 3 && i < n_embd; i++) {
-                printf("%9.6f ", embeddings[j * n_embd + i]);
+            for (int i = 0; i < 3 && i < n_embd_out; i++) {
+                printf("%9.6f ", embeddings[j * n_embd_out + i]);
             }
 
             printf(" ... ");
 
             // Print last 3 values
-            for (int i = n_embd - 3; i < n_embd; i++) {
+            for (int i = n_embd_out - 3; i < n_embd_out; i++) {
                 if (i >= 0) {
-                    printf("%9.6f ", embeddings[j * n_embd + i]);
+                    printf("%9.6f ", embeddings[j * n_embd_out + i]);
                 }
             }
 

diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
     struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
     // allocate output
-    const int n_embd = llama_model_n_embd(model);
-    std::vector<float> embeddings(n_chunks * n_embd, 0);
+    const int n_embd_out = llama_model_n_embd_out(model);
+    std::vector<float> embeddings(n_chunks * n_embd_out, 0);
     float * emb = embeddings.data();
 
     // break into batches
@@ -232,8 +232,8 @@ int main(int argc, char ** argv) {
 
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
-            float * out = emb + p * n_embd;
-            batch_process(ctx, batch, out, s, n_embd);
+            float * out = emb + p * n_embd_out;
+            batch_process(ctx, batch, out, s, n_embd_out);
             common_batch_clear(batch);
             p += s;
             s = 0;
@@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
     }
 
     // final batch
-    float * out = emb + p * n_embd;
-    batch_process(ctx, batch, out, s, n_embd);
+    float * out = emb + p * n_embd_out;
+    batch_process(ctx, batch, out, s, n_embd_out);
 
     // save embeddings to chunks
     for (int i = 0; i < n_chunks; i++) {
-        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
         // clear tokens as they are no longer needed
         chunks[i].tokens.clear();
     }
@@ -266,16 +266,16 @@ int main(int argc, char ** argv) {
 
         batch_add_seq(query_batch, query_tokens, 0);
 
-        std::vector<float> query_emb(n_embd, 0);
-        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
+        std::vector<float> query_emb(n_embd_out, 0);
+        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
 
         common_batch_clear(query_batch);
 
         // compute cosine similarities
         {
             std::vector<std::pair<int, float>> similarities;
             for (int i = 0; i < n_chunks; i++) {
-                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
                 similarities.push_back(std::make_pair(i, sim));
             }
 

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -104,6 +104,7 @@ class LLM:
         VOCAB_SIZE                        = "{arch}.vocab_size"
         CONTEXT_LENGTH                    = "{arch}.context_length"
         EMBEDDING_LENGTH                  = "{arch}.embedding_length"
+        EMBEDDING_LENGTH_OUT              = "{arch}.embedding_length_out"
         FEATURES_LENGTH                   = "{arch}.features_length"
         BLOCK_COUNT                       = "{arch}.block_count"
         LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
@@ -3038,6 +3039,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ATTN_V,
         MODEL_TENSOR.ATTN_OUT,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M
     ],
     MODEL_ARCH.LFM2MOE: [
         MODEL_TENSOR.TOKEN_EMBD,