Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ static std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CLI,
LLAMA_EXAMPLE_LIQUID_AUDIO,
};

static std::string read_file(const std::string & fname) {
Expand Down Expand Up @@ -1206,7 +1207,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_LIQUID_AUDIO}));
add_opt(common_arg(
{"--perf"},
{"--no-perf"},
Expand Down Expand Up @@ -2006,7 +2007,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image.emplace_back(item);
}
}
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_LIQUID_AUDIO}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
Expand Down Expand Up @@ -2482,7 +2483,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.out_file = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_LIQUID_AUDIO}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
Expand Down Expand Up @@ -2614,14 +2615,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.hostname = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_HOST"));
add_opt(common_arg(
{"--port"}, "PORT",
string_format("port to listen (default: %d)", params.port),
[](common_params & params, int value) {
params.port = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_PORT"));
add_opt(common_arg(
{"--path"}, "PATH",
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
Expand Down Expand Up @@ -3256,7 +3257,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.vocoder.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}));
add_opt(common_arg(
{"--tts-use-guide-tokens"},
"Use guide tokens to improve TTS word recall",
Expand All @@ -3270,7 +3271,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.vocoder.speaker_file = value;
}
).set_examples({LLAMA_EXAMPLE_TTS}));
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_LIQUID_AUDIO}));

add_opt(common_arg(
{"--diffusion-steps"}, "N",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ enum llama_example {
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_LIQUID_AUDIO,

LLAMA_EXAMPLE_COUNT,
};
Expand Down
42 changes: 41 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9917,7 +9917,7 @@
def set_gguf_parameters(self):
# set num_key_value_heads only for attention layers
self.hparams["num_key_value_heads"] = [
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
self.hparams["num_key_value_heads"] if layer_type != "conv" else 0
for layer_type in self.hparams["layer_types"]
]

Expand Down Expand Up @@ -9948,6 +9948,46 @@
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])


@ModelBase.register("Lfm2Model")
class LFM2ColBertModel(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2
dense_tensor_name = "dense_2"

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith(self.dense_tensor_name):
name = "model." + name

return super().modify_tensors(data_torch, name, bid)

def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# dense tensor is stored in a separate safetensors file
from safetensors.torch import load_file
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
assert tensors_file.is_file()
tensor = load_file(tensors_file)["linear.weight"]
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
yield f"{self.dense_tensor_name}.weight", tensor.clone()


@ModelBase.register("Lfm25AudioTokenizer")
class LFM25AudioTokenizer(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
self.gguf_writer.add_embedding_length_out(self.hparams.get("output_size"))

Check failure on line 9979 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

Argument of type "Any | None" cannot be assigned to parameter "length" of type "int" in function "add_embedding_length_out"   Type "Any | None" is not assignable to type "int"     "None" is not assignable to "int" (reportArgumentType)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "istft.window" or name.startswith("emb.emb"):
return []

if name.startswith("lin"):
name = name.replace("lin", "dense_2_out")

return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Lfm2MoeForCausalLM")
class LFM2MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.LFM2MOE
Expand Down
50 changes: 25 additions & 25 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
}
}

static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);

// clear previous kv_cache values (irrelevant for embeddings)
Expand Down Expand Up @@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
}

float * out = output + embd_pos * n_embd;
common_embd_normalize(embd, out, n_embd, embd_norm);
float * out = output + embd_pos * n_embd_out;
common_embd_normalize(embd, out, n_embd_out, embd_norm);
}
}

Expand Down Expand Up @@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
}

// allocate output
const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_embd_count * n_embd, 0);
const int n_embd_out = llama_model_n_embd_out(model);
std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
float * emb = embeddings.data();

// break into batches
Expand All @@ -267,8 +267,8 @@ int main(int argc, char ** argv) {

// encode if at capacity
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
float * out = emb + e * n_embd_out;
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
s = 0;
common_batch_clear(batch);
Expand All @@ -280,28 +280,28 @@ int main(int argc, char ** argv) {
}

// final batch
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
float * out = emb + e * n_embd_out;
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);

if (params.embd_out.empty()) {
LOG("\n");

if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int j = 0; j < n_embd_count; j++) {
LOG("embedding %d: ", j);
for (int i = 0; i < std::min(3, n_embd); i++) {
for (int i = 0; i < std::min(3, n_embd_out); i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd_out + i]);
} else {
LOG("%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd_out + i]);
}
}
LOG(" ... ");
for (int i = n_embd - 3; i < n_embd; i++) {
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd_out + i]);
} else {
LOG("%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd_out + i]);
}
}
LOG("\n");
Expand All @@ -320,21 +320,21 @@ int main(int argc, char ** argv) {
for (uint32_t i = 0; i < n_cls_out; i++) {
// NOTE: if you change this log - update the tests in ci/run.sh
if (n_cls_out == 1) {
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
} else {
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
}
}
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding
for (int j = 0; j < n_prompts; j++) {
LOG("embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd_out + i]);
} else {
LOG("%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd_out + i]);
}
}
LOG("\n");
Expand All @@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
LOG("\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
LOG("%6.2f ", sim);
}
LOG("%1.10s", prompts[i].c_str());
Expand All @@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
LOG("[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
i++;
if (i < n_embd) LOG(","); else break;
if (i < n_embd_out) LOG(","); else break;
}
LOG(notArray ? "]\n }" : "]");
j++;
Expand All @@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
LOG("%6.2f", sim);
j++;
if (j < n_embd_count) LOG(", "); else break;
Expand All @@ -397,7 +397,7 @@ int main(int argc, char ** argv) {

if (notArray) LOG("\n}\n");
} else if (params.embd_out == "raw") {
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
}

LOG("\n");
Expand Down
14 changes: 7 additions & 7 deletions examples/model-conversion/logits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ int main(int argc, char ** argv) {
std::vector<float> embd_out;

if (embedding_mode) {
const int n_embd = llama_model_n_embd(model);
const int n_embd_out = llama_model_n_embd_out(model);
const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
const int n_embeddings = n_embd * n_embd_count;
const int n_embeddings = n_embd_out * n_embd_count;
float * embeddings;
type = "-embeddings";

Expand All @@ -177,24 +177,24 @@ int main(int argc, char ** argv) {
embeddings = llama_get_embeddings(ctx);
}

printf("Embedding dimension: %d\n", n_embd);
printf("Embedding dimension: %d\n", n_embd_out);
printf("\n");

// Print embeddings in the specified format
for (int j = 0; j < n_embd_count; j++) {
printf("embedding %d: ", j);

// Print first 3 values
for (int i = 0; i < 3 && i < n_embd; i++) {
printf("%9.6f ", embeddings[j * n_embd + i]);
for (int i = 0; i < 3 && i < n_embd_out; i++) {
printf("%9.6f ", embeddings[j * n_embd_out + i]);
}

printf(" ... ");

// Print last 3 values
for (int i = n_embd - 3; i < n_embd; i++) {
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
if (i >= 0) {
printf("%9.6f ", embeddings[j * n_embd + i]);
printf("%9.6f ", embeddings[j * n_embd_out + i]);
}
}

Expand Down
20 changes: 10 additions & 10 deletions examples/retrieval/retrieval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);

// allocate output
const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_chunks * n_embd, 0);
const int n_embd_out = llama_model_n_embd_out(model);
std::vector<float> embeddings(n_chunks * n_embd_out, 0);
float * emb = embeddings.data();

// break into batches
Expand All @@ -232,8 +232,8 @@ int main(int argc, char ** argv) {

// encode if at capacity
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
float * out = emb + p * n_embd;
batch_process(ctx, batch, out, s, n_embd);
float * out = emb + p * n_embd_out;
batch_process(ctx, batch, out, s, n_embd_out);
common_batch_clear(batch);
p += s;
s = 0;
Expand All @@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
}

// final batch
float * out = emb + p * n_embd;
batch_process(ctx, batch, out, s, n_embd);
float * out = emb + p * n_embd_out;
batch_process(ctx, batch, out, s, n_embd_out);

// save embeddings to chunks
for (int i = 0; i < n_chunks; i++) {
chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
// clear tokens as they are no longer needed
chunks[i].tokens.clear();
}
Expand All @@ -266,16 +266,16 @@ int main(int argc, char ** argv) {

batch_add_seq(query_batch, query_tokens, 0);

std::vector<float> query_emb(n_embd, 0);
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
std::vector<float> query_emb(n_embd_out, 0);
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);

common_batch_clear(query_batch);

// compute cosine similarities
{
std::vector<std::pair<int, float>> similarities;
for (int i = 0; i < n_chunks; i++) {
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
similarities.push_back(std::make_pair(i, sim));
}

Expand Down
2 changes: 2 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class LLM:
VOCAB_SIZE = "{arch}.vocab_size"
CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length"
EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out"
FEATURES_LENGTH = "{arch}.features_length"
BLOCK_COUNT = "{arch}.block_count"
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
Expand Down Expand Up @@ -3038,6 +3039,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M
],
MODEL_ARCH.LFM2MOE: [
MODEL_TENSOR.TOKEN_EMBD,
Expand Down
Loading