Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/test-mtmd-c-api.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ int main(void) {
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
// get position of the last token, which should be (nx - 1, ny - 1)
struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, n_tokens - 1);
struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, 0, n_tokens - 1);
size_t nx = pos.x + 1;
size_t ny = pos.y + 1;
const char * id = mtmd_image_tokens_get_id(image_tokens);
Expand Down
20 changes: 10 additions & 10 deletions tools/mtmd/mtmd-helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,10 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
return n_pos;
}

void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, llama_pos pos_0, mtmd_decoder_pos * out_pos) {
size_t n_tokens = mtmd_image_tokens_get_n_tokens(chunks);
for (size_t i = 0; i < n_tokens; i++) {
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, i);
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, pos_0, i);
}
}

Expand Down Expand Up @@ -163,15 +163,15 @@ struct decode_embd_batch {
}

// M-RoPE for image
void set_position_mrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
void set_position_mrope_2d(const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
seq_id_0[0] = seq_id;
for (int32_t i = 0; i < batch.n_tokens; i++) {
pos[i ] = pos_0 + rel_pos[i].t;
pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].y;
pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i ] = rel_pos[i].t;
pos[i + batch.n_tokens ] = rel_pos[i].y;
pos[i + batch.n_tokens * 2] = rel_pos[i].x;
pos[i + batch.n_tokens * 3] = rel_pos[i].z;
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
Expand All @@ -188,7 +188,7 @@ struct decode_embd_batch {
pos[i ] = pos_0 + i;
pos[i + batch.n_tokens ] = pos_0 + i;
pos[i + batch.n_tokens * 2] = pos_0 + i;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i + batch.n_tokens * 3] = pos_0 + i;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change ok?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it should not change anything because mrope_sections is always configured such that last dim = 0, so backend will always skip the positional data being set here.

}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
Expand Down Expand Up @@ -268,8 +268,8 @@ int32_t mtmd_helper_decode_image_chunk(
}
const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
batch_embd.set_position_mrope_2d(n_past, rel_pos, seq_id);
mtmd_helper_image_get_decoder_pos(image_tokens, n_past, rel_pos.data());
batch_embd.set_position_mrope_2d(rel_pos, seq_id);
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {
Expand Down
2 changes: 1 addition & 1 deletion tools/mtmd/mtmd-helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);

// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
// out_pos must have length == mtmd_helper_get_n_tokens(image)
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos);
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);

// helper function that automatically:
// 1. run llama_decode() on text chunks
Expand Down
11 changes: 7 additions & 4 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1246,11 +1246,14 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
return image_tokens->ny;
}

mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i) {
mtmd_decoder_pos pos;
pos.t = 0;
pos.x = i % image_tokens->nx;
pos.y = i / image_tokens->nx;
// M-RoPE logic
// TODO: support other types of position encoding if needed
pos.t = pos_0;
pos.x = pos_0 + (i % image_tokens->nx);
pos.y = pos_0 + (i / image_tokens->nx);
pos.z = 0; // unused for now
return pos;
}

Expand Down
4 changes: 3 additions & 1 deletion tools/mtmd/mtmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,13 @@ struct mtmd_decoder_pos {
uint32_t t;
uint32_t x;
uint32_t y;
uint32_t z; // unused for now, reserved for future use
};
// get position for decoder attention, to be used by M-RoPE models
// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
// pos_0 is the absolute position of the first token
// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);

// tokenize an input text prompt and a list of bitmaps (images/audio)
// the prompt must have the input image marker (default: "<__media__>") in it
Expand Down
Loading