Skip to content
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
924bbab
mtmd: add "placeholder bitmap" for counting tokens w/o preprocessing
ngxson May 30, 2026
064c2d7
fast path skip preproc for placeholder
ngxson May 30, 2026
d1a098d
fix build
ngxson May 30, 2026
58171a6
correct the api
ngxson May 30, 2026
f1503cf
add server endpoint + tests
ngxson May 30, 2026
aec9eff
add object name
ngxson May 30, 2026
035d72c
update docs
ngxson May 30, 2026
3cb2d8c
add proxy handling
ngxson May 30, 2026
447e418
fix build
ngxson May 30, 2026
8f67dfb
fix audio input path
ngxson May 30, 2026
8351aaf
use is_placeholder in process_mtmd_prompt()
ngxson May 30, 2026
1945165
nits
ngxson May 30, 2026
c72ef5c
nits (2)
ngxson May 30, 2026
53e3e88
docs: clarify chat/completions/input_tokens is not official
ngxson Jun 1, 2026
c8d6a00
mtmd: enable non-causal vision for gemma 4 unified (#24082)
ngxson Jun 3, 2026
166fe29
qwen35: use post-norm hidden state for MTP (#24025)
am17an Jun 3, 2026
94a220c
mtmd: fix Gemma 4 unified FPE (#24088)
abetlen Jun 3, 2026
f478f1b
sycl : Improve SYCL doc (#23025)
malsbat Jun 4, 2026
3c7450c
ggml-cpu: extend RVV quantization vec dot to higher VLENs (#22754)
rehan-10xengineer Jun 4, 2026
e8c5489
ggml-webgpu: FlashAttention refactor + standardize quantization suppo…
reeselevine Jun 4, 2026
3d19986
metal : reduce rset heartbeat from 500ms -> 5ms (#24074)
ggerganov Jun 4, 2026
65ef50a
tests : refactor test-save-load-state to accept token input (#24073)
ggerganov Jun 4, 2026
6ddc943
readme : add status badges (#24104)
ggerganov Jun 4, 2026
e3ba22d
fix(mtmd): handle Gemma 4 audio projector embedding size (#24091)
abetlen Jun 4, 2026
7ac5a42
cmake: skip cvector-generator and export-lora when CPU backend is dis…
arichiardi Jun 4, 2026
0066404
server : add header to tools/server/server-http.h (#24089)
abawany Jun 4, 2026
4d74287
build : use umbrella Headers directory for XCFramework module map (#2…
gmarzjr Jun 4, 2026
4586479
webui: fix tool selector toggle/counter, key tools by stable identity…
ServeurpersoCom Jun 4, 2026
a121232
agents: refactor, include more guidelines (#24111)
ngxson Jun 4, 2026
6f3a9f3
server: avoid unnecessary checkpoint restore when new tokens are pres…
Abioy Jun 4, 2026
4c51309
ggml: vectorize ggml_vec_dot_q4_1_q8_1 with WASM SIMD128 (#22209)
sirohikartik Jun 4, 2026
e802356
convert: Fix Gemma 4 Unified conversion (#24118)
pcuenca Jun 4, 2026
0dbfa66
return filter to save memory (#24125)
forforever73 Jun 4, 2026
5269770
ui: added single line reasoning preview (#23601)
gugugiyu Jun 4, 2026
21444c8
ui: Fixed packages (#24119)
allozaur Jun 4, 2026
e7bcf1c
Move duplicated imatrix code into single common imatrix-loader.cpp (#…
bartowski1182 Jun 4, 2026
42b2d60
webui: [a11y] fix keyboard navigation issues in chat interface and si…
vignesh191 Jun 4, 2026
260862b
arg: fix double mtp downloads (#24128)
ngxson Jun 4, 2026
7c158fb
server : disable on-device spec checkpoints (#24108)
ggerganov Jun 4, 2026
7fe2ae4
sycl : port multi-column MMVQ from CUDA backend (#21845)
masonmilby Jun 5, 2026
46fa662
ci : build-msys job slimming [no ci] (#24157)
danbev Jun 5, 2026
2154a0f
CUDA: enroll mul_mat_vec_q_moe into pdl (#24087)
ORippler Jun 5, 2026
3ecfb15
kleidiai : dynamic chunck-based scheduling for hybrid execution (#23819)
chaxu01 Jun 5, 2026
7acb4e8
hparams : refactor `hparams.n_layer` (#24060)
ggerganov Jun 5, 2026
59917d3
minor : fix lint issues (#24165)
ggerganov Jun 5, 2026
ad1b88c
docs: Update quantization readme (#24133)
pcuenca Jun 5, 2026
cc7bef3
ui: add ignore-scripts=true to npmrc (#24149)
ngxson Jun 5, 2026
9c955c4
Fix link to available UI settings (#24169)
wariuccio Jun 5, 2026
2016bf2
ui: run npm install when package-lock.json is newer than node_modules…
ServeurpersoCom Jun 5, 2026
96fbe00
model : fix llama_model::n_gpu_layers() (#24188)
ggerganov Jun 5, 2026
86591c7
cli: fix model params not propagated (#23893)
therealkenc Jun 5, 2026
6effcec
TP: round up granularity to 128 (#24180)
JohannesGaessler Jun 5, 2026
64086f2
model, mtmd: Granite4 Vision (#23545)
gabe-l-hart Jun 5, 2026
c4a278d
model: fix build failed (#24193)
ngxson Jun 5, 2026
acca080
Merge branch 'master' into xsn/mtmd_placeholder_chunks
ngxson Jun 5, 2026
5b0cfdf
fix merge problem
ngxson Jun 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 138 additions & 7 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "gguf.h"
#include "clip.h"

#include <array>
#include <climits>
#include <cstdarg>
#include <cinttypes>
Expand Down Expand Up @@ -413,24 +414,154 @@ static projector_type clip_projector_type_from_string(const std::string & str) {

// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
clip_image_size get_size() const {
return { nx, ny };
}

void set_size(clip_image_size size, bool is_placeholder) {
nx = size.width;
ny = size.height;
if (is_placeholder) {
buf.clear();
} else {
buf.resize((size_t) nx * (size_t) ny * 3);
}
}

void cpy_buf(const std::vector<uint8_t> & new_buf) {
buf = new_buf;
}

const std::vector<uint8_t> & get_ro_buf() const {
if (is_placeholder()) {
throw std::runtime_error("this clip_image_u8 is a placeholder");
}
return buf;
}

// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation

bool is_placeholder() const {
return buf.empty();
}

std::array<uint8_t, 3> get_pixel(int x, int y) const {
if (is_placeholder()) {
// return a dummy value, so that legacy code can still process image without errors
return { 0, 0, 0 };
}
int idx = (y * nx + x) * 3;
return { buf[idx], buf[idx + 1], buf[idx + 2] };
}

void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) {
if (is_placeholder()) {
return; // no-op
}
int idx = (y * nx + x) * 3;
buf[idx] = rgb[0];
buf[idx + 1] = rgb[1];
buf[idx + 2] = rgb[2];
}

size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}

size_t n_elements() const {
return n_pixels() * 3;
}

private:
std::vector<uint8_t> buf;
int nx = 0;
int ny = 0;
};

// For images, buf.size() == nx*ny*3
// Memory layout: RGBRGBRGB...
// For audio, only one channel is used, buf.size() == nx*ny
// nx will be n_frames and ny will be n_mel
struct clip_image_f32 {
int nx;
int ny;

std::vector<float> buf;

// marks the global view in e.g., DeepSeek-OCR Models
bool add_viewsep = false;

clip_image_size get_size() const {
return { nx_, ny_ };
}

int nx() const { return nx_; }
int ny() const { return ny_; }

void set_size(clip_image_size size, bool is_placeholder, bool is_audio) {
nx_ = size.width;
ny_ = size.height;
if (is_placeholder) {
buf.clear();
} else {
if (is_audio) {
buf.resize((size_t) nx_ * (size_t) ny_);
} else {
buf.resize((size_t) nx_ * (size_t) ny_ * 3);
}
}
}

void cpy_buf(const std::vector<float> & new_buf) {
buf = new_buf;
}

void from_u8(const clip_image_u8 & img) {
auto size = img.get_size();
nx_ = size.width;
ny_ = size.height;
if (img.is_placeholder()) {
buf.clear();
return; // no-op
}
buf.resize(img.n_elements());
const auto & u8_buf = img.get_ro_buf();
for (size_t i = 0; i < img.n_elements(); ++i) {
buf[i] = (float) u8_buf[i] / 255.0f;
}
}

size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}

size_t n_elements() const {
return n_pixels() * 3;
}

void normalize(const float mean[3], const float std[3]) {
if (is_placeholder()) {
return; // no-op
}
for (size_t i = 0; i < n_pixels(); ++i) {
buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0];
buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1];
buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2];
}
}

const std::vector<float> & get_ro_buf() const {
if (is_placeholder()) {
throw std::runtime_error("this clip_image_f32 is a placeholder");
}
return buf;
}

// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern

bool is_placeholder() const {
return buf.empty();
}

private:
std::vector<float> buf;
int nx_ = 0;
int ny_ = 0;
};

//
Expand Down
Loading
Loading