Skip to content

Commit 45c40ee

Browse files
ngxsonggerganov
authored andcommitted
llama-quant: add support for mmproj (ggml-org#16592)
* llama-quant: add support for mmproj * Update src/llama.cpp Co-authored-by: Georgi Gerganov <[email protected]> * check prefix instead * small fix --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 54bb97e commit 45c40ee

File tree

5 files changed

+19
-2
lines changed

5 files changed

+19
-2
lines changed

src/llama-arch.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <map>
66

77
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8+
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
89
{ LLM_ARCH_LLAMA, "llama" },
910
{ LLM_ARCH_LLAMA4, "llama4" },
1011
{ LLM_ARCH_DECI, "deci" },
@@ -276,6 +277,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
276277
};
277278

278279
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
280+
{
281+
LLM_ARCH_CLIP,
282+
{},
283+
},
279284
{
280285
LLM_ARCH_LLAMA,
281286
{

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//
1010

1111
enum llm_arch {
12+
LLM_ARCH_CLIP,
1213
LLM_ARCH_LLAMA,
1314
LLM_ARCH_LLAMA4,
1415
LLM_ARCH_DECI,

src/llama-model.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
567567
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
568568

569569
// everything past this point is not vocab-related
570-
if (hparams.vocab_only) {
570+
// for CLIP models, we only need to load tensors, no hparams
571+
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
571572
return;
572573
}
573574

@@ -20199,6 +20200,7 @@ int32_t llama_n_head(const llama_model * model) {
2019920200
llama_rope_type llama_model_rope_type(const llama_model * model) {
2020020201
switch (model->arch) {
2020120202
// these models do not use RoPE
20203+
case LLM_ARCH_CLIP:
2020220204
case LLM_ARCH_GPT2:
2020320205
case LLM_ARCH_GPTJ:
2020420206
case LLM_ARCH_MPT:

src/llama-quant.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
704704
});
705705
}
706706

707+
bool is_clip_model = false;
707708
for (const auto * it : tensors) {
708709
const struct ggml_tensor * tensor = it->tensor;
709710

@@ -717,12 +718,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
717718
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
718719
qs.has_output = true;
719720
}
721+
722+
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
720723
}
721724

722725
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
723726

724727
// sanity checks for models that have attention layers
725-
if (qs.n_attention_wv != 0)
728+
if (qs.n_attention_wv != 0 && !is_clip_model)
726729
{
727730
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
728731
// attention layers have a non-zero number of kv heads
@@ -884,6 +887,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
884887
// do not quantize relative position bias (T5)
885888
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
886889

890+
// do not quantize specific multimodal tensors
891+
quantize &= name.find(".position_embd.") == std::string::npos;
892+
887893
ggml_type new_type;
888894
void * new_data;
889895
size_t new_size;

src/llama.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
124124
} catch(const std::exception & e) {
125125
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
126126
}
127+
if (model.arch == LLM_ARCH_CLIP) {
128+
throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
129+
}
127130
try {
128131
model.load_vocab(ml);
129132
} catch(const std::exception & e) {

0 commit comments

Comments
 (0)