Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9493,6 +9493,116 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

return [] # skip other tensors


@ModelBase.register("JanusForConditionalGeneration")
class JanusProModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch

def set_gguf_parameters(self):
super().set_gguf_parameters()

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision, aligner, and generation tensors as they will be handled by `JanusProVisionModel`
skip_prefixes = (
'model.vision_model.',
'model.aligner.',
'model.vqmodel.',
'model.generation_embeddings.',
'model.generation_aligner.',
'model.generation_head.',
)
if name.startswith(skip_prefixes):
return []

if name.startswith('model.language_model.'):
name = name.replace('model.language_model.', 'model.')
elif name.startswith('language_model.'):
name = name.replace('language_model.', '')

return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("JanusForConditionalGeneration")
class JanusProVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
if "intermediate_size" not in self.hparams_vision:
mlp_ratio = self.hparams_vision.get("mlp_ratio")
hidden_size = self.hparams_vision.get("hidden_size")
if mlp_ratio is not None and hidden_size is not None:
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))

def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None

self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)

self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))

hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
if hidden_act == "gelu":
self.gguf_writer.add_vision_use_gelu(True)
elif hidden_act == "silu":
self.gguf_writer.add_vision_use_silu(True)

def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
"""Map aligner tensors to projector format"""
suffix = ".bias" if name.endswith(".bias") else ".weight"

if name.startswith("model.aligner."):
local_name = name[len("model.aligner."):]
elif name.startswith("aligner."):
local_name = name[len("aligner."):]
else:
raise ValueError(f"Unsupported Janus aligner prefix: {name}")

if local_name.startswith("fc1."):
mm_index = 0
elif local_name.startswith("hidden_layers."):
parts = local_name.split(".", 2)
if len(parts) < 3:
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
mm_index = int(parts[1]) + 1
else:
raise ValueError(f"Unsupported Janus aligner tensor: {name}")

tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
return [(tensor_name, data_torch)]

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

# Skip language model tensors as they will be handled by `JanusProModel`
if name.startswith(('model.language_model.', 'language_model.')):
return []

# Skip generation-related components
skip_generation_prefixes = (
'model.vqmodel.',
'vqmodel.',
'model.generation_embeddings.',
'generation_embeddings.',
'model.generation_aligner.',
'generation_aligner.',
'model.generation_head.',
'generation_head.',
)
if name.startswith(skip_generation_prefixes):
return []

# Handle aligner tensors
if name.startswith(('model.aligner.', 'aligner.')):
return list(self._map_aligner_tensor(data_torch, name))

# Handle vision tensors
if name.startswith(('model.vision_model.', 'vision_model.')):
return [(self.map_tensor_name(name), data_torch)]

return []


###### CONVERSION LOGIC ######


Expand Down
6 changes: 4 additions & 2 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,7 @@ class TensorNameMap:
"model.mm_projector.mlp.mlp.{bid}",
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4
"mlp1.{bid}", # InternVL
"model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
),

MODEL_TENSOR.V_MMPROJ_PEG: (
Expand All @@ -1170,7 +1171,7 @@ class TensorNameMap:
"vision_tower.vision_model.embeddings.patch_embedding",
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"model.vision_model.embeddings.patch_embedding", # SmolVLM, Janus Pro
"vision_tower.patch_conv", # pixtral-hf
"vision_encoder.patch_conv", # pixtral
"vision_model.patch_embedding.linear", # llama 4
Expand All @@ -1182,7 +1183,7 @@ class TensorNameMap:
"vision_tower.vision_model.embeddings.position_embedding",
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
"model.vision_model.embeddings.position_embedding", # SmolVLM, Janus Pro
"vision_model.positional_embedding_vlm", # llama 4
"vision_tower.patch_embed.pos_emb", # kimi-vl
),
Expand Down Expand Up @@ -1252,6 +1253,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ enum projector_type {
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_UNKNOWN,
};

Expand All @@ -163,6 +164,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
73 changes: 73 additions & 0 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1509,6 +1509,45 @@ struct clip_graph {
return gf;
}

ggml_cgraph * build_janus_pro() {
GGML_ASSERT(model.class_embedding == nullptr); // No CLS token

ggml_tensor * inp = build_inp();

const int n_pos = n_patches;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(positions, "positions");
ggml_set_input(positions);

ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);

ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
nullptr);

cur = ggml_mul_mat(ctx0, model.mm_0_w, cur);
if (model.mm_0_b) {
cur = ggml_add(ctx0, cur, model.mm_0_b);
}
cb(cur, "aligner_0", -1);

cur = ggml_gelu(ctx0, cur);

cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
if (model.mm_1_b) {
cur = ggml_add(ctx0, cur, model.mm_1_b);
}
cb(cur, "aligner_1", -1);

// build the graph
ggml_build_forward_expand(gf, cur);

return gf;
}

// whisper encoder with custom projector
ggml_cgraph * build_whisper_enc() {
const int n_frames = img.nx;
Expand Down Expand Up @@ -2126,6 +2165,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
res = graph.build_kimivl();
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
res = graph.build_janus_pro();
} break;
default:
{
res = graph.build_llava();
Expand Down Expand Up @@ -2442,6 +2485,14 @@ struct clip_model_loader {
hparams.ffn_op = FFN_GELU_ERF;
log_ffn_op = "gelu_erf"; // temporary solution for logging
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
// Janus Pro uses mean = std = [0.5, 0.5, 0.5]
// ref: https://huggingface.co/deepseek-community/Janus-Pro-1B/blob/main/preprocessor_config.json
// ref: https://huggingface.co/deepseek-community/Janus-Pro-7B/blob/main/preprocessor_config.json
hparams.image_mean[0] = hparams.image_mean[1] = hparams.image_mean[2] = 0.5f;
hparams.image_std[0] = hparams.image_std[1] = hparams.image_std[2] = 0.5f;
} break;
default:
break;
}
Expand Down Expand Up @@ -2777,6 +2828,13 @@ struct clip_model_loader {
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
} break;
default:
GGML_ASSERT(false && "unknown projector type");
}
Expand Down Expand Up @@ -3637,6 +3695,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32));
return true;

} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
const std::array<uint8_t, 3> pad_color = {127, 127, 127};
clip_image_u8 resized_image;
int sz = params.image_size; // 384
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}, pad_color);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
return true;

} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL
|| ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR
) {
Expand Down Expand Up @@ -3817,6 +3886,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
switch (proj) {
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_JANUS_PRO:
{
// do nothing
} break;
Expand Down Expand Up @@ -4286,6 +4356,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_JANUS_PRO:
{
// llava and other models
std::vector<int32_t> positions(n_pos);
Expand Down Expand Up @@ -4427,6 +4498,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_JANUS_PRO:
return ctx->model.mm_1_w->ne[1];
default:
GGML_ABORT("Unknown projector type");
}
Expand Down