diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a38a88ce3fcb..855364b5d944 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -674,6 +674,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | +| `GlmOcrForConditionalGeneration` | GLM-OCR | T + IE+ | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + IE+ | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 60e1c9b07e92..91d407b7c218 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -566,6 +566,42 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: ) +# GLM-OCR +def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData: + model_name = "zai-org/GLM-OCR" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + mm_processor_kwargs={ + "size": {"shortest_edge": 12544, "longest_edge": 47040000}, + "fps": 1, + }, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + ) + + if modality == "image": + placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + elif modality == "video": + placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + + prompts = [ + ( + "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" + f"{placeholder}" + f"{question}<|assistant|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # H2OVL-Mississippi def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1962,6 +1998,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: "glm4_1v": run_glm4_1v, "glm4_5v": run_glm4_5v, "glm4_5v_fp8": run_glm4_5v_fp8, + "glm_ocr": run_glm_ocr, "h2ovl_chat": run_h2ovl, "hunyuan_vl": run_hunyuan_vl, "hyperclovax_seed_vision": run_hyperclovax_seed_vision, @@ -2013,6 +2050,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: MODELS_NEED_VIDEO_METADATA = [ "glm4_1v", + "glm_ocr", "glm4_5v", "glm4_5v_fp8", "molmo2", diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index effbafa50af0..4dab4b7d93be 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -458,6 +458,20 @@ ], marks=[large_gpu_mark(min_gb=32)], ), + "glm_ocr": VLMTestInfo( + models=["zai-org/GLM-OCR"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", + video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", + max_model_len=2048, + max_num_seqs=2, + get_stop_token_ids=lambda tok: [151329, 151336, 151338], + num_logprobs=10, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + auto_cls=AutoModelForImageTextToText, + marks=[large_gpu_mark(min_gb=32)], + ), "h2ovl": VLMTestInfo( models=[ "h2oai/h2ovl-mississippi-800m", diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index 8f141746e249..9310f52dfd3e 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -91,6 +91,19 @@ "use_processor": True, "question": "What is the content of each image?", }, + "glm_ocr": { + "model_name": "zai-org/GLM-OCR", + "interface": "llm_generate", + "max_model_len": 131072, + "max_num_seqs": 2, + "sampling_params": { + "temperature": 0.0, + "max_tokens": 256, + "stop_token_ids": None, + }, + "use_processor": True, + "question": "Text Recognition:", + }, "keye_vl": { "model_name": "Kwai-Keye/Keye-VL-8B-Preview", "interface": "llm_generate", diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 308784564d93..a6e580cb885c 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -122,6 +122,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: "ernie4_5_moe_vl": qwen3_vl_patch_mm_data, "glm4v": glm4_1v_patch_mm_data, "glm4v_moe": glm4_1v_patch_mm_data, + "glm_ocr": glm4_1v_patch_mm_data, "glmasr": glmasr_patch_mm_data, "molmo2": qwen3_vl_patch_mm_data, "qwen3_vl": qwen3_vl_patch_mm_data, diff --git a/tests/models/registry.py b/tests/models/registry.py index 5c6db71b1893..24d96cfda8a7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -274,7 +274,6 @@ def check_available_online( "Glm4MoeLiteForCausalLM": _HfExamplesInfo( "zai-org/GLM-4.7-Flash", min_transformers_version="5.0.0.dev", - is_available_online=False, ), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo( @@ -707,6 +706,11 @@ def check_available_online( ), "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"), + "GlmOcrForConditionalGeneration": _HfExamplesInfo( + "zai-org/GLM-OCR", + is_available_online=False, + min_transformers_version="5.0.0.dev", + ), "H2OVLChatModel": _HfExamplesInfo( "h2oai/h2ovl-mississippi-800m", trust_remote_code=True, @@ -1053,7 +1057,13 @@ def check_available_online( "Glm4MoeLiteMTPModel": _HfExamplesInfo( "zai-org/GLM-4.7-Flash", speculative_model="zai-org/GLM-4.7-Flash", + min_transformers_version="5.0.0.dev", + ), + "GlmOcrMTPModel": _HfExamplesInfo( + "zai-org/GLM-OCR", + speculative_model="zai-org/GLM-OCR", is_available_online=False, + min_transformers_version="5.0.0.dev", ), "LongCatFlashMTPModel": _HfExamplesInfo( "meituan-longcat/LongCat-Flash-Chat", diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 8f34dadae9c0..64a3e4608689 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -34,6 +34,7 @@ "mimo_mtp", "glm4_moe_mtp", "glm4_moe_lite_mtp", + "glm_ocr_mtp", "ernie_mtp", "exaone_moe_mtp", "qwen3_next_mtp", @@ -221,6 +222,17 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: } ) + if hf_config.architectures[0] == "GlmOcrForConditionalGeneration": + hf_config.model_type = "glm_ocr_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + { + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["GlmOcrMTPModel"], + } + ) + if hf_config.model_type == "ernie4_5_moe": hf_config.model_type = "ernie_mtp" if hf_config.model_type == "ernie_mtp": diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 06da2a8b3498..f6fdbc3d4c05 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -39,13 +39,22 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .llama import LlamaMLP as Glm4MLP from .llama import LlamaModel -from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + maybe_prefix, +) class Glm4Attention(nn.Module): @@ -78,7 +87,15 @@ def __init__( # Number of KV heads is less than TP size, so we replicate # the KV heads across multiple tensor parallel GPUs. assert tp_size % self.total_num_kv_heads == 0 - config.rope_parameters.setdefault("partial_rotary_factor", 0.5) + + rope_params = getattr(config, "rope_parameters", None) + if isinstance(rope_params, dict) and "partial_rotary_factor" in rope_params: + config.rope_parameters.setdefault( + "partial_rotary_factor", rope_params["partial_rotary_factor"] + ) + else: + config.rope_parameters.setdefault("partial_rotary_factor", 0.5) + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.head_dim = head_dim or hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim @@ -220,6 +237,73 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config=vllm_config, prefix=prefix, layer_type=Glm4DecoderLayer ) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue + if "rotary_emb.inv_freq" in name: + continue + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + if "scale" in name or "zero_point" in name: + # Remapping the name of FP8 kv-scale or zero point. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { @@ -293,3 +377,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) + + +def get_spec_layer_idx_from_weight_name( + config: Glm4Config, weight_name: str +) -> int | None: + if hasattr(config, "num_nextn_predict_layers") and ( + config.num_nextn_predict_layers > 0 + ): + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if f"layers.{layer_idx + i}." in weight_name: + return layer_idx + i + return None diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5db7a18f69a2..1c857bb4c25e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -24,7 +24,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only GLM-4V model compatible with HuggingFace weights.""" +"""Inference-only GLM-4.1V & GLM-4.6V-Flash, AutoGLM-Phone-9B model +compatible with HuggingFace weights.""" import itertools import math @@ -1418,7 +1419,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "visual"), ) - if config.model_type == "glm4v": + if config.model_type in ("glm4v", "glm_ocr"): architectures = ["Glm4ForCausalLM"] elif config.model_type == "glm4v_moe": architectures = ["Glm4MoeForCausalLM"] diff --git a/vllm/model_executor/models/glm_ocr.py b/vllm/model_executor/models/glm_ocr.py new file mode 100644 index 000000000000..ab80b679b048 --- /dev/null +++ b/vllm/model_executor/models/glm_ocr.py @@ -0,0 +1,389 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/Glm4v/modeling_Glm4v.py +# Copyright 2026 The ZhipuAI Team. +# Copyright 2026 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-OCR model compatible with HuggingFace weights.""" + +from collections.abc import Callable +from functools import partial +from typing import TYPE_CHECKING + +import torch +import torch.nn as nn +from einops import rearrange + +if TYPE_CHECKING: + from transformers.models.glm_ocr.configuration_glm_ocr import GlmOcrVisionConfig + +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state +from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) +from vllm.model_executor.layers.conv import Conv2dLayer +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding.common import ( + ApplyRotaryEmb, +) +from vllm.model_executor.models.glm4_1v import ( + Glm4vDummyInputsBuilder, + Glm4vForConditionalGeneration, + Glm4vMultiModalProcessor, + Glm4vPatchMerger, + Glm4vProcessingInfo, + Glm4vVisionBlock, + Glm4vVisionMLP, + Glm4vVisionPatchEmbed, + Glm4vVisionTransformer, +) +from vllm.multimodal import MULTIMODAL_REGISTRY + +from .utils import ( + maybe_prefix, +) +from .vision import ( + get_vit_attn_backend, + is_vit_use_data_parallel, +) + +logger = init_logger(__name__) + + +class GlmOcrVisionMLP(Glm4vVisionMLP): + pass + + +class GlmOcrVisionAttention(nn.Module): + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + # Per attention head and per partition values. + use_data_parallel = is_vit_use_data_parallel() + self.tp_size = ( + 1 if use_data_parallel else get_tensor_model_parallel_world_size() + ) + self.tp_rank = ( + 0 if use_data_parallel else parallel_state.get_tensor_model_parallel_rank() + ) + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads + ) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size + ) + + self.head_dim = embed_dim // num_heads + + self.q_norm = RMSNorm(self.head_dim, eps=1e-5) + self.k_norm = RMSNorm(self.head_dim, eps=1e-5) + + self.qkv = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=True, + quant_config=quant_config, + # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg + prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv", + disable_tp=use_data_parallel, + ) + self.proj = RowParallelLinear( + input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + bias=True, + disable_tp=use_data_parallel, + ) + + self.attn = MMEncoderAttention( + num_heads=self.num_attention_heads_per_partition, + head_size=self.hidden_size_per_attention_head, + scale=self.hidden_size_per_attention_head**-0.5, + ) + self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = ( + seq_len, + bs, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + + # RMSNorm on q, k + q_shape, k_shape = q.shape, k.shape + q = self.q_norm(q.reshape(-1, self.head_dim)).view(q_shape) + k = self.k_norm(k.reshape(-1, self.head_dim)).view(k_shape) + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) + if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None: + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = self.apply_rotary_emb( + qk_concat, + rotary_pos_emb_cos, + rotary_pos_emb_sin, + ) + q, k = torch.chunk(qk_rotated, 2, dim=0) + + context_layer = self.attn( + query=q, + key=k, + value=v, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + + output, _ = self.proj(context_layer) + return output + + +class GlmOcrVisionBlock(Glm4vVisionBlock): + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + norm_layer: Callable[[int], nn.Module] | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__( + dim, + num_heads, + mlp_hidden_dim, + norm_layer, + quant_config, + prefix, + ) + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.attn = GlmOcrVisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + self.mlp = GlmOcrVisionMLP( + dim, + mlp_hidden_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + +class GlmOcrVisionPatchEmbed(Glm4vVisionPatchEmbed): + pass + + +class GlmOcrPatchMerger(Glm4vPatchMerger): + pass + + +class GlmOcrVisionTransformer(Glm4vVisionTransformer): + def __init__( + self, + vision_config: GlmOcrVisionConfig, + norm_eps: float = 1e-5, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__(vision_config, norm_eps, quant_config, prefix) + + del self.post_conv_layernorm + del self.embeddings + + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + in_channels = vision_config.in_channels + depth = vision_config.depth + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.out_hidden_size = vision_config.out_hidden_size + + self.patch_embed = Glm4vVisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + hidden_size=self.hidden_size, + ) + + norm_layer = partial(RMSNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = get_rope( + head_size=head_dim, + max_position=8192, + is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, + ) + self.blocks = nn.ModuleList( + [ + GlmOcrVisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + ) + for layer_idx in range(depth) + ] + ) + self.merger = GlmOcrPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=vision_config.out_hidden_size * vision_config.in_channels, + quant_config=quant_config, + bias=False, + prefix=f"{prefix}.merger", + ) + + self.downsample = Conv2dLayer( + in_channels=vision_config.hidden_size, + out_channels=vision_config.out_hidden_size, + kernel_size=vision_config.spatial_merge_size, + stride=vision_config.spatial_merge_size, + ) + self.post_layernorm = RMSNorm( + vision_config.hidden_size, eps=vision_config.rms_norm_eps + ) + + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, + dtype=torch.get_default_dtype(), + ) + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor | list[list[int]], + ) -> torch.Tensor: + if isinstance(grid_thw, list): + grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + + # patchify + x = x.to(device=self.device, dtype=self.dtype) + x = self.patch_embed(x) + + # compute position embedding + rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb( + grid_thw + ) + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] + ).cumsum(dim=0, dtype=torch.int32) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) + cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + + # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations + max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) + + # transformers + x = x.unsqueeze(1) + for blk in self.blocks: + x = blk( + x, + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen, + ) + + # adapter + x = self.post_layernorm(x) + + x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, x.shape[-1]) + x = x.permute(0, 3, 1, 2) + x = self.downsample(x).view(-1, self.out_hidden_size) + x = self.merger(x) + + return x + + +@MULTIMODAL_REGISTRY.register_processor( + Glm4vMultiModalProcessor, + info=Glm4vProcessingInfo, + dummy_inputs=Glm4vDummyInputsBuilder, +) +class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + with self._mark_tower_model(vllm_config, {"image", "video"}): + self.visual = GlmOcrVisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-5), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) diff --git a/vllm/model_executor/models/glm_ocr_mtp.py b/vllm/model_executor/models/glm_ocr_mtp.py new file mode 100644 index 000000000000..34e602bb6695 --- /dev/null +++ b/vllm/model_executor/models/glm_ocr_mtp.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2026 The ZhipuAI Team. +# Copyright 2026 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-OCR MTP model compatible with HuggingFace weights.""" + +from collections.abc import Iterable + +import torch +import torch.nn as nn + +from vllm.config import VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors + +from .glm4 import Glm4DecoderLayer, get_spec_layer_idx_from_weight_name +from .glm4_moe_lite_mtp import ( + Glm4MoeLiteMultiTokenPredictor, + SharedHead, +) +from .interfaces import SupportsPP +from .utils import ( + is_pp_missing_parameter, + maybe_prefix, +) + + +class GlmOcrMultiTokenPredictorLayer(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + + config = vllm_config.speculative_config.draft_model_config.hf_config.text_config + self.config = config + quant_config = vllm_config.quant_config + + self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False) + + self.device = current_platform.device_type + self.shared_head = SharedHead( + config=config, prefix=prefix, quant_config=quant_config + ) + self.mtp_block = Glm4DecoderLayer( + vllm_config=vllm_config, prefix=prefix, config=self.config + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: torch.Tensor | None = None, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds[positions[0] == 0] = 0 + + inputs_embeds = self.enorm(inputs_embeds) + previous_hidden_states = self.hnorm(previous_hidden_states) + + hidden_states = self.eh_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1) + ) + + hidden_states, residual = self.mtp_block( + positions=positions, hidden_states=hidden_states, residual=None + ) + hidden_states = residual + hidden_states + return hidden_states + + +class GlmOcrMultiTokenPredictor(Glm4MoeLiteMultiTokenPredictor): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config.text_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + self.layers = torch.nn.ModuleDict( + { + str(idx): GlmOcrMultiTokenPredictorLayer( + vllm_config=vllm_config, + prefix=f"{prefix}.layers.{idx}", + ) + for idx in range( + self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers, + ) + } + ) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + + +class GlmOcrMTP(nn.Module, SupportsPP): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.config = vllm_config.model_config.hf_config.text_config + quant_config = vllm_config.quant_config + self.quant_config = quant_config + self.model = GlmOcrMultiTokenPredictor( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + self.expert_weights = [] + self.num_layers = self.config.num_nextn_predict_layers + for layer in self.model.layers.values(): + assert isinstance(layer, GlmOcrMultiTokenPredictorLayer) + layer = layer.mtp_block + assert isinstance(layer, Glm4DecoderLayer) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + hidden_states = self.model( + input_ids, positions, hidden_states, inputs_embeds, spec_step_idx + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + spec_step_idx: int = 0, + ) -> torch.Tensor | None: + return self.model.compute_logits(hidden_states, spec_step_idx) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if name == "lm_head.weight": + spec_layer = self.model.mtp_start_layer_idx + name = f"model.layers.{spec_layer}.shared_head.head.weight" + elif name == "model.embed_tokens.weight": + spec_layer = self.model.mtp_start_layer_idx + else: + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is None: + continue + + name = self._rewrite_spec_layer_name(spec_layer, name) + + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + + if "scale" in name or "zero_point" in name: + # Remapping the name of FP8 kv-scale or zero point. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Some checkpoints include weight scale tensors for the + # LM head even when the quantized head isn't built. Skip + # them if the model does not expose a matching parameter + # to avoid KeyError during load. + if name.endswith(".weight_scale") and name not in params_dict: + continue + + # According to DeepSeek-V3 Technical Report, MTP modules + # shares embedding layer. We only load the first weights. + if ( + spec_layer != self.model.mtp_start_layer_idx + and ".layers" not in name + ): + continue + + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str: + """ + Rewrite the weight name to match the format of the original model. + Add .mtp_block for modules in transformer layer block for spec layer + and rename shared layer weights to be top level. + """ + name = name.replace("model.language_model.layers", "model.layers") + + spec_layer_weight_names = [ + "embed_tokens", + "enorm", + "hnorm", + "eh_proj", + "shared_head", + ] + shared_weight_names = ["embed_tokens"] + spec_layer_weight = False + shared_weight = False + for weight_name in spec_layer_weight_names: + if weight_name in name: + spec_layer_weight = True + if weight_name in shared_weight_names: + shared_weight = True + break + if not spec_layer_weight: + # treat rest weights as weights for transformer layer block + name = name.replace( + f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block." + ) + elif shared_weight: + # treat shared weights as top level weights + name = name.replace(f"model.layers.{spec_layer}.", "model.") + return name diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 25b6e40251ef..cd029717ea04 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -319,8 +319,9 @@ ), "GlmAsrForConditionalGeneration": ("glmasr", "GlmAsrForConditionalGeneration"), "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), - "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 - "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501 + "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), + "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), + "GlmOcrForConditionalGeneration": ("glm_ocr", "GlmOcrForConditionalGeneration"), # noqa: E501 "GraniteSpeechForConditionalGeneration": ( "granite_speech", "GraniteSpeechForConditionalGeneration", @@ -472,6 +473,7 @@ "LongCatFlashMTPModel": ("longcat_flash_mtp", "LongCatFlashMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "Glm4MoeLiteMTPModel": ("glm4_moe_lite_mtp", "Glm4MoeLiteMTP"), + "GlmOcrMTPModel": ("glm_ocr_mtp", "GlmOcrMTP"), "MedusaModel": ("medusa", "Medusa"), "OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"), "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"), diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index d569c99ca214..caeca66d0d30 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -398,6 +398,7 @@ def get_num_hidden_layers(self) -> int: "qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor, "mimo_mtp": MimoMTPModelArchConfigConvertor, "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor, + "glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor, "ernie_mtp": ErnieMTPModelArchConfigConvertor, "pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor, "longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 4dff3fe706cc..6f0cae02340a 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -403,7 +403,7 @@ def propose( return draft_token_ids.view(-1, 1) if self.uses_mrope: - positions = self.positions[:, last_token_indices] + positions = self.mrope_positions[:, last_token_indices] else: positions = self.positions[last_token_indices] if self.method in ( @@ -1126,6 +1126,7 @@ def load_model(self, target_model: nn.Module) -> None: "Qwen2_5_VLForConditionalGeneration", "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", + "GlmOcrForConditionalGeneration", ]: self.model.config.image_token_index = target_model.config.image_token_id elif self.get_model_name(target_model) == "PixtralForConditionalGeneration":