From fff7e08f21cf01e96d5c1a5e8a196a3c66b254a7 Mon Sep 17 00:00:00 2001 From: Christina Date: Mon, 16 Mar 2026 13:02:57 -0500 Subject: [PATCH 1/3] fix(gguf): Consolidate Gemma2/3 GGUF fixes for correctness on Blackwell This PR consolidates four related GGUF bug fixes for Gemma2 and Gemma3 models, plus a style improvement from reviewer feedback. **1. Add quant_config to embedding layer (PR #30424)** Pass quant_config to VocabParallelEmbedding in Gemma2Model so that GGUFEmbeddingMethod is selected instead of UnquantizedEmbeddingMethod. Without this, quantized bytes are read as raw floats producing gibberish. **2. Fix EOS token extraction for HF blob paths (PR #30434)** GGUF files served from HuggingFace Hub use blob paths that don't match the expected filename pattern. Extract EOS token ID directly from GGUF metadata as a reliable fallback. **3. Skip missing parameters during GGUF weight loading (PR #30699)** Gemma2 GGUF files may lack certain weight keys (e.g. embed_tokens.qweight_type). Skip missing parameters gracefully instead of raising KeyError. **4. Use RMSNorm instead of GemmaRMSNorm for GGUF (PR #31464)** GGUF files store RMSNorm weights with +1 baked in (llama.cpp convention). GemmaRMSNorm adds 1 in its forward pass, causing double addition. Select plain RMSNorm at construction time for GGUF models. Applied to all norm layers in Gemma2 and Gemma3 (including q_norm/k_norm). **Style: compact rms_norm_kwargs pattern (reviewer feedback)** Use rms_norm_kwargs dict to avoid repeating constructor arguments, per hmellor's review on PR #31464. Tested on RTX 5090 (Blackwell, SM 120) with gem2-2b-gguf and gemma3-1b. Supersedes PRs #30424, #30434, #30699, #31464. Signed-off-by: Christina --- vllm/model_executor/models/gemma2.py | 31 +++++++++++++-------- vllm/model_executor/models/gemma3.py | 40 ++++++++++++--------------- vllm/tokenizers/hf.py | 26 +++++++++++++++++ vllm/transformers_utils/gguf_utils.py | 40 +++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 425ecc65195a..aee64af5b8dc 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -29,7 +29,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.layernorm import GemmaRMSNorm +from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -214,16 +214,15 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.mlp", ) - self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = GemmaRMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - self.pre_feedforward_layernorm = GemmaRMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - self.post_feedforward_layernorm = GemmaRMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) + # GGUF stores RMSNorm weights with +1 baked in (llama.cpp convention). + # GemmaRMSNorm adds 1 in its forward pass, so use plain RMSNorm for GGUF. + quant_name = quant_config.get_name() if quant_config else None + rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + rms_norm_kwargs = dict(hidden_size=config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm = rms_norm_cls(**rms_norm_kwargs) + self.post_attention_layernorm = rms_norm_cls(**rms_norm_kwargs) + self.pre_feedforward_layernorm = rms_norm_cls(**rms_norm_kwargs) + self.post_feedforward_layernorm = rms_norm_cls(**rms_norm_kwargs) def forward( self, @@ -263,6 +262,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -271,7 +272,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ), prefix=f"{prefix}.layers", ) - self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + quant_name = quant_config.get_name() if quant_config else None + rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + self.norm = rms_norm_cls(config.hidden_size, eps=config.rms_norm_eps) # Normalize the embedding by sqrt(hidden_size) # The normalizer's data type should be downcasted to the model's @@ -361,6 +364,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: continue if is_pp_missing_parameter(name, self): continue + # Skip parameters not in the model (e.g., GGUF quantization + # metadata like qweight_type for embeddings) + if name not in params_dict: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index b2352a3c9268..2c8c936aee2a 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -31,7 +31,7 @@ Attention, EncoderOnlyAttention, ) -from vllm.model_executor.layers.layernorm import GemmaRMSNorm +from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -156,8 +156,12 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) + # GGUF stores RMSNorm weights with +1 baked in (llama.cpp convention). + # GemmaRMSNorm adds 1 in its forward pass, so use plain RMSNorm for GGUF. + quant_name = quant_config.get_name() if quant_config else None + rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + self.q_norm = rms_norm_cls(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = rms_norm_cls(self.head_dim, eps=config.rms_norm_eps) layer_idx = extract_layer_index(prefix) layer_type = config.layer_types[layer_idx] @@ -261,16 +265,13 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.mlp", ) - self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = GemmaRMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - self.pre_feedforward_layernorm = GemmaRMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - self.post_feedforward_layernorm = GemmaRMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) + quant_name = quant_config.get_name() if quant_config else None + rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + rms_norm_kwargs = dict(hidden_size=config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm = rms_norm_cls(**rms_norm_kwargs) + self.post_attention_layernorm = rms_norm_cls(**rms_norm_kwargs) + self.pre_feedforward_layernorm = rms_norm_cls(**rms_norm_kwargs) + self.post_feedforward_layernorm = rms_norm_cls(**rms_norm_kwargs) def forward( self, @@ -322,7 +323,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ), prefix=f"{prefix}.layers", ) - self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + quant_name = quant_config.get_name() if quant_config else None + rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + self.norm = rms_norm_cls(config.hidden_size, eps=config.rms_norm_eps) # Normalize the embedding by sqrt(hidden_size) # The normalizer's data type should be downcasted to the model's @@ -383,15 +386,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - # Revert +1 during llama.cpp conversion - # see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400 - if ( - self.quant_config - and self.quant_config.get_name() == "gguf" - and name.endswith("norm.weight") - ): - loaded_weight -= 1 - if self.quant_config is not None and ( scale_name := self.quant_config.get_cache_scale(name) ): diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py index 85c812398529..6fb55dabfb62 100644 --- a/vllm/tokenizers/hf.py +++ b/vllm/tokenizers/hf.py @@ -7,12 +7,16 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from vllm.logger import init_logger from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config +from vllm.transformers_utils.gguf_utils import extract_eos_token_id_from_gguf from .protocol import TokenizerLike HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast +logger = init_logger(__name__) + def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer: """ @@ -81,6 +85,9 @@ def from_pretrained( download_dir: str | None = None, **kwargs, ) -> HfTokenizer: + # Save gguf_file before AutoTokenizer.from_pretrained() pops it from kwargs + gguf_file = kwargs.get("gguf_file") + try: tokenizer = AutoTokenizer.from_pretrained( path_or_repo_id, @@ -122,4 +129,23 @@ def from_pretrained( } tokenizer.add_special_tokens(special_tokens_map) + # Patch EOS token ID from GGUF metadata if available + # GGUF files may have a different EOS token ID than HF tokenizer config + # (e.g., Gemma uses ID 106 as EOS, but HF reports ID 1) + # Note: gguf_file was saved above before + # AutoTokenizer.from_pretrained() popped it + if gguf_file: + gguf_path = Path(path_or_repo_id) / gguf_file + gguf_eos_id = extract_eos_token_id_from_gguf(str(gguf_path)) + if gguf_eos_id is not None: + hf_eos_id = tokenizer.eos_token_id + if hf_eos_id != gguf_eos_id: + logger.info( + "Patching tokenizer eos_token_id from %d to %d " + "(using GGUF metadata)", + hf_eos_id, + gguf_eos_id, + ) + tokenizer.eos_token_id = gguf_eos_id + return get_cached_tokenizer(tokenizer) diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 3faa5ee60e9f..321011628678 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -220,6 +220,46 @@ def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | N return config +def extract_eos_token_id_from_gguf(model: str) -> int | None: + """Extract EOS token ID from GGUF metadata. + + GGUF files store the EOS token ID in tokenizer.ggml.eos_token_id field. + This may differ from HuggingFace's tokenizer config (e.g., Gemma models + use token ID 106 as EOS in GGUF, but HF tokenizer reports + token ID 1). + + Args: + model: Path to GGUF model file + + Returns: + EOS token ID from GGUF metadata, or None if not found + """ + # Note: We don't check for .gguf extension here because HuggingFace Hub + # stores GGUF files as blob hashes without extensions. The caller is + # responsible for ensuring this is a valid GGUF file (via check_gguf_file). + try: + model_path = Path(model) + if not model_path.is_file(): + return None + + reader = gguf.GGUFReader(str(model_path)) + + eos_field = reader.get_field(Keys.Tokenizer.EOS_ID) + if eos_field is not None: + eos_token_id = int(eos_field.parts[-1][0]) + logger.debug( + "Extracted eos_token_id=%d from GGUF metadata", + eos_token_id, + ) + return eos_token_id + + return None + + except Exception as e: + logger.debug("Error extracting EOS token ID from GGUF: %s", e) + return None + + def maybe_patch_hf_config_from_gguf( model: str, hf_config: PretrainedConfig, From c7a1a304d86faea7d85f73e6d5cbf067eed06724 Mon Sep 17 00:00:00 2001 From: Christina Date: Wed, 25 Mar 2026 11:50:45 -0500 Subject: [PATCH 2/3] refactor: Extract _get_norm_cls helper to reduce duplication Extract the repeated norm class selection logic into a module-level _get_norm_cls() helper in both gemma2.py and gemma3.py, as requested in review. Signed-off-by: kitaekatt Signed-off-by: Christina --- vllm/model_executor/models/gemma2.py | 19 ++++++++++++------- vllm/model_executor/models/gemma3.py | 22 +++++++++++++--------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index aee64af5b8dc..e1034d26f436 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -58,6 +58,14 @@ logger = init_logger(__name__) +def _get_norm_cls( + quant_config: QuantizationConfig | None, +) -> type[nn.Module]: + """Return RMSNorm for GGUF (weights have +1 baked in), else GemmaRMSNorm.""" + quant_name = quant_config.get_name() if quant_config else None + return RMSNorm if quant_name == "gguf" else GemmaRMSNorm + + class Gemma2MLP(nn.Module): def __init__( self, @@ -214,10 +222,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.mlp", ) - # GGUF stores RMSNorm weights with +1 baked in (llama.cpp convention). - # GemmaRMSNorm adds 1 in its forward pass, so use plain RMSNorm for GGUF. - quant_name = quant_config.get_name() if quant_config else None - rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + rms_norm_cls = _get_norm_cls(quant_config) rms_norm_kwargs = dict(hidden_size=config.hidden_size, eps=config.rms_norm_eps) self.input_layernorm = rms_norm_cls(**rms_norm_kwargs) self.post_attention_layernorm = rms_norm_cls(**rms_norm_kwargs) @@ -272,9 +277,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ), prefix=f"{prefix}.layers", ) - quant_name = quant_config.get_name() if quant_config else None - rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm - self.norm = rms_norm_cls(config.hidden_size, eps=config.rms_norm_eps) + self.norm = _get_norm_cls(quant_config)( + config.hidden_size, eps=config.rms_norm_eps + ) # Normalize the embedding by sqrt(hidden_size) # The normalizer's data type should be downcasted to the model's diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 2c8c936aee2a..9ad6b970b826 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -64,6 +64,14 @@ logger = init_logger(__name__) +def _get_norm_cls( + quant_config: QuantizationConfig | None, +) -> type[nn.Module]: + """Return RMSNorm for GGUF (weights have +1 baked in), else GemmaRMSNorm.""" + quant_name = quant_config.get_name() if quant_config else None + return RMSNorm if quant_name == "gguf" else GemmaRMSNorm + + class Gemma3MLP(nn.Module): def __init__( self, @@ -156,10 +164,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - # GGUF stores RMSNorm weights with +1 baked in (llama.cpp convention). - # GemmaRMSNorm adds 1 in its forward pass, so use plain RMSNorm for GGUF. - quant_name = quant_config.get_name() if quant_config else None - rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + rms_norm_cls = _get_norm_cls(quant_config) self.q_norm = rms_norm_cls(self.head_dim, eps=config.rms_norm_eps) self.k_norm = rms_norm_cls(self.head_dim, eps=config.rms_norm_eps) @@ -265,8 +270,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.mlp", ) - quant_name = quant_config.get_name() if quant_config else None - rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm + rms_norm_cls = _get_norm_cls(quant_config) rms_norm_kwargs = dict(hidden_size=config.hidden_size, eps=config.rms_norm_eps) self.input_layernorm = rms_norm_cls(**rms_norm_kwargs) self.post_attention_layernorm = rms_norm_cls(**rms_norm_kwargs) @@ -323,9 +327,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ), prefix=f"{prefix}.layers", ) - quant_name = quant_config.get_name() if quant_config else None - rms_norm_cls = RMSNorm if quant_name == "gguf" else GemmaRMSNorm - self.norm = rms_norm_cls(config.hidden_size, eps=config.rms_norm_eps) + self.norm = _get_norm_cls(quant_config)( + config.hidden_size, eps=config.rms_norm_eps + ) # Normalize the embedding by sqrt(hidden_size) # The normalizer's data type should be downcasted to the model's From 86fa0400298680138ce4dcab5af8b410cd0d39b9 Mon Sep 17 00:00:00 2001 From: Christina Date: Sat, 18 Apr 2026 11:58:43 -0500 Subject: [PATCH 3/3] refactor(gguf): Extract maybe_patch_gguf_tokenizer helper Addresses review feedback on #37220 from @Isotr0py: - Move GGUF EOS-token patching out of vllm/tokenizers/hf.py into a dedicated maybe_patch_gguf_tokenizer() helper in gguf_utils.py, mirroring the existing maybe_patch_hf_config_from_gguf naming. - Resolve HuggingFace repo ids (not just local directory paths) via hf_hub_download when the file is not found at a local path. Behavior is unchanged for local-directory callers. Signed-off-by: Christina --- vllm/tokenizers/hf.py | 24 +++-------- vllm/transformers_utils/gguf_utils.py | 57 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py index 6fb55dabfb62..7478879b620d 100644 --- a/vllm/tokenizers/hf.py +++ b/vllm/tokenizers/hf.py @@ -9,7 +9,7 @@ from vllm.logger import init_logger from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config -from vllm.transformers_utils.gguf_utils import extract_eos_token_id_from_gguf +from vllm.transformers_utils.gguf_utils import maybe_patch_gguf_tokenizer from .protocol import TokenizerLike @@ -129,23 +129,9 @@ def from_pretrained( } tokenizer.add_special_tokens(special_tokens_map) - # Patch EOS token ID from GGUF metadata if available - # GGUF files may have a different EOS token ID than HF tokenizer config - # (e.g., Gemma uses ID 106 as EOS, but HF reports ID 1) - # Note: gguf_file was saved above before - # AutoTokenizer.from_pretrained() popped it - if gguf_file: - gguf_path = Path(path_or_repo_id) / gguf_file - gguf_eos_id = extract_eos_token_id_from_gguf(str(gguf_path)) - if gguf_eos_id is not None: - hf_eos_id = tokenizer.eos_token_id - if hf_eos_id != gguf_eos_id: - logger.info( - "Patching tokenizer eos_token_id from %d to %d " - "(using GGUF metadata)", - hf_eos_id, - gguf_eos_id, - ) - tokenizer.eos_token_id = gguf_eos_id + # Patch tokenizer EOS from GGUF metadata when applicable + # (gguf_file was saved above before AutoTokenizer.from_pretrained() + # popped it from kwargs). + maybe_patch_gguf_tokenizer(tokenizer, path_or_repo_id, gguf_file) return get_cached_tokenizer(tokenizer) diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 621f0b4d2e31..a5d65deb267b 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -295,6 +295,63 @@ def extract_eos_token_id_from_gguf(model: str) -> int | None: return None +def maybe_patch_gguf_tokenizer( + tokenizer, + path_or_repo_id: str | Path, + gguf_file: str | None, +) -> None: + """Patch ``tokenizer.eos_token_id`` from GGUF metadata when available. + + GGUF files may store a different EOS token ID than the HuggingFace + tokenizer config (e.g., Gemma uses ```` ID 106 as EOS in + GGUF, but HF reports ```` ID 1). This helper mutates + ``tokenizer.eos_token_id`` in place to match the GGUF metadata. + + Accepts either a local directory path or a HuggingFace repo id for + ``path_or_repo_id``; remote repo ids resolve to a local file via + ``hf_hub_download``. + + Safe to call unconditionally — this is a no-op when ``gguf_file`` is + falsy, the file cannot be resolved, or the GGUF metadata has no EOS + token id. + """ + if not gguf_file: + return + + candidate = Path(path_or_repo_id) / gguf_file + if candidate.is_file(): + gguf_path = candidate + else: + # Treat path_or_repo_id as a HuggingFace repo id + from huggingface_hub import hf_hub_download + + try: + gguf_path = Path( + hf_hub_download(repo_id=str(path_or_repo_id), filename=gguf_file) + ) + except Exception as e: + logger.debug( + "Could not resolve GGUF file %s from %s: %s", + gguf_file, + path_or_repo_id, + e, + ) + return + + gguf_eos_id = extract_eos_token_id_from_gguf(str(gguf_path)) + if gguf_eos_id is None: + return + + hf_eos_id = tokenizer.eos_token_id + if hf_eos_id != gguf_eos_id: + logger.info( + "Patching tokenizer eos_token_id from %d to %d (using GGUF metadata)", + hf_eos_id, + gguf_eos_id, + ) + tokenizer.eos_token_id = gguf_eos_id + + def maybe_patch_hf_config_from_gguf( model: str, hf_config: PretrainedConfig,