From df10d4c0e44e0c76c1096ac6880f1dc46abb181f Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Fri, 20 Feb 2026 10:25:19 +0100 Subject: [PATCH 1/6] fix(flaky): `test_generate_with_and_without_position_ids` in GLM ORC Changes: 1. src/transformers/models/glm_ocr/modeling_glm_ocr.py:1431 - In prepare_inputs_for_generation, normalize user-provided 2D position_ids into packed multimodal format. - On cache continuation, only reuse rope_deltas when cache length is non-zero (fixes stale-delta reuse across sequential generate() calls). 2. tests/models/glm_ocr/test_modeling_glm_ocr.py:166 - Avoid random accidental pad tokens in synthetic input IDs by remapping pad_token_id in random portions (0 -> 1), reducing test instability from pad-derived masks. --- .../models/glm_ocr/modeling_glm_ocr.py | 29 +++++++++++++++++++ tests/models/glm_ocr/test_modeling_glm_ocr.py | 1 + 2 files changed, 30 insertions(+) diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index d066f8428c10..f39e9d09a96a 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -1428,6 +1428,35 @@ def prepare_inputs_for_generation( model_inputs["pixel_values"] = None model_inputs["pixel_values_videos"] = None + # Keep behavior consistent between inferred and user-provided 2D `position_ids` in multimodal generation. + model_position_ids = model_inputs.get("position_ids") + if model_position_ids is not None and model_position_ids.ndim == 2: + past_length = 0 + if past_key_values is not None and hasattr(past_key_values, "get_seq_length"): + past_length = past_key_values.get_seq_length() + + if past_length != 0 and self.model.rope_deltas is not None: + text_positions = model_position_ids[None, ...] + vision_positions = (text_positions + self.model.rope_deltas).expand(3, -1, -1) + model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0) + else: + can_compute_mrope = image_grid_thw is not None or video_grid_thw is not None + if can_compute_mrope: + vision_positions, rope_deltas = self.model.get_rope_index( + input_ids=model_inputs.get("input_ids"), + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + attention_mask=model_inputs.get("attention_mask"), + ) + self.model.rope_deltas = rope_deltas + else: + vision_positions = model_position_ids.unsqueeze(0).expand(3, -1, -1) + self.model.rope_deltas = torch.zeros( + model_position_ids.shape[0], 1, dtype=torch.long, device=model_position_ids.device + ) + + model_inputs["position_ids"] = torch.cat([model_position_ids[None, ...], vision_positions], dim=0) + return model_inputs def _prepare_position_ids_for_generation(self, inputs_tensor, model_kwargs): diff --git a/tests/models/glm_ocr/test_modeling_glm_ocr.py b/tests/models/glm_ocr/test_modeling_glm_ocr.py index 7f3dfdaed437..007c6f2cedee 100644 --- a/tests/models/glm_ocr/test_modeling_glm_ocr.py +++ b/tests/models/glm_ocr/test_modeling_glm_ocr.py @@ -163,6 +163,7 @@ def prepare_config_and_inputs_for_common(self): input_ids[input_ids == self.image_start_token_id] = self.pad_token_id input_ids[input_ids == self.video_end_token_id] = self.pad_token_id input_ids[input_ids == self.image_end_token_id] = self.pad_token_id + input_ids[input_ids == self.pad_token_id] = 1 input_ids[:, 0] = self.image_start_token_id input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id From af2ea605bb5b14496028263dca3ccdcc367108c9 Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Fri, 20 Feb 2026 10:38:26 +0100 Subject: [PATCH 2/6] moved the fix to modular --- .../models/glm_ocr/modeling_glm_ocr.py | 5 +- .../models/glm_ocr/modular_glm_ocr.py | 70 ++++++++++++++++++- tests/models/glm_ocr/test_modeling_glm_ocr.py | 18 ++++- 3 files changed, 89 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index f39e9d09a96a..ca3ce74dd249 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -1436,8 +1436,9 @@ def prepare_inputs_for_generation( past_length = past_key_values.get_seq_length() if past_length != 0 and self.model.rope_deltas is not None: + rope_deltas = self.model.rope_deltas.view(-1, 1) text_positions = model_position_ids[None, ...] - vision_positions = (text_positions + self.model.rope_deltas).expand(3, -1, -1) + vision_positions = (text_positions + rope_deltas).expand(3, -1, -1) model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0) else: can_compute_mrope = image_grid_thw is not None or video_grid_thw is not None @@ -1448,7 +1449,7 @@ def prepare_inputs_for_generation( video_grid_thw=video_grid_thw, attention_mask=model_inputs.get("attention_mask"), ) - self.model.rope_deltas = rope_deltas + self.model.rope_deltas = rope_deltas.view(-1, 1) else: vision_positions = model_position_ids.unsqueeze(0).expand(3, -1, -1) self.model.rope_deltas = torch.zeros( diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py index 7659b35deaa1..8b007872e38c 100644 --- a/src/transformers/models/glm_ocr/modular_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py @@ -413,7 +413,75 @@ class GlmOcrModel(Glm4vModel): class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration): - pass + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + is_first_iteration=False, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + is_first_iteration=is_first_iteration, + **kwargs, + ) + + if not is_first_iteration and use_cache: + model_inputs["pixel_values"] = None + model_inputs["pixel_values_videos"] = None + + # Keep behavior consistent between inferred and user-provided 2D `position_ids` in multimodal generation. + model_position_ids = model_inputs.get("position_ids") + if model_position_ids is not None and model_position_ids.ndim == 2: + past_length = 0 + if past_key_values is not None and hasattr(past_key_values, "get_seq_length"): + past_length = past_key_values.get_seq_length() + + if past_length != 0 and self.model.rope_deltas is not None: + rope_deltas = self.model.rope_deltas.view(-1, 1) + text_positions = model_position_ids[None, ...] + vision_positions = (text_positions + rope_deltas).expand(3, -1, -1) + model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0) + else: + can_compute_mrope = image_grid_thw is not None or video_grid_thw is not None + if can_compute_mrope: + vision_positions, rope_deltas = self.model.get_rope_index( + input_ids=model_inputs.get("input_ids"), + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + attention_mask=model_inputs.get("attention_mask"), + ) + self.model.rope_deltas = rope_deltas.view(-1, 1) + else: + vision_positions = model_position_ids.unsqueeze(0).expand(3, -1, -1) + self.model.rope_deltas = torch.zeros( + model_position_ids.shape[0], 1, dtype=torch.long, device=model_position_ids.device + ) + + model_inputs["position_ids"] = torch.cat([model_position_ids[None, ...], vision_positions], dim=0) + + return model_inputs __all__ = [ diff --git a/tests/models/glm_ocr/test_modeling_glm_ocr.py b/tests/models/glm_ocr/test_modeling_glm_ocr.py index 007c6f2cedee..b8c5b59b432b 100644 --- a/tests/models/glm_ocr/test_modeling_glm_ocr.py +++ b/tests/models/glm_ocr/test_modeling_glm_ocr.py @@ -163,7 +163,23 @@ def prepare_config_and_inputs_for_common(self): input_ids[input_ids == self.image_start_token_id] = self.pad_token_id input_ids[input_ids == self.video_end_token_id] = self.pad_token_id input_ids[input_ids == self.image_end_token_id] = self.pad_token_id - input_ids[input_ids == self.pad_token_id] = 1 + + non_special_id = 1 + self.parent.assertNotIn( + non_special_id, + { + self.pad_token_id, + self.bos_token_id, + self.eos_token_id, + self.video_start_token_id, + self.video_end_token_id, + self.image_start_token_id, + self.image_end_token_id, + self.image_token_id, + self.video_token_id, + }, + ) + input_ids[input_ids == self.pad_token_id] = non_special_id input_ids[:, 0] = self.image_start_token_id input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id From 875c8a0b761f8d96c8f65cc086b93b2bd0fa6a88 Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Fri, 20 Feb 2026 11:11:19 +0100 Subject: [PATCH 3/6] simplify code --- .../models/glm_ocr/modeling_glm_ocr.py | 38 +++++-------------- .../models/glm_ocr/modular_glm_ocr.py | 38 +++++-------------- 2 files changed, 20 insertions(+), 56 deletions(-) diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index ca3ce74dd249..161c03219d0d 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -1392,37 +1392,18 @@ def forward( def prepare_inputs_for_generation( self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - pixel_values=None, - pixel_values_videos=None, - image_grid_thw=None, - video_grid_thw=None, - is_first_iteration=False, + input_ids=None, **kwargs, ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - model_inputs = super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - cache_position=cache_position, - position_ids=position_ids, - pixel_values=pixel_values, - pixel_values_videos=pixel_values_videos, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - use_cache=use_cache, - is_first_iteration=is_first_iteration, - **kwargs, - ) + model_inputs = super().prepare_inputs_for_generation(input_ids=input_ids, **kwargs) + + past_key_values = kwargs.get("past_key_values") + use_cache = kwargs.get("use_cache", True) + is_first_iteration = kwargs.get("is_first_iteration", False) + image_grid_thw = kwargs.get("image_grid_thw") + video_grid_thw = kwargs.get("video_grid_thw") if not is_first_iteration and use_cache: model_inputs["pixel_values"] = None @@ -1431,9 +1412,10 @@ def prepare_inputs_for_generation( # Keep behavior consistent between inferred and user-provided 2D `position_ids` in multimodal generation. model_position_ids = model_inputs.get("position_ids") if model_position_ids is not None and model_position_ids.ndim == 2: - past_length = 0 if past_key_values is not None and hasattr(past_key_values, "get_seq_length"): past_length = past_key_values.get_seq_length() + else: + past_length = 0 if past_length != 0 and self.model.rope_deltas is not None: rope_deltas = self.model.rope_deltas.view(-1, 1) diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py index 8b007872e38c..2111813fb9ee 100644 --- a/src/transformers/models/glm_ocr/modular_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py @@ -415,37 +415,18 @@ class GlmOcrModel(Glm4vModel): class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration): def prepare_inputs_for_generation( self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - pixel_values=None, - pixel_values_videos=None, - image_grid_thw=None, - video_grid_thw=None, - is_first_iteration=False, + input_ids=None, **kwargs, ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - model_inputs = super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - cache_position=cache_position, - position_ids=position_ids, - pixel_values=pixel_values, - pixel_values_videos=pixel_values_videos, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - use_cache=use_cache, - is_first_iteration=is_first_iteration, - **kwargs, - ) + model_inputs = super().prepare_inputs_for_generation(input_ids=input_ids, **kwargs) + + past_key_values = kwargs.get("past_key_values") + use_cache = kwargs.get("use_cache", True) + is_first_iteration = kwargs.get("is_first_iteration", False) + image_grid_thw = kwargs.get("image_grid_thw") + video_grid_thw = kwargs.get("video_grid_thw") if not is_first_iteration and use_cache: model_inputs["pixel_values"] = None @@ -454,9 +435,10 @@ def prepare_inputs_for_generation( # Keep behavior consistent between inferred and user-provided 2D `position_ids` in multimodal generation. model_position_ids = model_inputs.get("position_ids") if model_position_ids is not None and model_position_ids.ndim == 2: - past_length = 0 if past_key_values is not None and hasattr(past_key_values, "get_seq_length"): past_length = past_key_values.get_seq_length() + else: + past_length = 0 if past_length != 0 and self.model.rope_deltas is not None: rope_deltas = self.model.rope_deltas.view(-1, 1) From cfc1321f92fcc3dee1707524ca0bfcb09c1ae0d7 Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Fri, 20 Feb 2026 13:47:10 +0100 Subject: [PATCH 4/6] dont tweak position ids here --- .../models/glm_ocr/modeling_glm_ocr.py | 66 ++++++++----------- .../models/glm_ocr/modular_glm_ocr.py | 52 +-------------- 2 files changed, 28 insertions(+), 90 deletions(-) diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index 161c03219d0d..d066f8428c10 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -1392,54 +1392,42 @@ def forward( def prepare_inputs_for_generation( self, - input_ids=None, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + is_first_iteration=False, **kwargs, ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - model_inputs = super().prepare_inputs_for_generation(input_ids=input_ids, **kwargs) - - past_key_values = kwargs.get("past_key_values") - use_cache = kwargs.get("use_cache", True) - is_first_iteration = kwargs.get("is_first_iteration", False) - image_grid_thw = kwargs.get("image_grid_thw") - video_grid_thw = kwargs.get("video_grid_thw") + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + is_first_iteration=is_first_iteration, + **kwargs, + ) if not is_first_iteration and use_cache: model_inputs["pixel_values"] = None model_inputs["pixel_values_videos"] = None - # Keep behavior consistent between inferred and user-provided 2D `position_ids` in multimodal generation. - model_position_ids = model_inputs.get("position_ids") - if model_position_ids is not None and model_position_ids.ndim == 2: - if past_key_values is not None and hasattr(past_key_values, "get_seq_length"): - past_length = past_key_values.get_seq_length() - else: - past_length = 0 - - if past_length != 0 and self.model.rope_deltas is not None: - rope_deltas = self.model.rope_deltas.view(-1, 1) - text_positions = model_position_ids[None, ...] - vision_positions = (text_positions + rope_deltas).expand(3, -1, -1) - model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0) - else: - can_compute_mrope = image_grid_thw is not None or video_grid_thw is not None - if can_compute_mrope: - vision_positions, rope_deltas = self.model.get_rope_index( - input_ids=model_inputs.get("input_ids"), - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - attention_mask=model_inputs.get("attention_mask"), - ) - self.model.rope_deltas = rope_deltas.view(-1, 1) - else: - vision_positions = model_position_ids.unsqueeze(0).expand(3, -1, -1) - self.model.rope_deltas = torch.zeros( - model_position_ids.shape[0], 1, dtype=torch.long, device=model_position_ids.device - ) - - model_inputs["position_ids"] = torch.cat([model_position_ids[None, ...], vision_positions], dim=0) - return model_inputs def _prepare_position_ids_for_generation(self, inputs_tensor, model_kwargs): diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py index 2111813fb9ee..7659b35deaa1 100644 --- a/src/transformers/models/glm_ocr/modular_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py @@ -413,57 +413,7 @@ class GlmOcrModel(Glm4vModel): class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration): - def prepare_inputs_for_generation( - self, - input_ids=None, - **kwargs, - ): - # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - - model_inputs = super().prepare_inputs_for_generation(input_ids=input_ids, **kwargs) - - past_key_values = kwargs.get("past_key_values") - use_cache = kwargs.get("use_cache", True) - is_first_iteration = kwargs.get("is_first_iteration", False) - image_grid_thw = kwargs.get("image_grid_thw") - video_grid_thw = kwargs.get("video_grid_thw") - - if not is_first_iteration and use_cache: - model_inputs["pixel_values"] = None - model_inputs["pixel_values_videos"] = None - - # Keep behavior consistent between inferred and user-provided 2D `position_ids` in multimodal generation. - model_position_ids = model_inputs.get("position_ids") - if model_position_ids is not None and model_position_ids.ndim == 2: - if past_key_values is not None and hasattr(past_key_values, "get_seq_length"): - past_length = past_key_values.get_seq_length() - else: - past_length = 0 - - if past_length != 0 and self.model.rope_deltas is not None: - rope_deltas = self.model.rope_deltas.view(-1, 1) - text_positions = model_position_ids[None, ...] - vision_positions = (text_positions + rope_deltas).expand(3, -1, -1) - model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0) - else: - can_compute_mrope = image_grid_thw is not None or video_grid_thw is not None - if can_compute_mrope: - vision_positions, rope_deltas = self.model.get_rope_index( - input_ids=model_inputs.get("input_ids"), - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - attention_mask=model_inputs.get("attention_mask"), - ) - self.model.rope_deltas = rope_deltas.view(-1, 1) - else: - vision_positions = model_position_ids.unsqueeze(0).expand(3, -1, -1) - self.model.rope_deltas = torch.zeros( - model_position_ids.shape[0], 1, dtype=torch.long, device=model_position_ids.device - ) - - model_inputs["position_ids"] = torch.cat([model_position_ids[None, ...], vision_positions], dim=0) - - return model_inputs + pass __all__ = [ From 893ab53ae771b4fa474ee1b8c2a696309223915a Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Fri, 20 Feb 2026 14:15:37 +0100 Subject: [PATCH 5/6] improve patch so we don't alter position ids in the wrong path --- .../models/glm_ocr/modeling_glm_ocr.py | 44 +++++++++++ .../models/glm_ocr/modular_glm_ocr.py | 77 ++++++++++++++++++- 2 files changed, 120 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index d066f8428c10..bd308bf9e37d 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -1406,6 +1406,22 @@ def prepare_inputs_for_generation( is_first_iteration=False, **kwargs, ): + has_vision_inputs = image_grid_thw is not None or video_grid_thw is not None + + # Keep explicit text-only position ids on the multimodal packed path so it matches inferred generation. + if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: + past_length = 0 if past_key_values is None else past_key_values.get_seq_length() + if past_length == 0: + prep_kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "image_grid_thw": image_grid_thw, + "video_grid_thw": video_grid_thw, + } + position_ids = self._prepare_position_ids_for_generation(input_ids, prep_kwargs) + else: + position_ids = self._pack_position_ids_with_rope_deltas(position_ids) # Overwritten -- in specific circumstances we don't want to forward image inputs to the model model_inputs = super().prepare_inputs_for_generation( @@ -1611,6 +1627,34 @@ def _expand_dict_for_generation(dict_to_expand): return input_ids, model_kwargs + def _pack_position_ids_with_rope_deltas(self, position_ids: torch.LongTensor) -> torch.LongTensor: + vision_positions = position_ids.unsqueeze(0).expand(3, -1, -1) + if self.model.rope_deltas is not None: + vision_positions = vision_positions + self.model.rope_deltas + return torch.cat([position_ids.unsqueeze(0), vision_positions], dim=0) + + def _update_model_kwargs_for_generation( + self, + outputs, + model_kwargs: dict, + is_encoder_decoder=False, + num_new_tokens=1, + ): + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + num_new_tokens=num_new_tokens, + ) + + has_vision_inputs = ( + model_kwargs.get("image_grid_thw") is not None or model_kwargs.get("video_grid_thw") is not None + ) + position_ids = model_kwargs.get("position_ids") + if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: + model_kwargs["position_ids"] = self._pack_position_ids_with_rope_deltas(position_ids) + return model_kwargs + __all__ = [ "GlmOcrTextModel", diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py index 7659b35deaa1..126c0d765b39 100644 --- a/src/transformers/models/glm_ocr/modular_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py @@ -413,7 +413,82 @@ class GlmOcrModel(Glm4vModel): class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration): - pass + def _pack_position_ids_with_rope_deltas(self, position_ids: torch.LongTensor) -> torch.LongTensor: + vision_positions = position_ids.unsqueeze(0).expand(3, -1, -1) + if self.model.rope_deltas is not None: + vision_positions = vision_positions + self.model.rope_deltas + return torch.cat([position_ids.unsqueeze(0), vision_positions], dim=0) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + is_first_iteration=False, + **kwargs, + ): + has_vision_inputs = image_grid_thw is not None or video_grid_thw is not None + + # Keep explicit text-only position ids on the multimodal packed path so it matches inferred generation. + if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: + past_length = 0 if past_key_values is None else past_key_values.get_seq_length() + if past_length == 0: + prep_kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "image_grid_thw": image_grid_thw, + "video_grid_thw": video_grid_thw, + } + position_ids = self._prepare_position_ids_for_generation(input_ids, prep_kwargs) + else: + position_ids = self._pack_position_ids_with_rope_deltas(position_ids) + + return super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + is_first_iteration=is_first_iteration, + **kwargs, + ) + + def _update_model_kwargs_for_generation( + self, + outputs, + model_kwargs: dict, + is_encoder_decoder=False, + num_new_tokens=1, + ): + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + num_new_tokens=num_new_tokens, + ) + + has_vision_inputs = ( + model_kwargs.get("image_grid_thw") is not None or model_kwargs.get("video_grid_thw") is not None + ) + position_ids = model_kwargs.get("position_ids") + if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: + model_kwargs["position_ids"] = self._pack_position_ids_with_rope_deltas(position_ids) + return model_kwargs __all__ = [ From 34e24baea9708160a197aa53fe542163e255741b Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Fri, 20 Feb 2026 14:44:13 +0100 Subject: [PATCH 6/6] reverted initial fix and skip 2D from GenerationTesterMixin.test_generate_with_and_without_position_ids` --- .../models/glm_ocr/modeling_glm_ocr.py | 44 ----------- .../models/glm_ocr/modular_glm_ocr.py | 77 +------------------ tests/generation/test_utils.py | 20 +++++ tests/models/glm_ocr/test_modeling_glm_ocr.py | 17 ---- 4 files changed, 21 insertions(+), 137 deletions(-) diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index bd308bf9e37d..d066f8428c10 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -1406,22 +1406,6 @@ def prepare_inputs_for_generation( is_first_iteration=False, **kwargs, ): - has_vision_inputs = image_grid_thw is not None or video_grid_thw is not None - - # Keep explicit text-only position ids on the multimodal packed path so it matches inferred generation. - if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: - past_length = 0 if past_key_values is None else past_key_values.get_seq_length() - if past_length == 0: - prep_kwargs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "image_grid_thw": image_grid_thw, - "video_grid_thw": video_grid_thw, - } - position_ids = self._prepare_position_ids_for_generation(input_ids, prep_kwargs) - else: - position_ids = self._pack_position_ids_with_rope_deltas(position_ids) # Overwritten -- in specific circumstances we don't want to forward image inputs to the model model_inputs = super().prepare_inputs_for_generation( @@ -1627,34 +1611,6 @@ def _expand_dict_for_generation(dict_to_expand): return input_ids, model_kwargs - def _pack_position_ids_with_rope_deltas(self, position_ids: torch.LongTensor) -> torch.LongTensor: - vision_positions = position_ids.unsqueeze(0).expand(3, -1, -1) - if self.model.rope_deltas is not None: - vision_positions = vision_positions + self.model.rope_deltas - return torch.cat([position_ids.unsqueeze(0), vision_positions], dim=0) - - def _update_model_kwargs_for_generation( - self, - outputs, - model_kwargs: dict, - is_encoder_decoder=False, - num_new_tokens=1, - ): - model_kwargs = super()._update_model_kwargs_for_generation( - outputs=outputs, - model_kwargs=model_kwargs, - is_encoder_decoder=is_encoder_decoder, - num_new_tokens=num_new_tokens, - ) - - has_vision_inputs = ( - model_kwargs.get("image_grid_thw") is not None or model_kwargs.get("video_grid_thw") is not None - ) - position_ids = model_kwargs.get("position_ids") - if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: - model_kwargs["position_ids"] = self._pack_position_ids_with_rope_deltas(position_ids) - return model_kwargs - __all__ = [ "GlmOcrTextModel", diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py index 126c0d765b39..7659b35deaa1 100644 --- a/src/transformers/models/glm_ocr/modular_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py @@ -413,82 +413,7 @@ class GlmOcrModel(Glm4vModel): class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration): - def _pack_position_ids_with_rope_deltas(self, position_ids: torch.LongTensor) -> torch.LongTensor: - vision_positions = position_ids.unsqueeze(0).expand(3, -1, -1) - if self.model.rope_deltas is not None: - vision_positions = vision_positions + self.model.rope_deltas - return torch.cat([position_ids.unsqueeze(0), vision_positions], dim=0) - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - pixel_values=None, - pixel_values_videos=None, - image_grid_thw=None, - video_grid_thw=None, - is_first_iteration=False, - **kwargs, - ): - has_vision_inputs = image_grid_thw is not None or video_grid_thw is not None - - # Keep explicit text-only position ids on the multimodal packed path so it matches inferred generation. - if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: - past_length = 0 if past_key_values is None else past_key_values.get_seq_length() - if past_length == 0: - prep_kwargs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "image_grid_thw": image_grid_thw, - "video_grid_thw": video_grid_thw, - } - position_ids = self._prepare_position_ids_for_generation(input_ids, prep_kwargs) - else: - position_ids = self._pack_position_ids_with_rope_deltas(position_ids) - - return super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - cache_position=cache_position, - position_ids=position_ids, - use_cache=use_cache, - pixel_values=pixel_values, - pixel_values_videos=pixel_values_videos, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - is_first_iteration=is_first_iteration, - **kwargs, - ) - - def _update_model_kwargs_for_generation( - self, - outputs, - model_kwargs: dict, - is_encoder_decoder=False, - num_new_tokens=1, - ): - model_kwargs = super()._update_model_kwargs_for_generation( - outputs=outputs, - model_kwargs=model_kwargs, - is_encoder_decoder=is_encoder_decoder, - num_new_tokens=num_new_tokens, - ) - - has_vision_inputs = ( - model_kwargs.get("image_grid_thw") is not None or model_kwargs.get("video_grid_thw") is not None - ) - position_ids = model_kwargs.get("position_ids") - if has_vision_inputs and position_ids is not None and position_ids.ndim == 2: - model_kwargs["position_ids"] = self._pack_position_ids_with_rope_deltas(position_ids) - return model_kwargs + pass __all__ = [ diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 34617810cfc0..682cd9e171f7 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2261,17 +2261,31 @@ def test_forward_with_logits_to_keep(self): torch.testing.assert_close(all_logits[:, -1:, :], last_token_logits, rtol=1e-5, atol=1e-5) def test_generate_with_and_without_position_ids(self): + ran_any_model = False for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() model = model_class(config).to(torch_device).eval() model_forward_args = inspect.signature(model.forward).parameters + has_3d_rope_positions = any( + hasattr(module, "get_rope_index") + for module in ( + model, + getattr(model, "model", None), + getattr(model, "language_model", None), + getattr(model, "text_model", None), + ) + ) + if has_3d_rope_positions: + continue + if "position_ids" not in model_forward_args or "input_ids" not in inputs_dict: self.skipTest("This model doesn't use `position_ids`") if config.is_encoder_decoder: self.skipTest("This model doesn't prepare `position_ids` in generate") + ran_any_model = True input_ids = inputs_dict["input_ids"] seq_length = input_ids.shape[1] # ensure left padding @@ -2304,6 +2318,12 @@ def test_generate_with_and_without_position_ids(self): # and can continue adding new ids to the already passed position ids self.assertListEqual(out_wo_positions.tolist(), out_w_positions.tolist()) + if not ran_any_model: + self.skipTest( + "All model classes in this test use 3D RoPE positions (`get_rope_index`), for which 2D custom " + "`position_ids` may be accepted but are expected to produce invalid outputs." + ) + def _check_generate_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): input_batch_size = int(output.sequences.shape[0] / num_return_sequences) internal_batch_size = ( diff --git a/tests/models/glm_ocr/test_modeling_glm_ocr.py b/tests/models/glm_ocr/test_modeling_glm_ocr.py index b8c5b59b432b..7f3dfdaed437 100644 --- a/tests/models/glm_ocr/test_modeling_glm_ocr.py +++ b/tests/models/glm_ocr/test_modeling_glm_ocr.py @@ -164,23 +164,6 @@ def prepare_config_and_inputs_for_common(self): input_ids[input_ids == self.video_end_token_id] = self.pad_token_id input_ids[input_ids == self.image_end_token_id] = self.pad_token_id - non_special_id = 1 - self.parent.assertNotIn( - non_special_id, - { - self.pad_token_id, - self.bos_token_id, - self.eos_token_id, - self.video_start_token_id, - self.video_end_token_id, - self.image_start_token_id, - self.image_end_token_id, - self.image_token_id, - self.video_token_id, - }, - ) - input_ids[input_ids == self.pad_token_id] = non_special_id - input_ids[:, 0] = self.image_start_token_id input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id input_ids[:, 1 + self.num_image_tokens] = self.image_end_token_id