From 538453a81dcfd9cb5770dc9a79c7e684a244281b Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Tue, 10 Mar 2026 20:57:07 +0000 Subject: [PATCH] Convert image tokens to BPE tokens before embedding --- src/transformers/models/chameleon/modeling_chameleon.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index fac0ef50a382..98792536d269 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -899,8 +899,10 @@ def get_image_features( pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. """ + batch_size = pixel_values.shape[0] vqmodel_outputs: ChameleonVQVAEModelOutput = self.vqmodel.encode(pixel_values, return_dict=True, **kwargs) - vqmodel_outputs.pooler_output = self.get_input_embeddings()(vqmodel_outputs.image_tokens) + bpe_tokens = self.vocabulary_mapping.convert_img2bpe(vqmodel_outputs.image_tokens).view(batch_size, -1) + vqmodel_outputs.pooler_output = self.get_input_embeddings()(bpe_tokens) return vqmodel_outputs def get_placeholder_mask(