Skip to content

Commit a98f6a1

Browse files
authored
LayoutXLMProcessor: ensure 1-to-1 mapping between samples and images, and add test for it (#18774)
1 parent 220da3b commit a98f6a1

File tree

2 files changed

+37
-0
lines changed

2 files changed

+37
-0
lines changed

src/transformers/models/layoutxlm/processing_layoutxlm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ def __call__(
8989
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
9090
)
9191

92+
if return_overflowing_tokens is True and return_offsets_mapping is False:
93+
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
94+
9295
# first, apply the feature extractor
9396
features = self.feature_extractor(images=images, return_tensors=return_tensors)
9497

tests/models/layoutxlm/test_processor_layoutxlm.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,40 @@ def test_save_load_pretrained_additional_features(self):
126126
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
127127
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
128128

129+
@slow
130+
def test_overflowing_tokens(self):
131+
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
132+
133+
from datasets import load_dataset
134+
135+
# set up
136+
datasets = load_dataset("nielsr/funsd")
137+
processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
138+
139+
def preprocess_data(examples):
140+
images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
141+
words = examples["words"]
142+
boxes = examples["bboxes"]
143+
word_labels = examples["ner_tags"]
144+
encoded_inputs = processor(
145+
images,
146+
words,
147+
boxes=boxes,
148+
word_labels=word_labels,
149+
max_length=512,
150+
padding="max_length",
151+
truncation=True,
152+
return_overflowing_tokens=True,
153+
stride=50,
154+
return_offsets_mapping=True,
155+
return_tensors="pt",
156+
)
157+
return encoded_inputs
158+
159+
train_data = preprocess_data(datasets["train"])
160+
161+
self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
162+
129163

130164
# different use cases tests
131165
@require_sentencepiece

0 commit comments

Comments
 (0)