LayoutXLMProcessor: ensure 1-to-1 mapping between samples and images, and add test for it (#18774)

anthony2261 · web-flow · commit a98f6a1da012 · 2022-08-30T14:43:14.000+02:00
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -89,6 +89,9 @@ def __call__(
                 "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
             )
 
+        if return_overflowing_tokens is True and return_offsets_mapping is False:
+            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
+
         # first, apply the feature extractor
         features = self.feature_extractor(images=images, return_tensors=return_tensors)
 
diff --git a/tests/models/layoutxlm/test_processor_layoutxlm.py b/tests/models/layoutxlm/test_processor_layoutxlm.py
@@ -126,6 +126,40 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
         self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
 
+    @slow
+    def test_overflowing_tokens(self):
+        # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
+
+        from datasets import load_dataset
+
+        # set up
+        datasets = load_dataset("nielsr/funsd")
+        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
+
+        def preprocess_data(examples):
+            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            words = examples["words"]
+            boxes = examples["bboxes"]
+            word_labels = examples["ner_tags"]
+            encoded_inputs = processor(
+                images,
+                words,
+                boxes=boxes,
+                word_labels=word_labels,
+                max_length=512,
+                padding="max_length",
+                truncation=True,
+                return_overflowing_tokens=True,
+                stride=50,
+                return_offsets_mapping=True,
+                return_tensors="pt",
+            )
+            return encoded_inputs
+
+        train_data = preprocess_data(datasets["train"])
+
+        self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
+
 
 # different use cases tests
 @require_sentencepiece

Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,9 @@ def __call__(`
`89`	`89`	`"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."`
`90`	`90`	`)`
`91`	`91`
	`92`	`+ if return_overflowing_tokens is True and return_offsets_mapping is False:`
	`93`	`+ raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")`
	`94`	`+`
`92`	`95`	`# first, apply the feature extractor`
`93`	`96`	`features = self.feature_extractor(images=images, return_tensors=return_tensors)`
`94`	`97`