huggingface · sgugger · Apr 3, 2023 · Feb 19, 2023 · NielsRogge · Apr 3, 2023
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -508,7 +508,6 @@ def encode_plus(
             **kwargs,
         )
 
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3
     def _batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[
@@ -640,18 +639,23 @@ def _batch_encode_plus(
                 else:
                     original_index = batch_index
                 labels_example = []
+                previous_token_empty = False
                 for id, offset, word_id in zip(
                     sanitized_tokens["input_ids"][batch_index],
                     sanitized_tokens["offset_mapping"][batch_index],
                     sanitized_encodings[batch_index].word_ids,
                 ):
                     if word_id is not None:
                         if self.only_label_first_subword:
-                            if offset[0] == 0:
+                            if offset[0] == 0 and not previous_token_empty:
                                 # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                                 labels_example.append(word_labels[original_index][word_id])
                             else:
                                 labels_example.append(self.pad_token_label)
+                            if offset == (0, 0):
-                            if offset == (0, 0):
+                            if self.decode(id) == "":
-                            if offset == (0, 0):
+                            if self.decode(id) == "":
+                                previous_token_empty = True
+                            else:
+                                previous_token_empty = False
                         else:
                             labels_example.append(word_labels[original_index][word_id])
                     else:

diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -2277,35 +2277,35 @@ def test_compare_prepare_for_model(self):
 
     @slow
     def test_only_label_first_subword(self):
-        words = ["hello", "niels"]
+        words = ["hello", "niels", "0000000000000000"]
         boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-        word_labels = [0, 1]
+        word_labels = [0, 1, 2]
 
         # test slow tokenizer
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
         encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
 
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
             "microsoft/layoutlmv3-base",
             only_label_first_subword=False,
             add_visual_labels=False,
         )
         encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
 
         # test fast tokenizer
         tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
 
         tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
             "microsoft/layoutlmv3-base",
             only_label_first_subword=False,
             add_visual_labels=False,
         )
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
 
     @slow
     def test_layoutlmv3_integration_test(self):