diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index 4bd3e91480c0..3d7445e44931 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -508,7 +508,6 @@ def encode_plus( **kwargs, ) - # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3 def _batch_encode_plus( self, batch_text_or_text_pairs: Union[ @@ -640,6 +639,7 @@ def _batch_encode_plus( else: original_index = batch_index labels_example = [] + previous_token_empty = False for id, offset, word_id in zip( sanitized_tokens["input_ids"][batch_index], sanitized_tokens["offset_mapping"][batch_index], @@ -647,11 +647,15 @@ def _batch_encode_plus( ): if word_id is not None: if self.only_label_first_subword: - if offset[0] == 0: + if offset[0] == 0 and not previous_token_empty: # Use the real label id for the first token of the word, and padding ids for the remaining tokens labels_example.append(word_labels[original_index][word_id]) else: labels_example.append(self.pad_token_label) + if offset == (0, 0): + previous_token_empty = True + else: + previous_token_empty = False else: labels_example.append(word_labels[original_index][word_id]) else: diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 884f87680353..63d86f280cc0 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -2277,14 +2277,14 @@ def test_compare_prepare_for_model(self): @slow def test_only_label_first_subword(self): - words = ["hello", "niels"] + words = ["hello", "niels", "0000000000000000"] boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] - word_labels = [0, 1] + word_labels = [0, 1, 2] # test slow tokenizer tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False) encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) - self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100]) + self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100]) tokenizer_p = LayoutLMv3Tokenizer.from_pretrained( "microsoft/layoutlmv3-base", @@ -2292,12 +2292,12 @@ def test_only_label_first_subword(self): add_visual_labels=False, ) encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) - self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100]) + self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100]) # test fast tokenizer tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False) encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) - self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100]) + self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100]) tokenizer_r = LayoutLMv3Tokenizer.from_pretrained( "microsoft/layoutlmv3-base", @@ -2305,7 +2305,7 @@ def test_only_label_first_subword(self): add_visual_labels=False, ) encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) - self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100]) + self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100]) @slow def test_layoutlmv3_integration_test(self):