Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,6 @@ def encode_plus(
**kwargs,
)

# Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
Expand Down Expand Up @@ -640,18 +639,23 @@ def _batch_encode_plus(
else:
original_index = batch_index
labels_example = []
previous_token_empty = False
for id, offset, word_id in zip(
sanitized_tokens["input_ids"][batch_index],
sanitized_tokens["offset_mapping"][batch_index],
sanitized_encodings[batch_index].word_ids,
):
if word_id is not None:
if self.only_label_first_subword:
if offset[0] == 0:
if offset[0] == 0 and not previous_token_empty:
Comment thread
thibaultdouzon marked this conversation as resolved.
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
labels_example.append(word_labels[original_index][word_id])
else:
labels_example.append(self.pad_token_label)
if offset == (0, 0):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if offset == (0, 0):
if self.decode(id) == "":

I'm not sure offset == (0,0) is the right way to check this, maybe this is a safer option

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm this works when testing on a new model (UDOP)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I believe it is equivalent, although slower than simply comparing a tuple to (0, 0) it is probably safer and resilient to future changes.
It works because special tokens are checked line 648 with word_id is not None. Thus an offset of (0, 0) cannot be a special token at this position in code.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just checked for LayoutLMv3TokenizerFast, tok.decode(1437) == " ", where 1437 is the id of the "Ġ" token.

@NielsRogge NielsRogge Apr 3, 2023

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok interesting. When working on UdopTokenizerFast (UDOP is a new model for which I'll open a PR soon), I had to use self.decode(id) == "", cause it didn't work with offset == (0, 0). UDOP has the same vocabulary as T5, which is based on SentencePiece.

When testing out the following with offset == (0,0) from this branch:

from transformers import UdopTokenizerFast

words = ['a', 'weirdly', 'test', 'hello']
boxes = [[1,2,3,4] for _ in range(len(words))]
labels = [1, 2, 3, 4]

tokenizer = UdopTokenizerFast.from_pretrained("nielsr/udop-large")

encoding = tokenizer(words, boxes=boxes, word_labels=labels)

for id, label in zip(encoding.input_ids, encoding.labels):
    print(tokenizer.decode([id]), label)

it gives me:

 1
a -100
weird 2
ly -100
test 3
hello 4
</s> -100

(I used "weirdly" just to make sure we get multiple tokens). Interestingly it splits the word "a" into 2 tokens: an empty token and "a". I also printed (id, offset, word_id):

 (0, 1) 0
a (0, 1) 0
weird (0, 5) 1
ly (5, 7) 1
test (0, 4) 2
hello (0, 5) 3

and in this case the empty token has offset (0, 1), which explains why offset == (0,0) didn't work.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably not related to the current issue but your tokenizer produces weird (pun intended) offsets mapping.

 (0, 1)
a (0, 1)
weird (0, 5)
ly (5, 7)

We should be able to derive word length from the offset_mapping, ie "weirdly" is of length (5-0) + (7-5) = 7. But this does not hold anymore with this empty token not being assigned (0, 0) offset.

previous_token_empty = True
else:
previous_token_empty = False
else:
labels_example.append(word_labels[original_index][word_id])
else:
Expand Down
12 changes: 6 additions & 6 deletions tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2277,35 +2277,35 @@ def test_compare_prepare_for_model(self):

@slow
def test_only_label_first_subword(self):
words = ["hello", "niels"]
words = ["hello", "niels", "0000000000000000"]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
word_labels = [0, 1]
word_labels = [0, 1, 2]

# test slow tokenizer
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])

tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
"microsoft/layoutlmv3-base",
only_label_first_subword=False,
add_visual_labels=False,
)
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])

# test fast tokenizer
tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])

tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
"microsoft/layoutlmv3-base",
only_label_first_subword=False,
add_visual_labels=False,
)
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])

@slow
def test_layoutlmv3_integration_test(self):
Expand Down