From b8e48ed5a9984abdfe865e2cbded009235455a3f Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Tue, 7 May 2024 11:42:08 +0800 Subject: [PATCH] fix llama3 eot. --- paddlenlp/transformers/llama/tokenizer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index 6f19530c05cb..46c16c58b427 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -295,11 +295,12 @@ def _pad( ENDOFTEXT = "<|end_of_text|>" IMSTART = "<|start_header_id|>" IMEND = "<|end_header_id|>" +EOTID = "<|eot_id|>" # as the default behavior is changed to allow special tokens in # regular texts, the surface forms of special tokens need to be # as different as possible to minimize the impact -EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250))) -SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:] +EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251))) +SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:] tiktoken = None @@ -354,9 +355,11 @@ def __init__( self.tokenizer = enc # type: tiktoken.Encoding + self.bod_id = self.special_tokens[BEGINOFTEXT] self.eod_id = self.special_tokens[ENDOFTEXT] self.start_header_id = self.special_tokens[IMSTART] self.end_header_id = self.special_tokens[IMEND] + self.eot_id = self.special_tokens[EOTID] if "pad_token_id" in kwargs: self.pad_token_id = kwargs["pad_token_id"]