From 7a221ccb0badd60f1066d673752f618cce26b064 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Mon, 2 Dec 2024 20:38:54 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Hackathon=207th=20No.43=E3=80=91Tokeni?= =?UTF-8?q?zerFast=20for=20Qwen2=20(#9532)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add qwen2 tokenizer fast --- paddlenlp/transformers/auto/tokenizer.py | 2 +- .../transformers/convert_slow_tokenizer.py | 50 ++++++- paddlenlp/transformers/qwen2/__init__.py | 1 + .../transformers/qwen2/tokenizer_fast.py | 131 ++++++++++++++++++ .../transformers/tokenizer_utils_base.py | 6 + tests/transformers/qwen2/test_tokenizer.py | 3 +- 6 files changed, 190 insertions(+), 3 deletions(-) create mode 100644 paddlenlp/transformers/qwen2/tokenizer_fast.py diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 51712fb1dc25..aca12e4aa0f5 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -124,7 +124,7 @@ ("ernie_vil", "ErnieViLTokenizer"), ("glm", "GLMGPT2Tokenizer"), ("qwen", "QWenTokenizer"), - ("qwen2", "Qwen2Tokenizer"), + ("qwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("yuan", "YuanTokenizer"), ] ) diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index da36ec04d270..56fb60057201 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -442,7 +442,55 @@ def pre_tokenizer(self, replacement, add_prefix_space): return None -SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter} +class Qwen2Converter(Converter): + def converted(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None) -> Tokenizer: + if not vocab: + vocab = self.original_tokenizer.encoder + if not merges: + merges = list(self.original_tokenizer.bpe_ranks.keys()) + + tokenizer = Tokenizer( + BPE( + vocab=vocab, + merges=merges, + dropout=None, + unk_token=None, + continuing_subword_prefix="", + end_of_word_suffix="", + fuse_unk=False, + byte_fallback=False, + ) + ) + + tokenizer.normalizer = normalizers.NFC() + + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split( + Regex( + r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" + ), + behavior="isolated", + invert=False, + ), + pre_tokenizers.ByteLevel( + add_prefix_space=getattr(self.original_tokenizer, "add_prefix_space", False), + use_regex=False, + ), + ] + ) + + tokenizer.decoder = decoders.ByteLevel() + tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) + + return tokenizer + + +SLOW_TO_FAST_CONVERTERS = { + "LlamaTokenizer": LlamaConverter, + "BertTokenizer": BertConverter, + "Qwen2Tokenizer": Qwen2Converter, +} def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: diff --git a/paddlenlp/transformers/qwen2/__init__.py b/paddlenlp/transformers/qwen2/__init__.py index b79ab9e6bdcc..ed41597aa254 100644 --- a/paddlenlp/transformers/qwen2/__init__.py +++ b/paddlenlp/transformers/qwen2/__init__.py @@ -17,3 +17,4 @@ from .modeling import * from .modeling_pp import * from .tokenizer import * +from .tokenizer_fast import * diff --git a/paddlenlp/transformers/qwen2/tokenizer_fast.py b/paddlenlp/transformers/qwen2/tokenizer_fast.py new file mode 100644 index 000000000000..dfff8b596d4b --- /dev/null +++ b/paddlenlp/transformers/qwen2/tokenizer_fast.py @@ -0,0 +1,131 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Qwen2.""" + +from typing import Optional, Tuple + +from ..tokenizer_utils import AddedToken +from ..tokenizer_utils_fast import PretrainedTokenizerFast +from .tokenizer import Qwen2Tokenizer + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_file": "tokenizer.json", +} + + +MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} + + +class Qwen2TokenizerFast(PretrainedTokenizerFast): + """ + Construct a "fast" Qwen2 tokenizer (backed by PaddleNLP's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import Qwen2TokenizerFast + + >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") + >>> tokenizer("Hello world")["input_ids"] + [9707, 1879] + + >>> tokenizer(" Hello world")["input_ids"] + [21927, 1879] + ``` + This is expected. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. Not applicable to this tokenizer. + bos_token (`str`, *optional*): + The beginning of sequence token. Not applicable for this tokenizer. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. + """ + + vocab_files_names = VOCAB_FILES_NAMES + resource_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = Qwen2Tokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="<|endoftext|>", + bos_token=None, + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", + **kwargs, + ): + # We need to at least pass vocab_file and merges_file to base class + # in case a slow tokenizer needs to be initialized; other can be + # configured through files. + # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token + + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(pad_token, str) + else pad_token + ) + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + **kwargs, + ) + + # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 98edc25ac1e2..cf1d3391b5c1 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1534,6 +1534,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, } + if hasattr(cls, "vocab_files_names") and len(cls.resource_files_names) == 0: + cls.resource_files_names = copy.deepcopy(cls.vocab_files_names) + logger.error( + "The attribute 'vocab_files_names' is deprecated. Please use 'resource_files_names' instead.", + DeprecationWarning, + ) vocab_files_target = {**cls.resource_files_names, **additional_files_names} # From HF Hub or AI Studio if from_hf_hub or from_aistudio: diff --git a/tests/transformers/qwen2/test_tokenizer.py b/tests/transformers/qwen2/test_tokenizer.py index 4339209eb2c0..4b81b0901f8d 100644 --- a/tests/transformers/qwen2/test_tokenizer.py +++ b/tests/transformers/qwen2/test_tokenizer.py @@ -18,7 +18,7 @@ import os import unittest -from paddlenlp.transformers import Qwen2Tokenizer +from paddlenlp.transformers import Qwen2Tokenizer, Qwen2TokenizerFast from paddlenlp.transformers.qwen2.tokenizer import VOCAB_FILES_NAMES, bytes_to_unicode from tests.transformers.test_tokenizer_common import TokenizerTesterMixin @@ -26,6 +26,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_id = "__internal_testing__/tiny-random-qwen2" tokenizer_class = Qwen2Tokenizer + rust_tokenizer_class = Qwen2TokenizerFast test_slow_tokenizer = True space_between_special_tokens = False from_pretrained_kwargs = None