diff --git a/README_cn.md b/README_cn.md index 1ecb023d1d6b..f6780f8d2e83 100644 --- a/README_cn.md +++ b/README_cn.md @@ -30,6 +30,9 @@ **PaddleNLP**是一款**简单易用**且**功能强大**的自然语言处理开发库。聚合业界**优质预训练模型**并提供**开箱即用**的开发体验,覆盖NLP多场景的模型库搭配**产业实践范例**可满足开发者**灵活定制**的需求。 ## News 📢 + +* 🔥 **2022.11.12 PaddleNLP新增AutoPrompt自动化提示功能,登顶FewCLUE小样本学习榜单!** + * 🥇 PaddleNLP 团队开源了 **AutoPrompt** 方案,基于开源的文心 ERNIE 预训练语言模型 ,结合了领域预训练和自动化提示学习技术,以291M 参数量的模型在小样本权威学习榜单 FewCLUE 排名第一,[详见](https://mp.weixin.qq.com/s/_JPiAzFA1f0BZ0igdv-EKA)。 * 🔥 **2022.10.27 发布 [PaddleNLP v2.4.2](https://github.com/PaddlePaddle/PaddleNLP/releases/tag/v2.4.2)** * NLG能力扩充:新增📄[**基于Pegasus的中文文本摘要方案**](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/text_summarization/pegasus),效果领先;新增❓[**问题生成解决方案**](./examples/question_generation),提供基于业界领先模型UNIMO-Text和大规模多领域问题生成数据集训练的通用问题生成预训练模型。均支持Taskflow一键调用,支持FasterGeneration高性能推理,训练推理部署全流程打通。 * 发布 🖼[**PPDiffusers**](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers):支持跨模态(如图像与语音)训练和推理的扩散模型(Diffusion Model)工具箱,可快速体验、二次开发 **Stable Diffusion**,持续支持更多模型。 @@ -234,7 +237,7 @@ PaddleNLP针对信息抽取、语义检索、智能问答、情感分析等高 ### 高性能分布式训练与推理 -#### ⚡ FasterTokenizer:高性能文本处理库 +#### ⚡ FastTokenizer:高性能文本处理库
@@ -244,7 +247,7 @@ PaddleNLP针对信息抽取、语义检索、智能问答、情感分析等高 AutoTokenizer.from_pretrained("ernie-3.0-medium-zh", use_faster=True) ``` -为了实现更极致的模型部署性能,安装FastTokenizers后只需在`AutoTokenizer` API上打开 `use_faster=True`选项,即可调用C++实现的高性能分词算子,轻松获得超Python百余倍的文本处理加速,更多使用说明可参考[FasterTokenizer文档](./faster_tokenizer)。 +为了实现更极致的模型部署性能,安装FastTokenizers后只需在`AutoTokenizer` API上打开 `use_faster=True`选项,即可调用C++实现的高性能分词算子,轻松获得超Python百余倍的文本处理加速,更多使用说明可参考[FastTokenizer文档](./fast_tokenizer)。 #### ⚡️ FasterGeneration:高性能生成加速库 diff --git a/README_en.md b/README_en.md index ec76f859a9a5..3cc50086f8cd 100644 --- a/README_en.md +++ b/README_en.md @@ -30,6 +30,8 @@ ## News 📢 +* 🔥 **2022.11.12 PaddleNLP added AutoPrompt and won the first place in FewCLUE!** + * 🥇 The PaddleNLP team has open-sourced the **AutoPrompt** solution, which is based on the open-source Wenxin ERNIE pre-training language model, combined with domain pre-training and automated prompt learning technology, and ranked first in FewCLUE (a authoritative few-sample learning contest) with a model with 291M parameters. [see details](https://mp.weixin.qq.com/s/_JPiAzFA1f0BZ0igdv-EKA). * 🔥 **2022.10.27 [PaddleNLP v2.4.2](https://github.com/PaddlePaddle/PaddleNLP/releases/tag/v2.4.2) Released!** * NLG Upgrade: 📄 Release [**Solution of Text Summarization**](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/text_summarization/pegasus) based on Pegasus;❓ Release [**Solution of Problem Generation**](./examples/question_generation), providing **general problem generation pre-trained model** based on Baidu's UNIMO Text and large-scale multi domain problem generation dataset。Supporting high-performance inference ability based on FasterGeneration , and covering the whole process of training , inference and deployment. * 🔥 **2022.10.14 [PaddleNLP v2.4.1](https://github.com/PaddlePaddle/PaddleNLP/releases/tag/v2.4.1) Released!** diff --git a/applications/neural_search/recall/in_batch_negative/README.md b/applications/neural_search/recall/in_batch_negative/README.md index ed04bd15b4e9..aa07d6908480 100644 --- a/applications/neural_search/recall/in_batch_negative/README.md +++ b/applications/neural_search/recall/in_batch_negative/README.md @@ -229,6 +229,9 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ * `recall_num`: 对 1 个文本召回的相似文本数量 * `similar_text_pair_file`: 由相似文本对构成的评估集 * `corpus_file`: 召回库数据 corpus_file +* `use_recompute`: 使用Recompute策略,用于节省显存,是一种以时间换空间的技术 +* `use_gradient_cache`: 使用Gradient Cache策略,用于节省显存,是一种以时间换空间的技术 +* `chunk_numbers`: 使用Gradient Cache策略的参数,表示的是同一个批次的样本分几次执行 也可以使用bash脚本: diff --git a/applications/neural_search/recall/in_batch_negative/batch_negative/model.py b/applications/neural_search/recall/in_batch_negative/batch_negative/model.py index 911fe0b4d360..050beb62f613 100644 --- a/applications/neural_search/recall/in_batch_negative/batch_negative/model.py +++ b/applications/neural_search/recall/in_batch_negative/batch_negative/model.py @@ -60,14 +60,14 @@ def forward(self, title_cls_embedding, transpose_y=True) - # substract margin from all positive samples cosine_sim() + # Substract margin from all positive samples cosine_sim() margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()) cosine_sim = cosine_sim - paddle.diag(margin_diag) - # scale cosine to ease training converge + # Scale cosine to ease training converge cosine_sim *= self.sacle labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64') @@ -76,3 +76,56 @@ def forward(self, loss = F.cross_entropy(input=cosine_sim, label=labels) return loss + + +class SemanticIndexCacheNeg(SemanticIndexBase): + + def __init__(self, + pretrained_model, + dropout=None, + margin=0.3, + scale=30, + output_emb_size=None): + super().__init__(pretrained_model, dropout, output_emb_size) + self.margin = margin + # Used scaling cosine similarity to ease converge + self.sacle = scale + + def forward(self, + query_input_ids, + title_input_ids, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None): + + query_cls_embedding = self.get_pooled_embedding(query_input_ids, + query_token_type_ids, + query_position_ids, + query_attention_mask) + + title_cls_embedding = self.get_pooled_embedding(title_input_ids, + title_token_type_ids, + title_position_ids, + title_attention_mask) + + cosine_sim = paddle.matmul(query_cls_embedding, + title_cls_embedding, + transpose_y=True) + + # Substract margin from all positive samples cosine_sim() + margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]], + fill_value=self.margin, + dtype=cosine_sim.dtype) + + cosine_sim = cosine_sim - paddle.diag(margin_diag) + + # Scale cosine to ease training converge + cosine_sim *= self.sacle + + labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64') + labels = paddle.reshape(labels, shape=[-1, 1]) + + return [cosine_sim, labels, query_cls_embedding, title_cls_embedding] diff --git a/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh b/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh index 857302c334a1..9920a045b9dc 100755 --- a/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh +++ b/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh @@ -1,6 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # GPU version root_dir="checkpoints/inbatch" -python -u -m paddle.distributed.launch --gpus "3" --log_dir "recall_log/" \ +python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \ recall.py \ --device gpu \ --recall_result_dir "recall_result_dir" \ @@ -11,7 +25,7 @@ python -u -m paddle.distributed.launch --gpus "3" --log_dir "recall_log/" \ --hnsw_ef 100 \ --batch_size 64 \ --output_emb_size 256\ - --max_seq_length 60 \ + --max_seq_length 64 \ --recall_num 50 \ --similar_text_pair "recall/dev.csv" \ --corpus_file "recall/corpus.csv" diff --git a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py index 10bead311455..fe48bd49ffbc 100644 --- a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py +++ b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py @@ -17,14 +17,17 @@ import time import numpy as np import paddle +import paddle.nn.functional as F from functools import partial + from paddlenlp.utils.log import logger from paddlenlp.data import Tuple, Pad from paddlenlp.datasets import load_dataset, MapDataset from paddlenlp.transformers import AutoModel, AutoTokenizer from paddlenlp.transformers import LinearDecayWithWarmup + from base_model import SemanticIndexBase -from batch_negative.model import SemanticIndexBatchNeg +from batch_negative.model import SemanticIndexBatchNeg, SemanticIndexCacheNeg from data import read_text_pair, convert_example, create_dataloader, gen_id2corpus, gen_text_file from ann_util import build_index @@ -89,6 +92,15 @@ help="evaluate_result") parser.add_argument('--evaluate', action='store_true', help='whether evaluate while training') +parser.add_argument("--use_amp", action="store_true", help="Whether to use AMP.") +parser.add_argument("--amp_loss_scale", default=32768, type=float,help="The value of scale_loss for fp16. This is only used for AMP training.") +parser.add_argument("--use_recompute", + action='store_true', + help="Using the recompute to scale up the batch size and save the memory.") +parser.add_argument("--use_gradient_cache", + action='store_true', + help="Using the gradient cache to scale up the batch size and save the memory.") +parser.add_argument("--chunk_numbers",type=int,default=50,help="The number of the chunks for model") args = parser.parse_args() # yapf: enable @@ -161,6 +173,179 @@ def evaluate(model, corpus_data_loader, query_data_loader, recall_result_file, return float(recall_N[1]) +def train(train_data_loader, model, optimizer, lr_scheduler, rank, + corpus_data_loader, query_data_loader, recall_result_file, text_list, + id2corpus, tokenizer): + global_step = 0 + best_recall = 0.0 + tic_train = time.time() + for epoch in range(1, args.epochs + 1): + for step, batch in enumerate(train_data_loader, start=1): + query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch + + loss = model(query_input_ids=query_input_ids, + title_input_ids=title_input_ids, + query_token_type_ids=query_token_type_ids, + title_token_type_ids=title_token_type_ids) + + global_step += 1 + if global_step % args.log_steps == 0 and rank == 0: + print( + "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" + % (global_step, epoch, step, loss, args.log_steps / + (time.time() - tic_train))) + tic_train = time.time() + loss.backward() + optimizer.step() + lr_scheduler.step() + optimizer.clear_grad() + if not args.evaluate: + if global_step % args.save_steps == 0 and rank == 0: + save_dir = os.path.join(args.save_dir, + "model_%d" % global_step) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_param_path = os.path.join(save_dir, + 'model_state.pdparams') + paddle.save(model.state_dict(), save_param_path) + tokenizer.save_pretrained(save_dir) + if args.evaluate and rank == 0: + print("evaluating") + recall_5 = evaluate(model, corpus_data_loader, query_data_loader, + recall_result_file, text_list, id2corpus) + if recall_5 > best_recall: + best_recall = recall_5 + + save_dir = os.path.join(args.save_dir, "model_best") + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_param_path = os.path.join(save_dir, 'model_state.pdparams') + paddle.save(model.state_dict(), save_param_path) + tokenizer.save_pretrained(save_dir) + with open(os.path.join(save_dir, "train_result.txt"), + 'a', + encoding='utf-8') as fp: + fp.write('epoch=%d, global_step: %d, recall: %s\n' % + (epoch, global_step, recall_5)) + + +def gradient_cache_train(train_data_loader, model, optimizer, lr_scheduler, + rank, tokenizer): + + if args.use_amp: + scaler = paddle.amp.GradScaler(init_loss_scaling=args.amp_loss_scale) + + if args.batch_size % args.chunk_numbers == 0: + chunk_numbers = args.chunk_numbers + else: + raise Exception( + f" Batch_size {args.batch_size} must divides chunk_numbers {args.chunk_numbers} without producing a remainder " + ) + + def split(inputs, chunk_numbers, axis=0): + if inputs.shape[0] % chunk_numbers == 0: + return paddle.split(inputs, chunk_numbers, axis=0) + else: + return paddle.split(inputs, inputs.shape[0], axis=0) + + global_step = 0 + tic_train = time.time() + for epoch in range(1, args.epochs + 1): + for step, batch in enumerate(train_data_loader, start=1): + # Separate large batches into several sub batches + chunked_x = [split(t, chunk_numbers, axis=0) for t in batch] + sub_batchs = [list(s) for s in zip(*chunked_x)] + + all_grads = [] + all_CUDA_rnd_state = [] + all_query = [] + all_title = [] + + for sub_batch in sub_batchs: + all_reps = [] + all_labels = [] + sub_query_input_ids, sub_query_token_type_ids, sub_title_input_ids, sub_title_token_type_ids = sub_batch + with paddle.amp.auto_cast( + args.use_amp, + custom_white_list=["layer_norm", "softmax", "gelu"]): + + with paddle.no_grad(): + sub_CUDA_rnd_state = paddle.framework.random.get_cuda_rng_state( + ) + all_CUDA_rnd_state.append(sub_CUDA_rnd_state) + sub_cosine_sim, sub_label, query_embedding, title_embedding = model( + query_input_ids=sub_query_input_ids, + title_input_ids=sub_title_input_ids, + query_token_type_ids=sub_query_token_type_ids, + title_token_type_ids=sub_title_token_type_ids) + all_reps.append(sub_cosine_sim) + all_labels.append(sub_label) + all_title.append(title_embedding) + all_query.append(query_embedding) + + model_reps = paddle.concat(all_reps, axis=0) + model_title = paddle.concat(all_title) + model_query = paddle.concat(all_query) + + model_title = model_title.detach() + model_query = model_query.detach() + + model_query.stop_gradient = False + model_title.stop_gradient = False + model_reps.stop_gradient = False + + model_label = paddle.concat(all_labels, axis=0) + loss = F.cross_entropy(input=model_reps, label=model_label) + loss.backward() + # Store gradients + all_grads.append(model_reps.grad) + + for sub_batch, CUDA_state, grad in zip(sub_batchs, + all_CUDA_rnd_state, + all_grads): + + sub_query_input_ids, sub_query_token_type_ids, sub_title_input_ids, sub_title_token_type_ids = sub_batch + paddle.framework.random.set_cuda_rng_state(CUDA_state) + # Recompute the forward propogation + sub_cosine_sim, sub_label, query_embedding, title_embedding = model( + query_input_ids=sub_query_input_ids, + title_input_ids=sub_title_input_ids, + query_token_type_ids=sub_query_token_type_ids, + title_token_type_ids=sub_title_token_type_ids) + # Chain rule + surrogate = paddle.dot(sub_cosine_sim, grad) + # Backward propogation + if args.use_amp: + scaled = scaler.scale(surrogate) + scaled.backward() + else: + surrogate.backward() + # Update model parameters + if args.use_amp: + scaler.minimize(optimizer, scaled) + else: + optimizer.step() + + global_step += 1 + if global_step % args.log_steps == 0 and rank == 0: + print( + "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" + % (global_step, epoch, step, loss, args.log_steps / + (time.time() - tic_train))) + tic_train = time.time() + + lr_scheduler.step() + optimizer.clear_grad() + + if global_step % args.save_steps == 0 and rank == 0: + save_dir = os.path.join(args.save_dir, "model_%d" % global_step) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_param_path = os.path.join(save_dir, 'model_state.pdparams') + paddle.save(model.state_dict(), save_param_path) + tokenizer.save_pretrained(save_dir) + + def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() @@ -173,7 +358,8 @@ def do_train(): data_path=args.train_set_file, lazy=False) - pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) + pretrained_model = AutoModel.from_pretrained( + args.model_name_or_path, enable_recompute=args.use_recompute) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) @@ -197,11 +383,16 @@ def do_train(): batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) - - model = SemanticIndexBatchNeg(pretrained_model, - margin=args.margin, - scale=args.scale, - output_emb_size=args.output_emb_size) + if (args.use_gradient_cache): + model = SemanticIndexCacheNeg(pretrained_model, + margin=args.margin, + scale=args.scale, + output_emb_size=args.output_emb_size) + else: + model = SemanticIndexBatchNeg(pretrained_model, + margin=args.margin, + scale=args.scale, + output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) @@ -262,57 +453,13 @@ def do_train(): weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) - global_step = 0 - best_recall = 0.0 - tic_train = time.time() - for epoch in range(1, args.epochs + 1): - for step, batch in enumerate(train_data_loader, start=1): - query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch - - loss = model(query_input_ids=query_input_ids, - title_input_ids=title_input_ids, - query_token_type_ids=query_token_type_ids, - title_token_type_ids=title_token_type_ids) - - global_step += 1 - if global_step % args.log_steps == 0 and rank == 0: - print( - "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" - % (global_step, epoch, step, loss, 10 / - (time.time() - tic_train))) - tic_train = time.time() - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.clear_grad() - if not args.evaluate: - if global_step % args.save_steps == 0 and rank == 0: - save_dir = os.path.join(args.save_dir, - "model_%d" % global_step) - if not os.path.exists(save_dir): - os.makedirs(save_dir) - save_param_path = os.path.join(save_dir, - 'model_state.pdparams') - paddle.save(model.state_dict(), save_param_path) - tokenizer.save_pretrained(save_dir) - if args.evaluate and rank == 0: - print("evaluating") - recall_5 = evaluate(model, corpus_data_loader, query_data_loader, - recall_result_file, text_list, id2corpus) - if recall_5 > best_recall: - best_recall = recall_5 - - save_dir = os.path.join(args.save_dir, "model_best") - if not os.path.exists(save_dir): - os.makedirs(save_dir) - save_param_path = os.path.join(save_dir, 'model_state.pdparams') - paddle.save(model.state_dict(), save_param_path) - tokenizer.save_pretrained(save_dir) - with open(os.path.join(save_dir, "train_result.txt"), - 'a', - encoding='utf-8') as fp: - fp.write('epoch=%d, global_step: %d, recall: %s\n' % - (epoch, global_step, recall_5)) + if (args.use_gradient_cache): + gradient_cache_train(train_data_loader, model, optimizer, lr_scheduler, + rank, tokenizer) + else: + train(train_data_loader, model, optimizer, lr_scheduler, rank, + corpus_data_loader, query_data_loader, recall_result_file, + text_list, id2corpus, tokenizer) if __name__ == "__main__": diff --git a/applications/text_classification/hierarchical/analysis/word_interpret.ipynb b/applications/text_classification/hierarchical/analysis/word_interpret.ipynb index 8c25b87a2711..6ddad7051a32 100644 --- a/applications/text_classification/hierarchical/analysis/word_interpret.ipynb +++ b/applications/text_classification/hierarchical/analysis/word_interpret.ipynb @@ -112,6 +112,7 @@ " items = line.strip().split('\\t')\n", " if items[0] == 'Text':\n", " continue\n", + " items[0] = items[0][:MAX_LENGTH-2]\n", " if len(items) == 3:\n", " yield {'text': items[0], 'label': items[1], 'predict': items[2]}\n", " elif len(items) == 2:\n", diff --git a/applications/text_classification/hierarchical/deploy/predictor/README.md b/applications/text_classification/hierarchical/deploy/predictor/README.md index caff6498386e..46102ba6d554 100644 --- a/applications/text_classification/hierarchical/deploy/predictor/README.md +++ b/applications/text_classification/hierarchical/deploy/predictor/README.md @@ -11,7 +11,7 @@ 如果基于GPU部署,请先确保机器已正确安装NVIDIA相关驱动和基础软件,确保CUDA >= 11.2,CuDNN >= 8.2,并使用以下命令安装所需依赖: ```shell -python -m pip install onnxruntime-gpu onnx onnxconverter-common psutil +python -m pip install onnxruntime-gpu onnx onnxconverter-common==1.9.0 psutil ``` 如果基于CPU部署,请使用如下命令安装所需依赖: diff --git a/applications/text_classification/hierarchical/few-shot/README.md b/applications/text_classification/hierarchical/few-shot/README.md index 347e522164f8..c1311eda6d1e 100644 --- a/applications/text_classification/hierarchical/few-shot/README.md +++ b/applications/text_classification/hierarchical/few-shot/README.md @@ -65,9 +65,9 @@ 内存: 630 GB -3. PaddlePaddle 版本:2.3.1 +3. PaddlePaddle 版本:2.4rc -4. PaddleNLP 版本:2.3.5 (develop) +4. PaddleNLP 版本:2.4.3 5. 评估设置 @@ -91,7 +91,7 @@ | model_name | 训练方式 | Micro F1分数 | Macro F1分数 | | ---------- | ------- | ----------- | ----------- | | ernie-3.0-base-zh | 微调学习 | 0.7172 | 0.3821 | - | ernie-3.0-base-zh | 提示学习 | 0.8855 | 0.8443 | + | ernie-3.0-base-zh | 提示学习 | 0.8945 | 0.8516 | @@ -102,10 +102,10 @@ ### 3.1 运行环境 -- python >= 3.6 -- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)) -- paddlenlp >= 2.3.5 -- paddle2onnx >= 1.0.0rc3 +- python >= 3.7 +- paddlepaddle >= 2.4rc +- paddlenlp >= 2.4.3 +- paddle2onnx >= 1.0.3 ### 3.2 代码结构 @@ -222,12 +222,12 @@ python train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ --load_best_model_at_end \ ---evaluation_strategy epoch \ ---save_strategy epoch +--eval_steps 100 ``` **多卡训练** @@ -247,12 +247,12 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ --load_best_model_at_end \ ---evaluation_strategy epoch \ ---save_strategy epoch +--eval_steps 100 ``` 可配置参数说明: @@ -273,6 +273,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ - `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。 - `num_train_epochs`: 训练的最大轮数。 - `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。 +- `save_total_limit`: 模型检查点保存数量。 - `device`: 使用的设备,默认为`gpu`。 - `eval_steps`: 评估模型的间隔步数。 - `logging_steps`: 打印日志的间隔步数。 @@ -352,9 +353,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data - 可配置参数说明: - `model_path_prefix`: 导出的静态图模型路径及文件前缀。 -- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 +- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 - `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。 -- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 +- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 - `batch_size`: 每次预测的样本数量。 - `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。 - `device_id`: 指定GPU设备ID。 diff --git a/applications/text_classification/hierarchical/few-shot/infer.py b/applications/text_classification/hierarchical/few-shot/infer.py index 7442641d502f..5fc92a9dd933 100644 --- a/applications/text_classification/hierarchical/few-shot/infer.py +++ b/applications/text_classification/hierarchical/few-shot/infer.py @@ -14,23 +14,24 @@ import os import six +import json import psutil import argparse import numpy as np from paddlenlp.utils.log import logger -from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample -from paddlenlp.transformers import AutoTokenizer +from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM import paddle2onnx import onnxruntime as ort # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.") -parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.") +parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.") parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.") -parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") +parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.") parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.") parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.") @@ -103,12 +104,6 @@ def __init__(self, 'device_id': device_id }]) - self.input_handles = [ - self.predictor.get_inputs()[0].name, - self.predictor.get_inputs()[1].name, - self.predictor.get_inputs()[2].name - ] - if device == "gpu": try: assert 'CUDAExecutionProvider' in self.predictor.get_providers() @@ -122,27 +117,52 @@ def __init__(self, logger.info(">>> [InferBackend] Engine Created ...") def infer(self, input_dict: dict): - input_dict = { - k: v - for k, v in input_dict.items() if k in self.input_handles - } result = self.predictor.run(None, input_dict) return result class HierachicalPredictor(object): - def __init__(self, args, label_list): - self._label_list = label_list - self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self._max_seq_length = args.max_seq_length - self._batch_size = args.batch_size - self.inference_backend = InferBackend(args.model_path_prefix, - args.device, args.device_id, - args.use_fp16, args.num_threads) - self._template = AutoTemplate.load_from( - os.path.dirname(args.model_path_prefix), self._tokenizer, - args.max_seq_length) + def __init__(self, args): + self.args = args + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(args.model_name) + self.template, self.labels, self.input_handles = self.post_init() + self.collate_fn = PromptDataCollatorWithPadding( + self.tokenizer, + padding=True, + return_tensors="np", + return_attention_mask=True) + + self.inference_backend = InferBackend(self.args.model_path_prefix, + self.args.device, + self.args.device_id, + self.args.use_fp16, + self.args.num_threads) + + def post_init(self): + export_path = os.path.dirname(self.args.model_path_prefix) + template_path = os.path.join(export_path, "template_config.json") + with open(template_path, "r") as fp: + prompt = json.load(fp) + template = AutoTemplate.create_from(prompt, self.tokenizer, + self.args.max_length, + self.model) + keywords = template.extract_template_keywords(template.prompt) + inputs = [ + "input_ids", "token_type_ids", "position_ids", "attention_mask", + "masked_positions" + ] + if "soft" in keywords: + inputs.append("soft_token_ids") + if "encoder" in keywords: + inputs.append("encoder_ids") + verbalizer_path = os.path.join(export_path, "verbalizer_config.json") + with open(verbalizer_path, "r") as fp: + label_words = json.load(fp) + labels = sorted(list(label_words.keys())) + + return template, labels, inputs def predict(self, input_data: list): encoded_inputs = self.preprocess(input_data) @@ -155,14 +175,27 @@ def _infer(self, input_dict): infer_data = self.inference_backend.infer(input_dict) return infer_data - def infer_batch(self, encoded_inputs): - num_sample = len(encoded_inputs["input_ids"]) + def infer_batch(self, inputs): + num_sample = len(inputs) infer_data = None num_infer_data = None - for idx in range(0, num_sample, self._batch_size): - l, r = idx, idx + self._batch_size - keys = encoded_inputs.keys() - input_dict = {k: encoded_inputs[k][l:r] for k in keys} + for index in range(0, num_sample, self.args.batch_size): + left, right = index, index + self.args.batch_size + batch_dict = self.collate_fn(inputs[left:right]) + input_dict = {} + for key in self.input_handles: + value = batch_dict[key] + if key == "attention_mask": + if value.ndim == 2: + value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4 + elif value.ndim != 4: + raise ValueError( + "Expect attention mask with ndim=2 or 4, but get ndim={}" + .format(value.ndim)) + value = value.astype("float32") + else: + value = value.astype("int64") + input_dict[key] = value results = self._infer(input_dict) if infer_data is None: infer_data = [[x] for x in results] @@ -175,16 +208,8 @@ def infer_batch(self, encoded_inputs): return infer_data def preprocess(self, input_data: list): - text = [InputExample(text_a=x) for x in input_data] - inputs = [self._template.wrap_one_example(x) for x in text] - inputs = { - "input_ids": - np.array([x["input_ids"] for x in inputs], dtype="int64"), - "mask_ids": - np.array([x["mask_ids"] for x in inputs], dtype="int64"), - "soft_token_ids": - np.array([x["soft_token_ids"] for x in inputs], dtype="int64") - } + text = [{"text_a": x} for x in input_data] + inputs = [self.template(x) for x in text] return inputs @staticmethod @@ -197,7 +222,7 @@ def postprocess(self, infer_data): label_ids = np.argwhere(probs > threshold) labels = [[] for _ in range(probs.shape[0])] for idx, label_id in label_ids: - labels[idx].append(self._label_list[label_id]) + labels[idx].append(self.labels[label_id]) return {"label": labels} def printer(self, result, input_data): @@ -212,12 +237,10 @@ def printer(self, result, input_data): for arg_name, arg_value in vars(args).items(): logger.info("{:20}: {}".format(arg_name, arg_value)) - export_path = os.path.dirname(args.model_path_prefix) - labels, _ = Verbalizer.load_from(export_path) + predictor = HierachicalPredictor(args) text_dir = os.path.join(args.data_dir, "data.txt") with open(text_dir, "r", encoding="utf-8") as f: text_list = [x.strip() for x in f.readlines()] - predictor = HierachicalPredictor(args, labels) predictor.predict(text_list) diff --git a/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt b/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt new file mode 100644 index 000000000000..bbe76e363f00 --- /dev/null +++ b/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt @@ -0,0 +1,5 @@ +psutil +paddlepaddle>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime diff --git a/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt b/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt new file mode 100644 index 000000000000..66454bd8b6b5 --- /dev/null +++ b/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt @@ -0,0 +1,7 @@ +psutil +paddlepaddle-gpu>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime-gpu +onnx +onnxconverter-common diff --git a/applications/text_classification/hierarchical/few-shot/train.py b/applications/text_classification/hierarchical/few-shot/train.py index c7c0a2daa38a..aac82bff9ab5 100644 --- a/applications/text_classification/hierarchical/few-shot/train.py +++ b/applications/text_classification/hierarchical/few-shot/train.py @@ -15,6 +15,7 @@ from dataclasses import dataclass, field import os import sys +from collections import defaultdict import paddle import paddle.nn.functional as F @@ -41,9 +42,6 @@ class DataArguments: data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional) files."}) prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."}) - soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."}) - encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."}) - @dataclass class ModelArguments: @@ -67,17 +65,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) # Define the template for preprocess and the verbalizer for postprocess. - template = AutoTemplate.create_from( - data_args.prompt, - tokenizer, - training_args.max_seq_length, - model=model, - prompt_encoder=data_args.soft_encoder, - encoder_hidden_size=data_args.encoder_hidden_size) - logger.info("Using template: {}".format(template.template)) + template = AutoTemplate.create_from(data_args.prompt, + tokenizer, + training_args.max_seq_length, + model=model) + logger.info("Using template: {}".format(template.prompt)) label_file = os.path.join(data_args.data_dir, "label.txt") - verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file) + with open(label_file, "r", encoding="utf-8") as fp: + label_words = defaultdict(list) + for line in fp: + data = line.strip().split("==") + word = data[1] if len(data) > 1 else data[0].split("##")[-1] + label_words[data[0]].append(word) + verbalizer = SoftVerbalizer(label_words, tokenizer, model) # Load the few-shot datasets. train_ds, dev_ds, test_ds = load_local_dataset( @@ -139,11 +140,24 @@ def compute_metrics(eval_preds): # Export static model. if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], + dtype="float32") # attention_mask ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], + dtype="int64")) # masked_positions + if "soft" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # soft_token_ids + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # encoder_ids export_path = os.path.join(training_args.output_dir, 'export') trainer.export_model(export_path, input_spec=input_spec, diff --git a/applications/text_classification/hierarchical/few-shot/utils.py b/applications/text_classification/hierarchical/few-shot/utils.py index 4880432310f3..e29aaa9db61d 100644 --- a/applications/text_classification/hierarchical/few-shot/utils.py +++ b/applications/text_classification/hierarchical/few-shot/utils.py @@ -15,7 +15,6 @@ import os from paddlenlp.datasets import load_dataset -from paddlenlp.prompt import InputExample def load_local_dataset(data_path, splits, label_list): @@ -39,14 +38,14 @@ def _reader(data_file, label_list): for idx, line in enumerate(fp): data = line.strip().split("\t") if len(data) == 1: - yield InputExample(text_a=data[0]) + yield {"text_a": data[0]} else: text, label = data label = label.strip().split(",") label = [ float(1) if x in label else float(0) for x in label_list ] - yield InputExample(text_a=text, labels=label) + yield {"text_a": text, "labels": label} split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} datasets = [] diff --git a/applications/text_classification/multi_class/analysis/evaluate.py b/applications/text_classification/multi_class/analysis/evaluate.py index 9aa272c650f9..ad716c21b048 100644 --- a/applications/text_classification/multi_class/analysis/evaluate.py +++ b/applications/text_classification/multi_class/analysis/evaluate.py @@ -156,16 +156,17 @@ def evaluate(): logger.info("Dev dataset size: {}".format(len(dev_ds))) logger.info("Accuracy in dev dataset: {:.2f}%".format(report['accuracy'] * 100)) - logger.info("Top-2 accuracy in dev dataset: {:.2f}%".format( - top_k_accuracy_score(y_true=labels, - y_score=probs, - k=2, - labels=list(range(len(label_list)))) * 100)) - logger.info("Top-3 accuracy in dev dataset: {:.2f}%".format( - top_k_accuracy_score(y_true=labels, - y_score=probs, - k=3, - labels=list(range(len(label_list)))) * 100)) + if len(labels) > 2: + logger.info("Top-2 accuracy in dev dataset: {:.2f}%".format( + top_k_accuracy_score(y_true=labels, + y_score=probs, + k=2, + labels=list(range(len(label_list)))) * 100)) + logger.info("Top-3 accuracy in dev dataset: {:.2f}%".format( + top_k_accuracy_score(y_true=labels, + y_score=probs, + k=3, + labels=list(range(len(label_list)))) * 100)) for i, l in enumerate(label_list): logger.info("Class name: {}".format(l)) diff --git a/applications/text_classification/multi_class/analysis/word_interpret.ipynb b/applications/text_classification/multi_class/analysis/word_interpret.ipynb index 0b83eec1bdf5..ddfee4143137 100644 --- a/applications/text_classification/multi_class/analysis/word_interpret.ipynb +++ b/applications/text_classification/multi_class/analysis/word_interpret.ipynb @@ -112,6 +112,7 @@ " items = line.strip().split('\\t')\n", " if items[0] == 'Text':\n", " continue\n", + " items[0] = items[0][:MAX_LENGTH-2]\n", " if len(items) == 3:\n", " yield {'text': items[0], 'label': items[1], 'predict': items[2]}\n", " elif len(items) == 2:\n", diff --git a/applications/text_classification/multi_class/deploy/predictor/README.md b/applications/text_classification/multi_class/deploy/predictor/README.md index 8959571cb6ab..5da5f714a025 100644 --- a/applications/text_classification/multi_class/deploy/predictor/README.md +++ b/applications/text_classification/multi_class/deploy/predictor/README.md @@ -12,7 +12,7 @@ 如果基于GPU部署,请先确保机器已正确安装NVIDIA相关驱动和基础软件,确保CUDA >= 11.2,CuDNN >= 8.2,并使用以下命令安装所需依赖: ``` -python -m pip install onnxruntime-gpu onnx onnxconverter-common +python -m pip install onnxruntime-gpu onnx onnxconverter-common==1.9.0 ``` 如果基于CPU部署,请使用如下命令安装所需依赖: diff --git a/applications/text_classification/multi_class/few-shot/README.md b/applications/text_classification/multi_class/few-shot/README.md index 3f6e5f759bbf..45f9b009bf34 100644 --- a/applications/text_classification/multi_class/few-shot/README.md +++ b/applications/text_classification/multi_class/few-shot/README.md @@ -65,9 +65,9 @@ 内存: 630 GB -3. PaddlePaddle 版本:2.3.1 +3. PaddlePaddle 版本:2.4rc -4. PaddleNLP 版本:2.3.5 (develop) +4. PaddleNLP 版本:2.4.3 5. 评估设置 @@ -82,7 +82,7 @@ python train.py --dataset_dir "./data/" --save_dir "./checkpoints" --max_seq_len - 提示学习 ``` -python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条新闻写的是" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model accuracy --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch +python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条新闻写的是" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model accuracy --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch --save_total_limit 1 ``` 6. 精度评价指标:Accuracy @@ -102,10 +102,10 @@ python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条 ### 3.1 运行环境 -- python >= 3.6 -- paddlepaddle > 2.3 (2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)) -- paddlenlp >= 2.3.5 -- paddle2onnx >= 1.0.0rc3 +- python >= 3.7 +- paddlepaddle >= 2.4rc +- paddlenlp >= 2.4.3 +- paddle2onnx >= 1.0.3 ### 3.2 代码结构 @@ -204,14 +204,15 @@ python train.py \ --output_dir ./checkpoints/ \ --prompt "这条新闻标题的主题是" \ --max_seq_length 128 \ ---learning_rate 3e-5 \ ---ppt_learning_rate 3e-4 \ +--learning_rate 3e-6 \ +--ppt_learning_rate 3e-5 \ --do_train \ --do_eval \ --use_rdrop \ --max_steps 1000 \ --eval_steps 10 \ --logging_steps 5 \ +--save_total_limit 1 \ --load_best_model_at_end True \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ @@ -227,8 +228,8 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --output_dir ./checkpoints/ \ --prompt "这条新闻标题的主题是" \ --max_seq_length 128 \ ---learning_rate 3e-5 \ ---ppt_learning_rate 3e-4 \ +--learning_rate 3e-6 \ +--ppt_learning_rate 3e-5 \ --do_train \ --do_eval \ --use_rdrop \ @@ -236,6 +237,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --max_steps 1000 \ --eval_steps 10 \ --logging_steps 5 \ +--save_total_limit 1 \ --load_best_model_at_end True \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ @@ -260,6 +262,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ - `do_predict`: 是否进行预测。 - `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。 - `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。 +- `save_total_limit`: 模型检查点保存数量。 - `eval_steps`: 评估模型的间隔步数。 - `device`: 使用的设备,默认为`gpu`。 - `logging_steps`: 打印日志的间隔步数。 @@ -335,9 +338,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data - 可配置参数说明: - `model_path_prefix`: 导出的静态图模型路径及文件前缀。 -- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 +- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 - `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。 -- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 +- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 - `batch_size`: 每次预测的样本数量。 - `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。 - `device_id`: 指定GPU设备ID。 diff --git a/applications/text_classification/multi_class/few-shot/infer.py b/applications/text_classification/multi_class/few-shot/infer.py index 142a20d1ed76..03715bc29d0e 100644 --- a/applications/text_classification/multi_class/few-shot/infer.py +++ b/applications/text_classification/multi_class/few-shot/infer.py @@ -14,23 +14,24 @@ import os import six +import json import psutil import argparse import numpy as np from paddlenlp.utils.log import logger -from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample -from paddlenlp.transformers import AutoTokenizer +from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM import paddle2onnx import onnxruntime as ort # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.") -parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.") +parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.") parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.") -parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") +parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.") parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.") parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.") @@ -103,11 +104,6 @@ def __init__(self, 'device_id': device_id }]) - self.input_handles = [ - self.predictor.get_inputs()[0].name, - self.predictor.get_inputs()[1].name, - self.predictor.get_inputs()[2].name - ] if device == "gpu": try: @@ -122,27 +118,53 @@ def __init__(self, logger.info(">>> [InferBackend] Engine Created ...") def infer(self, input_dict: dict): - input_dict = { - k: v - for k, v in input_dict.items() if k in self.input_handles - } result = self.predictor.run(None, input_dict) return result class MultiClassPredictor(object): - def __init__(self, args, label_list): - self._label_list = label_list - self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self._max_seq_length = args.max_seq_length - self._batch_size = args.batch_size - self.inference_backend = InferBackend(args.model_path_prefix, - args.device, args.device_id, - args.use_fp16, args.num_threads) - self._template = AutoTemplate.load_from( - os.path.dirname(args.model_path_prefix), self._tokenizer, - args.max_seq_length) + def __init__(self, args): + self.args = args + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(args.model_name) + self.template, self.labels, self.input_handles = self.post_init() + self.collate_fn = PromptDataCollatorWithPadding( + self.tokenizer, + padding=True, + return_tensors="np", + return_attention_mask=True) + + self.inference_backend = InferBackend(self.args.model_path_prefix, + self.args.device, + self.args.device_id, + self.args.use_fp16, + self.args.num_threads) + + def post_init(self): + export_path = os.path.dirname(self.args.model_path_prefix) + template_path = os.path.join(export_path, "template_config.json") + with open(template_path, "r") as fp: + prompt = json.load(fp) + template = AutoTemplate.create_from(prompt, self.tokenizer, + self.args.max_length, + self.model) + keywords = template.extract_template_keywords(template.prompt) + inputs = [ + "input_ids", "token_type_ids", "position_ids", "attention_mask" + ] + if "mask" in keywords: + inputs.append("masked_positions") + if "soft" in keywords: + inputs.append("soft_token_ids") + if "encoder" in keywords: + inputs.append("encoder_ids") + verbalizer_path = os.path.join(export_path, "verbalizer_config.json") + with open(verbalizer_path, "r") as fp: + label_words = json.load(fp) + labels = sorted(list(label_words.keys())) + + return template, labels, inputs def predict(self, input_data: list): encoded_inputs = self.preprocess(input_data) @@ -155,14 +177,27 @@ def _infer(self, input_dict): infer_data = self.inference_backend.infer(input_dict) return infer_data - def infer_batch(self, encoded_inputs): - num_sample = len(encoded_inputs["input_ids"]) + def infer_batch(self, inputs): + num_sample = len(inputs) infer_data = None num_infer_data = None - for idx in range(0, num_sample, self._batch_size): - l, r = idx, idx + self._batch_size - keys = encoded_inputs.keys() - input_dict = {k: encoded_inputs[k][l:r] for k in keys} + for index in range(0, num_sample, self.args.batch_size): + left, right = index, index + self.args.batch_size + batch_dict = self.collate_fn(inputs[left:right]) + input_dict = {} + for key in self.input_handles: + value = batch_dict[key] + if key == "attention_mask": + if value.ndim == 2: + value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4 + elif value.ndim != 4: + raise ValueError( + "Expect attention mask with ndim=2 or 4, but get ndim={}" + .format(value.ndim)) + value = value.astype("float32") + else: + value = value.astype("int64") + input_dict[key] = value results = self._infer(input_dict) if infer_data is None: infer_data = [[x] for x in results] @@ -175,21 +210,13 @@ def infer_batch(self, encoded_inputs): return infer_data def preprocess(self, input_data: list): - text = [InputExample(text_a=x) for x in input_data] - inputs = [self._template.wrap_one_example(x) for x in text] - inputs = { - "input_ids": - np.array([x["input_ids"] for x in inputs], dtype="int64"), - "mask_ids": - np.array([x["mask_ids"] for x in inputs], dtype="int64"), - "soft_token_ids": - np.array([x["soft_token_ids"] for x in inputs], dtype="int64") - } + text = [{"text_a": x} for x in input_data] + inputs = [self.template(x) for x in text] return inputs def postprocess(self, infer_data): preds = np.argmax(infer_data[0], axis=-1) - labels = [self._label_list[x] for x in preds] + labels = [self.labels[x] for x in preds] return {"label": labels} def printer(self, result, input_data): @@ -204,12 +231,10 @@ def printer(self, result, input_data): for arg_name, arg_value in vars(args).items(): logger.info("{:20}: {}".format(arg_name, arg_value)) - export_path = os.path.dirname(args.model_path_prefix) - labels, _ = Verbalizer.load_from(export_path) + predictor = MultiClassPredictor(args) text_dir = os.path.join(args.data_dir, "data.txt") with open(text_dir, "r", encoding="utf-8") as f: text_list = [x.strip() for x in f.readlines()] - predictor = MultiClassPredictor(args, labels) predictor.predict(text_list) diff --git a/applications/text_classification/multi_class/few-shot/requirements_cpu.txt b/applications/text_classification/multi_class/few-shot/requirements_cpu.txt new file mode 100644 index 000000000000..bbe76e363f00 --- /dev/null +++ b/applications/text_classification/multi_class/few-shot/requirements_cpu.txt @@ -0,0 +1,5 @@ +psutil +paddlepaddle>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime diff --git a/applications/text_classification/multi_class/few-shot/requirements_gpu.txt b/applications/text_classification/multi_class/few-shot/requirements_gpu.txt new file mode 100644 index 000000000000..66454bd8b6b5 --- /dev/null +++ b/applications/text_classification/multi_class/few-shot/requirements_gpu.txt @@ -0,0 +1,7 @@ +psutil +paddlepaddle-gpu>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime-gpu +onnx +onnxconverter-common diff --git a/applications/text_classification/multi_class/few-shot/train.py b/applications/text_classification/multi_class/few-shot/train.py index a07d27ea5879..0e67750deb23 100644 --- a/applications/text_classification/multi_class/few-shot/train.py +++ b/applications/text_classification/multi_class/few-shot/train.py @@ -14,6 +14,9 @@ from dataclasses import dataclass, field import os +from collections import defaultdict + +import numpy as np import paddle from paddle.static import InputSpec @@ -37,8 +40,6 @@ class DataArguments: data_dir: str = field(default="./data/", metadata={"help": "Path to a dataset which includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional)."}) prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."}) - soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."}) - encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."}) @dataclass @@ -63,17 +64,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) # Define the template for preprocess and the verbalizer for postprocess. - template = AutoTemplate.create_from( - data_args.prompt, - tokenizer, - training_args.max_seq_length, - model=model, - prompt_encoder=data_args.soft_encoder, - encoder_hidden_size=data_args.encoder_hidden_size) - logger.info("Using template: {}".format(template.template)) + template = AutoTemplate.create_from(data_args.prompt, + tokenizer, + training_args.max_seq_length, + model=model) + logger.info("Using template: {}".format(template.prompt)) label_file = os.path.join(data_args.data_dir, "label.txt") - verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file) + with open(label_file, "r", encoding="utf-8") as fp: + label_words = defaultdict(list) + for line in fp: + data = line.strip().split("==") + word = data[1] if len(data) > 1 else data[0].split("##")[-1] + label_words[data[0]].append(word) + verbalizer = SoftVerbalizer(label_words, tokenizer, model) # Load the few-shot datasets. train_ds, dev_ds, test_ds = load_local_dataset( @@ -133,11 +137,24 @@ def compute_metrics(eval_preds): # Export static model. if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], + dtype="float32") # attention_mask ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], + dtype="int64")) # masked_positions + if "soft" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # soft_token_ids + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # encoder_ids export_path = os.path.join(training_args.output_dir, 'export') trainer.export_model(export_path, input_spec=input_spec, diff --git a/applications/text_classification/multi_class/few-shot/utils.py b/applications/text_classification/multi_class/few-shot/utils.py index 907b640ea96b..b440c9627917 100644 --- a/applications/text_classification/multi_class/few-shot/utils.py +++ b/applications/text_classification/multi_class/few-shot/utils.py @@ -15,7 +15,6 @@ import os from paddlenlp.datasets import load_dataset -from paddlenlp.prompt import InputExample def load_local_dataset(data_path, splits, label_list): @@ -38,10 +37,10 @@ def _reader(data_file, label_list): for idx, line in enumerate(fp): data = line.strip().split("\t") if len(data) == 1: - yield InputExample(text_a=data[0]) + yield {"text_a": data[0]} else: text, label = data - yield InputExample(text_a=text, labels=label_list[label]) + yield {"text_a": text, "labels": label_list[label]} assert isinstance(splits, list) and len(splits) > 0 diff --git a/applications/text_classification/multi_label/analysis/word_interpret.ipynb b/applications/text_classification/multi_label/analysis/word_interpret.ipynb index 07dd3f85c839..d6601cd4d4ed 100644 --- a/applications/text_classification/multi_label/analysis/word_interpret.ipynb +++ b/applications/text_classification/multi_label/analysis/word_interpret.ipynb @@ -1,32 +1,8 @@ { - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.7.13 64-bit", - "metadata": { - "interpreter": { - "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90" - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 2, "cells": [ { + "cell_type": "markdown", + "metadata": {}, "source": [ "# 词级别可解释性分析\n", "本项目提供模型的词级别可解释性分析,包括LIME、Integrated Gradient、GradShap 三种分析方法,支持分析微调后模型的预测结果,开发者可以通过更改**数据目录**和**模型目录**在自己的任务中使用此项目进行数据分析。\n", @@ -54,34 +30,16 @@ "准予原告胡某甲与被告韩某甲离婚。\n", "...\n", "```\n" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { - "source": [ - "import functools\n", - "import random\n", - "import os\n", - "import argparse\n", - "\n", - "import jieba\n", - "import numpy as np\n", - "from trustai.interpretation import VisualizationTextRecord\n", - "from trustai.interpretation import get_word_offset\n", - "import paddle\n", - "from paddle.io import DataLoader, BatchSampler\n", - "from paddlenlp.data import DataCollatorWithPadding\n", - "from paddlenlp.datasets import load_dataset\n", - "from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer\n" - ], "cell_type": "code", - "metadata": {}, "execution_count": 1, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n", "/usr/local/lib/python3.7/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", @@ -96,6 +54,22 @@ " resample=Image.BICUBIC,\n" ] } + ], + "source": [ + "import functools\n", + "import random\n", + "import os\n", + "import argparse\n", + "\n", + "import jieba\n", + "import numpy as np\n", + "from trustai.interpretation import VisualizationTextRecord\n", + "from trustai.interpretation import get_word_offset\n", + "import paddle\n", + "from paddle.io import DataLoader, BatchSampler\n", + "from paddlenlp.data import DataCollatorWithPadding\n", + "from paddlenlp.datasets import load_dataset\n", + "from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer\n" ] }, { @@ -127,11 +101,11 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "## 2.读取待分析数据" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", @@ -148,6 +122,7 @@ " items = line.strip().split('\\t')\n", " if items[0] == 'Text':\n", " continue\n", + " items[0] = items[0][:MAX_LENGTH-2]\n", " if len(items) == 3:\n", " yield {'text': items[0], 'label': items[1], 'predict': items[2]}\n", " elif len(items) == 2:\n", @@ -181,8 +156,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "\u001b[32m[2022-09-28 04:51:03,566] [ INFO]\u001b[0m - We are using to load '../checkpoint/'.\u001b[0m\n", "W0928 04:51:03.570216 4827 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2\n", @@ -229,12 +204,12 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "## 3.开始数据可解释性分析\n", "数据量较大时,数据分析时间较长,请耐心等待" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", @@ -242,8 +217,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Start token level interpretion, it will take some time...\n", "Building prefix dict from the default dictionary ...\n", @@ -292,12 +267,12 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "## 4.数据可解释性分析结果可视化\n", "使用用颜色深浅可视化方式代表句子中词对预测结果的重要程度" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", @@ -364,12 +339,16 @@ "metadata": {}, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": "", - "text/html": "
LabelPredictionKey wordsImportant visualization
不履行家庭义务,婚后分居婚后分居至今 双方 出 分居 。 [CLS] 2015 2 23 被告 原告 家门 原告 居住 娘家 待产 双方 分居 至今 [SEP]
婚后有子女,限制行为能力子女抚养婚后有子女,限制行为能力子女抚养,不履行离婚协议财产 符合 付清 欠条 抚养 [CLS] 被告 孙某 辩称 离婚 协议 关于 财产 分割 给付 资金 符合 法律 规定 只有 离婚 子女 抚养 符合 法律 规定 没有 协议 代表 被告 真实 意思 表示 离婚 协议 没有 约定 付款 时间 而且 被告 原告 出具 欠条 5 年内 付清 原告 期满 起诉 驳回 [SEP]
存在非婚生子,支付抚养费,限制行为能力子女抚养限制行为能力子女抚养,存在非婚生子赵某 并非 认可 之女 表示 [CLS] 被告 董某 认可 赵某 并非 原告 之女 表示 愿意 自行 抚养 赵某 [SEP]
准予离婚准予离婚,法定离婚原告 韩某 准予 离婚 。 [CLS] 准予 原告 胡某 被告 韩某 离婚 [SEP]
" + "text/html": [ + "
LabelPredictionKey wordsImportant visualization
不履行家庭义务,婚后分居婚后分居至今 双方 出 分居 。 [CLS] 2015 2 23 被告 原告 家门 原告 居住 娘家 待产 双方 分居 至今 [SEP]
婚后有子女,限制行为能力子女抚养婚后有子女,限制行为能力子女抚养,不履行离婚协议财产 符合 付清 欠条 抚养 [CLS] 被告 孙某 辩称 离婚 协议 关于 财产 分割 给付 资金 符合 法律 规定 只有 离婚 子女 抚养 符合 法律 规定 没有 协议 代表 被告 真实 意思 表示 离婚 协议 没有 约定 付款 时间 而且 被告 原告 出具 欠条 5 年内 付清 原告 期满 起诉 驳回 [SEP]
存在非婚生子,支付抚养费,限制行为能力子女抚养限制行为能力子女抚养,存在非婚生子赵某 并非 认可 之女 表示 [CLS] 被告 董某 认可 赵某 并非 原告 之女 表示 愿意 自行 抚养 赵某 [SEP]
准予离婚准予离婚,法定离婚原告 韩某 准予 离婚 。 [CLS] 准予 原告 胡某 被告 韩某 离婚 [SEP]
" + ], + "text/plain": [ + "" + ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -377,5 +356,31 @@ "html = visualize(align_res, interpret_ds)" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.13 64-bit", + "metadata": { + "interpreter": { + "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90" + } + }, + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13-final" + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/applications/text_classification/multi_label/deploy/predictor/README.md b/applications/text_classification/multi_label/deploy/predictor/README.md index 4c7ff45c8aab..fb9aca5a731e 100644 --- a/applications/text_classification/multi_label/deploy/predictor/README.md +++ b/applications/text_classification/multi_label/deploy/predictor/README.md @@ -12,7 +12,7 @@ 如果基于GPU部署,请先确保机器已正确安装NVIDIA相关驱动和基础软件,确保CUDA >= 11.2,CuDNN >= 8.2,并使用以下命令安装所需依赖: ```shell -python -m pip install onnxruntime-gpu onnx onnxconverter-common +python -m pip install onnxruntime-gpu onnx onnxconverter-common==1.9.0 ``` 如果基于CPU部署,请使用如下命令安装所需依赖: diff --git a/applications/text_classification/multi_label/few-shot/README.md b/applications/text_classification/multi_label/few-shot/README.md index d7ee94ff68f2..4bb3bbf51fb5 100644 --- a/applications/text_classification/multi_label/few-shot/README.md +++ b/applications/text_classification/multi_label/few-shot/README.md @@ -70,9 +70,9 @@ 内存: 630 GB -3. PaddlePaddle 版本:2.3.1 +3. PaddlePaddle 版本:2.4rc -4. PaddleNLP 版本:2.3.5 (develop) +4. PaddleNLP 版本:2.4.3 5. 评估设置 @@ -88,7 +88,7 @@ - 提示学习 ``` - python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这句话包含的要素有" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model macro_f1_score --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch + python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这句话包含的要素有" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model macro_f1_score --load_best_model_at_end --eval_steps 100 --save_total_limit 1 ``` 6. 精度评价指标:Micro F1分数、Macro F1分数 @@ -96,7 +96,7 @@ | model_name | 训练方式 | Micro F1分数 | Macro F1分数 | | ---------- | ------- | ----------- | ----------- | | ernie-3.0-base-zh | 微调学习 | 0.7419 | 0.5105 | - | ernie-3.0-base-zh | 提示学习 | 0.7838 | 0.6985 | + | ernie-3.0-base-zh | 提示学习 | 0.7839 | 0.6003 | ## 3.定制训练 @@ -106,10 +106,10 @@ ### 3.1 运行环境 -- python >= 3.6 -- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)) -- paddlenlp >= 2.3.5 -- paddle2onnx >= 1.0.0rc3 +- python >= 3.7 +- paddlepaddle >= 2.4rc +- paddlenlp >= 2.4.3 +- paddle2onnx >= 1.0.3 ### 3.2 代码结构 @@ -223,6 +223,7 @@ python train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ @@ -248,6 +249,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ @@ -274,6 +276,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ - `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。 - `num_train_epochs`: 训练的最大轮数。 - `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。 +- `save_total_limit`: 模型检查点保存数量。 - `device`: 使用的设备,默认为`gpu`。 - `eval_steps`: 评估模型的间隔步数。 - `logging_steps`: 打印日志的间隔步数。 @@ -352,9 +355,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data - 可配置参数说明: - `model_path_prefix`: 导出的静态图模型路径及文件前缀。 -- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 +- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 - `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。 -- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 +- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 - `batch_size`: 每次预测的样本数量。 - `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。 - `device_id`: 指定GPU设备ID。 diff --git a/applications/text_classification/multi_label/few-shot/infer.py b/applications/text_classification/multi_label/few-shot/infer.py index 48d42d294e96..2014afaef56b 100644 --- a/applications/text_classification/multi_label/few-shot/infer.py +++ b/applications/text_classification/multi_label/few-shot/infer.py @@ -14,23 +14,24 @@ import os import six +import json import psutil import argparse import numpy as np from paddlenlp.utils.log import logger -from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample -from paddlenlp.transformers import AutoTokenizer +from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM import paddle2onnx import onnxruntime as ort # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.") -parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.") +parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.") parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.") -parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") +parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.") parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.") parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.") @@ -103,11 +104,6 @@ def __init__(self, 'device_id': device_id }]) - self.input_handles = [ - self.predictor.get_inputs()[0].name, - self.predictor.get_inputs()[1].name, - self.predictor.get_inputs()[2].name - ] if device == "gpu": try: @@ -122,27 +118,52 @@ def __init__(self, logger.info(">>> [InferBackend] Engine Created ...") def infer(self, input_dict: dict): - input_dict = { - k: v - for k, v in input_dict.items() if k in self.input_handles - } result = self.predictor.run(None, input_dict) return result class MultiLabelPredictor(object): - def __init__(self, args, label_list): - self._label_list = label_list - self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self._max_seq_length = args.max_seq_length - self._batch_size = args.batch_size - self.inference_backend = InferBackend(args.model_path_prefix, - args.device, args.device_id, - args.use_fp16, args.num_threads) - self._template = AutoTemplate.load_from( - os.path.dirname(args.model_path_prefix), self._tokenizer, - args.max_seq_length) + def __init__(self, args): + self.args = args + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(args.model_name) + self.template, self.labels, self.input_handles = self.post_init() + self.collate_fn = PromptDataCollatorWithPadding( + self.tokenizer, + padding=True, + return_tensors="np", + return_attention_mask=True) + + self.inference_backend = InferBackend(self.args.model_path_prefix, + self.args.device, + self.args.device_id, + self.args.use_fp16, + self.args.num_threads) + + def post_init(self): + export_path = os.path.dirname(self.args.model_path_prefix) + template_path = os.path.join(export_path, "template_config.json") + with open(template_path, "r") as fp: + prompt = json.load(fp) + template = AutoTemplate.create_from(prompt, self.tokenizer, + self.args.max_length, + self.model) + keywords = template.extract_template_keywords(template.prompt) + inputs = [ + "input_ids", "token_type_ids", "position_ids", "attention_mask", + "masked_positions" + ] + if "soft" in keywords: + inputs.append("soft_token_ids") + if "encoder" in keywords: + inputs.append("encoder_ids") + verbalizer_path = os.path.join(export_path, "verbalizer_config.json") + with open(verbalizer_path, "r") as fp: + label_words = json.load(fp) + labels = sorted(list(label_words.keys())) + + return template, labels, inputs def predict(self, input_data: list): encoded_inputs = self.preprocess(input_data) @@ -155,14 +176,27 @@ def _infer(self, input_dict): infer_data = self.inference_backend.infer(input_dict) return infer_data - def infer_batch(self, encoded_inputs): - num_sample = len(encoded_inputs["input_ids"]) + def infer_batch(self, inputs): + num_sample = len(inputs) infer_data = None num_infer_data = None - for idx in range(0, num_sample, self._batch_size): - l, r = idx, idx + self._batch_size - keys = encoded_inputs.keys() - input_dict = {k: encoded_inputs[k][l:r] for k in keys} + for index in range(0, num_sample, self.args.batch_size): + left, right = index, index + self.args.batch_size + batch_dict = self.collate_fn(inputs[left:right]) + input_dict = {} + for key in self.input_handles: + value = batch_dict[key] + if key == "attention_mask": + if value.ndim == 2: + value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4 + elif value.ndim != 4: + raise ValueError( + "Expect attention mask with ndim=2 or 4, but get ndim={}" + .format(value.ndim)) + value = value.astype("float32") + else: + value = value.astype("int64") + input_dict[key] = value results = self._infer(input_dict) if infer_data is None: infer_data = [[x] for x in results] @@ -175,16 +209,8 @@ def infer_batch(self, encoded_inputs): return infer_data def preprocess(self, input_data: list): - text = [InputExample(text_a=x) for x in input_data] - inputs = [self._template.wrap_one_example(x) for x in text] - inputs = { - "input_ids": - np.array([x["input_ids"] for x in inputs], dtype="int64"), - "mask_ids": - np.array([x["mask_ids"] for x in inputs], dtype="int64"), - "soft_token_ids": - np.array([x["soft_token_ids"] for x in inputs], dtype="int64") - } + text = [{"text_a": x} for x in input_data] + inputs = [self.template(x) for x in text] return inputs @staticmethod @@ -197,7 +223,7 @@ def postprocess(self, infer_data): label_ids = np.argwhere(probs > threshold) labels = [[] for _ in range(probs.shape[0])] for idx, label_id in label_ids: - labels[idx].append(self._label_list[label_id]) + labels[idx].append(self.labels[label_id]) return {"label": labels} def printer(self, result, input_data): @@ -212,12 +238,10 @@ def printer(self, result, input_data): for arg_name, arg_value in vars(args).items(): logger.info("{:20}: {}".format(arg_name, arg_value)) - export_path = os.path.dirname(args.model_path_prefix) - labels, _ = Verbalizer.load_from(export_path) + predictor = MultiLabelPredictor(args) text_dir = os.path.join(args.data_dir, "data.txt") with open(text_dir, "r", encoding="utf-8") as f: text_list = [x.strip() for x in f.readlines()] - predictor = MultiLabelPredictor(args, labels) predictor.predict(text_list) diff --git a/applications/text_classification/multi_label/few-shot/requirements_cpu.txt b/applications/text_classification/multi_label/few-shot/requirements_cpu.txt new file mode 100644 index 000000000000..bbe76e363f00 --- /dev/null +++ b/applications/text_classification/multi_label/few-shot/requirements_cpu.txt @@ -0,0 +1,5 @@ +psutil +paddlepaddle>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime diff --git a/applications/text_classification/multi_label/few-shot/requirements_gpu.txt b/applications/text_classification/multi_label/few-shot/requirements_gpu.txt new file mode 100644 index 000000000000..66454bd8b6b5 --- /dev/null +++ b/applications/text_classification/multi_label/few-shot/requirements_gpu.txt @@ -0,0 +1,7 @@ +psutil +paddlepaddle-gpu>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime-gpu +onnx +onnxconverter-common diff --git a/applications/text_classification/multi_label/few-shot/train.py b/applications/text_classification/multi_label/few-shot/train.py index 44f0c190c90f..6d12f6b3e5fd 100644 --- a/applications/text_classification/multi_label/few-shot/train.py +++ b/applications/text_classification/multi_label/few-shot/train.py @@ -15,6 +15,7 @@ from dataclasses import dataclass, field import os import sys +from collections import defaultdict import paddle import paddle.nn.functional as F @@ -41,8 +42,6 @@ class DataArguments: data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt and label.txt files."}) prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."}) - soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."}) - encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."}) @dataclass @@ -67,17 +66,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) # Define the template for preprocess and the verbalizer for postprocess. - template = AutoTemplate.create_from( - data_args.prompt, - tokenizer, - training_args.max_seq_length, - model=model, - prompt_encoder=data_args.soft_encoder, - encoder_hidden_size=data_args.encoder_hidden_size) - logger.info("Using template: {}".format(template.template)) + template = AutoTemplate.create_from(data_args.prompt, + tokenizer, + training_args.max_seq_length, + model=model) + logger.info("Using template: {}".format(template.prompt)) label_file = os.path.join(data_args.data_dir, "label.txt") - verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file) + with open(label_file, "r", encoding="utf-8") as fp: + label_words = defaultdict(list) + for line in fp: + data = line.strip().split("==") + word = data[1] if len(data) > 1 else data[0].split("##")[-1] + label_words[data[0]].append(word) + verbalizer = SoftVerbalizer(label_words, tokenizer, model) # Load the few-shot datasets. train_ds, dev_ds, test_ds = load_local_dataset( @@ -139,11 +141,24 @@ def compute_metrics(eval_preds): # Export static model. if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], + dtype="float32") # attention_mask ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], + dtype="int64")) # masked_positions + if "soft" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # soft_token_ids + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # encoder_ids export_path = os.path.join(training_args.output_dir, 'export') trainer.export_model(export_path, input_spec=input_spec, diff --git a/applications/text_classification/multi_label/few-shot/utils.py b/applications/text_classification/multi_label/few-shot/utils.py index 6891c22829fc..4855e43e7bf3 100644 --- a/applications/text_classification/multi_label/few-shot/utils.py +++ b/applications/text_classification/multi_label/few-shot/utils.py @@ -15,7 +15,6 @@ import os from paddlenlp.datasets import load_dataset -from paddlenlp.prompt import InputExample def load_local_dataset(data_path, splits, label_list): @@ -39,14 +38,14 @@ def _reader(data_file, label_list): for idx, line in enumerate(fp): data = line.strip().split("\t") if len(data) == 1: - yield InputExample(text_a=data[0]) + yield {"text_a": data[0]} else: text, label = data label = label.strip().split(",") label = [ float(1) if x in label else float(0) for x in label_list ] - yield InputExample(text_a=text, labels=label) + yield {"text_a": text, "labels": label} split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} datasets = [] diff --git a/docs/advanced_guide/prompt.md b/docs/advanced_guide/prompt.md index e45aca61d4b6..c8b720261302 100644 --- a/docs/advanced_guide/prompt.md +++ b/docs/advanced_guide/prompt.md @@ -22,11 +22,11 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi * [如何定义模板](#如何定义模板) * [离散型模板](#离散型模板) * [连续型模板](#连续型模板) + * [前缀连续型模板](#前缀连续型模板) * [快速定义模板](#快速定义模板) * [如何定义标签词映射](#如何定义标签词映射) - * [单掩码映射](#单掩码映射) - * [多掩码映射](#多掩码映射) - * [标签词映射分类](#标签词映射分类) + * [离散型标签词映射](#离散型标签词映射) + * [连续型标签词映射](#连续型标签词映射) * [快速开始训练](#快速开始训练) * [数据准备](#数据准备) * [预训练参数准备](#预训练参数准备) @@ -39,18 +39,28 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi ## 如何定义模板 -**模板**(Template)的功能是在原有输入文本上增加提示语句,从而将原任务转化为 MLM 任务,可以分为离散型和连续型两种。Prompt API 中提供了统一的数据结构来构造不同类型的模板,输入相应格式的**字符串**,通过解析得到对应的输入模板,即字典构成的列表。 +**模板**(Template)的功能是在原有输入文本上增加提示语句,从而将原任务转化为 MLM 任务,可以分为离散型和连续型两种。Prompt API 中提供了统一的数据结构来构造不同类型的模板,输入相应格式的**字符串**,通过解析得到对应的输入模板。模板由不同字段构成,可任意组合。每个字段中的关键字定义了数据文本或者提示文本,即 `input_ids`,属性可定义该字段是否可截断,以及对应的 `position_ids`,`token_type_ids` 等。 ### 离散型模板 离散型模板 `ManualTemplate` 是直接将提示语句与原始输入文本拼接起来,二者的词向量矩阵共享,均为预训练模型学到的词向量矩阵。可用于实现 PET、RGL 等算法。 -**模板关键字** +**模板关键字及属性** -- ``text`` :数据集中原始输入文本对应的关键字,包括`text_a`和`text_b`。[数据准备](#数据准备)中介绍了如何将自定义数据集转化为统一格式。 -- ``hard`` :自定义的文本提示语句。 +- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`、`text_b` 和 `content`。 +- ``hard`` :自定义的提示语句文本。 - ``mask`` :待预测词的占位符。 -- ``sep`` :用于区分不同的句子。`sep`前后的句子对应不同的`token_type_id`。 + - ``length`` :定义 ``mask`` 的数量。 +- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。 +- ``options`` :数据集字典或者文件中的候选标签序列。 + - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。 + - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。 + +**模版通用属性** + +- `position`: 定义当前字段的起始 `position id`。 +- `token_type`: 定义当前字段及后续字段的 `token type id`。 +- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。 **模板定义** @@ -64,16 +74,25 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi “{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'} ``` +``` +{'options': './data/label.txt'}{'sep'}下边两句话间的逻辑关系是什么?{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'} +``` +其中 `label.txt` 为候选标签的本地文件路径,每行一个候选标签,例如 + +``` +中立 +蕴含 +矛盾 +``` + **样本示例** 例如,对于自然语言推理任务,给定样本 ```python -from paddlenlp.prompt import InputExample -sample = InputExample(uid=0, - text_a="心里有些生畏,又不知畏惧什么", - text_b="心里特别开心", - labels="矛盾") +sample = { + "text_a": "心里有些生畏,又不知畏惧什么", "text_b": "心里特别开心", "labels": "矛盾" +} ``` 按照模板修改拼接后,最终输入模型的文本数据为 @@ -90,17 +109,17 @@ from paddlenlp.prompt import ManualTemplate from paddlenlp.transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -template = ManualTemplate(tokenizer=tokenizer, - max_seq_length=512, - template="“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}") -input_dict = template.wrap_one_example(sample) +template = ManualTemplate(prompt="“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}", + tokenizer=tokenizer, + max_length=512) +input_dict = template(sample) ``` 其中初始化参数定义如下 +- ``prompt`` :定义提示语句以及与输入文本组合方式的字符串。 - ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 -- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。 -- ``template`` :定义提示语句以及与输入文本组合方式的字符串。 +- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。 **使用技巧** @@ -115,24 +134,42 @@ input_dict = template.wrap_one_example(sample) **模板关键字** -- ``text`` :数据集中原始输入文本对应的关键字,包括`text_a`和`text_b`。[数据准备](#数据准备)中介绍了如何将自定义数据集转化为统一格式。 +- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`和`text_b`。 - ``hard`` :自定义的文本提示语句。 - ``mask`` :待预测词的占位符。 -- ``sep`` :用于区分不同的句子。`sep`前后的句子对应不同的`token_type_id`。 -- ``soft`` 表示连续型提示。若值为 ``None`` ,则使用对应数量的随机初始化向量作为提示;若值为文本,则使用对应长度的连续性向量作为提示,并预训练词向量中文本对应的向量进行初始化。 +- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。 +- ``soft`` 表示连续型提示。若值为 ``None`` ,则随机初始化提示向量;若值为文本,则使用文本对应的预训练字向量初始化提示向量。 + - ``length`` :定义 ``soft token`` 的数量。若定义文本长度小于该值,超过部分随机初始化。 + - ``encoder`` :定义 `soft token` 的编码器类型,可选 `lstm`,`mlp`。默认为 `None`, 不使用编码器。 + - ``hidden_size`` :定义编码器的隐藏层维度。默认与预训练词向量维度相同。 +- ``options`` :数据集字典或者文件中的候选标签序列。 + - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。 + - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。 + +**模版通用属性** + +- `position`: 定义当前字段的起始 `position id`。 +- `token_type`: 定义当前字段及后续字段的 `token type id`。 +- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。 **模板定义** - 定义长度为 1 的连续型提示,随机初始化: ```python -"{'soft': None}{'text': 'text_a'}{'sep'}{'text': 'text_b'}" +"{'soft'}{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'}" +``` + +- 定义长度为 10 的连续型提示,随机初始化,编码器为 `mlp`: + +```python +"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': None, 'length':10, 'encoder': 'mlp'}{'mask'}" ``` -- 定义长度为 10 的连续型提示,随机初始化,其中 ``duplicate`` 参数表示连续型提示的长度(仅在随机初始化时有效,即`soft`值为`None`): +- 定义长度为 15 的连续型提示,使用 `请判断` 初始化前三个 soft token,其余随机初始化,编码器为隐藏层维度为 100 的双层 LSTM: ```python -"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': None, `duplicate`:10}{'mask'}" +"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断:', 'length': 15, 'encoder': 'lstm', 'hidden_size': 100}{'mask'}" ``` - 定义长度为 15 的连续型提示,使用 `"请判断这两个句子间的逻辑关系:"` 的预训练词向量逐一进行初始化: @@ -156,38 +193,98 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -template = SoftTemplate(tokenizer=tokenizer, - max_seq_length=512, - model=model, - template="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", - prompt_encoder='lstm', - encoder_hidden_size=200) +template = SoftTemplate(prompt="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", + tokenizer=tokenizer, + max_length=512, + word_embeddings=model.get_input_embeddings()) ``` 其中初始化参数定义如下 +- ``prompt`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。 - ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 -- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。 -- ``model`` : 预训练语言模型,为了取预训练词向量用于连续型提示向量初始化。 -- ``template`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。 -- ``prompt_encoder`` : 连续型提示向量的编码器,可选 ``mlp`` 和 ``lstm``。默认为 ``None`` ,即无编码器,直接使用向量。 -- ``encoder_hidden_size`` : 连续型提示向量的维度。默认为 ``None`` ,即与预训练词向量维度相同。 +- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。 +- ``word_embeddings`` :预训练语言模型的词向量,用于连续型提示向量初始化。 +- ``soft_embeddings`` :连续型提示向量矩阵,可用于不同模板间的连续型参数共享。设置后将覆盖默认连续型向量矩阵。 **使用技巧** - 对于分类任务,推荐的连续型提示长度一般为10-20。 - 对于随机初始化的连续性 prompt 向量,通常用比预训练模型微调更大的学习率来更新参数。 - 与离散型模板相似,连续型模板对初始化参数也比较敏感。自定义提示语句作为连续性 prompt 向量的初始化参数通常比随机初始化效果好。 -- prompt_encoder 为已有论文中的策略,用于建模不同连续型提示向量之间的序列关系。在实际应用中推荐先去掉 prompt_encoder 调整向量初始化。 +- prompt_encoder 为已有论文中的策略,用于建模不同连续型提示向量之间的序列关系。 + + +### 前缀连续型模板 + +`PrefixTemplate` 同样使用了连续型向量作为提示,与 `SoftTemplate` 的不同,该模版的提示向量不仅仅作用于输入层,每层都会有相应的提示向量。可用于实现 P-Tuning 等算法。 + +**模板关键字** + +- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`和`text_b`。 +- ``hard`` :自定义的文本提示语句。 +- ``mask`` :待预测词的占位符。 +- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。 +- ``prefix`` 表示连续型提示,该字段**必须**位于模板首位。若值为 ``None`` ,则随机初始化提示向量;若值为文本,则使用文本对应的预训练字向量初始化提示向量。 + - ``length`` :定义 ``soft token`` 的数量。若定义文本长度小于该值,超过部分随机初始化。 + - ``encoder`` :定义 `soft token` 的编码器类型,可选 `lstm`,`mlp`。默认为 `None`, 不使用编码器。 + - ``hidden_size`` :定义编码器的隐藏层维度。默认与预训练词向量维度相同。 +- ``options`` :数据集字典或者文件中的候选标签序列。 + - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。 + - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。 + +**模版通用属性** + +- `position`: 定义当前字段的起始 `position id`。 +- `token_type`: 定义当前字段及后续字段的 `token type id`。 +- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。 + +**模板定义** + +- 定义长度为 15 的连续型提示,随机初始化: + +```python +"{'prefix': '新闻类别', 'length': 10, 'encoder': 'lstm'}{'text': 'text_a'}" +``` + +- 定义混合模板,这里`prefix`关键字对应的提示和`hard`对应的提示对应两套不同的向量: + +```python +"{'prefix': '自然语言推理任务:', 'encoder': 'mlp'}{'text': 'text_a'}{'sep'}{'text': 'text_b'}这两个句子间的逻辑关系是{'mask'}" +``` + + +**调用 API** + +```python +from paddlenlp.prompt import PrefixTemplate +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM + +model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") +tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") +template = PrefixTemplate(prompt="{'prefix': '任务描述'}{'text': 'text_a'}{'mask'}", + tokenizer=tokenizer, + max_length=512, + model=model, + prefix_dropout=0.1) +``` + +其中初始化参数定义如下 + +- ``prompt`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。 +- ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 +- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。 +- ``model`` :预训练语言模型,用于连续型提示向量初始化,以及根据模型结构生成每层对应的提示向量。 +- ``prefix_dropout`` :连续型提示向量的丢弃概率,用于正则化。 ### 快速定义模板 -PaddleNLP 提供了 ``AutoTemplate`` API 以便快速定义单句输入的手工初始化的连续型模板,同时支持直接按照模板类型自动切换离散型模板和离散型模板。 +PaddleNLP 提供了 ``AutoTemplate`` API 快速定义简化离散型模板,也可根据完整模板字符串自动切换 ManualTemplate、SoftTemplate 和 PrefixTemplate。 **模板定义** -- 只定义用于初始化连续型向量的文本提示,即可得到拼接到句尾的连续型模板输入。例如, +- 快速定义离散型的文本提示。例如, ```python "这篇文章表达了怎样的情感?" @@ -196,7 +293,7 @@ PaddleNLP 提供了 ``AutoTemplate`` API 以便快速定义单句输入的手工 等价于 ```python -"{'text': 'text_a'}{'soft': '这篇文章表达了怎样的情感?'}{'mask'}" +"{'text': 'text_a'}{'hard': '这篇文章表达了怎样的情感?'}{'mask'}" ``` - 当输入为完整模板字符串时,解析得到的模板与[离散型模板](#离散型模板)和[连续型模板](#连续型模板)中描述的一致。 @@ -210,40 +307,37 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") # 离散型模板,返回值为 ManualTemplate 实例 -template = AutoTemplate.create_from(template="{'text': 'text_a'}和{'text': 'text_b'}之间的逻辑关系是{'mask'}", +template = AutoTemplate.create_from(prompt="这个句子表达了怎样的情感?", + tokenizer=tokenizer, + max_length=512) + +template = AutoTemplate.create_from(prompt="这个句子表达了怎样的情感?{'text': 'text_a'}{'mask'}", tokenizer=tokenizer, - max_seq_length=512) + max_length=512) # 连续型模板,返回值为 SoftTemplate 实例 -template = AutoTemplate.create_from(template="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", +template = AutoTemplate.create_from(prompt="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", tokenizer=tokenizer, - max_seq_length=512, - model=model, - prompt_encoder='lstm', - encoder_hidden_size=200) + max_length=512, + model=model) -# 快速定义单句连续型模板,返回值为 SoftTemplate 实例 -template = AutoTemplate.create_from(template="这篇文章表达了怎样的情感?", +# 前缀连续型模板,返回值为 PrefixTemplate 实例 +template = AutoTemplate.create_from(prompt="{'prefix': None, 'encoder': 'mlp', 'hidden_size': 50}{'text': 'text_a'}", tokenizer=tokenizer, - max_seq_length=512, - model=model, - prompt_encoder='lstm', - encoder_hidden_size=200) + max_length=512, + model=model) ``` 其中初始化参数定义如下 +- ``prompt`` :定义离散型/连续型提示、初始化以及和输入文本的组合方式。 - ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 -- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。 +- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。 - ``model`` :预训练语言模型,为了取预训练词向量用于连续型提示向量初始化。 -- ``template`` :定义离散型/连续型提示、初始化以及和输入文本的组合方式。 -- ``prompt_encoder`` :连续型提示向量的编码器,可选 ``mlp`` 和 ``lstm`` 。默认为 ``None`` ,即无编码器,直接使用向量。 -- ``encoder_hidden_size`` :连续型提示向量的维度。默认为 ``None`` ,即与预训练词向量维度相同。 - ## 如何定义标签词映射 -**标签词映射**(Verbalizer)也是提示学习中可选的重要模块,用于建立预测词和标签之间的映射,将“预训练-微调”模式中预测标签的任务转换为预测模板中掩码位置的词语,从而将下游任务统一为预训练任务的形式。目前框架支持了离散型标签词映射和 [Word-level Adversarial ReProgramming (WARP)](https://aclanthology.org/2021.acl-long.381/) 方法。 +**标签词映射**(Verbalizer)也是提示学习中可选的重要模块,用于建立预测词和标签之间的映射,将“预训练-微调”模式中预测标签的任务转换为预测模板中掩码位置的词语,从而将下游任务统一为预训练任务的形式。目前框架支持了离散型标签词映射和连续型标签词映射 [Word-level Adversarial ReProgramming (WARP)](https://aclanthology.org/2021.acl-long.381/) 方法。 例如,在情感二分类任务中,微调方法和提示学习的标签体系如下 @@ -259,9 +353,9 @@ template = AutoTemplate.create_from(template="这篇文章表达了怎样的情 具体来说,对于模板 ``{'text':'text_a'}这句话表示我{'mask'}满意。`` ,我们使用映射 ``{'负向': '不', '正向': '很'}`` 将标签 ``负向`` 映射为 ``不`` ,将标签 ``正向`` 映射为 ``很`` 。也就是说,我们期望对于正向情感的文本,预测结果为 ``...这句话表示我很满意。`` ,对于负向情感的文本,预测结果为 ``...这句话表示我不满意。`` -### 单掩码映射 +### 离散型标签词映射 -``ManualVerbalizer`` 支持构造简单的单 ``{'mask'}`` 标签词映射,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时取均值。 +``ManualVerbalizer`` 支持构造 ``{'mask'}`` 对应的标签词映射,支持多``{'mask'}``,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时,默认取均值。 **调用 API** @@ -271,41 +365,15 @@ from paddlenlp.transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") verbalizer = ManualVerbalizer(tokenizer=tokenizer, - labels=['负向', '正向'], - label_words={'负向': '不', '正向': '很'}, - prefix=None) + label_words={'负向': '不', '正向': '很'}) ``` 其中初始化参数定义如下 +- ``label_words`` : 原标签到预测词之间的映射字典。 - ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。 -- ``labels`` : 数据集的原标签列表(可选)。 -- ``label_words`` : 原标签到预测词之间的映射字典。如果同时定义了 ``labels`` ,二者的标签集合需要相同。 -- ``prefix`` : 预测词解码前增加的前缀,用于 ``RoBERTa`` 等对前缀敏感的模型,例如 `roberta-large`, `good` 和 ` good` 经过 tokenize 会得到不同的 id。默认为 ``None`` ,无前缀。 - -### 多掩码映射 - -``MultiMaskVerbalizer`` 继承自 ``ManualVerbalizer`` ,支持多 ``{'mask'}`` 标签词映射。预测词长度需与 ``{'mask'}`` 长度一致。 - -**调用 API** - -```python -from paddlenlp.prompt import MultiMaskVerbalizer -from paddlenlp.transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -verbalizer = MultiMaskVerbalizer(tokenizer=tokenizer, - labels=['负向', '正向'], - label_words={'负向': '生气', '正向': '高兴'}, - prefix=None) -``` - - -其中初始化参数定义同[单掩码映射](#单掩码映射) 。 - - -### 标签词映射分类 +### 连续型标签词映射 标签词映射分类器 ``SoftVerbalizer`` 修改了原 ``AutoMaskedLM`` 的模型结构,将预训练模型最后一层“隐藏层-词表”替换为“隐藏层-标签”的映射。该层网络的初始化参数由标签词映射中的预测词词向量来决定,如果预测词长度大于 ``1`` ,则使用词向量均值进行初始化。当前支持的预训练模型包括 ``ErnieForMaskedLM`` 、 ``BertForMaskedLM`` 、 ``AlbertForMaskedLM`` 和 ``RobertaForMaskedLM`` 。可用于实现 WARP 算法。 @@ -318,15 +386,13 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -verbalizer = SoftVerbalizer(tokenizer=tokenizer, - model=model, - labels=['负向', '正向'], - label_words={'负向': '生气', '正向': '高兴'}, - prefix=None) +verbalizer = SoftVerbalizer(label_words={'负向': '生气', '正向': '高兴'}, + tokenizer=tokenizer, + model=model) ``` -其中初始化参数定义同[单掩码映射](#单掩码映射) ,此外 - +- ``label_words`` : 原标签到预测词之间的映射字典。 +- ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。 - ``model`` :预训练语言模型,用于取预训练词向量进行“隐藏层-标签”网络的修改和初始化。 ## 快速开始训练 @@ -335,29 +401,24 @@ verbalizer = SoftVerbalizer(tokenizer=tokenizer, ### 数据准备 -Prompt 框架定义了统一的样本结构 ``InputExample`` 以便进行数据处理,数据集样本需要封装在 ``MapDataset`` 中。 +数据集封装为 ``MapDataset`` 类型。每条数据格式为字典结构,字典中关键字与模板中 `text` 定义的值相对应,统一使用 `labels` 关键字表示样本标签。 -例如,对于文本语义相似度 BUSTM 数据集中的原始样本 +例如,文本语义相似度 BUSTM 数据集中的数据样本 ```python -data = [ +from paddlenlp.datasets import MapDataset + +data_ds = MapDataset([ {'id': 3, 'sentence1': '你晚上吃了什么', 'sentence2': '你晚上吃啥了', 'label': 1}, {'id': 4, 'sentence1': '我想打开滴滴叫的士', 'sentence2': '你叫小欧吗', 'label': 0}, {'id': 5, 'sentence1': '女孩子到底是不是你', 'sentence2': '你不是女孩子吗', 'label': 1} -] -``` +]) +def convert_label_keyword(input_dict): + input_dict["labels"] = input_dict.pop("label") + return input_dict -需要转换为统一格式 - -```python -from paddlenlp.datasets import MapDataset -from paddlenlp.prompt import InputExample - -data_ds = MapDataset([InputExample(uid=example["id"], - text_a=example["sentence1"], - text_b=example["sentence2"], - labels=example["label"]) for example in data]) +data_ds = data_ds.map(convert_label_keyword) ``` ### 预训练参数准备 @@ -383,13 +444,13 @@ from paddlenlp.prompt import ManualVerbalizer from paddlenlp.prompt import PromptModelForSequenceClassification # 定义模板 -template = AutoTemplate.create_from(template="{'text': 'text_a'}和{'text': 'text_b'}说的是{'mask'}同的事情。", +template = AutoTemplate.create_from(prompt="{'text': 'text_a'}和{'text': 'text_b'}说的是{'mask'}同的事情。", tokenizer=tokenizer, - max_seq_length=512) + max_length=512) # 定义标签词映射 -verbalizer = ManualVerbalizer(tokenizer=tokenizer, - label_words={0: '不', 1: '相'}) +verbalizer = ManualVerbalizer(label_words={0: '不', 1: '相'}, + tokenizer=tokenizer) # 定义文本分类提示模型 prompt_model = PromptModelForSequenceClassification(model, @@ -404,8 +465,8 @@ prompt_model = PromptModelForSequenceClassification(model, - ``model`` : 预训练模型实例,支持 ``AutoModelForMaskedLM`` 和 ``AutoModelForSequenceClassification`` 。 - ``template`` : 模板实例。 - ``verbalizer`` : 标签词映射实例。当设为 ``None`` 时,不使用标签词映射,模型输出及损失值计算由 ``model`` 类型定义。 -- ``freeze_plm`` : 在训练时是否固定预训练模型参数。对于规模较小的预训练模型,推荐更新预训练模型参数。 -- ``freeze_dropout`` : 在训练时是否固定预训练模型参数并关闭 ``dropout`` 。 当 ``freeze_dropout=True`` ,``freeze_plm`` 也为 ``True`` 。 +- ``freeze_plm`` : 在训练时固定预训练模型参数,默认为 `False`。对于轻量级预训练模型,推荐使用默认值。 +- ``freeze_dropout`` : 在训练时固定预训练模型参数并关闭 ``dropout`` 。 当 ``freeze_dropout=True`` ,``freeze_plm`` 也为 ``True`` 。 ### 使用PromptTrainer训练 @@ -497,6 +558,8 @@ if training_args.do_train: - WARP: Word-level Adversarial ReProgramming. [[PDF]](https://aclanthology.org/2021.acl-long.381/) - RGL: A Simple yet Effective Relation Graph Augmented Prompt-based Tuning Approach for Few-Shot Learning. [[PDF]](https://aclanthology.org/2022.findings-naacl.81/) - R-Drop: Regularized Dropout for Neural Networks. [[PDF]](https://arxiv.org/abs/2106.14448) +- Openprompt: An open-source framework for prompt-learning. [[PDF]](https://arxiv.org/abs/2111.01998) + ### 附录 @@ -506,9 +569,9 @@ if training_args.do_train: | 参数 | 类型 | 默认值 | 含义 | | ---------------- | ------ | ------- | ------------------------------------------------------- | -| max_seq_length | int | 512 | 模型输入的最大长度,包括模板部分 | -| freeze_plm | bool | False | 是否在训练时固定预训练模型的参数 | -| freeze_dropout | bool | False | 是否在训练时固定预训练模型的参数,同时关闭 dropout | +| max_seq_length | int | 512 | 模型输入的最大长度,包括模板部分 | +| freeze_plm | bool | False | 是否在训练时固定预训练模型的参数 | +| freeze_dropout | bool | False | 是否在训练时固定预训练模型的参数,同时关闭 dropout | | use_rdrop | bool | False | 是否使用 RDrop 策略,详见 [RDrop 论文](https://arxiv.org/abs/2106.14448) | | alpha_rdrop | float | 5.0 | RDrop Loss 的权重 | | use_rgl | bool | False | 是否使用 RGL 策略,详见 [RGL 论文](https://aclanthology.org/2022.findings-naacl.81/) | diff --git a/docs/trainer.md b/docs/trainer.md index f141d15a665d..96566548f811 100644 --- a/docs/trainer.md +++ b/docs/trainer.md @@ -395,6 +395,37 @@ Trainer 是一个简单,但功能完整的 Paddle训练和评估模块,并 The value of initial scale_loss for fp16. (default: 32768) + --sharding + 是否使用Paddle的Sharding数据并行功能,用户的参数。支持sharding `stage1`, `stage2` or `stage3`。 + 其中`stage2``stage3`可以和`offload`组合使用。 + 每个种策略分别为: + stage1 : optimizer 中的参数切分到不同卡 + stage2 : optimizer + gradient 中的参数切分到不同卡 + stage3 : parameter + gradient + optimizer 中的参数都切分到不同卡 + offload : offload parameters to cpu 部分参数存放到cpu中 + (`str`, 可选, 默认为 `` 不使用sharding) + 注意:当前stage3暂时不可用 + + Whether or not to use Paddle Sharding Data Parallel training (in distributed training + only). The base option should be `stage1`, `stage2` or `stage3` and you can add + CPU-offload to `stage2` or `stage3` like this: `stage2 offload` or `stage3 offload`. + Each stage means: + stage1 : optimizer state segmentation + stage2 : optimizer state + gradient segmentation + stage3 : parameter + gradient + optimizer state segmentation + offload : offload parameters to cpu + NOTICE: stage3 is temporarily unavaliable. + + --sharding_degree + 设置sharding的通信组参数,表示通信组的大小。同一个sharding通信组内的参数,进行sharding,分布到不同卡上。 + 不同sharding通信组之间,相当于单纯的数据并行。此选项只在sharding选项开启时候生效。 + 默认值为-1,表示所有训练的卡在同一个通信组内。 + (`int`, 可选, 默认为 `-1`) + + Sharding parameter in certain cards group. For example, aussume we use 2 machines each + with 8 cards, then set sharding_degree=8, sharding will only communication inside machine. + default -1 means sharding parameters between all workers. (`int`, *optional*, defaults to `-1`) + --recompute 是否使用重计算训练。可以节省显存。 重新计算前向过程以获取梯度,减少中间变量显存 diff --git a/examples/language_model/moe/dygraph/framework/group_sharded.py b/examples/language_model/moe/dygraph/framework/group_sharded.py index e56e284d7942..682a70b53efc 100644 --- a/examples/language_model/moe/dygraph/framework/group_sharded.py +++ b/examples/language_model/moe/dygraph/framework/group_sharded.py @@ -35,7 +35,6 @@ from paddle.fluid.clip import ClipGradBase, _squared_l2_norm from paddle.fluid.dygraph import base as imperative_base from paddle.fluid import core, layers, framework -from paddle.distributed import collective from paddle.incubate.distributed.models.moe.grad_clip import ClipGradForMOEByGlobalNorm # Old version @@ -72,18 +71,19 @@ def _dygraph_clip(self, params_grads): global_norm_var_normal, sum_dtype \ = self.get_l2_norm_pow(normal_params_grads) if global_norm_var_normal is not None: - collective.all_reduce(global_norm_var_normal, - op=collective.ReduceOp.SUM, - group=self.moe_group) + paddle.distributed.all_reduce(global_norm_var_normal, + op=paddle.distributed.ReduceOp.SUM, + group=self.moe_group) global_norm_var_moe = None if len(moe_params_grads) > 0: global_norm_var_moe, _ \ = self.get_l2_norm_pow(moe_params_grads, sum_dtype) if global_norm_var_moe is not None: - collective.all_reduce(global_norm_var_moe, - op=collective.ReduceOp.SUM, - group=self.moe_group) + paddle.distributed.all_reduce( + global_norm_var_moe, + op=paddle.distributed.ReduceOp.SUM, + group=self.moe_group) if global_norm_var_normal is None and global_norm_var_moe is None: return params_grads diff --git a/examples/language_model/t5/README.md b/examples/language_model/t5/README.md index 313a47c0079d..ef4ca649585f 100644 --- a/examples/language_model/t5/README.md +++ b/examples/language_model/t5/README.md @@ -46,12 +46,53 @@ python run_glue.py \ - `scheduler_type` scheduler类型,可选linear和cosine,默认linear。 - `output_dir` 表示模型保存路径。 +使用trainer进行Fine-tuning: +```shell +python -m paddle.distributed.launch --gpus "0,1,2,3" run_glue_trainer.py \ + --model_name_or_path t5-base \ + --task_name rte \ + --max_seq_length 256 \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 64 \ + --learning_rate 1e-4 \ + --weight_decay 0.01 \ + --warmup_ratio 0.1 \ + --num_train_epochs 10 \ + --eval_steps 200 \ + --logging_steps 20 \ + --save_steps 200 \ + --save_total_limit 3 \ + --metric_for_best_model "eval_accuarcy" \ + --fp16 false \ + --fp16_opt_level "O1" \ + --recompute true \ + --sharding "stage1" \ + --overwrite_output_dir \ + --disable_tqdm true \ + --output_dir outputs/rte/ +``` +具体参数含义请参见: https://paddlenlp.readthedocs.io/zh/latest/trainer.html + ###### t5-base模型在GLUE开发集上的结果: | Model | cola | sst-2 | mrpc | sts-b | qqp | mnli | qnli | rte | mean | |--------------------------------|-------|-------|-------------|------------------|-------------|-------------|------|-------|-------| | | mcc | acc | acc | pearson | acc | acc | acc | acc | | | T5-base-Paddle | 61.74 | 95.18 | 90.44 | 90.09 | 91.60 | 87.18 | 93.56 | 81.95 | 86.4675 | +###### t5_v1_1-base模型在GLUE开发集上的结果: +使用`run_glue_trainer.py`运行,由于`t5_v1_1-base`没有在glue任务上进行训练过,直接生成label的策略需要的训练时间需要更长。 +| Model | cola | sst-2 | mrpc | sts-b | qqp | mnli | qnli | rte | +|--------------------------------|-------|-------|-------------|------------------|-------------|-------------|------|-------| +| | mcc | acc | acc | pearson | acc | acc | acc | acc | +| T5-v1_1-base Paddle | 47.6845 | 94.38 | 84.31 | 87.74 | 88.05 | 85.39 | 90.518 | 65.70 | +| epoch | 100 | 10 | 100 | 100 | 3 | 3 | 10 | 100 | + +注: +- 直接生成label的finetune方式难度较大,前期基本学习如何正确生成label标签,后期才学习分类任务。 +- 生成的label标签设计,标签差异大一些,效果会更好一些。 +- `qqp`,`mnli`数据集适当增大训练epoch数,可以取得更好效果。 ### GLUE Demo测试 diff --git a/examples/language_model/t5/data.py b/examples/language_model/t5/data.py index 2927533f7626..9df3329d80de 100644 --- a/examples/language_model/t5/data.py +++ b/examples/language_model/t5/data.py @@ -50,6 +50,33 @@ ), ]) +GLUE_1_1_PROCESSED = collections.OrderedDict([ + ("cola", (["cola sentence: "], ["outrageous", "acceptable"])), + ("sst-2", (["sst2 sentence: "], ["negative", "positive"])), + ( + "mrpc", + (["mrpc sentence1: ", " sentence2: "], ["nonidentical", "equivalent"]), + ), + ("sts-b", (["stsb sentence1: ", " sentence2: "], None)), + ("qqp", (["qqp question1: ", " question2: "], ["inequable", "duplicate"])), + ( + "mnli", + ( + ["mnli hypothesis: ", " premise: "], + ["contradiction", "entailment", "neutral"], + ), + ), + ( + "qnli", + (["qnli question: ", " sentence: "], ["entailment", "contradiction"]), + ), + ( + "rte", + (["rte sentence1: ", + " rte sentence2: "], ["entailment", "contradiction"]), + ), +]) + def trans_func(example, tokenizer, args): task_name = args.task_name diff --git a/examples/language_model/t5/run_glue_trainer.py b/examples/language_model/t5/run_glue_trainer.py new file mode 100644 index 000000000000..712062ff7320 --- /dev/null +++ b/examples/language_model/t5/run_glue_trainer.py @@ -0,0 +1,429 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import math +import os +from functools import partial +from dataclasses import dataclass, field +from typing import Optional, Dict, Union, Any, List, Tuple + +from tqdm import tqdm +import paddle +import paddle.nn as nn +from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer +from paddlenlp.datasets import load_dataset +from paddlenlp.trainer import get_last_checkpoint +from paddlenlp.data import DataCollatorWithPadding, Pad +from paddlenlp.trainer import ( + PdArgumentParser, + TrainingArguments, + Trainer, +) +from paddlenlp.utils.log import logger + +from utils import load_pickle, save_pickle, GLUE_METRICS +from data import GLUE_PROCESSED, GLUE_1_1_PROCESSED + +label_length_map = { + "cola": 4, + "sst-2": 1, + "mrpc": 5, + "sts-b": 5, + "qqp": 5, + "mnli": 4, + "qnli": 5, + "rte": 5, +} + + +def trans_func(example, tokenizer, args): + task_name = args.task_name + PROCESSED = GLUE_PROCESSED + if "v1_1" in args.cache_dir: + PROCESSED = GLUE_1_1_PROCESSED + processed, label = PROCESSED[task_name] + if label: + id2label = dict(zip(range(len(label)), label)) + else: + id2label = None + + is_test = "labels" not in example + + if not is_test: + if id2label: + label_text = id2label[example["labels"]] + else: + label_text = str(example["labels"]) + target = tokenizer(label_text, + return_token_type_ids=False, + return_attention_mask=True) + + if len(processed) == 1: + text = processed[0] + example["sentence"] + else: + text = processed[0] + example["sentence1"] + processed[1] + example[ + "sentence2"] + + source = tokenizer( + text, + max_seq_len=args.max_seq_length, + padding='max_length', + return_token_type_ids=False, + return_attention_mask=True, + ) + + if not is_test: + return { + "input_ids": source["input_ids"], + "attention_mask": source["attention_mask"], + "labels": target["input_ids"], + "decoder_attention_mask": target["attention_mask"], + } + else: + return { + "input_ids": source["input_ids"], + "attention_mask": source["attention_mask"] + } + + +class BatchDict(object): + + def __init__(self, fn): + assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \ + 'Received fn=%s' % (str(fn)) + + self._fn = fn + + for col_name, ele_fn in self._fn.items(): + assert callable( + ele_fn + ), 'Batchify functions must be callable! type(fn[%d]) = %s' % ( + col_name, str(type(ele_fn))) + + def __call__(self, data): + + ret = {} + if len(data) <= 0: + return ret + + for col_name, ele_fn in self._fn.items(): + # skip unused col_name, such as labels in test mode. + if col_name not in data[0].keys(): + continue + result = ele_fn([ele[col_name] for ele in data]) + ret[col_name] = result + + return ret + + +def get_train_dataset(tokenizer, args): + filename = os.path.join(args.cache_dir, args.task_name + "_train" + ".pkl") + + if os.path.exists(filename): + ds = load_pickle(filename) + else: + ds = load_dataset("glue", args.task_name, splits="train") + ds.map( + partial(trans_func, tokenizer=tokenizer, args=args), + batched=False, + lazy=False, + ) + save_pickle(ds, filename) + + return ds + + +def get_dev_dataset(tokenizer, args): + filename = os.path.join(args.cache_dir, args.task_name + "_dev" + ".pkl") + + if os.path.exists(filename): + ds = load_pickle(filename) + else: + ds = load_dataset("glue", args.task_name, splits="dev") + ds.map( + partial(trans_func, tokenizer=tokenizer, args=args), + batched=False, + lazy=False, + ) + save_pickle(ds, filename) + + return ds + + +def get_mnli_dev_dataset(tokenizer, args, matched=True): + if matched: + split = "dev_matched" + else: + split = "dev_mismatched" + filename = os.path.join(args.cache_dir, + args.task_name + f"_{split}" + ".pkl") + if os.path.exists(filename): + ds = load_pickle(filename) + else: + ds = load_dataset("glue", args.task_name, splits=split) + ds.map( + partial(trans_func, tokenizer=tokenizer, args=args), + batched=False, + lazy=False, + ) + save_pickle(ds, filename) + + return ds + + +@dataclass +class DataArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `PdArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: str = field( + default=None, + metadata={ + "help": "The name of the task to use (via the datasets library)." + }) + + max_seq_length: int = field( + default=128, + metadata={ + "help": + "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + cache_dir: str = field(default="./caches", + metadata={"help": "cache dir for datasets."}) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default="t5-small", + metadata={ + "help": + "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" + }) + export_model_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Path to directory to store the exported inference model." + }, + ) + + +class T5GlueTrainer(Trainer): + + def __init__(self, do_generation: bool, label2id, **kwargs): + super().__init__(**kwargs) + self.do_generation = do_generation + self.label2id = label2id + + def prediction_step( + self, + model: nn.Layer, + inputs: Dict[str, Union[paddle.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], + Optional[paddle.Tensor]]: + + if not self.do_generation: + return super().prediction_step( + model, + inputs, + prediction_loss_only=prediction_loss_only, + ignore_keys=ignore_keys) + + all_preds = [] + all_labels = [] + # source_ids, source_mask, labels, target_mask = batch + labels = inputs["labels"] + target_mask = inputs["decoder_attention_mask"] + + with paddle.no_grad(): + outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + max_length=5, + )[0] + + for p, l, m in zip(outputs.numpy(), labels.numpy(), + target_mask.numpy()): + pred = self.tokenizer.decode(p, skip_special_tokens=True).strip() + label = self.tokenizer.decode(l[m.astype("bool")], + skip_special_tokens=True).strip() + + if self.label2id: + # for classifaction task. + label = self.label2id[label] + if pred not in self.label2id: + # set to wrong label if the generated text not in the labal set. + pred = 0 + if label == 0: + pred = 1 + else: + pred = self.label2id[pred] + else: + # for regression task. + label = float(label.replace(" ", "")) + try: + pred = float(pred.replace(" ", "")) + except Exception as e: + # set to zero if the generated text can not convert to float + pred = 0.0 + + all_preds.append(pred) + all_labels.append(label) + + all_preds = paddle.to_tensor(all_preds).detach() + all_labels = paddle.to_tensor(all_labels).detach() + + return (None, all_preds, all_labels) + + +def main(): + parser = PdArgumentParser( + (ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if "v1_1" in model_args.model_name_or_path: + data_args.cache_dir = "./caches_v1_1" + if not os.path.exists(data_args.cache_dir): + os.mkdir(data_args.cache_dir) + + # Log model and data config + training_args.print_config(model_args, "Model") + training_args.print_config(data_args, "Data") + + paddle.set_device(training_args.device) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + ) + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir( + training_args.output_dir + ) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir( + training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome.") + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + PROCESSED = GLUE_PROCESSED + if "v1_1" in data_args.cache_dir: + PROCESSED = GLUE_1_1_PROCESSED + label_name = PROCESSED[data_args.task_name][1] + if label_name: + label2id = dict(zip(label_name, range(len(label_name)))) + else: + label2id = None + metric_list = GLUE_METRICS[data_args.task_name] + generate_max_length = label_length_map[data_args.task_name] + + # get model and tokenizer + model = T5ForConditionalGeneration.from_pretrained( + model_args.model_name_or_path) + tokenizer = T5Tokenizer.from_pretrained(model_args.model_name_or_path) + + # get dataloader + train_dataset = get_train_dataset(tokenizer, data_args) + if data_args.task_name == "mnli": + eval_dataset = get_mnli_dev_dataset(tokenizer, data_args, matched=True) + eval_dataset_mismatch = get_mnli_dev_dataset(tokenizer, + data_args, + matched=False) + else: + eval_dataset = get_dev_dataset(tokenizer, data_args) + + batchify_fn = lambda samples, fn=BatchDict({ + "input_ids": + Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input_ids + "attention_mask": + Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" + ), # attention_mask + "labels": + Pad(axis=0, pad_val=-100, dtype="int64"), # lm_labels + "decoder_attention_mask": + Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" + ), # decoder_attention_mask + }): fn(samples) + data_collator = batchify_fn + + # Define the metrics of tasks. + def compute_metrics(p): + preds = p.predictions[0] if isinstance(p.predictions, + tuple) else p.predictions + + results = {} + for metric in metric_list: + results.update(metric(p.label_ids, preds)) + + return results + + trainer = T5GlueTrainer( + model=model, + criterion=None, + args=training_args, + data_collator=data_collator, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + do_generation=True, + label2id=label2id, + ) + + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + # trainer.save_model() + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluate and tests model + if training_args.do_eval: + eval_metrics = trainer.evaluate() + trainer.log_metrics("eval", eval_metrics) + + +if __name__ == "__main__": + main() diff --git a/faster_tokenizer/CMakeLists.txt b/fast_tokenizer/CMakeLists.txt similarity index 86% rename from faster_tokenizer/CMakeLists.txt rename to fast_tokenizer/CMakeLists.txt index c5325955ecca..8a24a4cccd3a 100644 --- a/faster_tokenizer/CMakeLists.txt +++ b/fast_tokenizer/CMakeLists.txt @@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.10) project(tokenizers LANGUAGES CXX C VERSION 1.0) -option(WITH_TESTING "Compile PaddleNLP faster_tokenizer with unit testing" OFF) -option(WITH_PYTHON "Compile PaddleNLP faster_tokenizer with python interpreter" ON) -add_definitions(-DFASTERTOKENIZER_LIB) +option(WITH_TESTING "Compile PaddleNLP fast_tokenizer with unit testing" OFF) +option(WITH_PYTHON "Compile PaddleNLP fast_tokenizer with python interpreter" ON) +add_definitions(-DFASTTOKENIZER_LIB) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set (PUBLIC_DEPEND_LIBS "") @@ -108,17 +108,6 @@ ELSE(WIN32) set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS}) ENDIF(WIN32) -# For OpenMP -# openmp not support well for now on windows -if (NOT APPLE AND NOT WIN32) # Linux - find_package(OpenMP) - if (OPENMP_FOUND) - add_definitions(-DWITH_OMP) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - endif() -endif() - set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) set(TOKENIZERS_INSTALL_INCLUDE_DIR ${PROJECT_SOURCE_DIR}) @@ -137,7 +126,7 @@ include_directories(${TOKENIZERS_INSTALL_INCLUDE_DIR}) include(generic) include(third_party) -add_subdirectory(faster_tokenizer) +add_subdirectory(fast_tokenizer) if(WITH_PYTHON) @@ -155,18 +144,18 @@ add_custom_target(build_tokenizers_bdist_wheel ALL DEPENDS copy_python_tokenizers) endif() -else(WITH_PYTHON) # Pack faster_tokenizer cpp lib +else(WITH_PYTHON) # Pack fast_tokenizer cpp lib -set(CPP_PACKAGE_DIR ${CMAKE_BINARY_DIR}/cpp/faster_tokenizer) +set(CPP_PACKAGE_DIR ${CMAKE_BINARY_DIR}/cpp/fast_tokenizer) add_custom_target(build_cpp_package_dir ALL COMMAND ${CMAKE_COMMAND} -E make_directory ${CPP_PACKAGE_DIR}/lib ${CPP_PACKAGE_DIR}/include ${CPP_PACKAGE_DIR}/third_party/include ${CPP_PACKAGE_DIR}/third_party/lib DEPENDS core_tokenizers) # copy cmake -file(COPY ${PROJECT_SOURCE_DIR}/FasterTokenizer.cmake DESTINATION ${CPP_PACKAGE_DIR}/) +file(COPY ${PROJECT_SOURCE_DIR}/FastTokenizer.cmake DESTINATION ${CPP_PACKAGE_DIR}/) # copy headers -file(COPY ${PROJECT_SOURCE_DIR}/faster_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/faster_tokenizer/ +file(COPY ${PROJECT_SOURCE_DIR}/fast_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/fast_tokenizer/ FILES_MATCHING PATTERN "*.h" PATTERN "test" EXCLUDE PATTERN "demo" EXCLUDE @@ -181,7 +170,7 @@ add_custom_target(copy_third_party_headers ALL # copy library set(TOKENIZER_CORE_NAME "core_tokenizers") -set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/faster_tokenizer) +set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/fast_tokenizer) if (WIN32) set(ICU_DLL_DIR ${CMAKE_BINARY_DIR}/third_party/icu/src/extern_icu/icu4c/bin64) set(ICU_LIB_DIR ${CMAKE_BINARY_DIR}/third_party/icu/src/extern_icu/icu4c/lib64) diff --git a/faster_tokenizer/FasterTokenizer.cmake b/fast_tokenizer/FastTokenizer.cmake similarity index 68% rename from faster_tokenizer/FasterTokenizer.cmake rename to fast_tokenizer/FastTokenizer.cmake index d4f2f4debac2..6d7f8e5f4115 100644 --- a/faster_tokenizer/FasterTokenizer.cmake +++ b/fast_tokenizer/FastTokenizer.cmake @@ -16,18 +16,18 @@ endif() set(LIBRARY_NAME core_tokenizers) -set(FASTER_TOKENIZER_INCS "") -list(APPEND FASTER_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/include) -list(APPEND FASTER_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/third_party/include) +set(FAST_TOKENIZER_INCS "") +list(APPEND FAST_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/include) +list(APPEND FAST_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/third_party/include) -set(FASTER_TOKENIZER_LIBS "") +set(FAST_TOKENIZER_LIBS "") find_library(FTLIB ${LIBRARY_NAME} ${CMAKE_CURRENT_LIST_DIR}/lib NO_DEFAULT_PATH) -list(APPEND FASTER_TOKENIZER_LIBS ${FTLIB}) +list(APPEND FAST_TOKENIZER_LIBS ${FTLIB}) if (WIN32) find_library(ICUDT icudt ${CMAKE_CURRENT_LIST_DIR}/third_party/lib NO_DEFAULT_PATH) -list(APPEND FASTER_TOKENIZER_LIBS ${ICUDT}) +list(APPEND FAST_TOKENIZER_LIBS ${ICUDT}) find_library(ICUUC icuuc ${CMAKE_CURRENT_LIST_DIR}/third_party/lib NO_DEFAULT_PATH) -list(APPEND FASTER_TOKENIZER_LIBS ${ICUUC}) +list(APPEND FAST_TOKENIZER_LIBS ${ICUUC}) endif() diff --git a/faster_tokenizer/LICENSE b/fast_tokenizer/LICENSE similarity index 100% rename from faster_tokenizer/LICENSE rename to fast_tokenizer/LICENSE diff --git a/fast_tokenizer/README.md b/fast_tokenizer/README.md new file mode 100644 index 000000000000..34313a097200 --- /dev/null +++ b/fast_tokenizer/README.md @@ -0,0 +1,118 @@ +# FastTokenizer + +------------------------------------------------------------------------------------------ + +

+ + + + + + + + + +

+FastTokenizer是一款简单易用、功能强大的跨平台高性能文本预处理库,集成业界多个常用的Tokenizer实现,支持不同NLP场景下的文本预处理功能,如文本分类、阅读理解,序列标注等。结合PaddleNLP Tokenizer模块,为用户在训练、推理阶段提供高效通用的文本预处理能力。 + +## 特性 + +- 高性能。由于底层采用C++实现,所以其性能远高于目前常规Python实现的Tokenizer。在文本分类任务上,FastTokenizer对比Python版本Tokenizer加速比最高可达20倍。支持多线程加速多文本批处理分词。默认使用单线程分词。 +- 跨平台。FastTokenizer可在不同的系统平台上使用,目前已支持Windows x64,Linux x64以及MacOS 10.14+平台上使用。 +- 多编程语言支持。FastTokenizer提供在C++、Python语言上开发的能力。 +- 灵活性强。用户可以通过指定不同的FastTokenizer组件定制满足需求的Tokenizer。 + +## 快速开始 + +下面将介绍Python版本FastTokenizer的使用方式,C++版本的使用方式可参考[FastTokenizer C++ Demo](./fast_tokenizer/demo/README.md)。 + +### 环境依赖 + +- Windows 64位系统 +- Linux x64系统 +- MacOS 10.14+系统(m1芯片的MacOS,需要使用x86_64版本的Anaconda作为python环境方可安装使用) +- Python 3.6 ~ 3.10 + +### 安装FastTokenizer + +```python +pip install fast_tokenizer +``` + +### FastTokenizer使用示例 + +- 准备词表 + +```shell +# Linux或者Mac用户可直接执行以下命令下载测试的词表,Windows 用户可在浏览器上下载到本地。 +wget https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt +``` + +- 切词示例 + +FastTokenizer库内置NLP任务常用的Tokenizer,如ErnieFastTokenizer。下面将展示FastTokenizer的简单用法。 + +```python +import fast_tokenizer +from fast_tokenizer import ErnieFastTokenizer, models + +# 0.(可选)设置线程数 +fast_tokenizer.set_thread_num(1) +# 1. 加载词表 +vocab = models.WordPiece.read_file("ernie_vocab.txt") +# 2. 实例化ErnieFastTokenizer对象 +fast_tokenizer = ErnieFastTokenizer(vocab) +# 3. 切词 +output = fast_tokenizer.encode("我爱中国") +# 4. 输出结果 +print("ids: ", output.ids) +print("type_ids: ", output.type_ids) +print("tokens: ", output.tokens) +print("offsets: ", output.offsets) +print("attention_mask: ", output.attention_mask) +``` + +### FastTokenizer在PaddleNLP Tokenizer模块加速示例 + +PaddleNLP Tokenizer模块可简单地应用在模型训练以及推理部署的文本预处理阶段,并通过`AutoTokenizer.from_pretrained`方式实例化相应的Tokenizer。其中`AutoTokenizer`默认加载得到的Tokenizer是常规Python实现的Tokenizer,其性能会低于C++实现的FastTokenizer。为了提升PaddleNLP Tokenizer模块性能,目前PaddleNLP Tokenizer模块已经支持使用FastTokenizer作为Tokenizer的后端加速切词阶段。在现有的Tokenizer加载接口中,仅需添加`use_fast=True`这一关键词参数,其余代码保持不变,即可加载Fast版本的Tokenizer,代码示例如下: + +```python +from paddlenlp.transformers import AutoTokenizer + +# 默认加载Python版本的Tokenizer +tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') +# 打开use_fast开关,可加载Fast版本Tokenizer +fast_tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh', use_fast=True) + +text1 = tokenizer('自然语言处理') +text2 = fast_tokenizer('自然语言处理') + +print(text1) +print(text2) +``` + +目前PaddleNLP已支持BERT、ERNIE、TinyBERT以及ERNIE-M 4种Tokenizer的Fast版本,其余模型的Tokenizer暂不支持Fast版本。 + +## FAQ + +Q:我在AutoTokenizer.from_pretrained接口上已经打开`use_fast=True`开关,为什么文本预处理阶段性能上好像没有任何变化? + +A:在有三种情况下,打开`use_fast=True`开关可能无法提升性能: + 1. 没有安装fast_tokenizer。若在没有安装fast_tokenizer库的情况下打开`use_fast`开关,PaddleNLP会给出以下warning:"Can't find the fast_tokenizer package, please ensure install fast_tokenizer correctly. "。 + + 2. 加载的Tokenizer类型暂不支持Fast版本。目前支持4种Tokenizer的Fast版本,分别是BERT、ERNIE、TinyBERT以及ERNIE-M Tokenizer。若加载不支持Fast版本的Tokenizer情况下打开`use_fast`开关,PaddleNLP会给出以下warning:"The tokenizer XXX doesn't have the fast version. Please check the map paddlenlp.transformers.auto.tokenizer.FAST_TOKENIZER_MAPPING_NAMES to see which fast tokenizers are currently supported." + + 3. 待切词文本长度过短(如文本平均长度小于5)。这种情况下切词开销可能不是整个文本预处理的性能瓶颈,导致在使用FastTokenizer后仍无法提升整体性能。 + +Q:如何使用多线程加速分词? + +A:可以通过调用 `fast_tokenizer.set_thread_num(xxx)` 使用多线程进行分词。需要谨慎开启多线程加速分词,在以下场景下可以考虑开启多线程: + 1. CPU资源充足。若在推理阶段使用CPU进行推理,开启多线程分词可能会出现资源竞争情况,从而影响推理阶段的性能。 + + 2. 文本的批大小较大。若批大小比较小,开启多线程可能不会得到任何加速效果,并且可能会因为线程调度导致延时增长。建议批大小大于4的时候再考虑开启多线程分词。 + + 3. 文本长度较长。若文本长度较短,开启多线程可能不会得到任何加速效果,并且可能会因为线程调度导致延时增长。建议文本平均长度大于16的时候再考虑开启多线程分词。 + +## 相关文档 + +[FastTokenizer编译指南](docs/compile/README.md) diff --git a/faster_tokenizer/cmake/ByproductsICU.cmake b/fast_tokenizer/cmake/ByproductsICU.cmake similarity index 100% rename from faster_tokenizer/cmake/ByproductsICU.cmake rename to fast_tokenizer/cmake/ByproductsICU.cmake diff --git a/faster_tokenizer/cmake/FindNumPy.cmake b/fast_tokenizer/cmake/FindNumPy.cmake similarity index 100% rename from faster_tokenizer/cmake/FindNumPy.cmake rename to fast_tokenizer/cmake/FindNumPy.cmake diff --git a/faster_tokenizer/cmake/dummy.c.in b/fast_tokenizer/cmake/dummy.c.in similarity index 100% rename from faster_tokenizer/cmake/dummy.c.in rename to fast_tokenizer/cmake/dummy.c.in diff --git a/faster_tokenizer/cmake/external/dart.cmake b/fast_tokenizer/cmake/external/dart.cmake similarity index 100% rename from faster_tokenizer/cmake/external/dart.cmake rename to fast_tokenizer/cmake/external/dart.cmake diff --git a/faster_tokenizer/cmake/external/gflags.cmake b/fast_tokenizer/cmake/external/gflags.cmake similarity index 100% rename from faster_tokenizer/cmake/external/gflags.cmake rename to fast_tokenizer/cmake/external/gflags.cmake diff --git a/faster_tokenizer/cmake/external/glog.cmake b/fast_tokenizer/cmake/external/glog.cmake similarity index 100% rename from faster_tokenizer/cmake/external/glog.cmake rename to fast_tokenizer/cmake/external/glog.cmake diff --git a/faster_tokenizer/cmake/external/gtest.cmake b/fast_tokenizer/cmake/external/gtest.cmake similarity index 100% rename from faster_tokenizer/cmake/external/gtest.cmake rename to fast_tokenizer/cmake/external/gtest.cmake diff --git a/faster_tokenizer/cmake/external/icu.cmake b/fast_tokenizer/cmake/external/icu.cmake similarity index 100% rename from faster_tokenizer/cmake/external/icu.cmake rename to fast_tokenizer/cmake/external/icu.cmake diff --git a/faster_tokenizer/cmake/external/nlohmann_json.cmake b/fast_tokenizer/cmake/external/nlohmann_json.cmake similarity index 100% rename from faster_tokenizer/cmake/external/nlohmann_json.cmake rename to fast_tokenizer/cmake/external/nlohmann_json.cmake diff --git a/faster_tokenizer/cmake/external/protobuf.cmake b/fast_tokenizer/cmake/external/protobuf.cmake similarity index 100% rename from faster_tokenizer/cmake/external/protobuf.cmake rename to fast_tokenizer/cmake/external/protobuf.cmake diff --git a/faster_tokenizer/cmake/external/pybind11.cmake b/fast_tokenizer/cmake/external/pybind11.cmake similarity index 100% rename from faster_tokenizer/cmake/external/pybind11.cmake rename to fast_tokenizer/cmake/external/pybind11.cmake diff --git a/faster_tokenizer/cmake/external/python.cmake b/fast_tokenizer/cmake/external/python.cmake similarity index 100% rename from faster_tokenizer/cmake/external/python.cmake rename to fast_tokenizer/cmake/external/python.cmake diff --git a/faster_tokenizer/cmake/external/re2.cmake b/fast_tokenizer/cmake/external/re2.cmake similarity index 100% rename from faster_tokenizer/cmake/external/re2.cmake rename to fast_tokenizer/cmake/external/re2.cmake diff --git a/faster_tokenizer/cmake/external/utf8proc.cmake b/fast_tokenizer/cmake/external/utf8proc.cmake similarity index 100% rename from faster_tokenizer/cmake/external/utf8proc.cmake rename to fast_tokenizer/cmake/external/utf8proc.cmake diff --git a/faster_tokenizer/cmake/generic.cmake b/fast_tokenizer/cmake/generic.cmake similarity index 100% rename from faster_tokenizer/cmake/generic.cmake rename to fast_tokenizer/cmake/generic.cmake diff --git a/faster_tokenizer/cmake/python_module.cmake b/fast_tokenizer/cmake/python_module.cmake similarity index 100% rename from faster_tokenizer/cmake/python_module.cmake rename to fast_tokenizer/cmake/python_module.cmake diff --git a/faster_tokenizer/cmake/third_party.cmake b/fast_tokenizer/cmake/third_party.cmake similarity index 100% rename from faster_tokenizer/cmake/third_party.cmake rename to fast_tokenizer/cmake/third_party.cmake diff --git a/faster_tokenizer/docs/compile/README.md b/fast_tokenizer/docs/compile/README.md similarity index 52% rename from faster_tokenizer/docs/compile/README.md rename to fast_tokenizer/docs/compile/README.md index d7820884e1f4..b510e6c79a17 100644 --- a/faster_tokenizer/docs/compile/README.md +++ b/fast_tokenizer/docs/compile/README.md @@ -1,11 +1,11 @@ -# FasterTokenizer编译指南 +# FastTokenizer编译指南 -本文档说明编译FasterTokenizer C++库、Python库两种编译过程,根据编译的平台参考如下文档 +本文档说明编译FastTokenizer C++库、Python库两种编译过程,根据编译的平台参考如下文档 - [Linux & Mac 编译](./how_to_build_linux_and_mac.md) - [Windows编译](./how_to_build_windows.md) -FasterTokenizer使用CMake编译,其中编译过程中,各平台上编译选项如下表所示 +FastTokenizer使用CMake编译,其中编译过程中,各平台上编译选项如下表所示 | 选项 | 作用 | 备注 | |:---- | :--- | :--- | diff --git a/faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md b/fast_tokenizer/docs/compile/how_to_build_linux_and_mac.md similarity index 92% rename from faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md rename to fast_tokenizer/docs/compile/how_to_build_linux_and_mac.md index 5dc820525176..fd856d8984d1 100644 --- a/faster_tokenizer/docs/compile/how_to_build_linux_and_mac.md +++ b/fast_tokenizer/docs/compile/how_to_build_linux_and_mac.md @@ -9,7 +9,7 @@ ```bash git clone https://github.com/PaddlePaddle/PaddleNLP.git -cd PaddleNLP/faster_tokenizer +cd PaddleNLP/fast_tokenizer mkdir build & cd build cmake .. -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release make -j8 @@ -21,7 +21,7 @@ make -j8 ```bash git clone https://github.com/PaddlePaddle/PaddleNLP.git -cd PaddleNLP/faster_tokenizer +cd PaddleNLP/fast_tokenizer mkdir build & cd build # 设置Python环境 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} diff --git a/faster_tokenizer/docs/compile/how_to_build_windows.md b/fast_tokenizer/docs/compile/how_to_build_windows.md similarity index 94% rename from faster_tokenizer/docs/compile/how_to_build_windows.md rename to fast_tokenizer/docs/compile/how_to_build_windows.md index b7b73bc7834b..e4ce9dc2af9a 100644 --- a/faster_tokenizer/docs/compile/how_to_build_windows.md +++ b/fast_tokenizer/docs/compile/how_to_build_windows.md @@ -13,7 +13,7 @@ ```bash git clone https://github.com/PaddlePaddle/PaddleNLP.git -cd PaddleNLP/faster_tokenizer +cd PaddleNLP/fast_tokenizer mkdir build & cd build cmake .. -G "Ninja" -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ninja -j8 @@ -25,7 +25,7 @@ ninja -j8 ```bash git clone https://github.com/PaddlePaddle/PaddleNLP.git -cd PaddleNLP/faster_tokenizer +cd PaddleNLP/fast_tokenizer mkdir build & cd build # 需要指定Python库 cmake .. -G "Ninja" -DWITH_PYTHON=ON ^ diff --git a/faster_tokenizer/faster_tokenizer/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/CMakeLists.txt similarity index 90% rename from faster_tokenizer/faster_tokenizer/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/CMakeLists.txt index cf4abb40395d..61e54191162c 100644 --- a/faster_tokenizer/faster_tokenizer/CMakeLists.txt +++ b/fast_tokenizer/fast_tokenizer/CMakeLists.txt @@ -13,7 +13,7 @@ endif() if (WITH_PYTHON) add_subdirectory(pybind) cc_library(core_tokenizers SHARED - SRCS pybind/pybind.cc tokenizers/ernie_faster_tokenizer.cc + SRCS pybind/pybind.cc tokenizers/ernie_fast_tokenizer.cc DEPS pybind python pybind_normalizers pybind_utils pybind_pretokenizers pybind_models pybind_decoders pybind_postprocessors pybind_tokenizers pybind_exception @@ -33,7 +33,7 @@ endif() else(WITH_PYTHON) # add_subdirectory(tokenizers) cc_library(core_tokenizers SHARED - SRCS tokenizers/ernie_faster_tokenizer.cc + SRCS tokenizers/ernie_fast_tokenizer.cc tokenizers/clip_fast_tokenizer.cc DEPS normalizers pretokenizers models decoders postprocessors core added_vocabulary tokenizer json) diff --git a/faster_tokenizer/faster_tokenizer/core/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/core/CMakeLists.txt similarity index 55% rename from faster_tokenizer/faster_tokenizer/core/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/core/CMakeLists.txt index ea831123e90e..6e97ff1a20db 100644 --- a/faster_tokenizer/faster_tokenizer/core/CMakeLists.txt +++ b/fast_tokenizer/fast_tokenizer/core/CMakeLists.txt @@ -1,3 +1,4 @@ cc_library(added_vocabulary SRCS added_vocabulary.cc DEPS normalizers pretokenizers json) -cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors) -cc_library(core SRCS encoding.cc DEPS json) +cc_library(base SRCS base.cc) +cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors base) +cc_library(core SRCS encoding.cc DEPS json base) diff --git a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc b/fast_tokenizer/fast_tokenizer/core/added_vocabulary.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc rename to fast_tokenizer/fast_tokenizer/core/added_vocabulary.cc index 2c6d47dcf0ca..bdb05fa136b8 100644 --- a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc +++ b/fast_tokenizer/fast_tokenizer/core/added_vocabulary.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/core/added_vocabulary.h" -#include "faster_tokenizer/models/model.h" -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/core/added_vocabulary.h" +#include "fast_tokenizer/models/model.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" #include "glog/logging.h" #include "re2/re2.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace core { inline bool StartWithWord(const std::string& sequence) { @@ -420,5 +420,5 @@ void to_json(nlohmann::json& j, const AddedVocabulary& added_vocab) { } } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.h b/fast_tokenizer/fast_tokenizer/core/added_vocabulary.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/core/added_vocabulary.h rename to fast_tokenizer/fast_tokenizer/core/added_vocabulary.h index d425d7d6ffe5..a9b26f677818 100644 --- a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.h +++ b/fast_tokenizer/fast_tokenizer/core/added_vocabulary.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/core/base.h" +#include "fast_tokenizer/core/base.h" #include "nlohmann/json.hpp" namespace re2 { @@ -26,7 +26,7 @@ class RE2; } // namespace re2 namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { class Normalizer; @@ -52,7 +52,7 @@ bool EndWithWord(const std::string& sequence); bool StartWithSpace(const std::string& sequence); bool EndWithSpace(const std::string& sequence); -class FASTERTOKENIZER_DECL AddedToken { +class FASTTOKENIZER_DECL AddedToken { public: AddedToken(); AddedToken(const std::string& content, @@ -84,14 +84,14 @@ class FASTERTOKENIZER_DECL AddedToken { friend struct AddedTokenWithId; }; -struct FASTERTOKENIZER_DECL AddedTokenWithId { +struct FASTTOKENIZER_DECL AddedTokenWithId { AddedToken added_token_; uint32_t id_; friend void to_json(nlohmann::json& j, const AddedTokenWithId& added_token); friend void from_json(const nlohmann::json& j, AddedTokenWithId& added_token); }; -class FASTERTOKENIZER_DECL AddedVocabulary { +class FASTTOKENIZER_DECL AddedVocabulary { public: AddedVocabulary(); size_t GetLen() const; @@ -139,15 +139,15 @@ class FASTERTOKENIZER_DECL AddedVocabulary { }; } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp namespace std { template <> -class hash { +class hash { public: size_t operator()( - const paddlenlp::faster_tokenizer::core::AddedToken& added_token) const { + const paddlenlp::fast_tokenizer::core::AddedToken& added_token) const { return std::hash()(added_token.GetContent()); } }; diff --git a/fast_tokenizer/fast_tokenizer/core/base.cc b/fast_tokenizer/fast_tokenizer/core/base.cc new file mode 100644 index 000000000000..ec4a37c74f66 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/core/base.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fast_tokenizer/core/base.h" +#include + +namespace paddlenlp { +namespace fast_tokenizer { +namespace core { + +static int fast_tokenizer_thread_num = 1; + +void SetThreadNum(int thread_num) { fast_tokenizer_thread_num = thread_num; } + +int GetThreadNum() { return fast_tokenizer_thread_num; } + +void RunMultiThread(std::function func, + size_t batch_size) { + int thread_num = GetThreadNum(); + std::vector vectorOfThread; + size_t start_index = 0; + size_t step_index = ceil(batch_size / float(thread_num)); + + for (size_t thread_index = 0; thread_index < thread_num; thread_index++) { + vectorOfThread.emplace_back(std::thread(func, start_index, step_index)); + start_index = start_index + step_index; + } + for (size_t thread_index = 0; thread_index < thread_num; thread_index++) { + vectorOfThread[thread_index].join(); + } +} + +} // namespace core +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/base.h b/fast_tokenizer/fast_tokenizer/core/base.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/core/base.h rename to fast_tokenizer/fast_tokenizer/core/base.h index 0fe8e834c56c..21af2d912f7c 100644 --- a/faster_tokenizer/faster_tokenizer/core/base.h +++ b/fast_tokenizer/fast_tokenizer/core/base.h @@ -21,8 +21,8 @@ limitations under the License. */ #include #include +#include "fast_tokenizer/utils/utils.h" #include "nlohmann/json.hpp" -#include "faster_tokenizer/utils/utils.h" namespace std { template <> @@ -36,17 +36,25 @@ struct hash> { } namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace core { -enum FASTERTOKENIZER_DECL OffsetType { CHAR, BYTE }; -enum FASTERTOKENIZER_DECL Direction { LEFT, RIGHT }; -enum FASTERTOKENIZER_DECL TruncStrategy { +enum FASTTOKENIZER_DECL OffsetType { CHAR, BYTE }; +enum FASTTOKENIZER_DECL Direction { LEFT, RIGHT }; +enum FASTTOKENIZER_DECL TruncStrategy { LONGEST_FIRST, ONLY_FIRST, ONLY_SECOND }; -enum FASTERTOKENIZER_DECL PadStrategy { BATCH_LONGEST, FIXED_SIZE }; +enum FASTTOKENIZER_DECL PadStrategy { BATCH_LONGEST, FIXED_SIZE }; + +enum FASTTOKENIZER_DECL SplitMode { + REMOVED, + ISOLATED, + MERGED_WITH_PREVIOUS, + MERGED_WITH_NEXT, + CONTIGUOUS +}; NLOHMANN_JSON_SERIALIZE_ENUM(OffsetType, { @@ -72,7 +80,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM(PadStrategy, {FIXED_SIZE, "FIXED_SIZE"}, }); -struct FASTERTOKENIZER_DECL TruncMethod { +struct FASTTOKENIZER_DECL TruncMethod { Direction direction_; size_t max_len_; TruncStrategy strategy_; @@ -84,7 +92,7 @@ struct FASTERTOKENIZER_DECL TruncMethod { direction_(RIGHT) {} }; -struct FASTERTOKENIZER_DECL PadMethod { +struct FASTTOKENIZER_DECL PadMethod { PadStrategy strategy_; Direction direction_; uint32_t pad_id_; @@ -160,7 +168,7 @@ inline void to_json(nlohmann::json& j, } } -struct FASTERTOKENIZER_DECL Token { +struct FASTTOKENIZER_DECL Token { uint32_t id_; std::string value_; Offset offset_; @@ -169,7 +177,7 @@ struct FASTERTOKENIZER_DECL Token { : id_(id), value_(value), offset_(offset) {} }; -struct FASTERTOKENIZER_DECL Merge { +struct FASTTOKENIZER_DECL Merge { size_t pos_; uint32_t rank_; uint32_t new_id_; @@ -188,7 +196,7 @@ struct FASTERTOKENIZER_DECL Merge { } }; -struct FASTERTOKENIZER_DECL Symbol { +struct FASTTOKENIZER_DECL Symbol { uint32_t ch_; // symbol id int prev_; int next_; @@ -208,7 +216,7 @@ struct FASTERTOKENIZER_DECL Symbol { } }; -struct FASTERTOKENIZER_DECL BPEWord { +struct FASTTOKENIZER_DECL BPEWord { BPEWord() = default; BPEWord(size_t capacity) { Reserve(capacity); } void Reserve(size_t capacity) { symbols_.reserve(capacity); } @@ -358,6 +366,13 @@ struct FASTERTOKENIZER_DECL BPEWord { std::vector symbols_; }; +FASTTOKENIZER_DECL void SetThreadNum(int thread_num); + +FASTTOKENIZER_DECL int GetThreadNum(); + +FASTTOKENIZER_DECL void RunMultiThread(std::function func, + size_t batch_size); + } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/fast_tokenizer/fast_tokenizer/core/encoding.cc similarity index 91% rename from faster_tokenizer/faster_tokenizer/core/encoding.cc rename to fast_tokenizer/fast_tokenizer/core/encoding.cc index 980e192abcbc..379d21df931b 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.cc +++ b/fast_tokenizer/fast_tokenizer/core/encoding.cc @@ -12,19 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/encoding.h" #include #include -#include +#include #include #include "glog/logging.h" -#ifdef WITH_OMP -#include -#endif - namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace core { Encoding::Encoding(const std::vector& ids, @@ -77,7 +73,7 @@ Encoding::Encoding(uint32_t capacity) { Encoding::Encoding(const std::vector& tokens, uint32_t type_id) : type_ids_(tokens.size(), type_id), - words_idx_(tokens.size()), + words_idx_(tokens.size(), std::numeric_limits::max()), attention_mask_(tokens.size(), 1), special_tokens_mask_(tokens.size(), 0) { auto length = tokens.size(); @@ -179,10 +175,11 @@ Range Encoding::GetSequenceRange(uint32_t seq_id) const { } void Encoding::ProcessTokenWithOffsets( - std::function process_token_fn) { + std::function + process_token_fn) { auto length = GetLen(); for (int i = 0; i < length; ++i) { - process_token_fn(i, &tokens_[i], &offsets_[i]); + process_token_fn(i, tokens_[i], &offsets_[i]); } } @@ -215,7 +212,7 @@ std::vector Encoding::WordIdxToTokensIdx(uint32_t word_idx, for (uint32_t i = seq_range.first; i < seq_range.second; ++i) { // -1 is the word index of special token if (words_idx_[i] > word_idx && - words_idx_[i] != static_cast(-1)) { + words_idx_[i] != std::numeric_limits::max()) { break; } if (words_idx_[i] == word_idx) { @@ -434,7 +431,8 @@ void Encoding::Pad(uint32_t target_length, ids_.insert(ids_.begin(), pad_len, pad_id); type_ids_.insert(type_ids_.begin(), pad_len, pad_type_id); tokens_.insert(tokens_.begin(), pad_len, pad_token); - words_idx_.insert(words_idx_.begin(), pad_len, UINT_MAX); + words_idx_.insert( + words_idx_.begin(), pad_len, std::numeric_limits::max()); attention_mask_.insert(attention_mask_.begin(), pad_len, 0); special_tokens_mask_.insert(special_tokens_mask_.begin(), pad_len, 1); offsets_.insert(offsets_.begin(), pad_len, {0, 0}); @@ -442,7 +440,8 @@ void Encoding::Pad(uint32_t target_length, ids_.insert(ids_.end(), pad_len, pad_id); type_ids_.insert(type_ids_.end(), pad_len, pad_type_id); tokens_.insert(tokens_.end(), pad_len, pad_token); - words_idx_.insert(words_idx_.end(), pad_len, UINT_MAX); + words_idx_.insert( + words_idx_.end(), pad_len, std::numeric_limits::max()); attention_mask_.insert(attention_mask_.end(), pad_len, 0); special_tokens_mask_.insert(special_tokens_mask_.end(), pad_len, 1); offsets_.insert(offsets_.end(), pad_len, {0, 0}); @@ -460,6 +459,27 @@ Encoding Encoding::Merge(const std::vector& encodings, return merged_encoding; } +void Encoding::SetTypeIds(const std::vector& type_ids) { + type_ids_ = type_ids; +} + +bool Encoding::operator==(const Encoding& other) const { + if (overflowing_.size() != other.overflowing_.size()) { + return false; + } + for (int i = 0; i < overflowing_.size(); ++i) { + if (!(overflowing_[i] == other.overflowing_[i])) { + return false; + } + } + return ids_ == other.ids_ && type_ids_ == other.type_ids_ && + tokens_ == other.tokens_ && words_idx_ == other.words_idx_ && + offsets_ == other.offsets_ && + special_tokens_mask_ == other.special_tokens_mask_ && + attention_mask_ == other.attention_mask_ && + sequence_ranges_ == other.sequence_ranges_; +} + std::string Encoding::DebugString() const { std::ostringstream oss; oss << "The Encoding content: \n"; @@ -523,7 +543,6 @@ std::string Encoding::DebugString() const { oss << "{" << iter->first << " : (" << iter->second.first << ", " << iter->second.second << ") }, "; } - oss << "\n"; return oss.str(); } @@ -635,17 +654,6 @@ void PadEncodings(std::vector* encodings, const PadMethod& method) { pad_length += pad_length - pad_length % method.pad_to_multiple_of_; } auto batch_size = encodings->size(); -#ifdef WITH_OMP -#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1) - for (int i = 0; i < batch_size; ++i) { - auto& encoding = (*encodings)[i]; - encoding.Pad(pad_length, - method.pad_id_, - method.pad_token_type_id_, - method.pad_token_, - method.direction_); - } -#else auto func = std::bind(&MultiThreadPadEncodings, encodings, std::ref(method), @@ -653,45 +661,9 @@ void PadEncodings(std::vector* encodings, const PadMethod& method) { std::placeholders::_1, std::placeholders::_2); RunMultiThread(func, batch_size); -#endif } -int GetThreadNum(size_t batch_size) { - char* env_var = std::getenv("OMP_NUM_THREADS"); - int thread_num = std::atoi(env_var); - if (batch_size <= 0) { - thread_num = 1; - VLOG(3) << "batch_size <=0, we set OMP_NUM_THREADS = 1"; - } else { - int best_num = ceil(batch_size / 4.0); - if (thread_num > best_num) { - thread_num = best_num; - VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = " - "batch_size/4"; - } else if (thread_num == 0) { - thread_num = best_num; - VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4"; - } - } - return thread_num; -} - -void RunMultiThread(std::function func, - size_t batch_size) { - int thread_num = GetThreadNum(batch_size); - std::vector vectorOfThread; - size_t start_index = 0; - size_t step_index = ceil(batch_size / float(thread_num)); - - for (size_t thread_index = 0; thread_index < thread_num; thread_index++) { - vectorOfThread.emplace_back(std::thread(func, start_index, step_index)); - start_index = start_index + step_index; - } - for (size_t thread_index = 0; thread_index < thread_num; thread_index++) { - vectorOfThread[thread_index].join(); - } -} } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/fast_tokenizer/fast_tokenizer/core/encoding.h similarity index 85% rename from faster_tokenizer/faster_tokenizer/core/encoding.h rename to fast_tokenizer/fast_tokenizer/core/encoding.h index 12a4bb708635..5a9d3a41b714 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.h +++ b/fast_tokenizer/fast_tokenizer/core/encoding.h @@ -18,20 +18,19 @@ limitations under the License. */ #include #include #include -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/utils/utils.h" #include #include #include #include -using namespace std; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace core { -class FASTERTOKENIZER_DECL Encoding { +class FASTTOKENIZER_DECL Encoding { public: Encoding() = default; Encoding(const std::vector& ids, @@ -82,7 +81,8 @@ class FASTERTOKENIZER_DECL Encoding { Range GetSequenceRange(uint32_t seq_id) const; void ProcessTokenWithOffsets( - std::function process_token_fn); + std::function + process_token_fn); // token_idx: The index of token in the sequence std::vector TokenIdxToSequenceIds(uint32_t token_idx) const; @@ -109,6 +109,8 @@ class FASTERTOKENIZER_DECL Encoding { static Encoding Merge(const std::vector& encodings, bool growing_offsets); std::string DebugString() const; + void SetTypeIds(const std::vector& type_ids); + bool operator==(const Encoding& other) const; private: std::vector ids_; @@ -122,16 +124,12 @@ class FASTERTOKENIZER_DECL Encoding { std::unordered_map sequence_ranges_; }; -bool FASTERTOKENIZER_DECL TruncateEncodings(Encoding* encoding, - Encoding* pair_encoding, - const TruncMethod& method); -void FASTERTOKENIZER_DECL PadEncodings(std::vector* encoding, - const PadMethod& method); +bool FASTTOKENIZER_DECL TruncateEncodings(Encoding* encoding, + Encoding* pair_encoding, + const TruncMethod& method); +void FASTTOKENIZER_DECL PadEncodings(std::vector* encoding, + const PadMethod& method); -int FASTERTOKENIZER_DECL GetThreadNum(size_t batch_size); - -void FASTERTOKENIZER_DECL -RunMultiThread(std::function func, size_t batch_size); } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/fast_tokenizer/fast_tokenizer/core/tokenizer.cc similarity index 81% rename from faster_tokenizer/faster_tokenizer/core/tokenizer.cc rename to fast_tokenizer/fast_tokenizer/core/tokenizer.cc index 1b6399c4aedf..f7689f6b76ac 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc +++ b/fast_tokenizer/fast_tokenizer/core/tokenizer.cc @@ -15,24 +15,19 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "faster_tokenizer/core/added_vocabulary.h" -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/core/encoding.h" -#include "faster_tokenizer/core/tokenizer.h" +#include "fast_tokenizer/core/added_vocabulary.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/tokenizer.h" -#include "faster_tokenizer/decoders/decoders.h" -#include "faster_tokenizer/models/models.h" -#include "faster_tokenizer/normalizers/normalizers.h" -#include "faster_tokenizer/postprocessors/postprocessors.h" -#include "faster_tokenizer/pretokenizers/pretokenizers.h" - - -#ifdef WITH_OMP -#include -#endif +#include "fast_tokenizer/decoders/decoders.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/postprocessors/postprocessors.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace core { normalizers::Normalizer* Tokenizer::GetNormalizerPtr() const { @@ -249,118 +244,116 @@ void Tokenizer::EncodePairStrings(const EncodeInput& encode_input, } } +void Tokenizer::EncodePairStrings(const std::string& text, + const std::string& text_pair, + Encoding* encodings, + bool add_special_tokens) const { + Encoding encoding, pair_encoding; + EncodeSingleString(text, 0, OffsetType::CHAR, &encoding); + EncodeSingleString(text_pair, 1, OffsetType::CHAR, &pair_encoding); + PostProcess(&encoding, &pair_encoding, add_special_tokens, encodings); +} + void Tokenizer::MultiThreadEncodeBatchStrings( - const std::vector& batch_encode_input, + const std::vector& texts, + const std::vector& text_pairs, std::vector* encodings, bool add_special_tokens, size_t start_index, size_t step_index) const { - auto batch_size = batch_encode_input.size(); + if (texts.size() != text_pairs.size()) { + throw std::runtime_error( + "The size of text must equal to the size of text_pair"); + } + auto batch_size = texts.size(); size_t end_index = start_index + step_index; if (end_index > batch_size) end_index = batch_size; for (size_t i = start_index; i < end_index; ++i) { EncodePairStrings( - batch_encode_input[i], &(*encodings)[i], add_special_tokens); + texts[i], text_pairs[i], &(*encodings)[i], add_special_tokens); } } -void Tokenizer::EncodeBatchStrings( +void Tokenizer::MultiThreadEncodeBatchStrings( const std::vector& batch_encode_input, std::vector* encodings, - bool add_special_tokens) const { + bool add_special_tokens, + size_t start_index, + size_t step_index) const { auto batch_size = batch_encode_input.size(); - encodings->resize(batch_size); - -#ifdef WITH_OMP -// (TODO:zhoushunjie): Simply use the batch size to estimate the workload of -// tokenization. -// Use workload to determine whether create omp threads. Need to optimize the -// workload estimation. -#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1) - for (int i = 0; i < batch_size; ++i) { + size_t end_index = start_index + step_index; + if (end_index > batch_size) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { EncodePairStrings( batch_encode_input[i], &(*encodings)[i], add_special_tokens); } -#else - auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings, - this, - std::ref(batch_encode_input), - encodings, - add_special_tokens, - std::placeholders::_1, - std::placeholders::_2); - RunMultiThread(func, batch_size); -#endif - - if (use_padding_) { - PadEncodings(encodings, pad_method_); - } -} - -void Tokenizer::EncodePairStringsCharOffsets(const EncodeInput& encode_input, - Encoding* encodings, - bool add_special_tokens) const { - const auto& input_string = paddlenlp::get_if(&encode_input); - const auto& input_string_pair = - paddlenlp::get_if>(&encode_input); - Encoding encoding; - Encoding pair_encoding; - if (input_string != nullptr) { - EncodeSingleString(*input_string, 0, OffsetType::CHAR, &encoding); - } else { - EncodeSingleString( - input_string_pair->first, 0, OffsetType::CHAR, &encoding); - EncodeSingleString( - input_string_pair->second, 1, OffsetType::CHAR, &pair_encoding); - } - PostProcess(&encoding, &pair_encoding, add_special_tokens, encodings); } -void Tokenizer::MultiThreadEncodeBatchStringsCharOffsets( - const std::vector& batch_encode_input, +void Tokenizer::MultiThreadEncodeBatchStrings( + const std::vector& texts, std::vector* encodings, bool add_special_tokens, size_t start_index, size_t step_index) const { - auto batch_size = batch_encode_input.size(); + auto batch_size = texts.size(); size_t end_index = start_index + step_index; if (end_index > batch_size) end_index = batch_size; for (size_t i = start_index; i < end_index; ++i) { - Encoding encoding; - EncodePairStringsCharOffsets( - batch_encode_input[i], &encoding, add_special_tokens); - (*encodings)[i] = std::move(encoding); + EncodePairStrings(texts[i], &(*encodings)[i], add_special_tokens); } } -void Tokenizer::EncodeBatchStringsCharOffsets( +void Tokenizer::EncodeBatchStrings( const std::vector& batch_encode_input, std::vector* encodings, bool add_special_tokens) const { auto batch_size = batch_encode_input.size(); encodings->resize(batch_size); -#ifdef WITH_OMP -// (TODO:zhoushunjie): Simply use the batch size to estimate the workload of -// tokenization. -// Use workload to determine whether create omp threads. Need to optimize the -// workload estimation. -#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1) - for (int i = 0; i < batch_size; ++i) { - Encoding encoding; - EncodePairStringsCharOffsets( - batch_encode_input[i], &encoding, add_special_tokens); - (*encodings)[i] = std::move(encoding); + auto func = [&](size_t start_index, size_t step_index) { + MultiThreadEncodeBatchStrings(batch_encode_input, + encodings, + add_special_tokens, + start_index, + step_index); + }; + RunMultiThread(func, batch_size); + + if (use_padding_) { + PadEncodings(encodings, pad_method_); } -#else - auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets, - this, - std::ref(batch_encode_input), - encodings, - add_special_tokens, - std::placeholders::_1, - std::placeholders::_2); +} + +void Tokenizer::EncodeBatchStrings(const std::vector& texts, + std::vector* encodings, + bool add_special_tokens) const { + auto batch_size = texts.size(); + encodings->resize(batch_size); + auto func = [&](size_t start_index, size_t step_index) { + MultiThreadEncodeBatchStrings( + texts, encodings, add_special_tokens, start_index, step_index); + }; + RunMultiThread(func, batch_size); + + if (use_padding_) { + PadEncodings(encodings, pad_method_); + } +} + +void Tokenizer::EncodeBatchStrings(const std::vector& texts, + const std::vector& text_pairs, + std::vector* encodings, + bool add_special_tokens) const { + auto batch_size = texts.size(); + encodings->resize(batch_size); + auto func = [&](size_t start_index, size_t step_index) { + MultiThreadEncodeBatchStrings(texts, + text_pairs, + encodings, + add_special_tokens, + start_index, + step_index); + }; RunMultiThread(func, batch_size); -#endif if (use_padding_) { PadEncodings(encodings, pad_method_); @@ -398,6 +391,7 @@ Encoding Tokenizer::EncodeTextToEncoding(const std::vector& word_idx, DoTokenize(&pretokenized, type_id, word_idx, offset_type, &encoding); return encoding; } + const AddedVocabulary& Tokenizer::GetAddedVocabulary() const { return added_vocabulary_; } @@ -479,26 +473,11 @@ void Tokenizer::DecodeBatch( bool skip_special_tokens) const { auto batch_size = batch_token_ids.size(); results->resize(batch_size); -#ifdef WITH_OMP -// (TODO:zhoushunjie): Simply use the batch size to estimate the workload of -// tokenization. -// Use workload to determine whether create omp threads. Need to optimize the -// workload estimation. -#pragma omp parallel for if (batch_token_ids.size() >= 4 && \ - omp_get_num_threads() > 1) - for (int i = 0; i < batch_token_ids.size(); ++i) { - Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); - } -#else - auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch, - this, - std::ref(batch_token_ids), - results, - skip_special_tokens, - std::placeholders::_1, - std::placeholders::_2); + auto func = [&](size_t start_index, size_t step_index) { + MultiThreadDecodeBatch( + batch_token_ids, results, skip_special_tokens, start_index, step_index); + }; RunMultiThread(func, batch_size); -#endif } bool Tokenizer::GetUseTruncation() const { return use_truncation_; } @@ -591,6 +570,14 @@ void to_json(nlohmann::json& j, const Tokenizer& tokenizer) { typeid(pretokenizers::SequencePreTokenizer)) { j["pretokenizer"] = *dynamic_cast( tokenizer.pretokenizer_.get()); + } else if (typeid(*tokenizer.pretokenizer_.get()) == + typeid(pretokenizers::ByteLevelPreTokenizer)) { + j["pretokenizer"] = *dynamic_cast( + tokenizer.pretokenizer_.get()); + } else if (typeid(*tokenizer.pretokenizer_.get()) == + typeid(pretokenizers::SplitPreTokenizer)) { + j["pretokenizer"] = *dynamic_cast( + tokenizer.pretokenizer_.get()); } } @@ -599,9 +586,9 @@ void to_json(nlohmann::json& j, const Tokenizer& tokenizer) { if (typeid(*tokenizer.model_.get()) == typeid(models::WordPiece)) { j["model"] = *dynamic_cast(tokenizer.model_.get()); } else if (typeid(*tokenizer.model_.get()) == - typeid(models::FasterWordPiece)) { + typeid(models::FastWordPiece)) { j["model"] = - *dynamic_cast(tokenizer.model_.get()); + *dynamic_cast(tokenizer.model_.get()); } else if (typeid(*tokenizer.model_.get()) == typeid(models::BPE)) { j["model"] = *dynamic_cast(tokenizer.model_.get()); } else if (typeid(*tokenizer.model_.get()) == typeid(models::Unigram)) { @@ -620,6 +607,15 @@ void to_json(nlohmann::json& j, const Tokenizer& tokenizer) { j["postprocessor"] = *dynamic_cast( tokenizer.post_processor_.get()); + } else if (typeid(*tokenizer.post_processor_.get()) == + typeid(postprocessors::RobertaPostProcessor)) { + j["postprocessor"] = *dynamic_cast( + tokenizer.post_processor_.get()); + } else if (typeid(*tokenizer.post_processor_.get()) == + typeid(postprocessors::ByteLevelPostProcessor)) { + j["postprocessor"] = + *dynamic_cast( + tokenizer.post_processor_.get()); } } @@ -705,6 +701,14 @@ void from_json(const nlohmann::json& j, Tokenizer& tokenizer) { pretokenizers::SequencePreTokenizer sequence_pretokenizer; pretokenizer.get_to(sequence_pretokenizer); tokenizer.SetPreTokenizer(sequence_pretokenizer); + } else if (pretokenizer.at("type") == "ByteLevelPreTokenizer") { + pretokenizers::ByteLevelPreTokenizer byte_pretokenizer; + pretokenizer.get_to(byte_pretokenizer); + tokenizer.SetPreTokenizer(byte_pretokenizer); + } else if (pretokenizer.at("type") == "SplitPreTokenizer") { + pretokenizers::SplitPreTokenizer split_pretokenizer; + pretokenizer.get_to(split_pretokenizer); + tokenizer.SetPreTokenizer(split_pretokenizer); } } @@ -715,8 +719,8 @@ void from_json(const nlohmann::json& j, Tokenizer& tokenizer) { models::WordPiece wordpiece; model.get_to(wordpiece); tokenizer.SetModel(wordpiece); - } else if (model.at("type") == "FasterWordPiece") { - models::FasterWordPiece wordpiece; + } else if (model.at("type") == "FastWordPiece") { + models::FastWordPiece wordpiece; model.get_to(wordpiece); tokenizer.SetModel(wordpiece); } else if (model.at("type") == "BPE") { @@ -741,6 +745,14 @@ void from_json(const nlohmann::json& j, Tokenizer& tokenizer) { postprocessors::TemplatePostProcessor template_postprocessor; post_processor.get_to(template_postprocessor); tokenizer.SetPostProcessor(template_postprocessor); + } else if (post_processor.at("type") == "RobertaPostProcessor") { + postprocessors::RobertaPostProcessor roberta_postprocessor; + post_processor.get_to(roberta_postprocessor); + tokenizer.SetPostProcessor(roberta_postprocessor); + } else if (post_processor.at("type") == "ByteLevelPostProcessor") { + postprocessors::ByteLevelPostProcessor byte_level_postprocessor; + post_processor.get_to(byte_level_postprocessor); + tokenizer.SetPostProcessor(byte_level_postprocessor); } } @@ -810,12 +822,16 @@ template void Tokenizer::SetPreTokenizer( const pretokenizers::MetaSpacePreTokenizer&); template void Tokenizer::SetPreTokenizer( const pretokenizers::SequencePreTokenizer&); +template void Tokenizer::SetPreTokenizer( + const pretokenizers::ByteLevelPreTokenizer&); +template void Tokenizer::SetPreTokenizer( + const pretokenizers::SplitPreTokenizer&); // Instantiate models template Tokenizer::Tokenizer(const models::WordPiece&); template void Tokenizer::SetModel(const models::WordPiece&); -template Tokenizer::Tokenizer(const models::FasterWordPiece&); -template void Tokenizer::SetModel(const models::FasterWordPiece&); +template Tokenizer::Tokenizer(const models::FastWordPiece&); +template void Tokenizer::SetModel(const models::FastWordPiece&); template Tokenizer::Tokenizer(const models::BPE&); template void Tokenizer::SetModel(const models::BPE&); template Tokenizer::Tokenizer(const models::Unigram&); @@ -826,9 +842,13 @@ template void Tokenizer::SetPostProcessor( const postprocessors::BertPostProcessor&); template void Tokenizer::SetPostProcessor( const postprocessors::TemplatePostProcessor&); +template void Tokenizer::SetPostProcessor( + const postprocessors::RobertaPostProcessor&); +template void Tokenizer::SetPostProcessor( + const postprocessors::ByteLevelPostProcessor&); // Instantiate Decoder template void Tokenizer::SetDecoder(const decoders::WordPiece& decoder); } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.h b/fast_tokenizer/fast_tokenizer/core/tokenizer.h similarity index 81% rename from faster_tokenizer/faster_tokenizer/core/tokenizer.h rename to fast_tokenizer/fast_tokenizer/core/tokenizer.h index d709cc5a5c6e..71fdf6099f64 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.h +++ b/fast_tokenizer/fast_tokenizer/core/tokenizer.h @@ -16,14 +16,14 @@ limitations under the License. */ #include // For shared_ptr #include -#include "faster_tokenizer/core/added_vocabulary.h" -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/utils/utils.h" -#include "faster_tokenizer/utils/variant.h" +#include "fast_tokenizer/core/added_vocabulary.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/variant.h" #include "nlohmann/json.hpp" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { @@ -60,7 +60,7 @@ using InputString = paddlenlp::variant>; using EncodeInput = paddlenlp::variant>; -class FASTERTOKENIZER_DECL Tokenizer { +class FASTTOKENIZER_DECL Tokenizer { public: Tokenizer() : model_(nullptr), @@ -149,16 +149,17 @@ class FASTERTOKENIZER_DECL Tokenizer { uint32_t type_id, OffsetType offset_type, Encoding* encodings) const; - void EncodePairStrings(const EncodeInput& encode_input, - Encoding* encodings, - bool add_special_tokens = true) const; - void EncodePairStringsCharOffsets(const EncodeInput& encode_input, - Encoding* encodings, - bool add_special_tokens = true) const; void PostProcess(Encoding* encoding, Encoding* pair_encoding, bool add_special_tokens, Encoding* result_encoding) const; + void EncodePairStrings(const EncodeInput& encode_input, + Encoding* encodings, + bool add_special_tokens = true) const; + void EncodePairStrings(const std::string& text, + const std::string& text_pair, + Encoding* encodings, + bool add_special_tokens = true) const; void MultiThreadEncodeBatchStrings( const std::vector& batch_encode_input, @@ -166,22 +167,30 @@ class FASTERTOKENIZER_DECL Tokenizer { bool add_special_tokens, size_t start_index, size_t step_index) const; + // Tokenize the unpretokenized text. + void MultiThreadEncodeBatchStrings(const std::vector& texts, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const; + void MultiThreadEncodeBatchStrings(const std::vector& texts, + const std::vector& text_pairs, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const; void EncodeBatchStrings(const std::vector& batch_encode_input, std::vector* encodings, bool add_special_tokens = true) const; - - void MultiThreadEncodeBatchStringsCharOffsets( - const std::vector& batch_encode_input, - std::vector* encodings, - bool add_special_tokens, - size_t start_index, - size_t step_index) const; - - void EncodeBatchStringsCharOffsets( - const std::vector& batch_encode_input, - std::vector* encodings, - bool add_special_tokens = true) const; + // Tokenize the unpretokenized text. + void EncodeBatchStrings(const std::vector& texts, + std::vector* encodings, + bool add_special_tokens = true) const; + void EncodeBatchStrings(const std::vector& texts, + const std::vector& text_pairs, + std::vector* encodings, + bool add_special_tokens = true) const; // Encode single text which is already pretokenized. void EncodeSingleText(const std::vector& pretokenized_texts, @@ -241,5 +250,5 @@ class FASTERTOKENIZER_DECL Tokenizer { }; } // namespace core -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/decoders/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/decoders/CMakeLists.txt similarity index 100% rename from faster_tokenizer/faster_tokenizer/decoders/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/decoders/CMakeLists.txt diff --git a/faster_tokenizer/faster_tokenizer/decoders/decoder.h b/fast_tokenizer/fast_tokenizer/decoders/decoder.h similarity index 85% rename from faster_tokenizer/faster_tokenizer/decoders/decoder.h rename to fast_tokenizer/fast_tokenizer/decoders/decoder.h index 081e6693f13d..7969e22d11f7 100644 --- a/faster_tokenizer/faster_tokenizer/decoders/decoder.h +++ b/fast_tokenizer/fast_tokenizer/decoders/decoder.h @@ -16,17 +16,17 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace decoders { -struct FASTERTOKENIZER_DECL Decoder { +struct FASTTOKENIZER_DECL Decoder { virtual void operator()(const std::vector tokens, std::string* result) const = 0; }; } // namespace decoders -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/decoders/decoders.h b/fast_tokenizer/fast_tokenizer/decoders/decoders.h similarity index 86% rename from faster_tokenizer/faster_tokenizer/decoders/decoders.h rename to fast_tokenizer/fast_tokenizer/decoders/decoders.h index 29f8111df4ca..efc72779de9f 100644 --- a/faster_tokenizer/faster_tokenizer/decoders/decoders.h +++ b/fast_tokenizer/fast_tokenizer/decoders/decoders.h @@ -14,5 +14,5 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/decoders/decoder.h" -#include "faster_tokenizer/decoders/wordpiece.h" \ No newline at end of file +#include "fast_tokenizer/decoders/decoder.h" +#include "fast_tokenizer/decoders/wordpiece.h" \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/decoders/wordpiece.cc b/fast_tokenizer/fast_tokenizer/decoders/wordpiece.cc similarity index 93% rename from faster_tokenizer/faster_tokenizer/decoders/wordpiece.cc rename to fast_tokenizer/fast_tokenizer/decoders/wordpiece.cc index 385457c15292..e81f1562d242 100644 --- a/faster_tokenizer/faster_tokenizer/decoders/wordpiece.cc +++ b/fast_tokenizer/fast_tokenizer/decoders/wordpiece.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/decoders/wordpiece.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/decoders/wordpiece.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace decoders { WordPiece::WordPiece(const std::string prefix, bool cleanup) @@ -65,5 +65,5 @@ void from_json(const nlohmann::json& j, WordPiece& decoder) { } } // namespace decoders -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/decoders/wordpiece.h b/fast_tokenizer/fast_tokenizer/decoders/wordpiece.h similarity index 84% rename from faster_tokenizer/faster_tokenizer/decoders/wordpiece.h rename to fast_tokenizer/fast_tokenizer/decoders/wordpiece.h index 69d93675eba7..1f41b3f8b5dc 100644 --- a/faster_tokenizer/faster_tokenizer/decoders/wordpiece.h +++ b/fast_tokenizer/fast_tokenizer/decoders/wordpiece.h @@ -14,15 +14,15 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/decoders/decoder.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/decoders/decoder.h" +#include "fast_tokenizer/utils/utils.h" #include "nlohmann/json.hpp" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace decoders { -struct FASTERTOKENIZER_DECL WordPiece : public Decoder { +struct FASTTOKENIZER_DECL WordPiece : public Decoder { virtual void operator()(const std::vector tokens, std::string* result) const; @@ -38,5 +38,5 @@ struct FASTERTOKENIZER_DECL WordPiece : public Decoder { }; } // namespace decoders -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/demo/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/demo/CMakeLists.txt new file mode 100644 index 000000000000..fc0669fc128c --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/demo/CMakeLists.txt @@ -0,0 +1,41 @@ +cmake_minimum_required(VERSION 3.10) +project(cpp_fast_tokenizer_demo CXX C) + +option(FAST_TOKENIZER_INSTALL_DIR "Path of downloaded fast_tokenizer sdk.") + +# Download ernie vocab for demo +set(ERNIE_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/ernie_vocab.txt) +if (EXISTS ${ERNIE_VOCAB_PATH}) + message(STATUS "The ${ERNIE_VOCAB_PATH} exists already.") +else() + file(DOWNLOAD "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt" ${ERNIE_VOCAB_PATH} SHOW_PROGRESS) + message(STATUS "Already download the vocab.txt of ernie to ${CMAKE_CURRENT_BINARY_DIR} for demo.") +endif() + +# Download clip vocab and merge files +set(CLIP_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/clip_vocab.json) +set(CLIP_MERGES_PATH ${CMAKE_CURRENT_BINARY_DIR}/clip_merges.txt) + +if (EXISTS ${CLIP_VOCAB_PATH}) + message("The ${CLIP_VOCAB_PATH} exists already.") +else() + file(DOWNLOAD "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/vocab.json" ${CLIP_VOCAB_PATH} SHOW_PROGRESS) + message("Already download the vocab.json of clip to ${CMAKE_CURRENT_BINARY_DIR} for test.") +endif() + +if (EXISTS ${CLIP_MERGES_PATH}) + message("The ${CLIP_MERGES_PATH} exists already.") +else() + file(DOWNLOAD "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/merges.txt" ${CLIP_MERGES_PATH} SHOW_PROGRESS) + message("Already download the merges.txt of clip to ${CMAKE_CURRENT_BINARY_DIR} for test.") +endif() + +# Get FAST_TOKENIZER_INCS and FAST_TOKENIZER_LIBS +include(${FAST_TOKENIZER_INSTALL_DIR}/FastTokenizer.cmake) + +include_directories(${FAST_TOKENIZER_INCS}) + +add_executable(ernie_fast_tokenizer_demo ${PROJECT_SOURCE_DIR}/ernie_fast_tokenizer_demo.cc) +add_executable(clip_fast_tokenizer_demo ${PROJECT_SOURCE_DIR}/clip_fast_tokenizer_demo.cc) +target_link_libraries(ernie_fast_tokenizer_demo ${FAST_TOKENIZER_LIBS}) +target_link_libraries(clip_fast_tokenizer_demo ${FAST_TOKENIZER_LIBS}) diff --git a/faster_tokenizer/faster_tokenizer/demo/README.md b/fast_tokenizer/fast_tokenizer/demo/README.md similarity index 60% rename from faster_tokenizer/faster_tokenizer/demo/README.md rename to fast_tokenizer/fast_tokenizer/demo/README.md index 1b2c1c7110df..e9c745199376 100644 --- a/faster_tokenizer/faster_tokenizer/demo/README.md +++ b/fast_tokenizer/fast_tokenizer/demo/README.md @@ -1,15 +1,15 @@ -# FasterTokenizer C++ Demo +# FastTokenizer C++ Demo ## 1. 快速安装 -当前版本FasterTokenizer C++库支持不同的操作系统以及硬件平台,并为以下平台提供预编译包: +当前版本FastTokenizer C++库支持不同的操作系统以及硬件平台,并为以下平台提供预编译包: |系统|下载地址| |---|---| -|Linux-x64| [faster_tokenizer-linux-x64-dev.tgz](https://bj.bcebos.com/paddlenlp/faster_tokenizer/faster_tokenizer-linux-x64-dev.tgz) | -|Linux-aarch64| [faster_tokenizer-linux-aarch64-dev.tgz](https://bj.bcebos.com/paddlenlp/faster_tokenizer/faster_tokenizer-linux-aarch64-dev.tgz) | -|Windows| [faster_tokenizer-win-x64-dev.zip](https://bj.bcebos.com/paddlenlp/faster_tokenizer/faster_tokenizer-win-x64-dev.zip) | -|MacOS-x64| [faster_tokenizer-osx-x86_64-dev.tgz](https://bj.bcebos.com/paddlenlp/faster_tokenizer/faster_tokenizer-osx-x86_64-dev.tgz) | -|MacOS-arm64| [faster_tokenizer-osx-arm64-dev.tgz](https://bj.bcebos.com/paddlenlp/faster_tokenizer/faster_tokenizer-osx-arm64-dev.tgz) | +|Linux-x64| [fast_tokenizer-linux-x64-dev.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-dev.tgz) | +|Linux-aarch64| [fast_tokenizer-linux-aarch64-dev.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-dev.tgz) | +|Windows| [fast_tokenizer-win-x64-dev.zip](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-dev.zip) | +|MacOS-x64| [fast_tokenizer-osx-x86_64-dev.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-dev.tgz) | +|MacOS-arm64| [fast_tokenizer-osx-arm64-dev.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-dev.tgz) | ### 环境依赖 @@ -35,15 +35,15 @@ ## 2. 快速开始 -以下以Linux平台为例, 介绍如何使用FasterTokenizer CPP预编译包完成demo示例编译及运行。 +以下以Linux平台为例, 介绍如何使用FastTokenizer CPP预编译包完成demo示例编译及运行。 ### 2.1 下载解压 ```shell -wget -c https://bj.bcebos.com/paddlenlp/faster_tokenizer/faster_tokenizer-linux-x64-dev.tgz +wget -c https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-dev.tgz -tar xvfz faster_tokenizer-linux-x64-dev.tgz -# 解压后为faster_tokenizer目录 +tar xvfz fast_tokenizer-linux-x64-dev.tgz +# 解压后为fast_tokenizer目录 ``` ### 2.1 编译 @@ -53,8 +53,8 @@ tar xvfz faster_tokenizer-linux-x64-dev.tgz mkdir build cd build -# 运行cmake,通过指定faster_tokenizer包的路径,构建Makefile -cmake .. -DFASTER_TOKENIZER_INSTALL_DIR=/path/to/faster_tokenizer +# 运行cmake,通过指定fast_tokenizer包的路径,构建Makefile +cmake .. -DFAST_TOKENIZER_INSTALL_DIR=/path/to/fast_tokenizer # 编译 make @@ -63,7 +63,7 @@ make ### 2.2 运行 ```shell -./ernie_faster_tokenizer_demo +./ernie_fast_tokenizer_demo ``` diff --git a/fast_tokenizer/fast_tokenizer/demo/clip_fast_tokenizer_demo.cc b/fast_tokenizer/fast_tokenizer/demo/clip_fast_tokenizer_demo.cc new file mode 100644 index 000000000000..064d9bd54c78 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/demo/clip_fast_tokenizer_demo.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h" +#include +#include +using namespace paddlenlp; + +template +std::ostream& operator<<(std::ostream& os, const std::vector vec) { + os << "["; + for (int i = 0; i < vec.size(); ++i) { + if (i == 0) { + os << vec[i]; + } else { + os << ", " << vec[i]; + } + } + os << "]"; + return os; +} + +fast_tokenizer::tokenizers_impl::ClipFastTokenizer CreateClipFastTokenizer( + const std::string& vocab_path, + const std::string& merge_path, + uint32_t max_length, + bool pad = true) { + fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer( + vocab_path, merge_path, max_length); + if (pad) { + tokenizer.EnablePadMethod(fast_tokenizer::core::RIGHT, + tokenizer.GetPadTokenId(), + 0, + tokenizer.GetPadToken(), + &max_length, + nullptr); + } + return tokenizer; +} + +int main() { + // 1. Define a clip fast tokenizer + auto tokenizer = + CreateClipFastTokenizer("clip_vocab.json", "clip_merges.txt", 77, true); + // 2. Tokenize the input strings + std::vector encodings; + std::vector texts = { + "a photo of an astronaut riding a horse on mars"}; + tokenizer.EncodeBatchStrings(texts, &encodings); + for (auto&& encoding : encodings) { + auto ids = encoding.GetIds(); + std::cout << ids << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/demo/ernie_faster_tokenizer_demo.cc b/fast_tokenizer/fast_tokenizer/demo/ernie_fast_tokenizer_demo.cc similarity index 59% rename from faster_tokenizer/faster_tokenizer/demo/ernie_faster_tokenizer_demo.cc rename to fast_tokenizer/fast_tokenizer/demo/ernie_fast_tokenizer_demo.cc index d97d509a04ae..6cf9812735c2 100644 --- a/faster_tokenizer/faster_tokenizer/demo/ernie_faster_tokenizer_demo.cc +++ b/fast_tokenizer/fast_tokenizer/demo/ernie_fast_tokenizer_demo.cc @@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h" +#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h" #include #include using namespace paddlenlp; int main() { - using faster_tokenizer::core::EncodeInput; - // 1. Define a ernie faster tokenizer - faster_tokenizer::tokenizers_impl::ErnieFasterTokenizer tokenizer( + // 1. Define a ernie fast tokenizer + fast_tokenizer::tokenizers_impl::ErnieFastTokenizer tokenizer( "ernie_vocab.txt"); // 2. Tokenize the input strings // case 1: tokenize a single string std::cout << "case 1: Tokenize a single string" << std::endl; - faster_tokenizer::core::Encoding encoding; - EncodeInput single_string = + fast_tokenizer::core::Encoding encoding; + std::string single_string = "商赢环球股份有限公司关于延期回复上海证券交易所对" "公司2017年年度报告的事后审核问询函的公告"; tokenizer.EncodePairStrings(single_string, &encoding); @@ -34,16 +33,16 @@ int main() { // case 2: tokenize a pair of strings std::cout << "case 2: Tokenize a pair of strings" << std::endl; - EncodeInput pair_string = - std::pair{"蚂蚁借呗等额还款可以换成先息后本吗", - "借呗有先息到期还本吗"}; - tokenizer.EncodePairStrings(pair_string, &encoding); + std::string text = "蚂蚁借呗等额还款可以换成先息后本吗"; + std::string text_pair = "借呗有先息到期还本吗"; + + tokenizer.EncodePairStrings(text, text_pair, &encoding); std::cout << encoding.DebugString() << std::endl; // case 3: Tokenize a batch of single strings std::cout << "case 3: Tokenize a batch of single strings" << std::endl; - std::vector encodings; - std::vector strings_list = { + std::vector encodings; + std::vector strings_list = { "通过中介公司买了二手房,首付都付了,现在卖家不想卖了。怎么处理?", "凌云研发的国产两轮电动车怎么样,有什么惊喜?", "一辆车的寿命到底多长,最多可以开多久?"}; @@ -54,14 +53,16 @@ int main() { // case 4: Tokenize a batch of pair strings std::cout << "case 4: Tokenize a batch of pair strings" << std::endl; - std::vector pair_strings_list = { - std::pair({"花呗自动从余额宝扣款,需要我自己设置吗", "支付宝余额会自动还花呗吗"}), - std::pair({"这个蚂蚁花呗能恢复正常用不", "我的蚂蚁花呗 怎么用不了"}), - std::pair({"在经济的另一次转变中,人们发现在低地农场饲养羔羊更具成本效益,部分原因" - "是那里有更丰富、更有营养的牧场,因此湖地农场的利润变得更少。", - "人们发现,经济的另一个转变更有营养。"}), - }; - tokenizer.EncodeBatchStrings(pair_strings_list, &encodings); + std::vector texts = { + "花呗自动从余额宝扣款,需要我自己设置吗", + "这个蚂蚁花呗能恢复正常用不", + "在经济的另一次转变中,人们发现在低地农场饲养羔羊更具成本效益,部分原因" + "是那里有更丰富、更有营养的牧场,因此湖地农场的利润变得更少。"}; + std::vector text_pairs = { + "支付宝余额会自动还花呗吗", + "我的蚂蚁花呗 怎么用不了", + "人们发现,经济的另一个转变更有营养。"}; + tokenizer.EncodeBatchStrings(texts, text_pairs, &encodings); for (auto&& encoding : encodings) { std::cout << encoding.DebugString() << std::endl; } diff --git a/faster_tokenizer/faster_tokenizer/models/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/models/CMakeLists.txt similarity index 56% rename from faster_tokenizer/faster_tokenizer/models/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/models/CMakeLists.txt index 05c568cb9a87..f51706a77035 100644 --- a/faster_tokenizer/faster_tokenizer/models/CMakeLists.txt +++ b/fast_tokenizer/fast_tokenizer/models/CMakeLists.txt @@ -1,3 +1,3 @@ cc_library(models - SRCS wordpiece.cc faster_wordpiece.cc bpe.cc unigram.cc + SRCS wordpiece.cc fast_wordpiece.cc bpe.cc unigram.cc DEPS core json trie failure icuuc icudata lattice utils) diff --git a/faster_tokenizer/faster_tokenizer/models/bpe.cc b/fast_tokenizer/fast_tokenizer/models/bpe.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/models/bpe.cc rename to fast_tokenizer/fast_tokenizer/models/bpe.cc index 89e3a7464779..ad80d2752d6f 100644 --- a/faster_tokenizer/faster_tokenizer/models/bpe.cc +++ b/fast_tokenizer/fast_tokenizer/models/bpe.cc @@ -19,12 +19,12 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "faster_tokenizer/models/bpe.h" -#include "faster_tokenizer/utils/path.h" -#include "faster_tokenizer/utils/utf8.h" +#include "fast_tokenizer/models/bpe.h" +#include "fast_tokenizer/utils/path.h" +#include "fast_tokenizer/utils/utf8.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { const std::string WHITESPACE = " \n\r\t\f\v"; @@ -345,5 +345,5 @@ void from_json(const nlohmann::json& j, BPE& model) { } } // namespace model -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/bpe.h b/fast_tokenizer/fast_tokenizer/models/bpe.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/models/bpe.h rename to fast_tokenizer/fast_tokenizer/models/bpe.h index ee922474f6c7..bb8cfd08cc41 100644 --- a/faster_tokenizer/faster_tokenizer/models/bpe.h +++ b/fast_tokenizer/fast_tokenizer/models/bpe.h @@ -14,16 +14,16 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/models/model.h" +#include "fast_tokenizer/models/model.h" #include "nlohmann/json.hpp" -#include "faster_tokenizer/utils/cache.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/cache.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { -struct FASTERTOKENIZER_DECL BPE : public Model { +struct FASTTOKENIZER_DECL BPE : public Model { BPE(); BPE(const core::Vocab& vocab, const core::Merges& merges, @@ -78,5 +78,5 @@ struct FASTERTOKENIZER_DECL BPE : public Model { }; } // namespace models -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc b/fast_tokenizer/fast_tokenizer/models/fast_wordpiece.cc similarity index 91% rename from faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc rename to fast_tokenizer/fast_tokenizer/models/fast_wordpiece.cc index 4272b8a3c5a0..bbe712092359 100644 --- a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc +++ b/fast_tokenizer/fast_tokenizer/models/fast_wordpiece.cc @@ -22,19 +22,19 @@ #include "glog/logging.h" #include "unicode/uchar.h" -#include "faster_tokenizer/models/faster_wordpiece.h" -#include "faster_tokenizer/models/wordpiece.h" -#include "faster_tokenizer/utils/path.h" -#include "faster_tokenizer/utils/utf8.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/models/fast_wordpiece.h" +#include "fast_tokenizer/models/wordpiece.h" +#include "fast_tokenizer/utils/path.h" +#include "fast_tokenizer/utils/utf8.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { const std::string WHITESPACE = " \n\r\t\f\v"; -void FasterWordPiece::InitFailureAndTrie() { +void FastWordPiece::InitFailureAndTrie() { unk_token_id_ = vocab_.at(unk_token_); trie_.SetWithPretokenization(with_pretokenization_); trie_.SetUNKToken(unk_token_); @@ -45,10 +45,10 @@ void FasterWordPiece::InitFailureAndTrie() { PrecomputeEncodeValueForSubwordPrefix(); } -FasterWordPiece::FasterWordPiece() +FastWordPiece::FastWordPiece() : WordPiece(), with_pretokenization_(false) {} -FasterWordPiece::FasterWordPiece(const core::Vocab& vocab, +FastWordPiece::FastWordPiece(const core::Vocab& vocab, const std::string& unk_token, size_t max_input_chars_per_word, const std::string& continuing_subword_prefix, @@ -63,7 +63,7 @@ FasterWordPiece::FasterWordPiece(const core::Vocab& vocab, InitFailureAndTrie(); } -void FasterWordPiece::PrecomputeEncodeValueForSubwordPrefix() { +void FastWordPiece::PrecomputeEncodeValueForSubwordPrefix() { auto subword_prefix_tokens = WordPiece::Tokenize(continuing_subword_prefix_); encoded_value_for_subword_prefix_.reserve(subword_prefix_tokens.size()); @@ -78,7 +78,7 @@ void FasterWordPiece::PrecomputeEncodeValueForSubwordPrefix() { } } -bool FasterWordPiece::TryFollowFailureLinkAndCollectTokens( +bool FastWordPiece::TryFollowFailureLinkAndCollectTokens( const std::string& sequence, int sequence_offset_in_text, int* curr_offset_in_sequence, @@ -115,7 +115,7 @@ bool FasterWordPiece::TryFollowFailureLinkAndCollectTokens( return true; } -void FasterWordPiece::AppendTokensToOutput( +void FastWordPiece::AppendTokensToOutput( const std::string& sequence, int sequence_offset_in_text, int* curr_offset_in_sequence, @@ -149,7 +149,7 @@ void FasterWordPiece::AppendTokensToOutput( *curr_offset_in_sequence += token_substr_length; } -void FasterWordPiece::ResetOutputAppendUNK( +void FastWordPiece::ResetOutputAppendUNK( int sequence_offset_in_text, int sequence_size, int* original_num_tokens, @@ -162,7 +162,7 @@ void FasterWordPiece::ResetOutputAppendUNK( (*original_num_tokens)++; } -bool FasterWordPiece::TryHandleContinuingSubWordPrefix( +bool FastWordPiece::TryHandleContinuingSubWordPrefix( const std::string& sequence, int sequence_offset_in_text, const utils::Trie::TraversalCursor& curr_node, @@ -193,7 +193,7 @@ bool FasterWordPiece::TryHandleContinuingSubWordPrefix( return true; } -void FasterWordPiece::HandleTheRemainingStringOnTriePath( +void FastWordPiece::HandleTheRemainingStringOnTriePath( const std::string& sequence, int sequence_offset_in_text, utils::Trie::TraversalCursor* curr_node, @@ -229,7 +229,7 @@ void FasterWordPiece::HandleTheRemainingStringOnTriePath( *original_num_tokens = tokens->size(); } -int FasterWordPiece::SkipRemainingOfWordAndTrailingWhiteSpaces( +int FastWordPiece::SkipRemainingOfWordAndTrailingWhiteSpaces( const std::string& sequence, int* curr_idx) const { int seq_len = sequence.length(); uint32_t curr_unicode_char; @@ -251,9 +251,9 @@ int FasterWordPiece::SkipRemainingOfWordAndTrailingWhiteSpaces( return end_of_word; } -std::vector FasterWordPiece::TokenizeWithoutPreTokenize( +std::vector FastWordPiece::TokenizeWithoutPreTokenize( const std::string& sequence) const { - VLOG(6) << "Using FasterWordPiece::TokenizeWithoutPreTokenize to tokenize " + VLOG(6) << "Using FastWordPiece::TokenizeWithoutPreTokenize to tokenize " "sequence"; if (sequence.empty()) { return {}; @@ -295,10 +295,10 @@ std::vector FasterWordPiece::TokenizeWithoutPreTokenize( return all_tokens; } -std::vector FasterWordPiece::TokenizeWithPreTokenize( +std::vector FastWordPiece::TokenizeWithPreTokenize( const std::string& sequence) const { VLOG(6) - << "Using FasterWordPiece::TokenizeWithPreTokenize to tokenize sequence"; + << "Using FastWordPiece::TokenizeWithPreTokenize to tokenize sequence"; // Need to implement if (sequence.empty()) { return {}; @@ -383,7 +383,7 @@ std::vector FasterWordPiece::TokenizeWithPreTokenize( return all_tokens; } -std::vector FasterWordPiece::Tokenize( +std::vector FastWordPiece::Tokenize( const std::string& sequence) { if (!with_pretokenization_) { return TokenizeWithoutPreTokenize(sequence); @@ -391,9 +391,9 @@ std::vector FasterWordPiece::Tokenize( return TokenizeWithPreTokenize(sequence); } -void to_json(nlohmann::json& j, const FasterWordPiece& model) { +void to_json(nlohmann::json& j, const FastWordPiece& model) { j = { - {"type", "FasterWordPiece"}, + {"type", "FastWordPiece"}, {"vocab", model.vocab_}, {"unk_token", model.unk_token_}, {"max_input_chars_per_word", model.max_input_chars_per_word_}, @@ -402,7 +402,7 @@ void to_json(nlohmann::json& j, const FasterWordPiece& model) { }; } -void from_json(const nlohmann::json& j, FasterWordPiece& model) { +void from_json(const nlohmann::json& j, FastWordPiece& model) { j["vocab"].get_to(model.vocab_); j["unk_token"].get_to(model.unk_token_); j["max_input_chars_per_word"].get_to(model.max_input_chars_per_word_); @@ -412,5 +412,5 @@ void from_json(const nlohmann::json& j, FasterWordPiece& model) { } } // namespace models -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.h b/fast_tokenizer/fast_tokenizer/models/fast_wordpiece.h similarity index 84% rename from faster_tokenizer/faster_tokenizer/models/faster_wordpiece.h rename to fast_tokenizer/fast_tokenizer/models/fast_wordpiece.h index 3a93f7a2209c..ad9598c7bf0e 100644 --- a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.h +++ b/fast_tokenizer/fast_tokenizer/models/fast_wordpiece.h @@ -15,20 +15,20 @@ #pragma once -#include "faster_tokenizer/models/model.h" -#include "faster_tokenizer/models/wordpiece.h" +#include "fast_tokenizer/models/model.h" +#include "fast_tokenizer/models/wordpiece.h" #include "nlohmann/json.hpp" -#include "faster_tokenizer/utils/failure.h" -#include "faster_tokenizer/utils/trie.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/failure.h" +#include "fast_tokenizer/utils/trie.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { -struct FASTERTOKENIZER_DECL FasterWordPiece : public WordPiece { - FasterWordPiece(); - FasterWordPiece(const core::Vocab& vocab, +struct FASTTOKENIZER_DECL FastWordPiece : public WordPiece { + FastWordPiece(); + FastWordPiece(const core::Vocab& vocab, const std::string& unk_token = "[UNK]", size_t max_input_chars_per_word = 100, const std::string& continuing_subword_prefix = "##", @@ -79,11 +79,11 @@ struct FASTERTOKENIZER_DECL FasterWordPiece : public WordPiece { utils::Trie trie_; utils::FailureArray failure_array_; std::vector encoded_value_for_subword_prefix_; - friend void to_json(nlohmann::json& j, const FasterWordPiece& model); - friend void from_json(const nlohmann::json& j, FasterWordPiece& model); - bool with_pretokenization_; // The end-to-end version of FasterWordPiece + friend void to_json(nlohmann::json& j, const FastWordPiece& model); + friend void from_json(const nlohmann::json& j, FastWordPiece& model); + bool with_pretokenization_; // The end-to-end version of FastWordPiece }; } // namespace models -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/model.h b/fast_tokenizer/fast_tokenizer/models/model.h similarity index 87% rename from faster_tokenizer/faster_tokenizer/models/model.h rename to fast_tokenizer/fast_tokenizer/models/model.h index 2ec7982dab85..8a8f8daddf6f 100644 --- a/faster_tokenizer/faster_tokenizer/models/model.h +++ b/fast_tokenizer/fast_tokenizer/models/model.h @@ -16,14 +16,14 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { -struct FASTERTOKENIZER_DECL Model { +struct FASTTOKENIZER_DECL Model { virtual std::vector Tokenize(const std::string& tokens) = 0; virtual bool TokenToId(const std::string& token, uint32_t* id) const = 0; virtual bool IdToToken(uint32_t id, std::string* token) const = 0; @@ -35,5 +35,5 @@ struct FASTERTOKENIZER_DECL Model { }; } // namespace model -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/postprocessors.h b/fast_tokenizer/fast_tokenizer/models/models.h similarity index 73% rename from faster_tokenizer/faster_tokenizer/postprocessors/postprocessors.h rename to fast_tokenizer/fast_tokenizer/models/models.h index 5ddd5c7fa259..feafdd1ae590 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/postprocessors.h +++ b/fast_tokenizer/fast_tokenizer/models/models.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/postprocessors/bert.h" -#include "faster_tokenizer/postprocessors/postprocessor.h" -#include "faster_tokenizer/postprocessors/template.h" +#include "fast_tokenizer/models/bpe.h" +#include "fast_tokenizer/models/fast_wordpiece.h" +#include "fast_tokenizer/models/model.h" +#include "fast_tokenizer/models/unigram.h" +#include "fast_tokenizer/models/wordpiece.h" \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/models/unigram.cc b/fast_tokenizer/fast_tokenizer/models/unigram.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/models/unigram.cc rename to fast_tokenizer/fast_tokenizer/models/unigram.cc index fe132779eb44..255ee1c3ca29 100644 --- a/faster_tokenizer/faster_tokenizer/models/unigram.cc +++ b/fast_tokenizer/fast_tokenizer/models/unigram.cc @@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/models/unigram.h" +#include "fast_tokenizer/models/unigram.h" #include #include #include #include "glog/logging.h" -#include "faster_tokenizer/utils/path.h" -#include "faster_tokenizer/utils/unique_ptr.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/path.h" +#include "fast_tokenizer/utils/unique_ptr.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { constexpr float kUnkPenalty = 10.0; @@ -432,5 +432,5 @@ void from_json(const nlohmann::json& j, Unigram& model) { } } // namespace model -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/unigram.h b/fast_tokenizer/fast_tokenizer/models/unigram.h similarity index 90% rename from faster_tokenizer/faster_tokenizer/models/unigram.h rename to fast_tokenizer/fast_tokenizer/models/unigram.h index 633b70aa141e..c66cbbbae5a3 100644 --- a/faster_tokenizer/faster_tokenizer/models/unigram.h +++ b/fast_tokenizer/fast_tokenizer/models/unigram.h @@ -14,21 +14,21 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/models/model.h" -#include "faster_tokenizer/utils/cache.h" -#include "faster_tokenizer/utils/lattice.h" -#include "faster_tokenizer/utils/trie.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/models/model.h" +#include "fast_tokenizer/utils/cache.h" +#include "fast_tokenizer/utils/lattice.h" +#include "fast_tokenizer/utils/trie.h" #include "darts.h" #include "nlohmann/json.hpp" #include "re2/re2.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { -struct FASTERTOKENIZER_DECL Unigram : public Model { +struct FASTTOKENIZER_DECL Unigram : public Model { Unigram(); Unigram(const core::VocabList& vocab, const std::vector& unk_id); Unigram(const Unigram& other); @@ -82,5 +82,5 @@ struct FASTERTOKENIZER_DECL Unigram : public Model { }; } // namespace models -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/wordpiece.cc b/fast_tokenizer/fast_tokenizer/models/wordpiece.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/models/wordpiece.cc rename to fast_tokenizer/fast_tokenizer/models/wordpiece.cc index 55e844a100bc..d19199e6177b 100644 --- a/faster_tokenizer/faster_tokenizer/models/wordpiece.cc +++ b/fast_tokenizer/fast_tokenizer/models/wordpiece.cc @@ -19,13 +19,13 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/models/wordpiece.h" -#include "faster_tokenizer/utils/path.h" -#include "faster_tokenizer/utils/utf8.h" +#include "fast_tokenizer/models/wordpiece.h" +#include "fast_tokenizer/utils/path.h" +#include "fast_tokenizer/utils/utf8.h" #include "glog/logging.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { const std::string WHITESPACE = " \n\r\t\f\v"; @@ -290,5 +290,5 @@ void WordPieceFactory::GetVocabFromFiles(const std::string& files) { } } // namespace model -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/wordpiece.h b/fast_tokenizer/fast_tokenizer/models/wordpiece.h similarity index 94% rename from faster_tokenizer/faster_tokenizer/models/wordpiece.h rename to fast_tokenizer/fast_tokenizer/models/wordpiece.h index b8fe7ac11e6f..956485522f25 100644 --- a/faster_tokenizer/faster_tokenizer/models/wordpiece.h +++ b/fast_tokenizer/fast_tokenizer/models/wordpiece.h @@ -14,14 +14,14 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/models/model.h" +#include "fast_tokenizer/models/model.h" #include "nlohmann/json.hpp" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace models { -struct FASTERTOKENIZER_DECL WordPiece : public Model { +struct FASTTOKENIZER_DECL WordPiece : public Model { WordPiece(); WordPiece(const core::Vocab& vocab, const std::string& unk_token = "[UNK]", @@ -84,5 +84,5 @@ struct WordPieceFactory { }; } // namespace models -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/normalizers/CMakeLists.txt similarity index 100% rename from faster_tokenizer/faster_tokenizer/normalizers/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/normalizers/CMakeLists.txt diff --git a/faster_tokenizer/faster_tokenizer/normalizers/bert.cc b/fast_tokenizer/fast_tokenizer/normalizers/bert.cc similarity index 87% rename from faster_tokenizer/faster_tokenizer/normalizers/bert.cc rename to fast_tokenizer/fast_tokenizer/normalizers/bert.cc index 8f73325c44c5..c92d55e94f08 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/bert.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/bert.cc @@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/bert.h" #include #include #include +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/utils.h" +#include "fast_tokenizer/utils/utils.h" #include "glog/logging.h" -#include "faster_tokenizer/normalizers/strip.h" -#include "faster_tokenizer/normalizers/utils.h" #include "unicode/uchar.h" #include "unicode/unistr.h" -#include "faster_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { BertNormalizer::BertNormalizer(bool clean_text, bool handle_chinese_chars, @@ -35,14 +35,6 @@ BertNormalizer::BertNormalizer(bool clean_text, strip_accents_(strip_accents), lowercase_(lowercase) {} -static bool IsWhiteSpace(int ch) { - const std::string WHITESPACE = " \n\r\t\f\v"; - for (int i = 0; i < WHITESPACE.length(); ++i) { - if (ch == WHITESPACE[i]) return true; - } - return u_isspace(ch); -} - static bool IsControl(int ch) { if (ch == '\t' || ch == '\n' || ch == '\r') return false; // It means (general category "C"). @@ -55,7 +47,7 @@ void BertNormalizer::DoCleanText(NormalizedString* input) const { return !(ch == 0 || ch == 0xfffd || IsControl(ch)); }) .MapChar([](char32_t ch) -> char32_t { - if (IsWhiteSpace(ch)) { + if (utils::IsWhiteSpace(ch)) { return ' '; } return ch; @@ -118,5 +110,5 @@ void from_json(const nlohmann::json& j, BertNormalizer& bert_normalizer) { } } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/bert.h b/fast_tokenizer/fast_tokenizer/normalizers/bert.h similarity index 87% rename from faster_tokenizer/faster_tokenizer/normalizers/bert.h rename to fast_tokenizer/fast_tokenizer/normalizers/bert.h index 6f0dd4ff6863..4312bdefb01e 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/bert.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/bert.h @@ -16,13 +16,13 @@ limitations under the License. */ #include #include "nlohmann/json.hpp" -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -struct FASTERTOKENIZER_DECL BertNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL BertNormalizer : public Normalizer { BertNormalizer(bool clean_text = true, bool handle_chinese_chars = true, bool strip_accents = true, @@ -43,5 +43,5 @@ struct FASTERTOKENIZER_DECL BertNormalizer : public Normalizer { BertNormalizer& bert_normalizer); }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc b/fast_tokenizer/fast_tokenizer/normalizers/normalizer.cc similarity index 94% rename from faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc rename to fast_tokenizer/fast_tokenizer/normalizers/normalizer.cc index c4a9bfb63475..bc63e0845063 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/normalizer.cc @@ -18,10 +18,10 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/utf8.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/utf8.h" -#include "faster_tokenizer/normalizers/unicode.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "glog/logging.h" #include "re2/re2.h" #include "unicode/edits.h" @@ -31,7 +31,7 @@ limitations under the License. */ #include "unicode/utypes.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { NormalizedString::NormalizedString(const std::string& original) @@ -569,7 +569,8 @@ bool NormalizedString::Slice(core::Range range, uint32_t NormalizedString::GetMatch( const std::string& normalized, const re2::RE2& pattern, - std::vector>* matches) const { + std::vector>* matches, + bool invert) const { size_t start = 0; size_t end = normalized.length(); // Construct the matches whose mode is REMOVED. @@ -579,15 +580,22 @@ uint32_t NormalizedString::GetMatch( size_t curr_start = result.data() - normalized.data(); size_t curr_end = curr_start + result.length(); if (start != curr_start) { - matches->push_back({{start, curr_start}, false}); + matches->push_back({{start, curr_start}, invert}); + if (!invert) { + ++reserved_num; + } + } + matches->push_back({{curr_start, curr_end}, !invert}); + if (invert) { ++reserved_num; } - matches->push_back({{curr_start, curr_end}, true}); start = curr_end; } if (start < end) { - matches->push_back({{start, end}, false}); - ++reserved_num; + matches->push_back({{start, end}, invert}); + if (!invert) { + ++reserved_num; + } } return reserved_num; } @@ -595,7 +603,8 @@ uint32_t NormalizedString::GetMatch( uint32_t NormalizedString::GetMatch( const std::string& normalized, const std::function& pattern_func, - std::vector>* matches) const { + std::vector>* matches, + bool invert) const { size_t utf8_len = 0; size_t start = 0; size_t curr_start = 0; @@ -610,30 +619,38 @@ uint32_t NormalizedString::GetMatch( curr_start = utf8_len; curr_end = curr_start + chwidth; if (curr_start != start) { - matches->emplace_back(core::Range{start, curr_start}, false); + matches->emplace_back(core::Range{start, curr_start}, invert); + if (!invert) { + ++reserved_num; + } + } + matches->emplace_back(core::Range{curr_start, curr_end}, !invert); + if (invert) { ++reserved_num; } - matches->emplace_back(core::Range{curr_start, curr_end}, true); start = curr_end; } utf8_len += chwidth; } if (start < normalized.length()) { - matches->emplace_back(core::Range{start, normalized.length()}, false); - ++reserved_num; + matches->emplace_back(core::Range{start, normalized.length()}, invert); + if (!invert) { + ++reserved_num; + } } return reserved_num; } -template void NormalizedString::Split( - const re2::RE2& pattern, - SplitMode mode, - std::vector* normalizes) const; +template void NormalizedString::Split(const re2::RE2& pattern, + core::SplitMode mode, + std::vector* normalizes, + bool invert) const; template void NormalizedString::Split( const std::function& pattern_func, - SplitMode mode, - std::vector* normalizes) const; + core::SplitMode mode, + std::vector* normalizes, + bool invert) const; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.h b/fast_tokenizer/fast_tokenizer/normalizers/normalizer.h similarity index 89% rename from faster_tokenizer/faster_tokenizer/normalizers/normalizer.h rename to fast_tokenizer/fast_tokenizer/normalizers/normalizer.h index d13bdc033a70..9a8b74e687cb 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/normalizer.h @@ -17,31 +17,23 @@ limitations under the License. */ #include #include #include -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/utils/utils.h" namespace re2 { class RE2; } // namespace re2 namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -enum FASTERTOKENIZER_DECL SplitMode { - REMOVED, - ISOLATED, - MERGED_WITH_PREVIOUS, - MERGED_WITH_NEXT, - CONTIGUOUS -}; - -struct FASTERTOKENIZER_DECL OffsetMapping { +struct FASTTOKENIZER_DECL OffsetMapping { std::u32string u32normalized; std::vector changes; // Same size as normalized }; -class FASTERTOKENIZER_DECL NormalizedString { +class FASTTOKENIZER_DECL NormalizedString { public: NormalizedString(const std::string& original); NormalizedString(NormalizedString&& other); @@ -82,23 +74,24 @@ class FASTERTOKENIZER_DECL NormalizedString { template void Split(const PatternType& pattern, /* re2::RE2 or std::function */ - SplitMode mode, - std::vector* normalizes) const { + core::SplitMode mode, + std::vector* normalizes, + bool invert = false) const { // Vec<(Offsets, should_remove)> std::vector> matches; - auto normalizes_size = GetMatch(normalized_, pattern, &matches); + auto normalizes_size = GetMatch(normalized_, pattern, &matches, invert); // Convert matches switch (mode) { - case REMOVED: + case core::SplitMode::REMOVED: break; - case ISOLATED: { + case core::SplitMode::ISOLATED: { for (auto& match : matches) { match.second = false; } normalizes_size = matches.size(); break; } - case MERGED_WITH_PREVIOUS: { + case core::SplitMode::MERGED_WITH_PREVIOUS: { bool previous_match = false; std::vector> new_matches; for (const auto& match : matches) { @@ -119,7 +112,7 @@ class FASTERTOKENIZER_DECL NormalizedString { normalizes_size = matches.size(); break; } - case MERGED_WITH_NEXT: { + case core::SplitMode::MERGED_WITH_NEXT: { bool previous_match = false; std::vector> new_matches; for (auto it = matches.crbegin(); it != matches.crend(); ++it) { @@ -142,7 +135,7 @@ class FASTERTOKENIZER_DECL NormalizedString { std::reverse(matches.begin(), matches.end()); break; } - case CONTIGUOUS: { + case core::SplitMode::CONTIGUOUS: { bool previous_match = false; std::vector> new_matches; for (const auto& match : matches) { @@ -194,17 +187,19 @@ class FASTERTOKENIZER_DECL NormalizedString { uint32_t GetMatch(const std::string& normalized, const re2::RE2& pattern, - std::vector>* matches) const; + std::vector>* matches, + bool invert = false) const; uint32_t GetMatch(const std::string& normalized, const std::function& pattern_func, - std::vector>* matches) const; + std::vector>* matches, + bool invert = false) const; }; -struct FASTERTOKENIZER_DECL Normalizer { +struct FASTTOKENIZER_DECL Normalizer { virtual void operator()(NormalizedString* mut_str) const = 0; }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizers.h b/fast_tokenizer/fast_tokenizer/normalizers/normalizers.h similarity index 64% rename from faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizers.h rename to fast_tokenizer/fast_tokenizer/normalizers/normalizers.h index 89abf2003d81..6f29e0c2eb25 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizers.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/normalizers.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/pretokenizers/bert.h" -#include "faster_tokenizer/pretokenizers/metaspace.h" -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/pretokenizers/sequence.h" -#include "faster_tokenizer/pretokenizers/whitespace.h" +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/normalizers/precompiled.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" +#include "fast_tokenizer/normalizers/utils.h" diff --git a/faster_tokenizer/faster_tokenizer/normalizers/precompiled.cc b/fast_tokenizer/fast_tokenizer/normalizers/precompiled.cc similarity index 94% rename from faster_tokenizer/faster_tokenizer/normalizers/precompiled.cc rename to fast_tokenizer/fast_tokenizer/normalizers/precompiled.cc index 52d909d1de2a..7d5189d30f26 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/precompiled.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/precompiled.cc @@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/normalizers/precompiled.h" +#include "fast_tokenizer/normalizers/precompiled.h" #include #include #include "glog/logging.h" -#include "faster_tokenizer/utils/unique_ptr.h" +#include "fast_tokenizer/utils/unique_ptr.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { PrecompiledNormalizer::PrecompiledNormalizer( @@ -83,5 +83,5 @@ void from_json(const nlohmann::json& j, } } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/precompiled.h b/fast_tokenizer/fast_tokenizer/normalizers/precompiled.h similarity index 82% rename from faster_tokenizer/faster_tokenizer/normalizers/precompiled.h rename to fast_tokenizer/fast_tokenizer/normalizers/precompiled.h index 30d261d2abb0..3641952030f6 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/precompiled.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/precompiled.h @@ -16,15 +16,15 @@ limitations under the License. */ #include #include "nlohmann/json.hpp" -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/sentencepiece_normalizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/sentencepiece_normalizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -struct FASTERTOKENIZER_DECL PrecompiledNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL PrecompiledNormalizer : public Normalizer { PrecompiledNormalizer() = default; explicit PrecompiledNormalizer(const std::string& precompiled_charsmap); PrecompiledNormalizer(const PrecompiledNormalizer& precompiled_normalizer); @@ -40,5 +40,5 @@ struct FASTERTOKENIZER_DECL PrecompiledNormalizer : public Normalizer { PrecompiledNormalizer& precompiled_normalizer); }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/replace.cc b/fast_tokenizer/fast_tokenizer/normalizers/replace.cc similarity index 91% rename from faster_tokenizer/faster_tokenizer/normalizers/replace.cc rename to fast_tokenizer/fast_tokenizer/normalizers/replace.cc index b24e8260cb3e..1d7f81d09a5f 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/replace.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/replace.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/normalizers/replace.h" -#include "faster_tokenizer/utils/unique_ptr.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/utils/unique_ptr.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { ReplaceNormalizer::ReplaceNormalizer(const std::string& pattern, @@ -47,5 +47,5 @@ void from_json(const nlohmann::json& j, ReplaceNormalizer& replace_normalizer) { } } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/replace.h b/fast_tokenizer/fast_tokenizer/normalizers/replace.h similarity index 85% rename from faster_tokenizer/faster_tokenizer/normalizers/replace.h rename to fast_tokenizer/fast_tokenizer/normalizers/replace.h index 55e0fbf4fb0b..76141f7669c8 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/replace.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/replace.h @@ -17,15 +17,15 @@ limitations under the License. */ #include #include #include "nlohmann/json.hpp" -#include "faster_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/normalizers/normalizer.h" #include "re2/re2.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -struct FASTERTOKENIZER_DECL ReplaceNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL ReplaceNormalizer : public Normalizer { ReplaceNormalizer() = default; ReplaceNormalizer(const std::string& pattern, const std::string& content); ReplaceNormalizer(const ReplaceNormalizer& replace_normalizer); @@ -41,5 +41,5 @@ struct FASTERTOKENIZER_DECL ReplaceNormalizer : public Normalizer { }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/strip.cc b/fast_tokenizer/fast_tokenizer/normalizers/strip.cc similarity index 94% rename from faster_tokenizer/faster_tokenizer/normalizers/strip.cc rename to fast_tokenizer/fast_tokenizer/normalizers/strip.cc index 375c6159b37c..c14c23f27164 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/strip.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/strip.cc @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/strip.h" #include "unicode/translit.h" #include "unicode/unistr.h" #include "unicode/utypes.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { StripNormalizer::StripNormalizer(bool left /* = true*/, bool right /* = true*/) : left_(left), right_(right) {} @@ -64,5 +64,5 @@ void from_json(const nlohmann::json& j, StripAccentsNormalizer& strip_normalizer) {} } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/strip.h b/fast_tokenizer/fast_tokenizer/normalizers/strip.h similarity index 83% rename from faster_tokenizer/faster_tokenizer/normalizers/strip.h rename to fast_tokenizer/fast_tokenizer/normalizers/strip.h index 1b36d3890cf4..e8af13ac36a6 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/strip.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/strip.h @@ -16,14 +16,14 @@ limitations under the License. */ #include -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -struct FASTERTOKENIZER_DECL StripNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL StripNormalizer : public Normalizer { StripNormalizer(bool left = true, bool right = true); virtual void operator()(NormalizedString* input) const override; StripNormalizer(StripNormalizer&&) = default; @@ -38,7 +38,7 @@ struct FASTERTOKENIZER_DECL StripNormalizer : public Normalizer { StripNormalizer& strip_normalizer); }; -struct FASTERTOKENIZER_DECL StripAccentsNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL StripAccentsNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const StripAccentsNormalizer& strip_normalizer); @@ -47,5 +47,5 @@ struct FASTERTOKENIZER_DECL StripAccentsNormalizer : public Normalizer { }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/unicode.cc b/fast_tokenizer/fast_tokenizer/normalizers/unicode.cc similarity index 96% rename from faster_tokenizer/faster_tokenizer/normalizers/unicode.cc rename to fast_tokenizer/fast_tokenizer/normalizers/unicode.cc index e73132e0434b..5c16c0d00eae 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/unicode.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/unicode.cc @@ -17,14 +17,14 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/normalizers/unicode.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "unicode/edits.h" #include "unicode/errorcode.h" #include "unicode/normalizer2.h" #include "unicode/utypes.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { void NFCNormalizer::operator()(NormalizedString* input) const { input->NFC(); } @@ -99,5 +99,5 @@ void to_json(nlohmann::json& j, const NmtNormalizer& normalizer) { void from_json(const nlohmann::json& j, NmtNormalizer& normalizer) {} } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/unicode.h b/fast_tokenizer/fast_tokenizer/normalizers/unicode.h similarity index 79% rename from faster_tokenizer/faster_tokenizer/normalizers/unicode.h rename to fast_tokenizer/fast_tokenizer/normalizers/unicode.h index 4d71f4b4cd32..6bf9c4b8de42 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/unicode.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/unicode.h @@ -15,43 +15,43 @@ limitations under the License. */ #pragma once #include -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -struct FASTERTOKENIZER_DECL NFCNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL NFCNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const NFCNormalizer& normalizer); friend void from_json(const nlohmann::json& j, NFCNormalizer& normalizer); }; -struct FASTERTOKENIZER_DECL NFDNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL NFDNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const NFDNormalizer& normalizer); friend void from_json(const nlohmann::json& j, NFDNormalizer& normalizer); }; -struct FASTERTOKENIZER_DECL NFKCNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL NFKCNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const NFKCNormalizer& normalizer); friend void from_json(const nlohmann::json& j, NFKCNormalizer& normalizer); }; -struct FASTERTOKENIZER_DECL NFKDNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL NFKDNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const NFKDNormalizer& normalizer); friend void from_json(const nlohmann::json& j, NFKDNormalizer& normalizer); }; -struct FASTERTOKENIZER_DECL NmtNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL NmtNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const NmtNormalizer& normalizer); friend void from_json(const nlohmann::json& j, NmtNormalizer& normalizer); }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/utils.cc b/fast_tokenizer/fast_tokenizer/normalizers/utils.cc similarity index 95% rename from faster_tokenizer/faster_tokenizer/normalizers/utils.cc rename to fast_tokenizer/fast_tokenizer/normalizers/utils.cc index af3c43ef25a3..15f6875b8749 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/utils.cc +++ b/fast_tokenizer/fast_tokenizer/normalizers/utils.cc @@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/normalizers/utils.h" -#include "faster_tokenizer/normalizers/bert.h" -#include "faster_tokenizer/normalizers/precompiled.h" -#include "faster_tokenizer/normalizers/replace.h" -#include "faster_tokenizer/normalizers/strip.h" -#include "faster_tokenizer/normalizers/unicode.h" +#include "fast_tokenizer/normalizers/utils.h" +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/precompiled.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "unicode/unistr.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { void SequenceNormalizer::AppendNormalizer(Normalizer* normalizer) { @@ -154,5 +154,5 @@ void to_json(nlohmann::json& j, const LowercaseNormalizer& normalizer) { void from_json(const nlohmann::json& j, LowercaseNormalizer& normalizer) {} } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/normalizers/utils.h b/fast_tokenizer/fast_tokenizer/normalizers/utils.h similarity index 84% rename from faster_tokenizer/faster_tokenizer/normalizers/utils.h rename to fast_tokenizer/fast_tokenizer/normalizers/utils.h index 88d10887c805..94fd2c91cdf0 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/utils.h +++ b/fast_tokenizer/fast_tokenizer/normalizers/utils.h @@ -17,14 +17,14 @@ limitations under the License. */ #include #include #include -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace normalizers { -struct FASTERTOKENIZER_DECL SequenceNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL SequenceNormalizer : public Normalizer { SequenceNormalizer() = default; SequenceNormalizer(const SequenceNormalizer&) = default; SequenceNormalizer(const std::vector& normalizers); @@ -38,7 +38,7 @@ struct FASTERTOKENIZER_DECL SequenceNormalizer : public Normalizer { SequenceNormalizer& normalizer); }; -struct FASTERTOKENIZER_DECL LowercaseNormalizer : public Normalizer { +struct FASTTOKENIZER_DECL LowercaseNormalizer : public Normalizer { virtual void operator()(NormalizedString* input) const override; friend void to_json(nlohmann::json& j, const LowercaseNormalizer& normalizer); friend void from_json(const nlohmann::json& j, @@ -46,5 +46,5 @@ struct FASTERTOKENIZER_DECL LowercaseNormalizer : public Normalizer { }; } // namespace normalizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/postprocessors/CMakeLists.txt similarity index 58% rename from faster_tokenizer/faster_tokenizer/postprocessors/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/postprocessors/CMakeLists.txt index ec4a80daf73d..9d4aad766e77 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/CMakeLists.txt +++ b/fast_tokenizer/fast_tokenizer/postprocessors/CMakeLists.txt @@ -1 +1 @@ -cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json) +cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc roberta.cc byte_level.cc DEPS core json) diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/bert.cc b/fast_tokenizer/fast_tokenizer/postprocessors/bert.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/postprocessors/bert.cc rename to fast_tokenizer/fast_tokenizer/postprocessors/bert.cc index 16b77a194296..d40067c9d837 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/bert.cc +++ b/fast_tokenizer/fast_tokenizer/postprocessors/bert.cc @@ -14,12 +14,12 @@ #include -#include "faster_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/encoding.h" #include "glog/logging.h" -#include "faster_tokenizer/postprocessors/bert.h" +#include "fast_tokenizer/postprocessors/bert.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace postprocessors { BertPostProcessor::BertPostProcessor() @@ -204,5 +204,5 @@ void from_json(const nlohmann::json& j, BertPostProcessor& bert_postprocessor) { } } // namespace postprocessors -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/bert.h b/fast_tokenizer/fast_tokenizer/postprocessors/bert.h similarity index 51% rename from faster_tokenizer/faster_tokenizer/postprocessors/bert.h rename to fast_tokenizer/fast_tokenizer/postprocessors/bert.h index 167b6abf7782..cc8c77f9785b 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/bert.h +++ b/fast_tokenizer/fast_tokenizer/postprocessors/bert.h @@ -1,28 +1,28 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once +#include "fast_tokenizer/postprocessors/postprocessor.h" +#include "fast_tokenizer/utils/utils.h" #include "nlohmann/json.hpp" -#include "faster_tokenizer/postprocessors/postprocessor.h" -#include "faster_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace postprocessors { -struct FASTERTOKENIZER_DECL BertPostProcessor : public PostProcessor { +struct FASTTOKENIZER_DECL BertPostProcessor : public PostProcessor { BertPostProcessor(const std::pair& sep, const std::pair& cls); BertPostProcessor(); @@ -39,5 +39,5 @@ struct FASTERTOKENIZER_DECL BertPostProcessor : public PostProcessor { BertPostProcessor& bert_postprocessor); }; } // namespace postprocessors -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/postprocessors/byte_level.cc b/fast_tokenizer/fast_tokenizer/postprocessors/byte_level.cc new file mode 100644 index 000000000000..51f0b45ecec4 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/postprocessors/byte_level.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fast_tokenizer/postprocessors/byte_level.h" +#include "fast_tokenizer/pretokenizers/byte_level.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace postprocessors { + +ByteLevelPostProcessor::ByteLevelPostProcessor(bool add_prefix_space, + bool trim_offsets, + bool use_regex) + : add_prefix_space_(add_prefix_space), + trim_offsets_(trim_offsets), + use_regex_(use_regex) {} + + +size_t ByteLevelPostProcessor::AddedTokensNum(bool is_pair) const { return 0; } + +void ByteLevelPostProcessor::operator()(core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens, + core::Encoding* result_encoding) const { + if (trim_offsets_) { + pretokenizers::ProcessOffsets(encoding, add_special_tokens); + for (auto& overflowing : encoding->GetMutableOverflowing()) { + pretokenizers::ProcessOffsets(&overflowing, add_special_tokens); + } + if (pair_encoding != nullptr) { + pretokenizers::ProcessOffsets(pair_encoding, add_special_tokens); + for (auto& overflowing : pair_encoding->GetMutableOverflowing()) { + pretokenizers::ProcessOffsets(&overflowing, add_special_tokens); + } + } + } + + encoding->SetSequenceIds(0); + if (pair_encoding != nullptr) { + pair_encoding->SetSequenceIds(1); + } +} + +void to_json(nlohmann::json& j, + const ByteLevelPostProcessor& byte_level_postprocessor) { + j = { + {"type", "ByteLevelPostProcessor"}, + {"add_prefix_space", byte_level_postprocessor.add_prefix_space_}, + {"trim_offsets", byte_level_postprocessor.trim_offsets_}, + {"use_regex", byte_level_postprocessor.use_regex_}, + }; +} + +void from_json(const nlohmann::json& j, + ByteLevelPostProcessor& byte_level_postprocessor) { + j["add_prefix_space"].get_to(byte_level_postprocessor.add_prefix_space_); + j["trim_offsets"].get_to(byte_level_postprocessor.trim_offsets_); + j["use_regex"].get_to(byte_level_postprocessor.use_regex_); +} + +} // namespace postprocessors +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/postprocessors/byte_level.h b/fast_tokenizer/fast_tokenizer/postprocessors/byte_level.h new file mode 100644 index 000000000000..75cdd995fec5 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/postprocessors/byte_level.h @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fast_tokenizer/postprocessors/postprocessor.h" +#include "fast_tokenizer/utils/utils.h" +#include "nlohmann/json.hpp" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace postprocessors { + +struct FASTTOKENIZER_DECL ByteLevelPostProcessor : public PostProcessor { + ByteLevelPostProcessor(bool add_prefix_space = true, + bool trim_offsets = true, + bool use_regex = true); + virtual size_t AddedTokensNum(bool is_pair) const override; + virtual void operator()(core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens, + core::Encoding* result_encoding) const override; + + friend void to_json(nlohmann::json& j, + const ByteLevelPostProcessor& byte_level_postprocessor); + friend void from_json(const nlohmann::json& j, + ByteLevelPostProcessor& byte_level_postprocessor); + bool add_prefix_space_; + bool trim_offsets_; + bool use_regex_; +}; + +} // namespace postprocessors +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/postprocessor.cc b/fast_tokenizer/fast_tokenizer/postprocessors/postprocessor.cc similarity index 84% rename from faster_tokenizer/faster_tokenizer/postprocessors/postprocessor.cc rename to fast_tokenizer/fast_tokenizer/postprocessors/postprocessor.cc index cbcc813aea84..abe994beb7d8 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/postprocessor.cc +++ b/fast_tokenizer/fast_tokenizer/postprocessors/postprocessor.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/postprocessors/postprocessor.h" -#include "faster_tokenizer/core/encoding.h" +#include "fast_tokenizer/postprocessors/postprocessor.h" +#include "fast_tokenizer/core/encoding.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace postprocessors { void PostProcessor::DefaultProcess(core::Encoding* encoding, @@ -26,12 +26,12 @@ void PostProcessor::DefaultProcess(core::Encoding* encoding, *result_encoding = *encoding; } else { encoding->SetSequenceIds(0); - pair_encoding->SetSequenceIds(0); + pair_encoding->SetSequenceIds(1); encoding->MergeWith(*pair_encoding, false); *result_encoding = *encoding; } } } // namespace postprocessors -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/postprocessor.h b/fast_tokenizer/fast_tokenizer/postprocessors/postprocessor.h similarity index 89% rename from faster_tokenizer/faster_tokenizer/postprocessors/postprocessor.h rename to fast_tokenizer/fast_tokenizer/postprocessors/postprocessor.h index f4ed9ce66e16..34fda8377a2a 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/postprocessor.h +++ b/fast_tokenizer/fast_tokenizer/postprocessors/postprocessor.h @@ -15,10 +15,10 @@ limitations under the License. */ #pragma once #include -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace core { class Encoding; @@ -26,7 +26,7 @@ class Encoding; namespace postprocessors { -struct FASTERTOKENIZER_DECL PostProcessor { +struct FASTTOKENIZER_DECL PostProcessor { virtual size_t AddedTokensNum(bool is_pair) const = 0; virtual void operator()(core::Encoding* encoding, core::Encoding* pair_encoding, @@ -37,5 +37,5 @@ struct FASTERTOKENIZER_DECL PostProcessor { core::Encoding* result_encoding); }; } // namespace postprocessors -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/models/models.h b/fast_tokenizer/fast_tokenizer/postprocessors/postprocessors.h similarity index 69% rename from faster_tokenizer/faster_tokenizer/models/models.h rename to fast_tokenizer/fast_tokenizer/postprocessors/postprocessors.h index 378b2d35f884..9427f2478b9a 100644 --- a/faster_tokenizer/faster_tokenizer/models/models.h +++ b/fast_tokenizer/fast_tokenizer/postprocessors/postprocessors.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/models/bpe.h" -#include "faster_tokenizer/models/faster_wordpiece.h" -#include "faster_tokenizer/models/model.h" -#include "faster_tokenizer/models/unigram.h" -#include "faster_tokenizer/models/wordpiece.h" \ No newline at end of file +#include "fast_tokenizer/postprocessors/bert.h" +#include "fast_tokenizer/postprocessors/byte_level.h" +#include "fast_tokenizer/postprocessors/postprocessor.h" +#include "fast_tokenizer/postprocessors/roberta.h" +#include "fast_tokenizer/postprocessors/template.h" diff --git a/fast_tokenizer/fast_tokenizer/postprocessors/roberta.cc b/fast_tokenizer/fast_tokenizer/postprocessors/roberta.cc new file mode 100644 index 000000000000..4a468847c222 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/postprocessors/roberta.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/postprocessors/roberta.h" +#include "fast_tokenizer/pretokenizers/byte_level.h" +#include "glog/logging.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace postprocessors { + +RobertaPostProcessor::RobertaPostProcessor( + const std::pair& sep, + const std::pair& cls, + bool trim_offsets, + bool add_prefix_space) + : sep_(sep), + cls_(cls), + trim_offsets_(trim_offsets), + add_prefix_space_(add_prefix_space) {} + +size_t RobertaPostProcessor::AddedTokensNum(bool is_pair) const { + if (is_pair) { + return 4; + } + return 2; +} + +void RobertaPostProcessor::operator()(core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens, + core::Encoding* result_encoding) const { + if (trim_offsets_) { + pretokenizers::ProcessOffsets(encoding, add_special_tokens); + for (auto& overflowing : encoding->GetMutableOverflowing()) { + pretokenizers::ProcessOffsets(&overflowing, add_special_tokens); + } + if (pair_encoding != nullptr) { + pretokenizers::ProcessOffsets(pair_encoding, add_special_tokens); + for (auto& overflowing : pair_encoding->GetMutableOverflowing()) { + pretokenizers::ProcessOffsets(&overflowing, add_special_tokens); + } + } + } + encoding->SetTypeIds(std::vector(encoding->GetLen(), 0)); + if (pair_encoding != nullptr) { + pair_encoding->SetTypeIds( + std::vector(pair_encoding->GetLen(), 0)); + } + if (!add_special_tokens) { + DefaultProcess(encoding, pair_encoding, result_encoding); + return; + } +// Construct the sequence as: [CLS] A [SEP] +#define CREATE_PROCESSED_ENCODING_SEQ( \ + encoding_ptr, attr, name, head_value, back_value) \ + auto encoding_##name = encoding_ptr->Get##attr(); \ + decltype(encoding_##name) name(encoding_##name.size() + 2); \ + std::copy(encoding_##name.begin(), encoding_##name.end(), name.begin() + 1); \ + name.front() = head_value; \ + name.back() = back_value + // ids + CREATE_PROCESSED_ENCODING_SEQ(encoding, Ids, ids, cls_.second, sep_.second); + // type_ids. Because there is no nsp task in the roberta, all type_ids are set + // to 0. + std::vector type_ids(encoding->GetTypeIds().size() + 2, 0); + // tokens + CREATE_PROCESSED_ENCODING_SEQ( + encoding, Tokens, tokens, cls_.first, sep_.first); + // word_idx + CREATE_PROCESSED_ENCODING_SEQ(encoding, WordsIdx, word_idx, -1, -1); + // offsets + core::Offset empty_offsets = {0, 0}; + CREATE_PROCESSED_ENCODING_SEQ( + encoding, Offsets, offsets, empty_offsets, empty_offsets); + // special_tokens_mask + std::vector special_tokens_mask(ids.size(), 0); + special_tokens_mask.front() = special_tokens_mask.back() = 1; + // attention_mask + std::vector attention_mask(ids.size(), 1); + // sequence_ranges + std::unordered_map sequence_ranges; + sequence_ranges[0] = {1, ids.size() - 1}; + // overflowing + auto& overflowings = encoding->GetMutableOverflowing(); + for (auto& overflow_encoding : overflowings) { + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_encoding), Ids, ids, cls_.second, sep_.second); + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_encoding), TypeIds, type_ids, 0, 0); + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_encoding), Tokens, tokens, cls_.first, sep_.first); + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_encoding), WordsIdx, word_idx, -1, -1); + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_encoding), Offsets, offsets, empty_offsets, empty_offsets); + + std::vector special_tokens_mask(ids.size(), 0); + special_tokens_mask.front() = special_tokens_mask.back() = 1; + + std::vector attention_mask(ids.size(), 1); + + std::unordered_map sequence_ranges; + sequence_ranges[0] = {1, ids.size() - 1}; + + overflow_encoding = std::move( + core::Encoding(std::move(ids), + std::move(type_ids), + std::move(tokens), + std::move(word_idx), + std::move(offsets), + std::move(special_tokens_mask), + std::move(attention_mask), + std::vector(), // No overflowing + std::move(sequence_ranges))); + } + + core::Encoding new_encoding(std::move(ids), + std::move(type_ids), + std::move(tokens), + std::move(word_idx), + std::move(offsets), + std::move(special_tokens_mask), + std::move(attention_mask), + std::move(overflowings), + std::move(sequence_ranges)); + + // Construct the sequence as: [CLS] A [SEP] [SEP] B [SEP] + if (pair_encoding != nullptr) { + // ids + CREATE_PROCESSED_ENCODING_SEQ( + pair_encoding, Ids, ids, sep_.second, sep_.second); + // type_ids. Because there is no nsp task in the roberta, all type_ids are + // set to 0. + std::vector type_ids(pair_encoding->GetTypeIds().size() + 2, 0); + // tokens + CREATE_PROCESSED_ENCODING_SEQ( + pair_encoding, Tokens, tokens, sep_.first, sep_.first); + // word_idx + CREATE_PROCESSED_ENCODING_SEQ(pair_encoding, WordsIdx, word_idx, -1, -1); + // offsets + core::Offset empty_offsets = {0, 0}; + CREATE_PROCESSED_ENCODING_SEQ( + pair_encoding, Offsets, offsets, empty_offsets, empty_offsets); + // special_tokens_mask + std::vector special_tokens_mask(ids.size(), 0); + special_tokens_mask.front() = special_tokens_mask.back() = 1; + // attention_mask + std::vector attention_mask(ids.size(), 1); + // sequence_ranges + std::unordered_map sequence_ranges; + sequence_ranges[1] = {1, ids.size() - 1}; + // overflowing + auto& overflowings = pair_encoding->GetMutableOverflowing(); + for (auto& overflow_pair_encoding : overflowings) { + // ids + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_pair_encoding), Ids, ids, sep_.second, sep_.second); + // type_ids + std::vector type_ids( + overflow_pair_encoding.GetTypeIds().size() + 2, 0); + // tokens + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_pair_encoding), Tokens, tokens, sep_.first, sep_.first); + // word_idx + CREATE_PROCESSED_ENCODING_SEQ( + (&overflow_pair_encoding), WordsIdx, word_idx, -1, -1); + // offsets + core::Offset empty_offsets = {0, 0}; + CREATE_PROCESSED_ENCODING_SEQ((&overflow_pair_encoding), + Offsets, + offsets, + empty_offsets, + empty_offsets); + // special_tokens_mask + std::vector special_tokens_mask(ids.size(), 0); + special_tokens_mask.front() = special_tokens_mask.back() = 1; + // attention_mask + std::vector attention_mask(ids.size(), 1); + // sequence_ranges + std::unordered_map sequence_ranges; + sequence_ranges[0] = {1, ids.size() - 1}; + overflow_pair_encoding = std::move( + core::Encoding(std::move(ids), + std::move(type_ids), + std::move(tokens), + std::move(word_idx), + std::move(offsets), + std::move(special_tokens_mask), + std::move(attention_mask), + std::vector(), // No overflowing + std::move(sequence_ranges))); + } + core::Encoding new_pair_encoding(std::move(ids), + std::move(type_ids), + std::move(tokens), + std::move(word_idx), + std::move(offsets), + std::move(special_tokens_mask), + std::move(attention_mask), + std::move(overflowings), + std::move(sequence_ranges)); + new_encoding.MergeWith(new_pair_encoding, false); + } +#undef CREATE_PROCESSED_ENCODING_SEQ + *result_encoding = std::move(new_encoding); +} + +void to_json(nlohmann::json& j, + const RobertaPostProcessor& roberta_postprocessor) { + j = { + {"type", "RobertaPostProcessor"}, + {"sep", roberta_postprocessor.sep_}, + {"cls", roberta_postprocessor.cls_}, + {"trim_offsets", roberta_postprocessor.trim_offsets_}, + {"add_prefix_space", roberta_postprocessor.add_prefix_space_}, + }; +} + +void from_json(const nlohmann::json& j, + RobertaPostProcessor& roberta_postprocessor) { + j["cls"].get_to(roberta_postprocessor.cls_); + j["sep"].get_to(roberta_postprocessor.sep_); + j["trim_offsets"].get_to(roberta_postprocessor.trim_offsets_); + j["add_prefix_space"].get_to(roberta_postprocessor.add_prefix_space_); +} + +} // namespace postprocessors +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/postprocessors/roberta.h b/fast_tokenizer/fast_tokenizer/postprocessors/roberta.h new file mode 100644 index 000000000000..0601882f1df1 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/postprocessors/roberta.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fast_tokenizer/postprocessors/postprocessor.h" +#include "fast_tokenizer/utils/utils.h" +#include "nlohmann/json.hpp" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace postprocessors { + +struct FASTTOKENIZER_DECL RobertaPostProcessor : public PostProcessor { + RobertaPostProcessor(const std::pair& sep = {"", + 2}, + const std::pair& cls = {"", 0}, + bool trim_offsets = true, + bool add_prefix_space = true); + virtual size_t AddedTokensNum(bool is_pair) const override; + virtual void operator()(core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens, + core::Encoding* result_encoding) const override; + std::pair sep_; + std::pair cls_; + bool trim_offsets_; + bool add_prefix_space_; + friend void to_json(nlohmann::json& j, + const RobertaPostProcessor& roberta_postprocessor); + friend void from_json(const nlohmann::json& j, + RobertaPostProcessor& roberta_postprocessor); +}; +} // namespace postprocessors +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/template.cc b/fast_tokenizer/fast_tokenizer/postprocessors/template.cc similarity index 99% rename from faster_tokenizer/faster_tokenizer/postprocessors/template.cc rename to fast_tokenizer/fast_tokenizer/postprocessors/template.cc index 7bbb8a3e2bd7..28a3eb92587e 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/template.cc +++ b/fast_tokenizer/fast_tokenizer/postprocessors/template.cc @@ -15,12 +15,12 @@ #include #include -#include "faster_tokenizer/core/encoding.h" -#include "faster_tokenizer/postprocessors/template.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/postprocessors/template.h" #include "glog/logging.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace postprocessors { void ParseIdFromString(const std::string& template_id_string, @@ -450,5 +450,5 @@ void from_json(const nlohmann::json& j, } } // namespace postprocessors -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/template.h b/fast_tokenizer/fast_tokenizer/postprocessors/template.h similarity index 93% rename from faster_tokenizer/faster_tokenizer/postprocessors/template.h rename to fast_tokenizer/fast_tokenizer/postprocessors/template.h index 12376ae5087d..aa20de483daa 100644 --- a/faster_tokenizer/faster_tokenizer/postprocessors/template.h +++ b/fast_tokenizer/fast_tokenizer/postprocessors/template.h @@ -18,16 +18,16 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/postprocessors/postprocessor.h" -#include "faster_tokenizer/utils/utils.h" -#include "faster_tokenizer/utils/variant.h" +#include "fast_tokenizer/postprocessors/postprocessor.h" +#include "fast_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/variant.h" #include "nlohmann/json.hpp" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace postprocessors { -enum FASTERTOKENIZER_DECL SequenceType { SEQ_A, SEQ_B }; +enum FASTTOKENIZER_DECL SequenceType { SEQ_A, SEQ_B }; NLOHMANN_JSON_SERIALIZE_ENUM(SequenceType, { {SEQ_A, "A"}, {SEQ_B, "B"}, @@ -47,7 +47,7 @@ void SetTypeId(uint32_t type_id, TemplatePiece* template_piece); void GetTemplatePieceFromString(const std::string& template_string, TemplatePiece* template_piece); -struct FASTERTOKENIZER_DECL SpecialToken { +struct FASTTOKENIZER_DECL SpecialToken { std::string id_; std::vector ids_; std::vector tokens_; @@ -65,7 +65,7 @@ struct FASTERTOKENIZER_DECL SpecialToken { friend void from_json(const nlohmann::json& j, SpecialToken& special_token); }; -struct FASTERTOKENIZER_DECL Template { +struct FASTTOKENIZER_DECL Template { std::vector pieces_; Template() = default; explicit Template(const std::string& template_str) { @@ -131,7 +131,7 @@ struct FASTERTOKENIZER_DECL Template { friend void from_json(const nlohmann::json& j, Template& template_); }; -struct FASTERTOKENIZER_DECL SpecialTokensMap { +struct FASTTOKENIZER_DECL SpecialTokensMap { std::unordered_map tokens_map_; SpecialTokensMap() = default; explicit SpecialTokensMap(const std::vector& special_tokens) { @@ -147,7 +147,7 @@ struct FASTERTOKENIZER_DECL SpecialTokensMap { friend void from_json(const nlohmann::json& j, SpecialTokensMap& tokens_map); }; -struct FASTERTOKENIZER_DECL TemplatePostProcessor : public PostProcessor { +struct FASTTOKENIZER_DECL TemplatePostProcessor : public PostProcessor { TemplatePostProcessor(); TemplatePostProcessor(const Template&, const Template&, @@ -187,5 +187,5 @@ struct FASTERTOKENIZER_DECL TemplatePostProcessor : public PostProcessor { }; } // namespace postprocessors -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/pretokenizers/CMakeLists.txt similarity index 81% rename from faster_tokenizer/faster_tokenizer/pretokenizers/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/pretokenizers/CMakeLists.txt index 065f6a78b6a5..8c9c0741aa50 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/CMakeLists.txt +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/CMakeLists.txt @@ -1,3 +1,3 @@ cc_library(pretokenizers - SRCS pretokenizer.cc whitespace.cc bert.cc metaspace.cc sequence.cc + SRCS pretokenizer.cc whitespace.cc bert.cc metaspace.cc sequence.cc byte_level.cc split.cc DEPS normalizers core json utils) diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/bert.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/bert.cc similarity index 69% rename from faster_tokenizer/faster_tokenizer/pretokenizers/bert.cc rename to fast_tokenizer/fast_tokenizer/pretokenizers/bert.cc index 58d44b252317..8c4962389a10 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/bert.cc +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/bert.cc @@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/pretokenizers/bert.h" +#include "fast_tokenizer/pretokenizers/bert.h" +#include "fast_tokenizer/utils/utils.h" #include "glog/logging.h" #include "re2/re2.h" #include "unicode/uchar.h" -#include "faster_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { // Note (zhoushunjie): Init re2::RE2 objects cost too much, @@ -35,7 +35,7 @@ void BertPreTokenizer::operator()(PreTokenizedString* pretokenized) const { std::vector* string_splits) { // Use single character match instead of regex to improve performance normalized->Split([](char32_t ch) -> bool { return u_isUWhiteSpace(ch); }, - normalizers::REMOVED, + core::SplitMode::REMOVED, &normalized_splits); for (auto&& normalize : normalized_splits) { if (!normalize.IsEmpty()) { @@ -44,20 +44,20 @@ void BertPreTokenizer::operator()(PreTokenizedString* pretokenized) const { } }); normalized_splits.clear(); - pretokenized->Split( - [&normalized_splits](int idx, - normalizers::NormalizedString* normalized, - std::vector* string_splits) { - // Use single character match instead of regex to improve performance - normalized->Split( - utils::IsPunctuation, normalizers::ISOLATED, &normalized_splits); - for (auto&& normalize : normalized_splits) { - if (!normalize.IsEmpty()) { - VLOG(6) << "After pretokenized: " << normalize.GetStr(); - string_splits->emplace_back(std::move(normalize)); - } - } - }); + pretokenized->Split([&normalized_splits]( + int idx, + normalizers::NormalizedString* normalized, + std::vector* string_splits) { + // Use single character match instead of regex to improve performance + normalized->Split( + utils::IsPunctuation, core::SplitMode::ISOLATED, &normalized_splits); + for (auto&& normalize : normalized_splits) { + if (!normalize.IsEmpty()) { + VLOG(6) << "After pretokenized: " << normalize.GetStr(); + string_splits->emplace_back(std::move(normalize)); + } + } + }); } void to_json(nlohmann::json& j, const BertPreTokenizer& bert_pre_tokenizer) { @@ -69,5 +69,5 @@ void to_json(nlohmann::json& j, const BertPreTokenizer& bert_pre_tokenizer) { void from_json(const nlohmann::json& j, BertPreTokenizer& bert_pre_tokenizer) {} } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/bert.h b/fast_tokenizer/fast_tokenizer/pretokenizers/bert.h similarity index 81% rename from faster_tokenizer/faster_tokenizer/pretokenizers/bert.h rename to fast_tokenizer/fast_tokenizer/pretokenizers/bert.h index 3f08d29f3c51..283930356aa5 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/bert.h +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/bert.h @@ -15,14 +15,14 @@ limitations under the License. */ #pragma once #include "nlohmann/json.hpp" -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { -struct FASTERTOKENIZER_DECL BertPreTokenizer : public PreTokenizer { +struct FASTTOKENIZER_DECL BertPreTokenizer : public PreTokenizer { virtual void operator()(PreTokenizedString* pretokenized) const override; friend void to_json(nlohmann::json& j, const BertPreTokenizer& bert_pre_tokenizer); @@ -31,5 +31,5 @@ struct FASTERTOKENIZER_DECL BertPreTokenizer : public PreTokenizer { }; } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/pretokenizers/byte_level.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/byte_level.cc new file mode 100644 index 000000000000..138aa784744b --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/byte_level.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fast_tokenizer/pretokenizers/byte_level.h" +#include +#include +#include "fast_tokenizer/utils/utf8.h" +#include "fast_tokenizer/utils/utils.h" +#include "glog/logging.h" +#include "re2/re2.h" +#include "unicode/uchar.h" + + +namespace paddlenlp { +namespace fast_tokenizer { +namespace pretokenizers { + + +static re2::RE2 pattern( + R"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+)"); + + +static std::unordered_map BYTES_TO_CHARS = + utils::CreateBytesToChars(); +ByteLevelPreTokenizer::ByteLevelPreTokenizer(bool add_prefix_space, + bool use_regex) + : add_prefix_space_(add_prefix_space), use_regex_(use_regex) {} + + +void ByteLevelPreTokenizer::operator()(PreTokenizedString* pretokenized) const { + std::vector normalized_splits; + pretokenized->Split([&normalized_splits, this]( + int idx, + normalizers::NormalizedString* normalized, + std::vector* string_splits) { + if (this->add_prefix_space_ && normalized->GetStr().find(' ') != 0) { + normalized->Prepend(" "); + } + if (this->use_regex_) { + normalized->Split(pattern, core::SplitMode::ISOLATED, &normalized_splits); + for (auto&& normalize : normalized_splits) { + if (!normalize.IsEmpty()) { + string_splits->emplace_back(std::move(normalize)); + } + } + } else { + string_splits->emplace_back(*normalized); + } + }); + pretokenized->Normalize([](normalizers::NormalizedString* normalized) { + const std::string& str = normalized->GetStr(); + std::u32string u32normalized; + std::vector changes; + size_t utf8_len = 0; + uint32_t last_char; + uint32_t curr_char; + while (utf8_len < str.length()) { + auto chwidth = utils::UTF8ToUInt32(str.data() + utf8_len, &curr_char); + curr_char = utils::UTF8ToUnicode(curr_char); + for (int i = 0; i < chwidth; ++i) { + u32normalized.push_back(BYTES_TO_CHARS.at(str[i + utf8_len])); + if (i == 0) { + changes.push_back(0); + } else { + changes.push_back(1); + } + } + utf8_len += chwidth; + } + normalized->UpdateNormalized({u32normalized, changes}, 0); + }); +} + + +void to_json(nlohmann::json& j, + const ByteLevelPreTokenizer& byte_pre_tokenizer) { + j = { + {"type", "ByteLevelPreTokenizer"}, + {"add_prefix_space", byte_pre_tokenizer.add_prefix_space_}, + {"use_regex", byte_pre_tokenizer.use_regex_}, + }; +} + + +void from_json(const nlohmann::json& j, + ByteLevelPreTokenizer& byte_pre_tokenizer) { + j.at("add_prefix_space").get_to(byte_pre_tokenizer.add_prefix_space_); + j.at("use_regex").get_to(byte_pre_tokenizer.add_prefix_space_); +} + +void ProcessOffsets(core::Encoding* encoding, bool add_prefix_space) { + auto process_token_fn = [&]( + uint32_t i, const std::string& token, core::Offset* offset) -> void { + uint32_t leading_spaces = 0; + uint32_t trailing_spaces = 0; + + std::wstring_convert, char32_t> conv; + std::u32string u32token = conv.from_bytes(token); + for (int i = 0; i < u32token.size(); ++i) { + if (utils::IsWhiteSpace(u32token[i]) || + u32token[i] == BYTES_TO_CHARS.at(' ')) { + ++leading_spaces; + } else { + break; + } + } + + for (int i = u32token.size() - 1; i >= 0; --i) { + if (utils::IsWhiteSpace(u32token[i]) || + u32token[i] == BYTES_TO_CHARS.at(' ')) { + ++trailing_spaces; + } else { + break; + } + } + + if (leading_spaces > 0 || trailing_spaces > 0) { + if (leading_spaces > 0) { + bool is_first = (i == 0) || (offset->first == 0); + if (is_first && add_prefix_space && leading_spaces == 1) { + leading_spaces = 0; + } + offset->first = + (std::min)(offset->first + leading_spaces, offset->second); + } + } + if (trailing_spaces > 0 && offset->second >= trailing_spaces) { + offset->second = + (std::max)(offset->second - trailing_spaces, offset->first); + } + }; + encoding->ProcessTokenWithOffsets(process_token_fn); +} + +} // namespace pretokenizers +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/pretokenizers/byte_level.h b/fast_tokenizer/fast_tokenizer/pretokenizers/byte_level.h new file mode 100644 index 000000000000..c06dcc373f6d --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/byte_level.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utils.h" +#include "nlohmann/json.hpp" + + +namespace paddlenlp { +namespace fast_tokenizer { +namespace pretokenizers { + +struct FASTTOKENIZER_DECL ByteLevelPreTokenizer : public PreTokenizer { + ByteLevelPreTokenizer(bool add_prefix_space = true, bool use_regex = true); + virtual void operator()(PreTokenizedString* pretokenized) const override; + friend void to_json(nlohmann::json& j, + const ByteLevelPreTokenizer& byte_pre_tokenizer); + friend void from_json(const nlohmann::json& j, + ByteLevelPreTokenizer& byte_pre_tokenizer); + +private: + bool add_prefix_space_; + bool use_regex_; +}; + +void FASTTOKENIZER_DECL ProcessOffsets(core::Encoding* encoding, + bool add_prefix_space); + +} // namespace pretokenizers +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/metaspace.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/metaspace.cc similarity index 93% rename from faster_tokenizer/faster_tokenizer/pretokenizers/metaspace.cc rename to fast_tokenizer/fast_tokenizer/pretokenizers/metaspace.cc index df864a0e9445..add5496f5e0f 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/metaspace.cc +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/metaspace.cc @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/pretokenizers/metaspace.h" -#include "re2/re2.h" -#include "faster_tokenizer/utils/utf8.h" +#include "fast_tokenizer/pretokenizers/metaspace.h" +#include "fast_tokenizer/utils/utf8.h" #include "glog/logging.h" +#include "re2/re2.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { static re2::RE2 pattern(" "); @@ -55,7 +55,7 @@ void MetaSpacePreTokenizer::operator()(PreTokenizedString* pretokenized) const { } normalized->Split( [&](char32_t ch) -> bool { return ch == replacement_char_; }, - normalizers::MERGED_WITH_NEXT, + core::SplitMode::MERGED_WITH_NEXT, &normalized_splits); for (auto&& normalize : normalized_splits) { if (!normalize.IsEmpty()) { @@ -82,5 +82,5 @@ void from_json(const nlohmann::json& j, } } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/metaspace.h b/fast_tokenizer/fast_tokenizer/pretokenizers/metaspace.h similarity index 86% rename from faster_tokenizer/faster_tokenizer/pretokenizers/metaspace.h rename to fast_tokenizer/fast_tokenizer/pretokenizers/metaspace.h index a87c8f98f5ae..e851720b5ca7 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/metaspace.h +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/metaspace.h @@ -15,14 +15,14 @@ limitations under the License. */ #pragma once #include "nlohmann/json.hpp" -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { -struct FASTERTOKENIZER_DECL MetaSpacePreTokenizer : public PreTokenizer { +struct FASTTOKENIZER_DECL MetaSpacePreTokenizer : public PreTokenizer { // Replaces white space with U+2581 (LOWER ONE EIGHT BLOCK) MetaSpacePreTokenizer(const std::string& replacement = "\xe2\x96\x81", bool add_prefix_space = true); @@ -44,5 +44,5 @@ struct FASTERTOKENIZER_DECL MetaSpacePreTokenizer : public PreTokenizer { }; } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizer.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizer.cc similarity index 85% rename from faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizer.cc rename to fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizer.cc index b28d5e1b3376..78efd0a11784 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizer.cc +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizer.cc @@ -16,12 +16,13 @@ limitations under the License. */ #include #include +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/unique_ptr.h" +#include "fast_tokenizer/utils/utf8.h" #include "glog/logging.h" -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/utils/utf8.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { BytesToCharOffsetConverter::BytesToCharOffsetConverter(const std::string& seq) @@ -64,7 +65,7 @@ CharToBytesOffsetConverter::CharToBytesOffsetConverter(const std::string& seq) offset_map_.reserve(u32seq.length() * 4); for (int i = 0; i < u32seq.length(); ++i) { offset_map_.push_back(index); - auto utf8_len = faster_tokenizer::utils::GetUTF8CharLen(u32seq[i]); + auto utf8_len = fast_tokenizer::utils::GetUTF8CharLen(u32seq[i]); index += utf8_len; } offset_map_.push_back(index); @@ -130,7 +131,7 @@ void PreTokenizedString::Split( void PreTokenizedString::Normalize( std::function normalize_fn) { for (auto& split : splits_) { - if (!split.tokens_.empty()) { + if (split.tokens_.empty()) { normalize_fn(&split.normalized_); } } @@ -241,6 +242,35 @@ void PreTokenizedString::SetOriginalStr(const std::string& original) { splits_.emplace_back(original_); } +std::vector>> +PreTokenizedString::GetSplits(bool is_original, + const core::OffsetType& offset_type) const { + std::unique_ptr converter; + if (offset_type == core::OffsetType::BYTE) { + converter = utils::make_unique(original_); + } else { + converter = utils::make_unique(original_); + } + std::vector>> + result; + uint32_t offset = 0; + for (auto&& split : splits_) { + core::Offset curr_offset, split_offset; + if (is_original) { + split_offset = split.normalized_.GetOrginalOffset(); + } else { + auto len = split.normalized_.GetLen(); + offset += len; + split_offset = {offset - len, offset}; + } + + // Convert to char offsets if relevant + converter->convert(split_offset, &curr_offset); + result.emplace_back(split.normalized_.GetStr(), curr_offset, split.tokens_); + } + return result; +} + } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizer.h b/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizer.h similarity index 84% rename from faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizer.h rename to fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizer.h index 8fdbb14bf2b1..1052807d8a0b 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/pretokenizer.h +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizer.h @@ -16,18 +16,19 @@ limitations under the License. */ #include #include +#include #include #include -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/core/encoding.h" -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { -struct FASTERTOKENIZER_DECL StringSplit { +struct FASTTOKENIZER_DECL StringSplit { normalizers::NormalizedString normalized_; std::vector tokens_; StringSplit(normalizers::NormalizedString&& normalized) @@ -51,7 +52,7 @@ struct FASTERTOKENIZER_DECL StringSplit { } }; -class FASTERTOKENIZER_DECL PreTokenizedString { +class FASTTOKENIZER_DECL PreTokenizedString { public: PreTokenizedString() = default; PreTokenizedString(const std::string& original); @@ -78,31 +79,33 @@ class FASTERTOKENIZER_DECL PreTokenizedString { StringSplit GetSplit(int idx) const; const std::string& GetOriginStr() const; void SetOriginalStr(const std::string& original); + std::vector>> + GetSplits(bool is_original, const core::OffsetType& offset_type) const; private: std::string original_; std::vector splits_; }; -struct FASTERTOKENIZER_DECL PreTokenizer { +struct FASTTOKENIZER_DECL PreTokenizer { virtual void operator()(PreTokenizedString* pretokenized) const = 0; }; -struct FASTERTOKENIZER_DECL OffsetConverter { +struct FASTTOKENIZER_DECL OffsetConverter { OffsetConverter(const std::string&) {} virtual bool convert(const core::Offset&, core::Offset*) const { return true; } }; -struct FASTERTOKENIZER_DECL BytesToCharOffsetConverter +struct FASTTOKENIZER_DECL BytesToCharOffsetConverter : public OffsetConverter { std::vector offset_map_; BytesToCharOffsetConverter(const std::string&); virtual bool convert(const core::Offset&, core::Offset*) const; }; -struct FASTERTOKENIZER_DECL CharToBytesOffsetConverter +struct FASTTOKENIZER_DECL CharToBytesOffsetConverter : public OffsetConverter { std::vector offset_map_; CharToBytesOffsetConverter(const std::string&); @@ -110,5 +113,5 @@ struct FASTERTOKENIZER_DECL CharToBytesOffsetConverter }; } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizers.h b/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizers.h new file mode 100644 index 000000000000..009fc6748d9f --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/pretokenizers.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "fast_tokenizer/pretokenizers/bert.h" +#include "fast_tokenizer/pretokenizers/byte_level.h" +#include "fast_tokenizer/pretokenizers/metaspace.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/pretokenizers/sequence.h" +#include "fast_tokenizer/pretokenizers/split.h" +#include "fast_tokenizer/pretokenizers/whitespace.h" diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/sequence.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/sequence.cc similarity index 78% rename from faster_tokenizer/faster_tokenizer/pretokenizers/sequence.cc rename to fast_tokenizer/fast_tokenizer/pretokenizers/sequence.cc index da781aa8144a..285f21f759d8 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/sequence.cc +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/sequence.cc @@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/pretokenizers/pretokenizers.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" +#include "glog/logging.h" +#include "re2/re2.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { SequencePreTokenizer::SequencePreTokenizer( @@ -43,6 +45,15 @@ void SequencePreTokenizer::AppendPreTokenizer(PreTokenizer* pretokenizer) { dynamic_cast(pretokenizer); pretokenizer_ptr = std::make_shared(*cast_pretokenizer); + } else if (typeid(*pretokenizer) == typeid(SplitPreTokenizer)) { + auto cast_pretokenizer = dynamic_cast(pretokenizer); + pretokenizer_ptr = std::make_shared(*cast_pretokenizer); + } else if (typeid(*pretokenizer) == typeid(ByteLevelPreTokenizer)) { + auto cast_pretokenizer = dynamic_cast(pretokenizer); + pretokenizer_ptr = + std::make_shared(*cast_pretokenizer); + } else { + VLOG(6) << "This pretokenizer is not supportted now."; } pretokenzer_ptrs_.push_back(pretokenizer_ptr); } @@ -66,6 +77,10 @@ void to_json(nlohmann::json& j, jitem = *dynamic_cast(ptr.get()); } else if (typeid(*ptr) == typeid(WhitespacePreTokenizer)) { jitem = *dynamic_cast(ptr.get()); + } else if (typeid(*ptr) == typeid(SplitPreTokenizer)) { + jitem = *dynamic_cast(ptr.get()); + } else if (typeid(*ptr) == typeid(ByteLevelPreTokenizer)) { + jitem = *dynamic_cast(ptr.get()); } jlist.push_back(jitem); } @@ -87,10 +102,12 @@ void from_json(const nlohmann::json& j, TRY_APPEND_PRETOKENIZER(WhitespacePreTokenizer); TRY_APPEND_PRETOKENIZER(MetaSpacePreTokenizer); TRY_APPEND_PRETOKENIZER(BertPreTokenizer); + TRY_APPEND_PRETOKENIZER(ByteLevelPreTokenizer); + TRY_APPEND_PRETOKENIZER(SplitPreTokenizer); } #undef TRY_APPEND_PRETOKENIZER } } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/sequence.h b/fast_tokenizer/fast_tokenizer/pretokenizers/sequence.h similarity index 85% rename from faster_tokenizer/faster_tokenizer/pretokenizers/sequence.h rename to fast_tokenizer/fast_tokenizer/pretokenizers/sequence.h index 03ff13fbd5a0..1ab813dfc226 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/sequence.h +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/sequence.h @@ -16,14 +16,14 @@ limitations under the License. */ #include #include "nlohmann/json.hpp" -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { -struct FASTERTOKENIZER_DECL SequencePreTokenizer : public PreTokenizer { +struct FASTTOKENIZER_DECL SequencePreTokenizer : public PreTokenizer { SequencePreTokenizer() = default; SequencePreTokenizer(const SequencePreTokenizer&) = default; SequencePreTokenizer(const std::vector& pretokenizers); @@ -39,5 +39,5 @@ struct FASTERTOKENIZER_DECL SequencePreTokenizer : public PreTokenizer { }; } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/pretokenizers/split.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/split.cc new file mode 100644 index 000000000000..0f97c76406e4 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/split.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fast_tokenizer/pretokenizers/split.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/utils/unique_ptr.h" +#include "re2/re2.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace pretokenizers { + +SplitPreTokenizer::SplitPreTokenizer( + const SplitPreTokenizer& split_pretokenizer) + : pattern_(new re2::RE2(split_pretokenizer.pattern_->pattern())) { + split_mode_ = split_pretokenizer.split_mode_; + invert_ = split_pretokenizer.invert_; +} + +SplitPreTokenizer::SplitPreTokenizer(const std::string& pattern, + core::SplitMode split_mode, + bool invert) + : invert_(invert), split_mode_(split_mode) { + pattern_ = utils::make_unique(pattern); +} + +void SplitPreTokenizer::operator()(PreTokenizedString* pretokenized) const { + pretokenized->Split([&](int idx, + normalizers::NormalizedString* normalized, + std::vector* string_splits) { + std::vector normalized_splits; + normalized->Split(*pattern_, split_mode_, &normalized_splits, invert_); + for (auto& normalize : normalized_splits) { + string_splits->push_back(StringSplit(normalize)); + } + }); +} + + +void to_json(nlohmann::json& j, const SplitPreTokenizer& split_pretokenizer) { + j = { + {"type", "SplitPreTokenizer"}, + {"pattern", split_pretokenizer.pattern_->pattern()}, + {"split_mode", split_pretokenizer.split_mode_}, + {"invert", split_pretokenizer.invert_}, + }; +} + +void from_json(const nlohmann::json& j, SplitPreTokenizer& split_pretokenizer) { + split_pretokenizer.pattern_ = + utils::make_unique(j.at("pattern").get()); + j.at("split_mode").get_to(split_pretokenizer.split_mode_); + j.at("invert").get_to(split_pretokenizer.invert_); +} + + +} // namespace pretokenizers +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/pretokenizers/split.h b/fast_tokenizer/fast_tokenizer/pretokenizers/split.h new file mode 100644 index 000000000000..52796f9f4e24 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/split.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utils.h" + +namespace re2 { +class RE2; +} // namespace re2 + +namespace paddlenlp { +namespace fast_tokenizer { +namespace pretokenizers { + +struct FASTTOKENIZER_DECL SplitPreTokenizer : public PreTokenizer { + SplitPreTokenizer() = default; + SplitPreTokenizer(const std::string& pattern, + core::SplitMode split_mode, + bool invert); + SplitPreTokenizer(const SplitPreTokenizer& split_pretokenizer); + virtual void operator()(PreTokenizedString* pretokenized) const override; + friend void to_json(nlohmann::json& j, + const SplitPreTokenizer& split_pretokenizer); + friend void from_json(const nlohmann::json& j, + SplitPreTokenizer& split_pretokenizer); + +private: + bool invert_; + core::SplitMode split_mode_; + std::unique_ptr pattern_; +}; + +} // namespace pretokenizers +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/whitespace.cc b/fast_tokenizer/fast_tokenizer/pretokenizers/whitespace.cc similarity index 85% rename from faster_tokenizer/faster_tokenizer/pretokenizers/whitespace.cc rename to fast_tokenizer/fast_tokenizer/pretokenizers/whitespace.cc index 24fb5cb3b25e..1ab00b9eeb91 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/whitespace.cc +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/whitespace.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/pretokenizers/whitespace.h" -#include "faster_tokenizer/normalizers/normalizer.h" +#include "fast_tokenizer/pretokenizers/whitespace.h" +#include "fast_tokenizer/normalizers/normalizer.h" #include "re2/re2.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { static re2::RE2 pattern("[\\s\\p{Zs}]+"); @@ -27,7 +27,7 @@ void WhitespacePreTokenizer::operator()( normalizers::NormalizedString* normalized, std::vector* string_splits) { std::vector normalized_splits; - normalized->Split(pattern, normalizers::REMOVED, &normalized_splits); + normalized->Split(pattern, core::SplitMode::REMOVED, &normalized_splits); for (auto& normalize : normalized_splits) { string_splits->push_back(StringSplit(normalize)); } @@ -45,5 +45,5 @@ void from_json(const nlohmann::json& j, WhitespacePreTokenizer& whitespace_pretokenizer) {} } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pretokenizers/whitespace.h b/fast_tokenizer/fast_tokenizer/pretokenizers/whitespace.h similarity index 81% rename from faster_tokenizer/faster_tokenizer/pretokenizers/whitespace.h rename to fast_tokenizer/fast_tokenizer/pretokenizers/whitespace.h index 8bb8eaee7e3a..43aa955ffc06 100644 --- a/faster_tokenizer/faster_tokenizer/pretokenizers/whitespace.h +++ b/fast_tokenizer/fast_tokenizer/pretokenizers/whitespace.h @@ -14,14 +14,14 @@ limitations under the License. */ #pragma once -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pretokenizers { -struct FASTERTOKENIZER_DECL WhitespacePreTokenizer : public PreTokenizer { +struct FASTTOKENIZER_DECL WhitespacePreTokenizer : public PreTokenizer { virtual void operator()(PreTokenizedString* pretokenized) const override; friend void to_json(nlohmann::json& j, const WhitespacePreTokenizer& whitespace_pretokenizer); @@ -30,5 +30,5 @@ struct FASTERTOKENIZER_DECL WhitespacePreTokenizer : public PreTokenizer { }; } // namespace pretokenizers -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/pybind/CMakeLists.txt similarity index 100% rename from faster_tokenizer/faster_tokenizer/pybind/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/pybind/CMakeLists.txt diff --git a/faster_tokenizer/faster_tokenizer/pybind/core.cc b/fast_tokenizer/fast_tokenizer/pybind/core.cc similarity index 93% rename from faster_tokenizer/faster_tokenizer/pybind/core.cc rename to fast_tokenizer/fast_tokenizer/pybind/core.cc index 7534e904a241..26cd25d9317d 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/core.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/core.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include -#include "faster_tokenizer/core/added_vocabulary.h" -#include "faster_tokenizer/core/base.h" -#include "faster_tokenizer/core/encoding.h" -#include "faster_tokenizer/pybind/core.h" +#include "fast_tokenizer/core/added_vocabulary.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/pybind/core.h" #include #include @@ -25,7 +25,7 @@ limitations under the License. */ namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { py::list GetWordIdx(const core::Encoding& self) { @@ -88,6 +88,14 @@ void BindCore(pybind11::module* m) { .value("FIXED_SIZE", core::PadStrategy::FIXED_SIZE) .export_values(); + py::enum_(*m, "SplitMode") + .value("REMOVED", core::SplitMode::REMOVED) + .value("ISOLATED", core::SplitMode::ISOLATED) + .value("MERGED_WITH_PREVIOUS", core::SplitMode::MERGED_WITH_PREVIOUS) + .value("MERGED_WITH_NEXT", core::SplitMode::MERGED_WITH_NEXT) + .value("CONTIGUOUS", core::SplitMode::CONTIGUOUS) + .export_values(); + py::class_(*m, "Encoding") .def(py::init&, const std::vector&, @@ -270,8 +278,11 @@ void BindCore(pybind11::module* m) { .def_property_readonly("lstrip", &core::AddedToken::GetUseLStrip) .def_property_readonly("rstrip", &core::AddedToken::GetUseRStrip) .def_property_readonly("single_word", &core::AddedToken::GetIsSingleWord); + + m->def("set_thread_num", &core::SetThreadNum); + m->def("get_thread_num", &core::GetThreadNum); } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/core.h b/fast_tokenizer/fast_tokenizer/pybind/core.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/pybind/core.h rename to fast_tokenizer/fast_tokenizer/pybind/core.h index 55a0cc1aa351..4d42bdd00cbf 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/core.h +++ b/fast_tokenizer/fast_tokenizer/pybind/core.h @@ -17,11 +17,11 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindCore(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/decoders.cc b/fast_tokenizer/fast_tokenizer/pybind/decoders.cc similarity index 93% rename from faster_tokenizer/faster_tokenizer/pybind/decoders.cc rename to fast_tokenizer/fast_tokenizer/pybind/decoders.cc index 00b13f8a636b..0e0bdbe8728f 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/decoders.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/decoders.cc @@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/decoders/decoders.h" +#include "fast_tokenizer/decoders/decoders.h" #include -#include "faster_tokenizer/pybind/decoders.h" +#include "fast_tokenizer/pybind/decoders.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { class PyDecoder : public decoders::Decoder { @@ -70,5 +70,5 @@ void BindDecoders(pybind11::module* m) { } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/decoders.h b/fast_tokenizer/fast_tokenizer/pybind/decoders.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/pybind/decoders.h rename to fast_tokenizer/fast_tokenizer/pybind/decoders.h index 0e7f45d69747..27be3049cf67 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/decoders.h +++ b/fast_tokenizer/fast_tokenizer/pybind/decoders.h @@ -17,11 +17,11 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindDecoders(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/exception.cc b/fast_tokenizer/fast_tokenizer/pybind/exception.cc similarity index 90% rename from faster_tokenizer/faster_tokenizer/pybind/exception.cc rename to fast_tokenizer/fast_tokenizer/pybind/exception.cc index d6768b7eaa7d..35df7987fd54 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/exception.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/exception.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/pybind/exception.h" +#include "fast_tokenizer/pybind/exception.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void ThrowExceptionToPython(std::exception_ptr p) { @@ -31,5 +31,5 @@ void ThrowExceptionToPython(std::exception_ptr p) { } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/exception.h b/fast_tokenizer/fast_tokenizer/pybind/exception.h similarity index 95% rename from faster_tokenizer/faster_tokenizer/pybind/exception.h rename to fast_tokenizer/fast_tokenizer/pybind/exception.h index 6fe8471e159b..49381948179f 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/exception.h +++ b/fast_tokenizer/fast_tokenizer/pybind/exception.h @@ -35,11 +35,11 @@ limitations under the License. */ namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void ThrowExceptionToPython(std::exception_ptr p); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/models.cc b/fast_tokenizer/fast_tokenizer/pybind/models.cc similarity index 93% rename from faster_tokenizer/faster_tokenizer/pybind/models.cc rename to fast_tokenizer/fast_tokenizer/pybind/models.cc index 6d059a70e843..8d1ea7725184 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/models.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/models.cc @@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/models/models.h" +#include "fast_tokenizer/models/models.h" #include -#include "faster_tokenizer/pybind/models.h" -#include "faster_tokenizer/pybind/utils.h" +#include "fast_tokenizer/pybind/models.h" +#include "fast_tokenizer/pybind/utils.h" #include "glog/logging.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { class PyModel : public models::Model { @@ -99,12 +99,12 @@ class PyWordPiece : public models::WordPiece { } }; -class PyFasterWordPiece : public models::FasterWordPiece { - using FasterWordPiece::FasterWordPiece; +class PyFastWordPiece : public models::FastWordPiece { + using FastWordPiece::FastWordPiece; virtual std::vector Tokenize( const std::string& tokens) override { PYBIND11_OVERLOAD_NAME(std::vector, - FasterWordPiece, + FastWordPiece, "tokenize", Tokenize, tokens); @@ -113,28 +113,28 @@ class PyFasterWordPiece : public models::FasterWordPiece { virtual bool TokenToId(const std::string& token, uint32_t* id) const override { PYBIND11_OVERLOAD_NAME( - bool, FasterWordPiece, "token_to_id", TokenToId, token, id); + bool, FastWordPiece, "token_to_id", TokenToId, token, id); } virtual bool IdToToken(uint32_t id, std::string* token) const override { PYBIND11_OVERLOAD_NAME( - bool, FasterWordPiece, "id_to_token", IdToToken, id, token); + bool, FastWordPiece, "id_to_token", IdToToken, id, token); } virtual core::Vocab GetVocab() const override { - PYBIND11_OVERLOAD_NAME(core::Vocab, FasterWordPiece, "get_vocab", GetVocab); + PYBIND11_OVERLOAD_NAME(core::Vocab, FastWordPiece, "get_vocab", GetVocab); } virtual size_t GetVocabSize() const override { PYBIND11_OVERLOAD_NAME( - size_t, FasterWordPiece, "get_vocab_size", GetVocabSize); + size_t, FastWordPiece, "get_vocab_size", GetVocabSize); } virtual std::vector Save( const std::string& folder, const std::string& filename_prefix) const override { PYBIND11_OVERLOAD_NAME(std::vector, - FasterWordPiece, + FastWordPiece, "save", Save, folder, @@ -259,8 +259,8 @@ void BindModels(pybind11::module* m) { }, py::arg("folder"), py::arg("prefix") = py::none()); - py::class_(submodule, - "FasterWordPiece") + py::class_(submodule, + "FastWordPiece") .def(py::init<>()) .def(py::init namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindModels(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/normalizers.cc b/fast_tokenizer/fast_tokenizer/pybind/normalizers.cc similarity index 99% rename from faster_tokenizer/faster_tokenizer/pybind/normalizers.cc rename to fast_tokenizer/fast_tokenizer/pybind/normalizers.cc index f0663e8b963f..9c561b118503 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/normalizers.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/normalizers.cc @@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/normalizers/normalizers.h" #include -#include "faster_tokenizer/pybind/normalizers.h" +#include "fast_tokenizer/pybind/normalizers.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { class PyNormalizer : public normalizers::Normalizer { @@ -458,5 +458,5 @@ void BindNormalizers(pybind11::module* m) { } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/normalizers.h b/fast_tokenizer/fast_tokenizer/pybind/normalizers.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/pybind/normalizers.h rename to fast_tokenizer/fast_tokenizer/pybind/normalizers.h index 562772b260f3..64cd9b6e2ed4 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/normalizers.h +++ b/fast_tokenizer/fast_tokenizer/pybind/normalizers.h @@ -17,11 +17,11 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindNormalizers(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/postprocessors.cc b/fast_tokenizer/fast_tokenizer/pybind/postprocessors.cc similarity index 75% rename from faster_tokenizer/faster_tokenizer/pybind/postprocessors.cc rename to fast_tokenizer/fast_tokenizer/pybind/postprocessors.cc index 19e4287d9cda..6795c125481a 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/postprocessors.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/postprocessors.cc @@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/postprocessors/postprocessors.h" +#include "fast_tokenizer/postprocessors/postprocessors.h" #include -#include "faster_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/pybind/postprocessors.h" +#include "fast_tokenizer/pybind/utils.h" #include "glog/logging.h" -#include "faster_tokenizer/pybind/postprocessors.h" -#include "faster_tokenizer/pybind/utils.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { class PyPostProcessor : public postprocessors::PostProcessor { @@ -100,6 +100,55 @@ class PyTemplatePostProcessor : public postprocessors::TemplatePostProcessor { } }; +class PyRobertaPostProcessor : public postprocessors::RobertaPostProcessor { +public: + using RobertaPostProcessor::RobertaPostProcessor; + virtual void operator()(core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens, + core::Encoding* result_encoding) const override { + PYBIND11_OVERLOAD_NAME(void, + RobertaPostProcessor, + "__call__", + operator(), + encoding, + pair_encoding, + add_special_tokens, + result_encoding); + } + virtual size_t AddedTokensNum(bool is_pair) const override { + PYBIND11_OVERLOAD_NAME(size_t, + RobertaPostProcessor, + "num_special_tokens_to_add", + AddedTokensNum, + is_pair); + } +}; + +class PyByteLevelPostProcessor : public postprocessors::ByteLevelPostProcessor { +public: + using ByteLevelPostProcessor::ByteLevelPostProcessor; + virtual void operator()(core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens, + core::Encoding* result_encoding) const override { + PYBIND11_OVERLOAD_NAME(void, + ByteLevelPostProcessor, + "__call__", + operator(), + encoding, + pair_encoding, + add_special_tokens, + result_encoding); + } + virtual size_t AddedTokensNum(bool is_pair) const override { + PYBIND11_OVERLOAD_NAME(size_t, + ByteLevelPostProcessor, + "num_special_tokens_to_add", + AddedTokensNum, + is_pair); + } +}; void BindPostProcessors(pybind11::module* m) { auto submodule = @@ -309,8 +358,61 @@ void BindPostProcessors(pybind11::module* m) { py::arg("encoding"), py::arg("pair_encoding"), py::arg("add_special_tokens")); + + py::class_( + submodule, "RobertaPostProcessor") + .def(py::init<>()) + .def(py::init&, + const std::pair&, + bool, + bool>(), + py::arg("sep"), + py::arg("cls"), + py::arg("trim_offsets") = true, + py::arg("add_prefix_space") = true) + .def("num_special_tokens_to_add", + &postprocessors::RobertaPostProcessor::AddedTokensNum, + py::arg("is_pair")) + .def("__call__", + [](const postprocessors::RobertaPostProcessor& self, + core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens) { + core::Encoding result_encoding; + self( + encoding, pair_encoding, add_special_tokens, &result_encoding); + return result_encoding; + }, + py::arg("encoding"), + py::arg("pair_encoding"), + py::arg("add_special_tokens")); + py::class_( + submodule, "ByteLevelPostProcessor") + .def(py::init(), + py::arg("add_prefix_space") = true, + py::arg("trim_offsets") = true, + py::arg("use_regex") = true) + .def("num_special_tokens_to_add", + &postprocessors::ByteLevelPostProcessor::AddedTokensNum, + py::arg("is_pair")) + .def("__call__", + [](const postprocessors::ByteLevelPostProcessor& self, + core::Encoding* encoding, + core::Encoding* pair_encoding, + bool add_special_tokens) { + core::Encoding result_encoding; + self( + encoding, pair_encoding, add_special_tokens, &result_encoding); + return result_encoding; + }, + py::arg("encoding"), + py::arg("pair_encoding"), + py::arg("add_special_tokens")); + } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/postprocessors.h b/fast_tokenizer/fast_tokenizer/pybind/postprocessors.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/pybind/postprocessors.h rename to fast_tokenizer/fast_tokenizer/pybind/postprocessors.h index bff6944824ed..b30b31a951ee 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/postprocessors.h +++ b/fast_tokenizer/fast_tokenizer/pybind/postprocessors.h @@ -17,11 +17,11 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindPostProcessors(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/pretokenizers.cc b/fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc similarity index 73% rename from faster_tokenizer/faster_tokenizer/pybind/pretokenizers.cc rename to fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc index 1b2ab06e22ec..aec6b7027912 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/pretokenizers.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc @@ -13,14 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/pretokenizers/pretokenizers.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" #include -#include "faster_tokenizer/pybind/pretokenizers.h" +#include "fast_tokenizer/pybind/pretokenizers.h" +#include "re2/re2.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { class PyPreTokenizer : public pretokenizers::PreTokenizer { @@ -73,6 +74,26 @@ class PySequencePreTokenizer : public pretokenizers::SequencePreTokenizer { } }; +class PyByteLevelPreTokenizer : public pretokenizers::ByteLevelPreTokenizer { +public: + using ByteLevelPreTokenizer::ByteLevelPreTokenizer; + virtual void operator()( + pretokenizers::PreTokenizedString* pretokenized) const override { + PYBIND11_OVERLOAD_NAME( + void, ByteLevelPreTokenizer, "__call__", operator(), pretokenized); + } +}; + +class PySplitPreTokenizer : public pretokenizers::SplitPreTokenizer { +public: + using SplitPreTokenizer::SplitPreTokenizer; + virtual void operator()( + pretokenizers::PreTokenizedString* pretokenized) const override { + PYBIND11_OVERLOAD_NAME( + void, SplitPreTokenizer, "__call__", operator(), pretokenized); + } +}; + void BindPreTokenizers(pybind11::module* m) { auto sub_module = m->def_submodule("pretokenizers", "The pretokenizers module"); @@ -96,6 +117,22 @@ void BindPreTokenizers(pybind11::module* m) { &pretokenizers::PreTokenizedString::GetSplitsSize) .def("get_original_text", &pretokenizers::PreTokenizedString::GetOriginStr) + .def("get_splits", + [](const pretokenizers::PreTokenizedString& self, + const std::string& offset_referential, + const std::string& offset_type) { + bool is_original = true; + if (offset_referential != "original") { + is_original = false; + } + core::OffsetType type = core::OffsetType::CHAR; + if (offset_type != "char") { + type = core::OffsetType::BYTE; + } + return self.GetSplits(is_original, type); + }, + py::arg("offset_referential") = "original", + py::arg("offset_type") = "char") .def("to_encoding", [](const pretokenizers::PreTokenizedString& self, const std::vector& word_idx, @@ -152,11 +189,17 @@ void BindPreTokenizers(pybind11::module* m) { pretokenizer_ptr = py_pretokenizer .cast(); + } else if (pybind11::type::of(py_pretokenizer) + .is(py::type::of< + pretokenizers::ByteLevelPreTokenizer>())) { + pretokenizer_ptr = + py_pretokenizer + .cast(); } else { throw py::value_error( "Type of normalizers should be one of `BertPreTokenizer`," " `MetaSpacePreTokenizer`, `SequencePreTokenizer`," - " `WhitespacePreTokenizer`"); + " `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`"); } pretokenizers.push_back(pretokenizer_ptr); } @@ -164,8 +207,21 @@ void BindPreTokenizers(pybind11::module* m) { }), py::arg("pretokenizers")) .def("__call__", &pretokenizers::SequencePreTokenizer::operator()); + py::class_( + sub_module, "ByteLevelPreTokenizer") + .def(py::init(), + py::arg("add_prefix_space") = true, + py::arg("use_regex") = true) + .def("__call__", &pretokenizers::ByteLevelPreTokenizer::operator()); + py::class_( + sub_module, "SplitPreTokenizer") + .def(py::init(), + py::arg("pattern"), + py::arg("split_mode"), + py::arg("invert")) + .def("__call__", &pretokenizers::SplitPreTokenizer::operator()); } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/pretokenizers.h b/fast_tokenizer/fast_tokenizer/pybind/pretokenizers.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/pybind/pretokenizers.h rename to fast_tokenizer/fast_tokenizer/pybind/pretokenizers.h index 32eb5fdf8346..ffe7b1adf83d 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/pretokenizers.h +++ b/fast_tokenizer/fast_tokenizer/pybind/pretokenizers.h @@ -17,11 +17,11 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindPreTokenizers(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/pybind.cc b/fast_tokenizer/fast_tokenizer/pybind/pybind.cc similarity index 72% rename from faster_tokenizer/faster_tokenizer/pybind/pybind.cc rename to fast_tokenizer/fast_tokenizer/pybind/pybind.cc index 82f751701e1c..3f7b32f56d88 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/pybind.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/pybind.cc @@ -16,20 +16,20 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/pybind/core.h" -#include "faster_tokenizer/pybind/decoders.h" -#include "faster_tokenizer/pybind/models.h" -#include "faster_tokenizer/pybind/normalizers.h" -#include "faster_tokenizer/pybind/postprocessors.h" -#include "faster_tokenizer/pybind/pretokenizers.h" -#include "faster_tokenizer/pybind/tokenizers.h" +#include "fast_tokenizer/pybind/core.h" +#include "fast_tokenizer/pybind/decoders.h" +#include "fast_tokenizer/pybind/models.h" +#include "fast_tokenizer/pybind/normalizers.h" +#include "fast_tokenizer/pybind/postprocessors.h" +#include "fast_tokenizer/pybind/pretokenizers.h" +#include "fast_tokenizer/pybind/tokenizers.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { PYBIND11_MODULE(core_tokenizers, m) { - m.doc() = "The paddlenlp tokenizers core module."; + m.doc() = "The paddlenlp fast_tokenizer core module."; // 1. Bind normalizers submodule BindNormalizers(&m); // 2. Bind pre_tokenizers submodule @@ -47,5 +47,5 @@ PYBIND11_MODULE(core_tokenizers, m) { } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/tokenizers.cc b/fast_tokenizer/fast_tokenizer/pybind/tokenizers.cc similarity index 96% rename from faster_tokenizer/faster_tokenizer/pybind/tokenizers.cc rename to fast_tokenizer/fast_tokenizer/pybind/tokenizers.cc index 0a6e94ffa8e4..fd2eb0a39f8a 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/tokenizers.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/tokenizers.cc @@ -14,24 +14,24 @@ limitations under the License. */ #include -#include "faster_tokenizer/core/tokenizer.h" -#include "faster_tokenizer/decoders/decoders.h" +#include "fast_tokenizer/core/tokenizer.h" +#include "fast_tokenizer/decoders/decoders.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/postprocessors/postprocessors.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" #include "glog/logging.h" -#include "faster_tokenizer/models/models.h" -#include "faster_tokenizer/normalizers/normalizers.h" -#include "faster_tokenizer/postprocessors/postprocessors.h" -#include "faster_tokenizer/pretokenizers/pretokenizers.h" #include -#include "faster_tokenizer/pybind/exception.h" -#include "faster_tokenizer/pybind/tokenizers.h" -#include "faster_tokenizer/pybind/utils.h" +#include "fast_tokenizer/pybind/exception.h" +#include "fast_tokenizer/pybind/tokenizers.h" +#include "fast_tokenizer/pybind/utils.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { PyTypeObject* p_tokenizer_type; // For Tokenizer @@ -155,6 +155,16 @@ static int TokenizerPropertiesSetPreTokenizer(TokenizerObject* self, const auto& pretokenizer = py_obj.cast(); self->tokenizer.SetPreTokenizer(pretokenizer); + } else if (pybind11::type::of(py_obj).is( + py::type::of())) { + const auto& pretokenizer = + py_obj.cast(); + self->tokenizer.SetPreTokenizer(pretokenizer); + } else if (pybind11::type::of(py_obj).is( + py::type::of())) { + const auto& pretokenizer = + py_obj.cast(); + self->tokenizer.SetPreTokenizer(pretokenizer); } else if (py_obj.is(py::none())) { self->tokenizer.ReleasePreTokenizer(); } else { @@ -182,8 +192,8 @@ static int TokenizerPropertiesSetModel(TokenizerObject* self, const auto& model = py_obj.cast(); self->tokenizer.SetModel(model); } else if (pybind11::type::of(py_obj).is( - py::type::of())) { - const auto& model = py_obj.cast(); + py::type::of())) { + const auto& model = py_obj.cast(); self->tokenizer.SetModel(model); } else if (pybind11::type::of(py_obj).is(py::type::of())) { const auto& model = py_obj.cast(); @@ -222,6 +232,16 @@ static int TokenizerPropertiesSetPostProcessor(TokenizerObject* self, const auto& processor = py_obj.cast(); self->tokenizer.SetPostProcessor(processor); + } else if (pybind11::type::of(py_obj).is( + py::type::of())) { + const auto& processor = + py_obj.cast(); + self->tokenizer.SetPostProcessor(processor); + } else if (pybind11::type::of(py_obj).is( + py::type::of())) { + const auto& processor = + py_obj.cast(); + self->tokenizer.SetPostProcessor(processor); } else if (py_obj.is(py::none())) { self->tokenizer.ReleasePostProcessor(); } else { @@ -400,8 +420,8 @@ int TokenizerInit(PyObject* self, PyObject* args, PyObject* kwargs) { const auto& model = py_obj.cast(); py_tokenizer_ptr->tokenizer.SetModel(model); } else if (pybind11::type::of(py_obj).is( - py::type::of())) { - const auto& model = py_obj.cast(); + py::type::of())) { + const auto& model = py_obj.cast(); py_tokenizer_ptr->tokenizer.SetModel(model); } else if (pybind11::type::of(py_obj).is(py::type::of())) { const auto& model = py_obj.cast(); @@ -1374,5 +1394,5 @@ void BindTokenizers(pybind11::module* m) { } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/tokenizers.h b/fast_tokenizer/fast_tokenizer/pybind/tokenizers.h similarity index 92% rename from faster_tokenizer/faster_tokenizer/pybind/tokenizers.h rename to fast_tokenizer/fast_tokenizer/pybind/tokenizers.h index 02efce5b5b08..b9e45b957da8 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/tokenizers.h +++ b/fast_tokenizer/fast_tokenizer/pybind/tokenizers.h @@ -17,11 +17,11 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { void BindTokenizers(pybind11::module* m); } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/utils.cc b/fast_tokenizer/fast_tokenizer/pybind/utils.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/pybind/utils.cc rename to fast_tokenizer/fast_tokenizer/pybind/utils.cc index 978e92ad8ba8..377aeb2ed32e 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/utils.cc +++ b/fast_tokenizer/fast_tokenizer/pybind/utils.cc @@ -15,11 +15,11 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/pybind/utils.h" +#include "fast_tokenizer/pybind/utils.h" namespace py = pybind11; namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { PyObject* ToPyObject(bool value) { @@ -273,5 +273,5 @@ bool PyObject_CheckLongOrConvertToLong(PyObject** obj) { } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/pybind/utils.h b/fast_tokenizer/fast_tokenizer/pybind/utils.h similarity index 98% rename from faster_tokenizer/faster_tokenizer/pybind/utils.h rename to fast_tokenizer/fast_tokenizer/pybind/utils.h index 03150bf77343..448abafe3214 100644 --- a/faster_tokenizer/faster_tokenizer/pybind/utils.h +++ b/fast_tokenizer/fast_tokenizer/pybind/utils.h @@ -18,7 +18,7 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace pybind { PyObject* ToPyObject(int value); @@ -98,5 +98,5 @@ std::vector CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos) { return result; } } // namespace pybind -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/test/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/test/CMakeLists.txt new file mode 100644 index 000000000000..380e63582b16 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/test/CMakeLists.txt @@ -0,0 +1,58 @@ +if(WITH_TESTING) +cc_library(tokenizers_gtest_main SRCS gtest_main.cc DEPS gtest gflags) + +# Test Normalizers modules +cc_test(test_normalizer SRCS test_normalizer.cc DEPS normalizers) +cc_test(test_unicode SRCS test_unicode.cc DEPS normalizers) +cc_test(test_replace SRCS test_replace.cc DEPS normalizers) +cc_test(test_strip SRCS test_strip.cc DEPS normalizers) +cc_test(test_utils SRCS test_utils.cc DEPS normalizers) + +# Test PreTokenizers modules +cc_test(test_whitespace SRCS test_whitespace.cc DEPS pretokenizers) +cc_test(test_bert_pretokenizer SRCS test_bert_pretokenizer.cc DEPS pretokenizers) +cc_test(test_split_pretokenizer SRCS test_split_pretokenizer.cc DEPS pretokenizers) + +# Test Model +cc_test(test_wordpiece SRCS test_wordpiece.cc DEPS models) +cc_test(test_fast_wordpiece SRCS test_fast_wordpiece.cc DEPS models) + +# Download ernie vocab for test +set(ERNIE_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/ernie_vocab.txt) +if (EXISTS ${ERNIE_VOCAB_PATH}) + message("The ${ERNIE_VOCAB_PATH} exists already.") +else() + file(DOWNLOAD "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt" ${ERNIE_VOCAB_PATH} SHOW_PROGRESS) + message("Already download the vocab.txt of ernie to ${CMAKE_CURRENT_BINARY_DIR} for test.") +endif() + +# Download clip vocab and merge files +set(CLIP_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/clip_vocab.json) +set(CLIP_MERGES_PATH ${CMAKE_CURRENT_BINARY_DIR}/clip_merges.txt) + +if (EXISTS ${CLIP_VOCAB_PATH}) + message("The ${CLIP_VOCAB_PATH} exists already.") +else() + file(DOWNLOAD "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/vocab.json" ${CLIP_VOCAB_PATH} SHOW_PROGRESS) + message("Already download the vocab.json of clip to ${CMAKE_CURRENT_BINARY_DIR} for test.") +endif() + +if (EXISTS ${CLIP_MERGES_PATH}) + message("The ${CLIP_MERGES_PATH} exists already.") +else() + file(DOWNLOAD "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/merges.txt" ${CLIP_MERGES_PATH} SHOW_PROGRESS) + message("Already download the merges.txt of clip to ${CMAKE_CURRENT_BINARY_DIR} for test.") +endif() + +# Test Tokenizer +cc_test(test_bert_tokenizer SRCS test_bert_tokenizer.cc DEPS normalizers pretokenizers models postprocessors tokenizer) + +# Test PostProcessor +cc_test(test_roberta_postprocessor SRCS test_roberta_postprocessor.cc DEPS normalizers pretokenizers models postprocessors tokenizer) + +if(NOT WITH_PYTHON) + cc_test(test_ernie_fast_tokenizer SRCS test_ernie_fast_tokenizer.cc DEPS normalizers pretokenizers models postprocessors tokenizer core_tokenizers) + cc_test(test_clip_fast_tokenizer SRCS test_clip_fast_tokenizer.cc DEPS normalizers pretokenizers models postprocessors tokenizer core_tokenizers) +endif() + +endif() diff --git a/faster_tokenizer/faster_tokenizer/test/gtest_main.cc b/fast_tokenizer/fast_tokenizer/test/gtest_main.cc similarity index 100% rename from faster_tokenizer/faster_tokenizer/test/gtest_main.cc rename to fast_tokenizer/fast_tokenizer/test/gtest_main.cc diff --git a/faster_tokenizer/faster_tokenizer/test/test_bert_pretokenizer.cc b/fast_tokenizer/fast_tokenizer/test/test_bert_pretokenizer.cc similarity index 93% rename from faster_tokenizer/faster_tokenizer/test/test_bert_pretokenizer.cc rename to fast_tokenizer/fast_tokenizer/test/test_bert_pretokenizer.cc index 5e8502e810cf..f4be133e375d 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_bert_pretokenizer.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_bert_pretokenizer.cc @@ -14,14 +14,14 @@ limitations under the License. */ #include #include +#include "fast_tokenizer/pretokenizers/bert.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "pretokenizers/bert.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { -TEST(pretokenizers, whitespace) { +TEST(pretokenizers, bert) { std::string input = "I \t am good\r at \nsport. I like\tfootball especially!!!"; std::vector expected_outputs = {"I", @@ -45,5 +45,5 @@ TEST(pretokenizers, whitespace) { } } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_bert_tokenizer.cc b/fast_tokenizer/fast_tokenizer/test/test_bert_tokenizer.cc similarity index 88% rename from faster_tokenizer/faster_tokenizer/test/test_bert_tokenizer.cc rename to fast_tokenizer/fast_tokenizer/test/test_bert_tokenizer.cc index f454661c0ea4..6aac0fdf1222 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_bert_tokenizer.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_bert_tokenizer.cc @@ -14,19 +14,19 @@ limitations under the License. */ #include #include #include -#include "core/added_vocabulary.h" -#include "core/base.h" -#include "core/encoding.h" -#include "core/tokenizer.h" +#include "fast_tokenizer/core/added_vocabulary.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/tokenizer.h" +#include "fast_tokenizer/models/wordpiece.h" +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/postprocessors/bert.h" +#include "fast_tokenizer/pretokenizers/bert.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "models/wordpiece.h" -#include "normalizers/bert.h" -#include "postprocessors/bert.h" -#include "pretokenizers/bert.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { template @@ -116,5 +116,5 @@ TEST(tokenizer, bert_tokenizer) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/test/test_clip_fast_tokenizer.cc b/fast_tokenizer/fast_tokenizer/test/test_clip_fast_tokenizer.cc new file mode 100644 index 000000000000..b2604b3cc906 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/test/test_clip_fast_tokenizer.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h" + +#include "fast_tokenizer/test/utils.h" + +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tests { + +TEST(tokenizer, clip_full) { + std::string vocab_path = "clip_vocab.json"; + std::string merges_path = "clip_merges.txt"; + tokenizers_impl::ClipFastTokenizer clip_tokenizer(vocab_path, merges_path); + + core::Encoding encoding; + std::string input_text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"; + std::vector expected_ids = { + 49406, 320, 1342, 272, 272, 335, 273, 273, 274, 16368, 13439, 2971, + 748, 531, 13610, 323, 1896, 8445, 323, 539, 320, 2368, 49407}; + std::vector expected_tokens = { + "<|startoftext|>", "a", "'ll", "1", "1", + "p", "2", "2", "3", "rf", + "âĺĨ", "ho", "!!", "to", "?'", + "d", "'d", "''", "d", "of", + "a", "cat", "<|endoftext|>"}; + clip_tokenizer.EncodePairStrings(input_text, &encoding); + CheckVectorEqual(expected_ids, encoding.GetIds()); + CheckVectorEqual(expected_tokens, encoding.GetTokens()); +} + +} // namespace tests +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_ernie_faster_tokenizer.cc b/fast_tokenizer/fast_tokenizer/test/test_ernie_fast_tokenizer.cc similarity index 71% rename from faster_tokenizer/faster_tokenizer/test/test_ernie_faster_tokenizer.cc rename to fast_tokenizer/fast_tokenizer/test/test_ernie_fast_tokenizer.cc index b7857df724ee..f3b73f6b42d9 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_ernie_faster_tokenizer.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_ernie_fast_tokenizer.cc @@ -11,40 +11,34 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include #include #include -#include "core/added_vocabulary.h" -#include "core/base.h" -#include "core/encoding.h" -#include "core/tokenizer.h" +#include "fast_tokenizer/core/added_vocabulary.h" +#include "fast_tokenizer/core/base.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/tokenizer.h" +#include "fast_tokenizer/models/wordpiece.h" +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/postprocessors/bert.h" +#include "fast_tokenizer/pretokenizers/bert.h" +#include "fast_tokenizer/test/utils.h" +#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h" + #include "glog/logging.h" #include "gtest/gtest.h" -#include "models/wordpiece.h" -#include "normalizers/bert.h" -#include "postprocessors/bert.h" -#include "pretokenizers/bert.h" -#include "tokenizers/ernie_faster_tokenizer.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { -template -void CheckVectorEqual(const std::vector& a, const std::vector& b) { - ASSERT_EQ(a.size(), b.size()); - auto size = a.size(); - for (int i = 0; i < size; ++i) { - ASSERT_EQ(a[i], b[i]); - } -} - -TEST(tokenizer, ernie_faster_tokenizer) { +TEST(tokenizer, ernie_fast_tokenizer) { std::string vocab_file = "ernie_vocab.txt"; - tokenizers_impl::ErnieFasterTokenizer ernie_faster_tokenizer(vocab_file); + tokenizers_impl::ErnieFastTokenizer ernie_fast_tokenizer(vocab_file); std::vector encodings(2); - ernie_faster_tokenizer.EncodePairStrings("今天天气真好", &encodings[0]); - ernie_faster_tokenizer.EncodePairStrings( + ernie_fast_tokenizer.EncodePairStrings("今天天气真好", &encodings[0]); + ernie_fast_tokenizer.EncodePairStrings( "don't know how this missed award nominations.", &encodings[1]); std::vector> expected_tokens = { {"[CLS]", "今", "天", "天", "气", "真", "好", "[SEP]"}, @@ -90,5 +84,5 @@ TEST(tokenizer, ernie_faster_tokenizer) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_faster_wordpiece.cc b/fast_tokenizer/fast_tokenizer/test/test_fast_wordpiece.cc similarity index 67% rename from faster_tokenizer/faster_tokenizer/test/test_faster_wordpiece.cc rename to fast_tokenizer/fast_tokenizer/test/test_fast_wordpiece.cc index 30628631fc21..c30517ebd4a3 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_faster_wordpiece.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_fast_wordpiece.cc @@ -15,28 +15,28 @@ limitations under the License. */ #include #include #include +#include "fast_tokenizer/models/fast_wordpiece.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "models/faster_wordpiece.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { -TEST(model, faster_wordpiece_token_to_id) { - auto vocab = models::FasterWordPiece::GetVocabFromFile("ernie_vocab.txt"); - models::FasterWordPiece faster_wordpiece_model(vocab); +TEST(model, fast_wordpiece_token_to_id) { + auto vocab = models::FastWordPiece::GetVocabFromFile("ernie_vocab.txt"); + models::FastWordPiece fast_wordpiece_model(vocab); // Test tokens in vocab for (const auto& item : vocab) { uint32_t id; - faster_wordpiece_model.TokenToId(item.first, &id); + fast_wordpiece_model.TokenToId(item.first, &id); ASSERT_EQ(item.second, id); } // Test [UNK] token - uint32_t faster_wordpiece_id; - ASSERT_FALSE(faster_wordpiece_model.TokenToId("dasd", &faster_wordpiece_id)); + uint32_t fast_wordpiece_id; + ASSERT_FALSE(fast_wordpiece_model.TokenToId("dasd", &fast_wordpiece_id)); } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_normalizer.cc b/fast_tokenizer/fast_tokenizer/test/test_normalizer.cc similarity index 69% rename from faster_tokenizer/faster_tokenizer/test/test_normalizer.cc rename to fast_tokenizer/fast_tokenizer/test/test_normalizer.cc index f86ec3f42203..97de7ea83073 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_normalizer.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_normalizer.cc @@ -13,16 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "normalizers/bert.h" -#include "normalizers/replace.h" -#include "normalizers/strip.h" -#include "normalizers/unicode.h" #include "re2/re2.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(normalizers, split) { @@ -30,8 +30,7 @@ TEST(normalizers, split) { std::string input = "The-final--countdown"; normalizers::NormalizedString split_input(input); auto test_split = [&pattern, &split_input]( - normalizers::SplitMode mode, - const std::vector expected_strings) { + core::SplitMode mode, const std::vector expected_strings) { std::vector normalizes; split_input.Split(pattern, mode, &normalizes); ASSERT_EQ(expected_strings.size(), normalizes.size()); @@ -40,16 +39,17 @@ TEST(normalizers, split) { } }; - test_split(normalizers::REMOVED, {"The", "final", "countdown"}); - test_split(normalizers::ISOLATED, + test_split(core::SplitMode::REMOVED, {"The", "final", "countdown"}); + test_split(core::SplitMode::ISOLATED, {"The", "-", "final", "-", "-", "countdown"}); - test_split(normalizers::CONTIGUOUS, {"The", "-", "final", "--", "countdown"}); - test_split(normalizers::MERGED_WITH_PREVIOUS, + test_split(core::SplitMode::CONTIGUOUS, + {"The", "-", "final", "--", "countdown"}); + test_split(core::SplitMode::MERGED_WITH_PREVIOUS, {"The-", "final-", "-", "countdown"}); - test_split(normalizers::MERGED_WITH_NEXT, + test_split(core::SplitMode::MERGED_WITH_NEXT, {"The", "-final", "-", "-countdown"}); } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_replace.cc b/fast_tokenizer/fast_tokenizer/test/test_replace.cc similarity index 85% rename from faster_tokenizer/faster_tokenizer/test/test_replace.cc rename to fast_tokenizer/fast_tokenizer/test/test_replace.cc index 48ac3988ac5f..c32a193e4e92 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_replace.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_replace.cc @@ -13,15 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "normalizers/bert.h" -#include "normalizers/replace.h" -#include "normalizers/strip.h" -#include "normalizers/unicode.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(normalizers, replace) { @@ -43,5 +43,5 @@ TEST(normalizers, replace) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/test/test_roberta_postprocessor.cc b/fast_tokenizer/fast_tokenizer/test/test_roberta_postprocessor.cc new file mode 100644 index 000000000000..3fabbf6af049 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/test/test_roberta_postprocessor.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/postprocessors/roberta.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tests { + +TEST(postprocessors, roberta) { + postprocessors::RobertaPostProcessor postprocessor; + core::Encoding encoding( + {core::Token(12, "Hello", {0, 5}), core::Token(14, "there", {6, 11})}, 0); + core::Encoding pair_encoding({core::Token(15, "pair", {0, 4})}, 0); + core::Encoding result_encoding; + + core::Encoding encoding_copy = encoding; + core::Encoding pair_encoding_copy = pair_encoding; + + postprocessor(&encoding_copy, nullptr, true, &result_encoding); + uint32_t special_word_idx = std::numeric_limits::max(); + ASSERT_EQ(result_encoding, + core::Encoding({0, 12, 14, 2}, + {0, 0, 0, 0}, + {"", "Hello", "there", ""}, + std::vector(4, special_word_idx), + {{0, 0}, {0, 5}, {6, 11}, {0, 0}}, + {1, 0, 0, 1}, + {1, 1, 1, 1}, + {}, + {{0, {1, 3}}})); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(2), + std::vector(1, 0)); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(3).size(), 0); + + encoding_copy = encoding; + postprocessor(&encoding_copy, &pair_encoding_copy, true, &result_encoding); + ASSERT_EQ( + result_encoding, + core::Encoding({0, 12, 14, 2, 2, 15, 2}, + {0, 0, 0, 0, 0, 0, 0}, + {"", "Hello", "there", "", "", "pair", ""}, + std::vector(7, special_word_idx), + {{0, 0}, {0, 5}, {6, 11}, {0, 0}, {0, 0}, {0, 4}, {0, 0}}, + {1, 0, 0, 1, 1, 0, 1}, + {1, 1, 1, 1, 1, 1, 1}, + {}, + {{0, {1, 3}}, {1, {5, 6}}})); + + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(2), + std::vector(1, 0)); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(3), std::vector{}); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(4), std::vector{}); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(5), std::vector{1}); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(6), std::vector{}); + + encoding_copy = encoding; + pair_encoding_copy = pair_encoding; + postprocessor(&encoding_copy, &pair_encoding_copy, false, &result_encoding); + ASSERT_EQ(result_encoding, + core::Encoding({12, 14, 15}, + {0, 0, 0}, + {"Hello", "there", "pair"}, + std::vector(3, special_word_idx), + {{0, 5}, {6, 11}, {0, 4}}, + {0, 0, 0}, + {1, 1, 1}, + {}, + {{0, {0, 2}}, {1, {2, 3}}})); + + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(0), std::vector{0}); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(1), std::vector{0}); + ASSERT_EQ(result_encoding.TokenIdxToSequenceIds(2), std::vector{1}); +} + +} // namespace tests +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/test/test_split_pretokenizer.cc b/fast_tokenizer/fast_tokenizer/test/test_split_pretokenizer.cc new file mode 100644 index 000000000000..89c4df06943e --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/test/test_split_pretokenizer.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "fast_tokenizer/pretokenizers/split.h" +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "re2/re2.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tests { + +TEST(pretokenizers, split_basic) { + std::string input = "How are you doing?"; + // All tokens' id are set to 0. + std::vector>> test_cases = + {{ + core::SplitMode::REMOVED, + std::vector{{0, "How", {0, 3}}, + {0, "are", {4, 7}}, + {0, "you", {8, 11}}, + {0, "doing", {12, 17}}, + {0, "?", {17, 18}}}, + }, + { + core::SplitMode::ISOLATED, + std::vector{{0, "How", {0, 3}}, + {0, " ", {3, 4}}, + {0, "are", {4, 7}}, + {0, " ", {7, 8}}, + {0, "you", {8, 11}}, + {0, " ", {11, 12}}, + {0, "doing", {12, 17}}, + {0, "?", {17, 18}}}, + }, + { + core::SplitMode::MERGED_WITH_PREVIOUS, + std::vector{{0, "How ", {0, 4}}, + {0, "are ", {4, 8}}, + {0, "you ", {8, 12}}, + {0, "doing", {12, 17}}, + {0, "?", {17, 18}}}, + }, + { + core::SplitMode::MERGED_WITH_NEXT, + std::vector{{0, "How", {0, 3}}, + {0, " are", {3, 7}}, + {0, " you", {7, 11}}, + {0, " doing", {11, 17}}, + {0, "?", {17, 18}}}, + }, + { + core::SplitMode::CONTIGUOUS, + std::vector{{0, "How", {0, 3}}, + {0, " ", {3, 4}}, + {0, "are", {4, 7}}, + {0, " ", {7, 8}}, + {0, "you", {8, 11}}, + {0, " ", {11, 12}}, + {0, "doing?", {12, 18}}}, + }}; + std::string pattern = R"(\w+|[^\w\s]+)"; + for (auto&& test : test_cases) { + pretokenizers::PreTokenizedString pretokenized(input); + pretokenizers::SplitPreTokenizer pretok(pattern, test.first, true); + pretok(&pretokenized); + ASSERT_EQ(test.second.size(), pretokenized.GetSplitsSize()); + for (int i = 0; i < test.second.size(); ++i) { + auto&& curr_split = pretokenized.GetSplit(i); + ASSERT_EQ(test.second[i].value_, curr_split.normalized_.GetStr()); + auto original_offset = curr_split.normalized_.GetOrginalOffset(); + ASSERT_EQ(test.second[i].offset_, original_offset); + } + } +} + +TEST(pretokenizers, split_invert) { + std::string input = "Hello Hello Hello"; + pretokenizers::PreTokenizedString pretok_str(input), + pretok_str_for_invert(input); + pretokenizers::SplitPreTokenizer pretok(" ", core::SplitMode::REMOVED, false); + pretokenizers::SplitPreTokenizer pretok_invert( + "Hello", core::SplitMode::REMOVED, true); + + pretok(&pretok_str); + pretok_invert(&pretok_str_for_invert); + + ASSERT_EQ(pretok_str.GetSplitsSize(), pretok_str_for_invert.GetSplitsSize()); + for (int i = 0; i < pretok_str.GetSplitsSize(); ++i) { + ASSERT_EQ(pretok_str.GetSplit(i).normalized_.GetStr(), + pretok_str_for_invert.GetSplit(i).normalized_.GetStr()); + } +} + +} // namespace tests +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_strip.cc b/fast_tokenizer/fast_tokenizer/test/test_strip.cc similarity index 87% rename from faster_tokenizer/faster_tokenizer/test/test_strip.cc rename to fast_tokenizer/fast_tokenizer/test/test_strip.cc index a607cd64be83..370ad9c37d79 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_strip.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_strip.cc @@ -13,15 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "normalizers/bert.h" -#include "normalizers/replace.h" -#include "normalizers/strip.h" -#include "normalizers/unicode.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(normalizers, strip) { @@ -51,5 +51,5 @@ TEST(normalizers, strip) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_unicode.cc b/fast_tokenizer/fast_tokenizer/test/test_unicode.cc similarity index 89% rename from faster_tokenizer/faster_tokenizer/test/test_unicode.cc rename to fast_tokenizer/fast_tokenizer/test/test_unicode.cc index 7e122ae28510..e4e927b13f35 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_unicode.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_unicode.cc @@ -13,15 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "normalizers/bert.h" -#include "normalizers/replace.h" -#include "normalizers/strip.h" -#include "normalizers/unicode.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(normalizers, unicode) { @@ -57,5 +57,5 @@ TEST(normalizers, unicode) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_utils.cc b/fast_tokenizer/fast_tokenizer/test/test_utils.cc similarity index 80% rename from faster_tokenizer/faster_tokenizer/test/test_utils.cc rename to fast_tokenizer/fast_tokenizer/test/test_utils.cc index 42e9b9bcdd89..8711f47724b4 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_utils.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_utils.cc @@ -13,15 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "fast_tokenizer/normalizers/bert.h" +#include "fast_tokenizer/normalizers/replace.h" +#include "fast_tokenizer/normalizers/strip.h" +#include "fast_tokenizer/normalizers/unicode.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "normalizers/bert.h" -#include "normalizers/replace.h" -#include "normalizers/strip.h" -#include "normalizers/unicode.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(normalizers, utils) { @@ -33,5 +33,5 @@ TEST(normalizers, utils) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_whitespace.cc b/fast_tokenizer/fast_tokenizer/test/test_whitespace.cc similarity index 92% rename from faster_tokenizer/faster_tokenizer/test/test_whitespace.cc rename to fast_tokenizer/fast_tokenizer/test/test_whitespace.cc index d778a60ebb76..65a8f84a44cd 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_whitespace.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_whitespace.cc @@ -14,12 +14,12 @@ limitations under the License. */ #include #include +#include "fast_tokenizer/pretokenizers/whitespace.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "pretokenizers/whitespace.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(pretokenizers, whitespace) { @@ -36,5 +36,5 @@ TEST(pretokenizers, whitespace) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/test/test_wordpiece.cc b/fast_tokenizer/fast_tokenizer/test/test_wordpiece.cc similarity index 97% rename from faster_tokenizer/faster_tokenizer/test/test_wordpiece.cc rename to fast_tokenizer/fast_tokenizer/test/test_wordpiece.cc index 5d3eafe1f59d..3d7661f92319 100644 --- a/faster_tokenizer/faster_tokenizer/test/test_wordpiece.cc +++ b/fast_tokenizer/fast_tokenizer/test/test_wordpiece.cc @@ -15,12 +15,12 @@ limitations under the License. */ #include #include #include +#include "fast_tokenizer/models/wordpiece.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "models/wordpiece.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tests { TEST(model, wordpiece_factory) { @@ -92,5 +92,5 @@ TEST(model, wordpiece_model) { } } // namespace tests -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp \ No newline at end of file diff --git a/fast_tokenizer/fast_tokenizer/test/utils.h b/fast_tokenizer/fast_tokenizer/test/utils.h new file mode 100644 index 000000000000..af1525ebbf1d --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/test/utils.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tests { + +template +void CheckVectorEqual(const std::vector& a, const std::vector& b) { + ASSERT_EQ(a.size(), b.size()); + auto size = a.size(); + for (int i = 0; i < size; ++i) { + ASSERT_EQ(a[i], b[i]); + } +} + +} // namespace tests +} // namespace fast_tokenizer +} // namespace paddlenlp \ No newline at end of file diff --git a/faster_tokenizer/faster_tokenizer/tokenizers/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/tokenizers/CMakeLists.txt similarity index 100% rename from faster_tokenizer/faster_tokenizer/tokenizers/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/tokenizers/CMakeLists.txt diff --git a/fast_tokenizer/fast_tokenizer/tokenizers/clip_fast_tokenizer.cc b/fast_tokenizer/fast_tokenizer/tokenizers/clip_fast_tokenizer.cc new file mode 100644 index 000000000000..929d6320cf7b --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/tokenizers/clip_fast_tokenizer.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/postprocessors/postprocessors.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" +#include "glog/logging.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tokenizers_impl { + +ClipFastTokenizer::ClipFastTokenizer( + const std::string& vocab_path, + const std::string& merges_path, + uint32_t max_length, + const std::string& unk_token, + const std::string& pad_token, + const std::string& bos_token, + const std::string& eos_token, + bool add_prefix_space, + const std::string& continuing_subword_prefix, + const std::string& end_of_word_suffix, + bool trim_offsets) { + core::Vocab vocab; + core::Merges merges; + models::BPE::GetVocabAndMergesFromFile( + vocab_path, merges_path, &vocab, &merges); + VLOG(6) << "The vocab size of ClipFastTokenizer is " << vocab.size(); + VLOG(6) << "The merges size of ClipFastTokenizer is " << merges.size(); + + models::BPE bpe(vocab, + merges, + 10000, + {}, + {unk_token}, + {continuing_subword_prefix}, + {end_of_word_suffix}, + false); + // Set tokenizer model + this->SetModel(bpe); + + // Set added tokens + std::vector added_tokens; + uint32_t id; + unk_token_ = unk_token; + if (this->TokenToId(unk_token, &id)) { + added_tokens.emplace_back(unk_token, true); + } + pad_token_ = pad_token; + if (this->TokenToId(pad_token, &id)) { + added_tokens.emplace_back(pad_token, true); + pad_token_id_ = id; + } + bos_token_ = bos_token; + if (this->TokenToId(bos_token, &id)) { + added_tokens.emplace_back(bos_token, true); + bos_token_id_ = id; + } + eos_token_ = eos_token; + if (this->TokenToId(eos_token, &id)) { + added_tokens.emplace_back(eos_token, true); + eos_token_id_ = id; + } + this->AddSpecialTokens(added_tokens); + + // Set normalizers + normalizers::NFCNormalizer nfc_normalizer; + normalizers::ReplaceNormalizer replace_normalizer(R"(\s+)", " "); + normalizers::LowercaseNormalizer lower_normalizer; + normalizers::SequenceNormalizer seq_normalizer; + seq_normalizer.AppendNormalizer(&nfc_normalizer); + seq_normalizer.AppendNormalizer(&replace_normalizer); + seq_normalizer.AppendNormalizer(&lower_normalizer); + this->SetNormalizer(seq_normalizer); + + // Set pretokenizers + pretokenizers::ByteLevelPreTokenizer byte_level_pretokenizer(add_prefix_space, + true); + pretokenizers::SplitPreTokenizer split_pretokenizer( + R"('s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)", + core::SplitMode::REMOVED, + true); + pretokenizers::SequencePreTokenizer seq_pretokenizer; + seq_pretokenizer.AppendPreTokenizer(&split_pretokenizer); + seq_pretokenizer.AppendPreTokenizer(&byte_level_pretokenizer); + this->SetPreTokenizer(seq_pretokenizer); + + // Set postprocessors + postprocessors::RobertaPostProcessor roberta_postprocessor( + {eos_token, eos_token_id_}, + {bos_token, bos_token_id_}, + /* trim_offsets= */ false, + add_prefix_space); + this->SetPostProcessor(roberta_postprocessor); + + if (max_length == 0) { + this->DisableTruncMethod(); + } else { + this->EnableTruncMethod(max_length, + 0, + core::Direction::RIGHT, + core::TruncStrategy::LONGEST_FIRST); + } +} + +std::string ClipFastTokenizer::GetPadToken() const { return pad_token_; } + +uint32_t ClipFastTokenizer::GetPadTokenId() const { return pad_token_id_; } + +std::string ClipFastTokenizer::GetUNKToken() const { return unk_token_; } + +uint32_t ClipFastTokenizer::GetUNKTokenId() const { return unk_token_id_; } + +std::string ClipFastTokenizer::GetBOSToken() const { return bos_token_; } + +uint32_t ClipFastTokenizer::GetBOSTokenId() const { return bos_token_id_; } + +std::string ClipFastTokenizer::GetEOSToken() const { return eos_token_; } + +uint32_t ClipFastTokenizer::GetEOSTokenId() const { return eos_token_id_; } + +} // namespace tokenizers_impl +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/tokenizers/clip_fast_tokenizer.h b/fast_tokenizer/fast_tokenizer/tokenizers/clip_fast_tokenizer.h new file mode 100644 index 000000000000..c01891961556 --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/tokenizers/clip_fast_tokenizer.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/tokenizer.h" +#include "fast_tokenizer/utils/utils.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tokenizers_impl { + +struct FASTTOKENIZER_DECL ClipFastTokenizer : public core::Tokenizer { + ClipFastTokenizer(const std::string& vocab_path, + const std::string& merges_path, + uint32_t max_length = 0, + const std::string& unk_token = "<|endoftext|>", + const std::string& pad_token = "<|endoftext|>", + const std::string& bos_token = "<|startoftext|>", + const std::string& eos_token = "<|endoftext|>", + bool add_prefix_space = false, + const std::string& continuing_subword_prefix = "", + const std::string& end_of_word_suffix = "", + bool trim_offsets = false); + std::string GetPadToken() const; + uint32_t GetPadTokenId() const; + std::string GetUNKToken() const; + uint32_t GetUNKTokenId() const; + std::string GetBOSToken() const; + uint32_t GetBOSTokenId() const; + std::string GetEOSToken() const; + uint32_t GetEOSTokenId() const; + +private: + std::string pad_token_; + uint32_t pad_token_id_; + std::string unk_token_; + uint32_t unk_token_id_; + std::string bos_token_; + uint32_t bos_token_id_; + std::string eos_token_; + uint32_t eos_token_id_; +}; + +} // namespace fast_tokenizer_impl +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/fast_tokenizer/fast_tokenizer/tokenizers/ernie_fast_tokenizer.cc b/fast_tokenizer/fast_tokenizer/tokenizers/ernie_fast_tokenizer.cc new file mode 100644 index 000000000000..2c9d3bacbd5c --- /dev/null +++ b/fast_tokenizer/fast_tokenizer/tokenizers/ernie_fast_tokenizer.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/models/models.h" +#include "fast_tokenizer/normalizers/normalizers.h" +#include "fast_tokenizer/postprocessors/postprocessors.h" +#include "fast_tokenizer/pretokenizers/pretokenizers.h" +#include "fast_tokenizer/utils/utils.h" +#include "glog/logging.h" + +namespace paddlenlp { +namespace fast_tokenizer { +namespace tokenizers_impl { + +ErnieFastTokenizer::ErnieFastTokenizer(const std::string& vocab_path, + const std::string& unk_token, + const std::string& sep_token, + const std::string& cls_token, + const std::string& pad_token, + const std::string& mask_token, + bool clean_text, + bool handle_chinese_chars, + bool strip_accents, + bool lowercase, + const std::string& wordpieces_prefix, + uint32_t max_sequence_len) { + core::Vocab vocab; + utils::GetVocabFromFiles(vocab_path, &vocab); + VLOG(6) << "The vocab size of ErnieFastTokenizer is " << vocab.size(); + Init(vocab, + unk_token, + sep_token, + cls_token, + pad_token, + mask_token, + clean_text, + handle_chinese_chars, + strip_accents, + lowercase, + wordpieces_prefix, + max_sequence_len); +} + + +ErnieFastTokenizer::ErnieFastTokenizer(const core::Vocab& vocab, + const std::string& unk_token, + const std::string& sep_token, + const std::string& cls_token, + const std::string& pad_token, + const std::string& mask_token, + bool clean_text, + bool handle_chinese_chars, + bool strip_accents, + bool lowercase, + const std::string& wordpieces_prefix, + uint32_t max_sequence_len) { + Init(vocab, + unk_token, + sep_token, + cls_token, + pad_token, + mask_token, + clean_text, + handle_chinese_chars, + strip_accents, + lowercase, + wordpieces_prefix, + max_sequence_len); +} + + +void ErnieFastTokenizer::Init(const core::Vocab& vocab, + const std::string& unk_token, + const std::string& sep_token, + const std::string& cls_token, + const std::string& pad_token, + const std::string& mask_token, + bool clean_text, + bool handle_chinese_chars, + bool strip_accents, + bool lowercase, + const std::string& wordpieces_prefix, + uint32_t max_sequence_len) { + models::FastWordPiece wordpiece(vocab, + unk_token, + 100 /* max_input_chars_per_word */, + wordpieces_prefix, + true); + this->SetModel(wordpiece); + + std::vector added_tokens; + uint32_t id; + if (this->TokenToId(unk_token, &id)) { + added_tokens.emplace_back(unk_token, true); + } + if (this->TokenToId(sep_token, &id)) { + added_tokens.emplace_back(sep_token, true); + } + if (this->TokenToId(cls_token, &id)) { + added_tokens.emplace_back(cls_token, true); + } + if (this->TokenToId(pad_token, &id)) { + added_tokens.emplace_back(pad_token, true); + } + if (this->TokenToId(mask_token, &id)) { + added_tokens.emplace_back(mask_token, true); + } + this->AddSpecialTokens(added_tokens); + + + normalizers::BertNormalizer bert_normalizer( + clean_text, handle_chinese_chars, strip_accents, lowercase); + this->SetNormalizer(bert_normalizer); + + if (vocab.size() > 0) { + uint32_t sep_id, cls_id; + if (!this->TokenToId(sep_token, &sep_id)) { + throw std::invalid_argument("sep_token not found in the vocabulary"); + } + if (!this->TokenToId(cls_token, &cls_id)) { + throw std::invalid_argument("cls_token not found in the vocabulary"); + } + postprocessors::BertPostProcessor bert_postprocessor({sep_token, sep_id}, + {cls_token, cls_id}); + this->SetPostProcessor(bert_postprocessor); + } + if (max_sequence_len == 0) { + this->DisableTruncMethod(); + } else { + this->EnableTruncMethod(max_sequence_len, + 0, + core::Direction::RIGHT, + core::TruncStrategy::LONGEST_FIRST); + } +} + +} // namespace tokenizers_impl +} // namespace fast_tokenizer +} // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/tokenizers/ernie_faster_tokenizer.h b/fast_tokenizer/fast_tokenizer/tokenizers/ernie_fast_tokenizer.h similarity index 86% rename from faster_tokenizer/faster_tokenizer/tokenizers/ernie_faster_tokenizer.h rename to fast_tokenizer/fast_tokenizer/tokenizers/ernie_fast_tokenizer.h index d91548516c67..b1cf0dbe52b2 100644 --- a/faster_tokenizer/faster_tokenizer/tokenizers/ernie_faster_tokenizer.h +++ b/fast_tokenizer/fast_tokenizer/tokenizers/ernie_fast_tokenizer.h @@ -15,16 +15,16 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/core/encoding.h" -#include "faster_tokenizer/core/tokenizer.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/core/encoding.h" +#include "fast_tokenizer/core/tokenizer.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace tokenizers_impl { -struct FASTERTOKENIZER_DECL ErnieFasterTokenizer : public core::Tokenizer { - ErnieFasterTokenizer(const std::string& vocab_path, +struct FASTTOKENIZER_DECL ErnieFastTokenizer : public core::Tokenizer { + ErnieFastTokenizer(const std::string& vocab_path, const std::string& unk_token = "[UNK]", const std::string& sep_token = "[SEP]", const std::string& cls_token = "[CLS]", @@ -37,7 +37,7 @@ struct FASTERTOKENIZER_DECL ErnieFasterTokenizer : public core::Tokenizer { const std::string& wordpieces_prefix = "##", uint32_t max_sequence_len = 0); - ErnieFasterTokenizer(const core::Vocab& vocab, + ErnieFastTokenizer(const core::Vocab& vocab, const std::string& unk_token = "[UNK]", const std::string& sep_token = "[SEP]", const std::string& cls_token = "[CLS]", @@ -65,6 +65,6 @@ struct FASTERTOKENIZER_DECL ErnieFasterTokenizer : public core::Tokenizer { uint32_t max_sequence_len = 0); }; -} // namespace faster_tokenizer_impl -} // namespace faster_tokenizer +} // namespace fast_tokenizer_impl +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/utils/CMakeLists.txt similarity index 100% rename from faster_tokenizer/faster_tokenizer/utils/CMakeLists.txt rename to fast_tokenizer/fast_tokenizer/utils/CMakeLists.txt diff --git a/faster_tokenizer/faster_tokenizer/utils/cache.h b/fast_tokenizer/fast_tokenizer/utils/cache.h similarity index 95% rename from faster_tokenizer/faster_tokenizer/utils/cache.h rename to fast_tokenizer/fast_tokenizer/utils/cache.h index feaf8a16da19..704710572394 100644 --- a/faster_tokenizer/faster_tokenizer/utils/cache.h +++ b/fast_tokenizer/fast_tokenizer/utils/cache.h @@ -17,10 +17,10 @@ #include #include -#include "faster_tokenizer/utils/shared_mutex.h" +#include "fast_tokenizer/utils/shared_mutex.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { static size_t DEFAULT_CACHE_CAPACITY = 10000; @@ -98,5 +98,5 @@ struct Cache { }; } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/failure.cc b/fast_tokenizer/fast_tokenizer/utils/failure.cc similarity index 98% rename from faster_tokenizer/faster_tokenizer/utils/failure.cc rename to fast_tokenizer/fast_tokenizer/utils/failure.cc index c950d2fe7034..1ae50b4d334e 100644 --- a/faster_tokenizer/faster_tokenizer/utils/failure.cc +++ b/fast_tokenizer/fast_tokenizer/utils/failure.cc @@ -18,13 +18,13 @@ #include #include "glog/logging.h" -#include "faster_tokenizer/utils/failure.h" -#include "faster_tokenizer/utils/trie.h" -#include "faster_tokenizer/utils/utf8.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/failure.h" +#include "fast_tokenizer/utils/trie.h" +#include "fast_tokenizer/utils/utf8.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { Failure::Failure() @@ -421,5 +421,5 @@ void FailureArray::BuildOutgoingEdgeLabelsFromToken( } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/failure.h b/fast_tokenizer/fast_tokenizer/utils/failure.h similarity index 97% rename from faster_tokenizer/faster_tokenizer/utils/failure.h rename to fast_tokenizer/fast_tokenizer/utils/failure.h index 17d360cca3fe..c302f53496a8 100644 --- a/faster_tokenizer/faster_tokenizer/utils/failure.h +++ b/fast_tokenizer/fast_tokenizer/utils/failure.h @@ -20,12 +20,12 @@ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { class Trie; -// Used in Faster WordPiece Model specially +// Used in Fast WordPiece Model specially struct Failure { uint32_t failure_link_; // Indicate the number of failure_pops @@ -103,5 +103,5 @@ struct FailureArray { }; } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/lattice.cc b/fast_tokenizer/fast_tokenizer/utils/lattice.cc similarity index 99% rename from faster_tokenizer/faster_tokenizer/utils/lattice.cc rename to fast_tokenizer/fast_tokenizer/utils/lattice.cc index fc39925e02d5..14447788cda5 100644 --- a/faster_tokenizer/faster_tokenizer/utils/lattice.cc +++ b/fast_tokenizer/fast_tokenizer/utils/lattice.cc @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "faster_tokenizer/utils/lattice.h" +#include "fast_tokenizer/utils/lattice.h" #include #include @@ -28,10 +28,10 @@ #include #include "glog/logging.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { // Size of nodes pre-allocated in Lattice. @@ -542,5 +542,5 @@ std::vector Lattice::Sample(float inv_theta) { } } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/lattice.h b/fast_tokenizer/fast_tokenizer/utils/lattice.h similarity index 98% rename from faster_tokenizer/faster_tokenizer/utils/lattice.h rename to fast_tokenizer/fast_tokenizer/utils/lattice.h index 3949c1221631..daa6523d059d 100644 --- a/faster_tokenizer/faster_tokenizer/utils/lattice.h +++ b/fast_tokenizer/fast_tokenizer/utils/lattice.h @@ -17,10 +17,10 @@ #include #include -#include "faster_tokenizer/utils/string_view.h" +#include "fast_tokenizer/utils/string_view.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { // Copy from https://github.com/google/sentencepiece/blob/master/src/freelist.h @@ -188,5 +188,5 @@ class Lattice { } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/path.h b/fast_tokenizer/fast_tokenizer/utils/path.h similarity index 96% rename from faster_tokenizer/faster_tokenizer/utils/path.h rename to fast_tokenizer/fast_tokenizer/utils/path.h index ca7d5ef48f8c..a58a00af613a 100644 --- a/faster_tokenizer/faster_tokenizer/utils/path.h +++ b/fast_tokenizer/fast_tokenizer/utils/path.h @@ -24,7 +24,7 @@ limitations under the License. */ #endif namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { inline std::string PathJoin(const std::vector& paths, @@ -54,5 +54,5 @@ inline std::string PathJoin(const std::string& folder, } } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/sentencepiece_normalizer.cc b/fast_tokenizer/fast_tokenizer/utils/sentencepiece_normalizer.cc similarity index 97% rename from faster_tokenizer/faster_tokenizer/utils/sentencepiece_normalizer.cc rename to fast_tokenizer/fast_tokenizer/utils/sentencepiece_normalizer.cc index 3c1e730616fb..4a7bf9950ab5 100644 --- a/faster_tokenizer/faster_tokenizer/utils/sentencepiece_normalizer.cc +++ b/fast_tokenizer/fast_tokenizer/utils/sentencepiece_normalizer.cc @@ -13,17 +13,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "faster_tokenizer/utils/sentencepiece_normalizer.h" +#include "fast_tokenizer/utils/sentencepiece_normalizer.h" #include -#include "faster_tokenizer/utils/unique_ptr.h" -#include "faster_tokenizer/utils/utf8.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/unique_ptr.h" +#include "fast_tokenizer/utils/utf8.h" +#include "fast_tokenizer/utils/utils.h" #include "glog/logging.h" #include "unicode/brkiter.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { PrefixMatcher::PrefixMatcher(const std::set& dic) { @@ -181,7 +181,7 @@ std::pair Normalizer::NormalizePrefix( int longest_value = 0; if (trie_ != nullptr) { // Allocates trie_results in stack, which makes the encoding speed 36% - // faster. (38k sentences/sec => 60k sentences/sec). Builder checks that the + // fast. (38k sentences/sec => 60k sentences/sec). Builder checks that the // result size never exceeds kMaxTrieResultsSize. This array consumes // 0.5kByte in stack, which is less than default stack frames (16kByte). Darts::DoubleArray::result_pair_type @@ -338,5 +338,5 @@ void Normalizer::Replace(const simple_string_view& new_part, } } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/sentencepiece_normalizer.h b/fast_tokenizer/fast_tokenizer/utils/sentencepiece_normalizer.h similarity index 97% rename from faster_tokenizer/faster_tokenizer/utils/sentencepiece_normalizer.h rename to fast_tokenizer/fast_tokenizer/utils/sentencepiece_normalizer.h index 8821d3fad2d2..3a8543cc39c8 100644 --- a/faster_tokenizer/faster_tokenizer/utils/sentencepiece_normalizer.h +++ b/fast_tokenizer/fast_tokenizer/utils/sentencepiece_normalizer.h @@ -20,12 +20,12 @@ #include #include -#include "faster_tokenizer/utils/string_view.h" +#include "fast_tokenizer/utils/string_view.h" #include "darts.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { struct Cstrless { @@ -110,5 +110,5 @@ class Normalizer { }; } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/shared_mutex.h b/fast_tokenizer/fast_tokenizer/utils/shared_mutex.h similarity index 99% rename from faster_tokenizer/faster_tokenizer/utils/shared_mutex.h rename to fast_tokenizer/fast_tokenizer/utils/shared_mutex.h index 530f45552617..37931ff9f740 100644 --- a/faster_tokenizer/faster_tokenizer/utils/shared_mutex.h +++ b/fast_tokenizer/fast_tokenizer/utils/shared_mutex.h @@ -21,7 +21,7 @@ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { // The code is from http://howardhinnant.github.io/shared_mutex.cpp @@ -300,5 +300,5 @@ inline void swap(shared_lock& x, shared_lock& y) { } } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/string_view.h b/fast_tokenizer/fast_tokenizer/utils/string_view.h similarity index 95% rename from faster_tokenizer/faster_tokenizer/utils/string_view.h rename to fast_tokenizer/fast_tokenizer/utils/string_view.h index b457531e4543..35cacdefed9f 100644 --- a/faster_tokenizer/faster_tokenizer/utils/string_view.h +++ b/fast_tokenizer/fast_tokenizer/utils/string_view.h @@ -17,7 +17,7 @@ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { struct simple_string_view { @@ -49,5 +49,5 @@ struct simple_string_view { }; } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/trie.cc b/fast_tokenizer/fast_tokenizer/utils/trie.cc similarity index 97% rename from faster_tokenizer/faster_tokenizer/utils/trie.cc rename to fast_tokenizer/fast_tokenizer/utils/trie.cc index 6b6feb629e3f..b063e91ff085 100644 --- a/faster_tokenizer/faster_tokenizer/utils/trie.cc +++ b/fast_tokenizer/fast_tokenizer/utils/trie.cc @@ -18,12 +18,12 @@ #include #include "glog/logging.h" -#include "faster_tokenizer/utils/trie.h" -#include "faster_tokenizer/utils/utf8.h" -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/trie.h" +#include "fast_tokenizer/utils/utf8.h" +#include "fast_tokenizer/utils/utils.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { void Trie::CreateTrie(const std::vector& keys, @@ -227,5 +227,5 @@ void Trie::SetContinuingSubwordPrefix( } } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/trie.h b/fast_tokenizer/fast_tokenizer/utils/trie.h similarity index 98% rename from faster_tokenizer/faster_tokenizer/utils/trie.h rename to fast_tokenizer/fast_tokenizer/utils/trie.h index 59db802de2ac..b4c9b0cbff4d 100644 --- a/faster_tokenizer/faster_tokenizer/utils/trie.h +++ b/fast_tokenizer/fast_tokenizer/utils/trie.h @@ -23,7 +23,7 @@ #include "darts.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { class Trie { @@ -116,5 +116,5 @@ class Trie { }; } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/unique_ptr.h b/fast_tokenizer/fast_tokenizer/utils/unique_ptr.h similarity index 96% rename from faster_tokenizer/faster_tokenizer/utils/unique_ptr.h rename to fast_tokenizer/fast_tokenizer/utils/unique_ptr.h index 05c808cf925a..767e203fcd2d 100644 --- a/faster_tokenizer/faster_tokenizer/utils/unique_ptr.h +++ b/fast_tokenizer/fast_tokenizer/utils/unique_ptr.h @@ -17,7 +17,7 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { // Trait to select overloads and return types for MakeUnique. @@ -57,5 +57,5 @@ typename MakeUniqueResult::invalid make_unique(Args &&... /* args */) = delete; // NOLINT } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/utf8.h b/fast_tokenizer/fast_tokenizer/utils/utf8.h similarity index 99% rename from faster_tokenizer/faster_tokenizer/utils/utf8.h rename to fast_tokenizer/fast_tokenizer/utils/utf8.h index 661fafee857b..dbb8c92f6732 100644 --- a/faster_tokenizer/faster_tokenizer/utils/utf8.h +++ b/fast_tokenizer/fast_tokenizer/utils/utf8.h @@ -16,7 +16,7 @@ limitations under the License. */ #include namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { static constexpr uint32_t kUnicodeError = 0xFFFD; @@ -221,5 +221,5 @@ inline bool IsValidDecodeUTF8(const char* begin, } } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/utils.cc b/fast_tokenizer/fast_tokenizer/utils/utils.cc similarity index 80% rename from faster_tokenizer/faster_tokenizer/utils/utils.cc rename to fast_tokenizer/fast_tokenizer/utils/utils.cc index e10aa9af398d..dd23726c1bf4 100644 --- a/faster_tokenizer/faster_tokenizer/utils/utils.cc +++ b/fast_tokenizer/fast_tokenizer/utils/utils.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "faster_tokenizer/utils/utils.h" +#include "fast_tokenizer/utils/utils.h" #include "unicode/uchar.h" namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { void GetVocabFromFiles(const std::string& files, @@ -112,6 +112,36 @@ void GetSortedVocab(const std::vector& keys, } } +std::unordered_map CreateBytesToChars() { + std::unordered_map bytes_to_chars; + bool bytes_flag[256] = {false}; + std::vector> ranges = { + {'!', '~'}, {'\xA1', '\xAC'}, {'\xAE', '\xFF'}}; + + for (int i = 0; i < ranges.size(); ++i) { + for (uint32_t c = ranges[i].first; c <= ranges[i].second; ++c) { + bytes_to_chars.insert({c, c}); + bytes_flag[c] = true; + } + } + uint32_t n = 0; + for (uint32_t b = 0; b <= 255; ++b) { + if (!bytes_flag[b]) { + bytes_to_chars.insert({b, (1 << 8) + n}); + n += 1; + } + } + return bytes_to_chars; +} + +bool IsWhiteSpace(int ch) { + const std::string WHITESPACE = " \n\r\t\f\v"; + for (int i = 0; i < WHITESPACE.length(); ++i) { + if (ch == WHITESPACE[i]) return true; + } + return u_isspace(ch); +} + } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/utils.h b/fast_tokenizer/fast_tokenizer/utils/utils.h similarity index 93% rename from faster_tokenizer/faster_tokenizer/utils/utils.h rename to fast_tokenizer/fast_tokenizer/utils/utils.h index ba49c5ae5f12..6a9e418693d3 100644 --- a/faster_tokenizer/faster_tokenizer/utils/utils.h +++ b/fast_tokenizer/fast_tokenizer/utils/utils.h @@ -36,17 +36,17 @@ limitations under the License. */ #endif #if defined(_WIN32) -#ifdef FASTERTOKENIZER_LIB -#define FASTERTOKENIZER_DECL __declspec(dllexport) +#ifdef FASTTOKENIZER_LIB +#define FASTTOKENIZER_DECL __declspec(dllexport) #else -#define FASTERTOKENIZER_DECL __declspec(dllimport) -#endif // FASTERTOKENIZER_LIB +#define FASTTOKENIZER_DECL __declspec(dllimport) +#endif // FASTTOKENIZER_LIB #else -#define FASTERTOKENIZER_DECL __attribute__((visibility("default"))) +#define FASTTOKENIZER_DECL __attribute__((visibility("default"))) #endif // _WIN32 namespace paddlenlp { -namespace faster_tokenizer { +namespace fast_tokenizer { namespace utils { void GetVocabFromFiles(const std::string& files, @@ -66,7 +66,7 @@ void StringReplaceAll(std::string* str, const std::string& from, const std::string& to); -// Used in faster wordpiece model +// Used in fast wordpiece model static constexpr uint32_t kBitToIndicateSuffixToken = 30; static constexpr uint32_t kBitsToEncodeVocabTokenLength = 8; @@ -178,6 +178,10 @@ void GetSortedVocab(const std::vector& keys, std::vector* sorted_keys, std::vector* sorted_values); +std::unordered_map CreateBytesToChars(); + +bool IsWhiteSpace(int ch); + } // namespace utils -} // namespace faster_tokenizer +} // namespace fast_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/utils/variant.h b/fast_tokenizer/fast_tokenizer/utils/variant.h similarity index 99% rename from faster_tokenizer/faster_tokenizer/utils/variant.h rename to fast_tokenizer/fast_tokenizer/utils/variant.h index 696f8312afe4..3429c2ce15b3 100644 --- a/faster_tokenizer/faster_tokenizer/utils/variant.h +++ b/fast_tokenizer/fast_tokenizer/utils/variant.h @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // Copy from // https://github.com/mpark/variant/blob/single-header/v1.4.0/variant.hpp // Modify the following points: diff --git a/faster_tokenizer/perf/README.md b/fast_tokenizer/perf/README.md similarity index 53% rename from faster_tokenizer/perf/README.md rename to fast_tokenizer/perf/README.md index c453189ddea6..7d4f4b1d33a9 100644 --- a/faster_tokenizer/perf/README.md +++ b/fast_tokenizer/perf/README.md @@ -1,6 +1,6 @@ -# 飞桨FasterTokenizer性能测试 +# 飞桨FastTokenizer性能测试 -在PaddleNLP v2.2.0版本中PaddleNLP推出了高性能的Transformer类文本分词器,简称飞桨FasterTokenizer。为了验证飞桨FasterTokenizer的性能快的特点,PaddleNLP选取了业内常见的一些文本分词器进行了性能对比比较,主要进行性能参考的是HuggingFace BertTokenizer, Tensorflow-text BertTokenizer. 我们以 bert-base-chinese 模型为例进行了文本分词性能实验对比,在中文的数据下进行性能对比实验,下面是具体实验设置信息: +在PaddleNLP v2.2.0版本中PaddleNLP推出了高性能的Transformer类文本分词器,简称飞桨FastTokenizer。为了验证飞桨FastTokenizer的性能快的特点,PaddleNLP选取了业内常见的一些文本分词器进行了性能对比比较,主要进行性能参考的是HuggingFace BertTokenizer, Tensorflow-text BertTokenizer. 我们以 bert-base-chinese 模型为例进行了文本分词性能实验对比,在中文的数据下进行性能对比实验,下面是具体实验设置信息: * [HuggingFace Tokenizers(Python)](https://github.com/huggingface/tokenizers): ```python @@ -26,12 +26,12 @@ import tensorflow_text as tf_text tf_tokenizer = tf_text.BertTokenizer(vocab) ``` -* [飞桨FasterTokenizer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/experimental): +* [飞桨FastTokenizer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/experimental): ```python -from paddlenlp.experimental import FasterTokenizer +from paddlenlp.experimental import FastTokenizer -faster_tokenizer = FasterTokenizer.from_pretrained("bert-base-chinese") +fast_tokenizer = FastTokenizer.from_pretrained("bert-base-chinese") ``` @@ -65,4 +65,4 @@ python perf.py
图片
-飞桨FasterTokenizer与其他框架性能的对比,是在固定文本长度在不同batch size下的分词吞吐量。纵坐标是对数坐标,单位是1w tokens/秒。随着batch size的增大,飞桨FasterTokenizer速度会远远超过其他同类产品的实现,尤其是在大batch文本上飞桨框架能充分发挥多核机器的优势,取得领先的速度。 +飞桨FastTokenizer与其他框架性能的对比,是在固定文本长度在不同batch size下的分词吞吐量。纵坐标是对数坐标,单位是1w tokens/秒。随着batch size的增大,飞桨FastTokenizer速度会远远超过其他同类产品的实现,尤其是在大batch文本上飞桨框架能充分发挥多核机器的优势,取得领先的速度。 diff --git a/faster_tokenizer/perf/perf.py b/fast_tokenizer/perf/perf.py similarity index 95% rename from faster_tokenizer/perf/perf.py rename to fast_tokenizer/perf/perf.py index ba3751c63c71..1c5f95d5008d 100755 --- a/faster_tokenizer/perf/perf.py +++ b/fast_tokenizer/perf/perf.py @@ -21,7 +21,7 @@ import paddle import paddlenlp from paddlenlp.transformers import BertTokenizer -from paddlenlp.experimental import FasterTokenizer +from paddlenlp.experimental import FastTokenizer from paddlenlp.experimental import to_tensor from transformers import AutoTokenizer @@ -55,8 +55,8 @@ data = [text[:max_seq_length]] * num_samples -# BERT Tokenizer using PaddleNLP FasterTokenizer -pp_tokenizer = FasterTokenizer.from_pretrained("bert-base-chinese") +# BERT Tokenizer using PaddleNLP FastTokenizer +pp_tokenizer = FastTokenizer.from_pretrained("bert-base-chinese") batches = [ to_tensor(data[idx:idx + batch_size]) @@ -74,7 +74,7 @@ max_seq_len=max_seq_length) end = time.time() -print("The throughput of paddle FasterTokenizer: {:,.2f} tokens/s".format( +print("The throughput of paddle FastTokenizer: {:,.2f} tokens/s".format( (total_tokens / (end - start)))) hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese", use_fast=True) @@ -93,7 +93,7 @@ encoded_inputs = hf_tokenizer( batch_data) #, padding=True, truncation=True) end = time.time() -print("The throughput of huggingface FasterTokenizer: {:,.2f} tokens/s".format( +print("The throughput of huggingface FastTokenizer: {:,.2f} tokens/s".format( (total_tokens / (end - start)))) # BERT Tokenizer using PaddleNLP BertTokenizer diff --git a/faster_tokenizer/perf/requirements.txt b/fast_tokenizer/perf/requirements.txt similarity index 100% rename from faster_tokenizer/perf/requirements.txt rename to fast_tokenizer/perf/requirements.txt diff --git a/fast_tokenizer/perf/run_all_perf.sh b/fast_tokenizer/perf/run_all_perf.sh new file mode 100644 index 000000000000..052de2a717cd --- /dev/null +++ b/fast_tokenizer/perf/run_all_perf.sh @@ -0,0 +1,27 @@ +# !/bin/sh + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +for seq_len in 32 64 128 256 512; do +for batch_size in 1 2 4 8 16 32 64; do +mkdir -p seq_len_$seq_len/batch_size_$batch_size +for thread_num in 1 2 4 8 16 32 64; do +echo "Experiment setting: thread_num=$thread_num, batch_size=$batch_size, sequence_length=$seq_len" +export OMP_NUM_THREADS=$thread_num +export RAYON_RS_NUM_CPUS=$thread_num +python perf.py --batch_size $batch_size --max_seq_length $seq_len >seq_len_$seq_len/batch_size_$batch_size/parallel$thread_num.log 2>nohup.out +done +done +done \ No newline at end of file diff --git a/faster_tokenizer/python/CMakeLists.txt b/fast_tokenizer/python/CMakeLists.txt similarity index 91% rename from faster_tokenizer/python/CMakeLists.txt rename to fast_tokenizer/python/CMakeLists.txt index 77127bb82596..ab0fe0065e3b 100644 --- a/faster_tokenizer/python/CMakeLists.txt +++ b/fast_tokenizer/python/CMakeLists.txt @@ -10,8 +10,8 @@ add_custom_target(copy_setup ALL # 3. Copy the core_tokenizers.so to python tokenizers directory set(TOKENIZER_CORE_NAME "core_tokenizers") -set(TOKENIZER_DST_DIR ${CMAKE_BINARY_DIR}/python/faster_tokenizer) -set(TOKENIZER_SRC_DIR ${CMAKE_BINARY_DIR}/faster_tokenizer) +set(TOKENIZER_DST_DIR ${CMAKE_BINARY_DIR}/python/fast_tokenizer) +set(TOKENIZER_SRC_DIR ${CMAKE_BINARY_DIR}/fast_tokenizer) set(TOKENIZER_THIRD_PARTY_DIR ${CMAKE_BINARY_DIR}/third_party) IF(WIN32) diff --git a/faster_tokenizer/python/faster_tokenizer/__init__.py b/fast_tokenizer/python/fast_tokenizer/__init__.py similarity index 95% rename from faster_tokenizer/python/faster_tokenizer/__init__.py rename to fast_tokenizer/python/fast_tokenizer/__init__.py index 7c6aaac938df..6deffb5f6445 100644 --- a/faster_tokenizer/python/faster_tokenizer/__init__.py +++ b/fast_tokenizer/python/fast_tokenizer/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.0" +__version__ = "1.0.0" from typing import Tuple, Union, Tuple, List import sys @@ -72,4 +72,6 @@ PadStrategy) from .core_tokenizers import models, normalizers, pretokenizers, postprocessors, decoders -from .tokenizers_impl import ErnieFasterTokenizer, SentencePieceBPEFasterTokenizer +from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer + +from .core_tokenizers import (get_thread_num, set_thread_num) \ No newline at end of file diff --git a/faster_tokenizer/python/faster_tokenizer/decoders/__init__.py b/fast_tokenizer/python/fast_tokenizer/decoders/__init__.py similarity index 100% rename from faster_tokenizer/python/faster_tokenizer/decoders/__init__.py rename to fast_tokenizer/python/fast_tokenizer/decoders/__init__.py diff --git a/faster_tokenizer/python/faster_tokenizer/libs/__init__.py b/fast_tokenizer/python/fast_tokenizer/libs/__init__.py similarity index 100% rename from faster_tokenizer/python/faster_tokenizer/libs/__init__.py rename to fast_tokenizer/python/fast_tokenizer/libs/__init__.py diff --git a/faster_tokenizer/python/faster_tokenizer/models/__init__.py b/fast_tokenizer/python/fast_tokenizer/models/__init__.py similarity index 100% rename from faster_tokenizer/python/faster_tokenizer/models/__init__.py rename to fast_tokenizer/python/fast_tokenizer/models/__init__.py diff --git a/faster_tokenizer/python/faster_tokenizer/normalizers/__init__.py b/fast_tokenizer/python/fast_tokenizer/normalizers/__init__.py similarity index 100% rename from faster_tokenizer/python/faster_tokenizer/normalizers/__init__.py rename to fast_tokenizer/python/fast_tokenizer/normalizers/__init__.py diff --git a/faster_tokenizer/python/faster_tokenizer/postprocessors/__init__.py b/fast_tokenizer/python/fast_tokenizer/postprocessors/__init__.py similarity index 100% rename from faster_tokenizer/python/faster_tokenizer/postprocessors/__init__.py rename to fast_tokenizer/python/fast_tokenizer/postprocessors/__init__.py diff --git a/faster_tokenizer/python/faster_tokenizer/pretokenizers/__init__.py b/fast_tokenizer/python/fast_tokenizer/pretokenizers/__init__.py similarity index 100% rename from faster_tokenizer/python/faster_tokenizer/pretokenizers/__init__.py rename to fast_tokenizer/python/fast_tokenizer/pretokenizers/__init__.py diff --git a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/__init__.py b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/__init__.py similarity index 80% rename from faster_tokenizer/python/faster_tokenizer/tokenizers_impl/__init__.py rename to fast_tokenizer/python/fast_tokenizer/tokenizers_impl/__init__.py index ff20cee7da86..babf4cd5d584 100644 --- a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/__init__.py +++ b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .base_tokenizer import BaseFasterTokenizer -from .ernie import ErnieFasterTokenizer -from .sentencepiece_bpe import SentencePieceBPEFasterTokenizer +from .base_tokenizer import BaseFastTokenizer +from .ernie import ErnieFastTokenizer +from .sentencepiece_bpe import SentencePieceBPEFastTokenizer diff --git a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/base_tokenizer.py b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/base_tokenizer.py similarity index 98% rename from faster_tokenizer/python/faster_tokenizer/tokenizers_impl/base_tokenizer.py rename to fast_tokenizer/python/fast_tokenizer/tokenizers_impl/base_tokenizer.py index 947f0f623711..5fa31ecb4e6b 100644 --- a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/base_tokenizer.py +++ b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/base_tokenizer.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from faster_tokenizer import Tokenizer +from fast_tokenizer import Tokenizer -__all__ = ['BaseFasterTokenizer'] +__all__ = ['BaseFastTokenizer'] -class BaseFasterTokenizer: +class BaseFastTokenizer: def __init__(self, tokenizer_impl, parma_dict=None): self._tokenizer = tokenizer_impl diff --git a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/ernie.py b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/ernie.py similarity index 81% rename from faster_tokenizer/python/faster_tokenizer/tokenizers_impl/ernie.py rename to fast_tokenizer/python/fast_tokenizer/tokenizers_impl/ernie.py index 07934ca2307f..32194512627b 100644 --- a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/ernie.py +++ b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/ernie.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .base_tokenizer import BaseFasterTokenizer +from .base_tokenizer import BaseFastTokenizer -from faster_tokenizer.normalizers import BertNormalizer -from faster_tokenizer.pretokenizers import BertPreTokenizer -from faster_tokenizer.models import WordPiece, FasterWordPiece -from faster_tokenizer.postprocessors import BertPostProcessor -from faster_tokenizer import decoders -from faster_tokenizer import Tokenizer +from fast_tokenizer.normalizers import BertNormalizer +from fast_tokenizer.pretokenizers import BertPreTokenizer +from fast_tokenizer.models import WordPiece, FastWordPiece +from fast_tokenizer.postprocessors import BertPostProcessor +from fast_tokenizer import decoders +from fast_tokenizer import Tokenizer -__all__ = ['ErnieFasterTokenizer'] +__all__ = ['ErnieFastTokenizer'] -class ErnieFasterTokenizer(BaseFasterTokenizer): +class ErnieFastTokenizer(BaseFastTokenizer): def __init__(self, vocab=None, @@ -40,17 +40,17 @@ def __init__(self, wordpieces_prefix="##", max_sequence_len=None, max_input_chars_per_word=100, - use_faster_wordpiece=False, - use_faster_wordpiece_with_pretokenization=False): - tokenizer_model = WordPiece if not use_faster_wordpiece else FasterWordPiece + use_fast_wordpiece=False, + use_fast_wordpiece_with_pretokenization=False): + tokenizer_model = WordPiece if not use_fast_wordpiece else FastWordPiece model_kwargs = { "unk_token": str(unk_token), "continuing_subword_prefix": wordpieces_prefix, "max_input_chars_per_word": max_input_chars_per_word, } - if use_faster_wordpiece: + if use_fast_wordpiece: model_kwargs[ - "with_pretokenization"] = use_faster_wordpiece_with_pretokenization + "with_pretokenization"] = use_fast_wordpiece_with_pretokenization else: model_kwargs["handle_chinese_chars"] = handle_chinese_chars if vocab is not None: @@ -74,7 +74,7 @@ def __init__(self, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase) - if not use_faster_wordpiece or not use_faster_wordpiece_with_pretokenization: + if not use_fast_wordpiece or not use_fast_wordpiece_with_pretokenization: tokenizer.pretokenizer = BertPreTokenizer() if vocab is not None: diff --git a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/sentencepiece_bpe.py b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/sentencepiece_bpe.py similarity index 81% rename from faster_tokenizer/python/faster_tokenizer/tokenizers_impl/sentencepiece_bpe.py rename to fast_tokenizer/python/fast_tokenizer/tokenizers_impl/sentencepiece_bpe.py index c24ba080fc1e..8ecbed410f6c 100644 --- a/faster_tokenizer/python/faster_tokenizer/tokenizers_impl/sentencepiece_bpe.py +++ b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/sentencepiece_bpe.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .base_tokenizer import BaseFasterTokenizer -from faster_tokenizer.models import BPE -from faster_tokenizer.normalizers import NFKCNormalizer -from faster_tokenizer import Tokenizer -from faster_tokenizer.pretokenizers import MetaSpacePreTokenizer +from .base_tokenizer import BaseFastTokenizer +from fast_tokenizer.models import BPE +from fast_tokenizer.normalizers import NFKCNormalizer +from fast_tokenizer import Tokenizer +from fast_tokenizer.pretokenizers import MetaSpacePreTokenizer -__all__ = ['SentencePieceBPEFasterTokenizer'] +__all__ = ['SentencePieceBPEFastTokenizer'] -class SentencePieceBPEFasterTokenizer(BaseFasterTokenizer): +class SentencePieceBPEFastTokenizer(BaseFastTokenizer): def __init__(self, vocab=None, @@ -58,4 +58,4 @@ def __init__(self, @staticmethod def from_file(vocab_filename, merges_filename, **kwargs): vocab, merges = BPE.read_file(vocab_filename, merges_filename) - return SentencePieceBPEFasterTokenizer(vocab, merges, **kwargs) + return SentencePieceBPEFastTokenizer(vocab, merges, **kwargs) diff --git a/fast_tokenizer/python/tests/test_byte_level_pretokenizer.py b/fast_tokenizer/python/tests/test_byte_level_pretokenizer.py new file mode 100644 index 000000000000..cb3cb0c0b709 --- /dev/null +++ b/fast_tokenizer/python/tests/test_byte_level_pretokenizer.py @@ -0,0 +1,60 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +import unittest +from fast_tokenizer import pretokenizers + + +class TestByteLevelPreTokenizer(unittest.TestCase): + + def setUp(self): + self.pretokenized = pretokenizers.PreTokenizedString( + "Hello my friend, how is your day going?") + + def check_equals(self, add_prefix_space, use_regex, expected_result): + bytelevel = pretokenizers.ByteLevelPreTokenizer( + add_prefix_space=add_prefix_space, use_regex=use_regex) + bytelevel(self.pretokenized) + splits = self.pretokenized.get_splits() + result = [(s, offset) for s, offset, tokens in splits] + self.assertEqual(result, expected_result) + + def test_pretokenize_with_regex(self): + expected_result = [("Hello", (0, 5)), ("Ġmy", (5, 8)), + ("Ġfriend", (8, 15)), (",", (15, 16)), + ("Ġhow", (16, 20)), ("Ġis", (20, 23)), + ("Ġyour", (23, 28)), ("Ġday", (28, 32)), + ("Ġgoing", (32, 38)), ("?", (38, 39))] + + self.check_equals(False, True, expected_result) + + def test_pretokenize_without_regex(self): + expected_result = [("HelloĠmyĠfriend,ĠhowĠisĠyourĠdayĠgoing?", (0, 39))] + self.check_equals(False, False, expected_result) + + def test_pretokenize_with_prefix_with_regex(self): + expected_result = [("ĠHello", (0, 5)), ("Ġmy", (5, 8)), + ("Ġfriend", (8, 15)), (",", (15, 16)), + ("Ġhow", (16, 20)), ("Ġis", (20, 23)), + ("Ġyour", (23, 28)), ("Ġday", (28, 32)), + ("Ġgoing", (32, 38)), ("?", (38, 39))] + + self.check_equals(True, True, expected_result) + + def test_pretokenize_with_prefix_without_regex(self): + expected_result = [("ĠHelloĠmyĠfriend,ĠhowĠisĠyourĠdayĠgoing?", (0, 39)) + ] + self.check_equals(True, False, expected_result) diff --git a/faster_tokenizer/python/tests/test_faster_wordpiece.py b/fast_tokenizer/python/tests/test_fast_wordpiece.py similarity index 68% rename from faster_tokenizer/python/tests/test_faster_wordpiece.py rename to fast_tokenizer/python/tests/test_fast_wordpiece.py index dfc5fb651935..d0a98fed3e98 100644 --- a/faster_tokenizer/python/tests/test_faster_wordpiece.py +++ b/fast_tokenizer/python/tests/test_fast_wordpiece.py @@ -18,7 +18,7 @@ from paddlenlp.utils.log import logger from paddlenlp.transformers import AutoTokenizer from paddlenlp.datasets import load_dataset -from faster_tokenizer import ErnieFasterTokenizer, models +from fast_tokenizer import ErnieFastTokenizer, models logger.logger.setLevel('ERROR') @@ -26,20 +26,20 @@ class TestWordpiece(unittest.TestCase): def set_flag(self): - self.use_faster_wordpiece = False - self.use_faster_wordpiece_with_pretokenization = False + self.use_fast_wordpiece = False + self.use_fast_wordpiece_with_pretokenization = False def setUp(self): self.max_seq_length = 128 self.wordpiece_tokenizer = AutoTokenizer.from_pretrained("ernie-1.0") ernie_vocab = self.wordpiece_tokenizer.vocab.token_to_idx self.set_flag() - self.faster_wordpiece_tokenizer = ErnieFasterTokenizer( + self.fast_wordpiece_tokenizer = ErnieFastTokenizer( ernie_vocab, max_sequence_len=self.max_seq_length, - use_faster_wordpiece=self.use_faster_wordpiece, - use_faster_wordpiece_with_pretokenization=self. - use_faster_wordpiece_with_pretokenization) + use_fast_wordpiece=self.use_fast_wordpiece, + use_fast_wordpiece_with_pretokenization=self. + use_fast_wordpiece_with_pretokenization) self.dataset = [ example["sentence"] for example in load_dataset('clue', 'tnews', splits=['train']) @@ -52,10 +52,10 @@ def test_encode(self): expected_input_ids = wordpiece_result['input_ids'] expected_token_type_ids = wordpiece_result['token_type_ids'] - faster_wordpiece_result = self.faster_wordpiece_tokenizer.encode( + fast_wordpiece_result = self.fast_wordpiece_tokenizer.encode( sentence) - actual_input_ids = faster_wordpiece_result.ids - actual_token_type_ids = faster_wordpiece_result.type_ids + actual_input_ids = fast_wordpiece_result.ids + actual_token_type_ids = fast_wordpiece_result.type_ids self.assertEqual(expected_input_ids, actual_input_ids) self.assertEqual(expected_token_type_ids, actual_token_type_ids) @@ -67,24 +67,24 @@ def test_get_offset_mapping(self): return_offsets_mapping=True) expected_offset_mapping = wordpiece_result['offset_mapping'] - faster_wordpiece_result = self.faster_wordpiece_tokenizer.encode( + fast_wordpiece_result = self.fast_wordpiece_tokenizer.encode( sentence) - actual_offset_mapping = faster_wordpiece_result.offsets + actual_offset_mapping = fast_wordpiece_result.offsets self.assertEqual(expected_offset_mapping, actual_offset_mapping) -class TestFasterWordpiece(TestWordpiece): +class TestFastWordpiece(TestWordpiece): def set_flag(self): - self.use_faster_wordpiece = True - self.use_faster_wordpiece_with_pretokenization = False + self.use_fast_wordpiece = True + self.use_fast_wordpiece_with_pretokenization = False -class TestFasterWordpieceWithPretokenization(TestWordpiece): +class TestFastWordpieceWithPretokenization(TestWordpiece): def set_flag(self): - self.use_faster_wordpiece = True - self.use_faster_wordpiece_with_pretokenization = True + self.use_fast_wordpiece = True + self.use_fast_wordpiece_with_pretokenization = True if __name__ == "__main__": diff --git a/faster_tokenizer/python/tests/test_tokenizer_json.py b/fast_tokenizer/python/tests/test_tokenizer_json.py similarity index 66% rename from faster_tokenizer/python/tests/test_tokenizer_json.py rename to fast_tokenizer/python/tests/test_tokenizer_json.py index 74b87ff0fcad..b15d2d415af4 100644 --- a/faster_tokenizer/python/tests/test_tokenizer_json.py +++ b/fast_tokenizer/python/tests/test_tokenizer_json.py @@ -17,8 +17,8 @@ import unittest from paddlenlp.utils.log import logger from paddlenlp.transformers import AutoTokenizer -import faster_tokenizer -from faster_tokenizer import ErnieFasterTokenizer, models +import fast_tokenizer +from fast_tokenizer import ErnieFastTokenizer, models logger.logger.setLevel('ERROR') @@ -28,63 +28,61 @@ class TestTokenizerJson(unittest.TestCase): def setUp(self): wordpiece_tokenizer = AutoTokenizer.from_pretrained("ernie-1.0") ernie_vocab = wordpiece_tokenizer.vocab.token_to_idx - self.faster_tokenizer = ErnieFasterTokenizer(ernie_vocab) + self.fast_tokenizer = ErnieFastTokenizer(ernie_vocab) class TestNormalizerJson(TestTokenizerJson): def check_normalizer_json(self, normalizer): - self.faster_tokenizer.normalizer = normalizer + self.fast_tokenizer.normalizer = normalizer json_file = str(normalizer.__class__) + ".json" - self.faster_tokenizer.save(json_file) - tokenizer = ErnieFasterTokenizer.from_file(json_file) + self.fast_tokenizer.save(json_file) + tokenizer = ErnieFastTokenizer.from_file(json_file) os.remove(json_file) self.assertEqual(normalizer.__getstate__(), tokenizer.normalizer.__getstate__()) def test_replace(self): - replace_normalizer = faster_tokenizer.normalizers.ReplaceNormalizer( + replace_normalizer = fast_tokenizer.normalizers.ReplaceNormalizer( "''", "\"") self.check_normalizer_json(replace_normalizer) def test_strip(self): - strip_normalizer = faster_tokenizer.normalizers.StripNormalizer( + strip_normalizer = fast_tokenizer.normalizers.StripNormalizer( True, True) self.check_normalizer_json(strip_normalizer) def test_strip_accent(self): - strip_normalizer = faster_tokenizer.normalizers.StripAccentsNormalizer() + strip_normalizer = fast_tokenizer.normalizers.StripAccentsNormalizer() self.check_normalizer_json(strip_normalizer) def test_nfc(self): - nfc_normalizer = faster_tokenizer.normalizers.NFCNormalizer() + nfc_normalizer = fast_tokenizer.normalizers.NFCNormalizer() self.check_normalizer_json(nfc_normalizer) def test_nfkc(self): - nfkc_normalizer = faster_tokenizer.normalizers.NFKCNormalizer() + nfkc_normalizer = fast_tokenizer.normalizers.NFKCNormalizer() self.check_normalizer_json(nfkc_normalizer) def test_nfd(self): - nfd_normalizer = faster_tokenizer.normalizers.NFDNormalizer() + nfd_normalizer = fast_tokenizer.normalizers.NFDNormalizer() self.check_normalizer_json(nfd_normalizer) def test_nfkd(self): - nfkd_normalizer = faster_tokenizer.normalizers.NFKDNormalizer() + nfkd_normalizer = fast_tokenizer.normalizers.NFKDNormalizer() self.check_normalizer_json(nfkd_normalizer) def test_nmt(self): - nmt_normalizer = faster_tokenizer.normalizers.NmtNormalizer() + nmt_normalizer = fast_tokenizer.normalizers.NmtNormalizer() self.check_normalizer_json(nmt_normalizer) def test_lowercase(self): - lowercase_normalizer = faster_tokenizer.normalizers.LowercaseNormalizer( - ) + lowercase_normalizer = fast_tokenizer.normalizers.LowercaseNormalizer() self.check_normalizer_json(lowercase_normalizer) def test_sequence(self): - lowercase_normalizer = faster_tokenizer.normalizers.LowercaseNormalizer( - ) - sequence_normalizer = faster_tokenizer.normalizers.SequenceNormalizer( + lowercase_normalizer = fast_tokenizer.normalizers.LowercaseNormalizer() + sequence_normalizer = fast_tokenizer.normalizers.SequenceNormalizer( normalizers=[lowercase_normalizer]) self.check_normalizer_json(sequence_normalizer) diff --git a/faster_tokenizer/run_build_cpp_lib.bat b/fast_tokenizer/run_build_cpp_lib.bat similarity index 100% rename from faster_tokenizer/run_build_cpp_lib.bat rename to fast_tokenizer/run_build_cpp_lib.bat diff --git a/faster_tokenizer/run_build_cpp_lib.sh b/fast_tokenizer/run_build_cpp_lib.sh similarity index 100% rename from faster_tokenizer/run_build_cpp_lib.sh rename to fast_tokenizer/run_build_cpp_lib.sh diff --git a/faster_tokenizer/run_build_py_lib.bat b/fast_tokenizer/run_build_py_lib.bat similarity index 100% rename from faster_tokenizer/run_build_py_lib.bat rename to fast_tokenizer/run_build_py_lib.bat diff --git a/faster_tokenizer/run_build_py_lib.sh b/fast_tokenizer/run_build_py_lib.sh similarity index 100% rename from faster_tokenizer/run_build_py_lib.sh rename to fast_tokenizer/run_build_py_lib.sh diff --git a/faster_tokenizer/setup.py b/fast_tokenizer/setup.py similarity index 78% rename from faster_tokenizer/setup.py rename to fast_tokenizer/setup.py index 7a2a897a1942..3f89179303cc 100644 --- a/faster_tokenizer/setup.py +++ b/fast_tokenizer/setup.py @@ -39,19 +39,19 @@ def finalize_options(self): if os.name != 'nt': - package_data = {"faster_tokenizer": ["core_tokenizers.so", "commit.log"]} - package_data['faster_tokenizer.libs'] = [] + package_data = {"fast_tokenizer": ["core_tokenizers.so", "commit.log"]} + package_data['fast_tokenizer.libs'] = [] else: package_data = { - "faster_tokenizer": + "fast_tokenizer": ["core_tokenizers.pyd", "core_tokenizers.lib", "commit.log"] } # Add icu dll - package_data['faster_tokenizer.libs'] = ["icuuc70.dll", "icudt70.dll"] + package_data['fast_tokenizer.libs'] = ["icuuc70.dll", "icudt70.dll"] def get_version(): - f = open(os.path.join("python", "faster_tokenizer", "__init__.py")) + f = open(os.path.join("python", "fast_tokenizer", "__init__.py")) lines = f.readlines() version = "" for line in lines: @@ -62,22 +62,22 @@ def get_version(): return version -long_description = "PaddleNLP Faster Tokenizer Library written in C++ " +long_description = "PaddleNLP Fast Tokenizer Library written in C++ " setup( - name="faster_tokenizer", + name="fast_tokenizer", version=get_version(), author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com", description=long_description, long_description=long_description, zip_safe=False, - url="https://github.com/PaddlePaddle/PaddleNLP/faster_tokenizer", + url="https://github.com/PaddlePaddle/PaddleNLP/fast_tokenizer", package_dir={"": "python"}, packages=[ - "faster_tokenizer", "faster_tokenizer.tokenizers_impl", - "faster_tokenizer.normalizers", "faster_tokenizer.pretokenizers", - "faster_tokenizer.models", "faster_tokenizer.postprocessors", - "faster_tokenizer.libs" + "fast_tokenizer", "fast_tokenizer.tokenizers_impl", + "fast_tokenizer.normalizers", "fast_tokenizer.pretokenizers", + "fast_tokenizer.models", "fast_tokenizer.postprocessors", + "fast_tokenizer.libs" ], package_data=package_data, extras_require={"test": ["pytest>=6.0"]}, diff --git a/faster_tokenizer/tools/codestyle/clang_format.hook b/fast_tokenizer/tools/codestyle/clang_format.hook similarity index 100% rename from faster_tokenizer/tools/codestyle/clang_format.hook rename to fast_tokenizer/tools/codestyle/clang_format.hook diff --git a/faster_tokenizer/tools/codestyle/copyright.hook b/fast_tokenizer/tools/codestyle/copyright.hook similarity index 100% rename from faster_tokenizer/tools/codestyle/copyright.hook rename to fast_tokenizer/tools/codestyle/copyright.hook diff --git a/faster_tokenizer/tools/codestyle/cpplint_pre_commit.hook b/fast_tokenizer/tools/codestyle/cpplint_pre_commit.hook similarity index 100% rename from faster_tokenizer/tools/codestyle/cpplint_pre_commit.hook rename to fast_tokenizer/tools/codestyle/cpplint_pre_commit.hook diff --git a/faster_tokenizer/tools/codestyle/pylint_pre_commit.hook b/fast_tokenizer/tools/codestyle/pylint_pre_commit.hook similarity index 100% rename from faster_tokenizer/tools/codestyle/pylint_pre_commit.hook rename to fast_tokenizer/tools/codestyle/pylint_pre_commit.hook diff --git a/faster_tokenizer/README.md b/faster_tokenizer/README.md deleted file mode 100644 index 6747ff743580..000000000000 --- a/faster_tokenizer/README.md +++ /dev/null @@ -1,105 +0,0 @@ -# FasterTokenizer - ------------------------------------------------------------------------------------------- - -

- - - - - - - - - -

-FasterTokenizer是一款简单易用、功能强大的跨平台高性能文本预处理库,集成业界多个常用的Tokenizer实现,支持不同NLP场景下的文本预处理功能,如文本分类、阅读理解,序列标注等。结合PaddleNLP Tokenizer模块,为用户在训练、推理阶段提供高效通用的文本预处理能力。 - -## 特性 - -- 高性能。由于底层采用C++实现,所以其性能远高于目前常规Python实现的Tokenizer。在文本分类任务上,FasterTokenizer对比Python版本Tokenizer加速比最高可达20倍。 -- 跨平台。FasterTokenizer可在不同的系统平台上使用,目前已支持Windows x64,Linux x64以及MacOS 10.14+平台上使用。 -- 多编程语言支持。FasterTokenizer提供在C++、Python语言上开发的能力。 -- 灵活性强。用户可以通过指定不同的FasterTokenizer组件定制满足需求的Tokenizer。 - -## 快速开始 - -下面将介绍Python版本FasterTokenizer的使用方式,C++版本的使用方式可参考[FasterTokenizer C++ Demo](./faster_tokenizer/demo/README.md)。 - -### 前置依赖 - -- Windows 64位系统 -- Linux x64系统 -- MacOS 10.14+系统(m1芯片的MacOS,需要使用x86_64版本的Anaconda作为python环境方可安装使用) -- Python 3.6 ~ 3.9 - -### 安装FasterTokenizer - -```python -pip install faster_tokenizer -``` - -### FasterTokenizer使用示例 - -- 准备词表 - -```shell -# Linux或者Mac用户可直接执行以下命令下载测试的词表,Windows 用户可在浏览器上下载到本地。 -wget https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt -``` - -- 切词示例 - -FasterTokenizer库内置NLP任务常用的Tokenizer,如ErnieFasterTokenizer。下面将展示FasterTokenizer的简单用法。 - -```python -from faster_tokenizer import ErnieFasterTokenizer, models -# 1. 加载词表 -vocab = models.WordPiece.read_file("ernie_vocab.txt") -# 2. 实例化ErnieFasterTokenizer对象 -faster_tokenizer = ErnieFasterTokenizer(vocab) -# 3. 切词 -output = faster_tokenizer.encode("我爱中国") -# 4. 输出结果 -print("ids: ", output.ids) -print("type_ids: ", output.type_ids) -print("tokens: ", output.tokens) -print("offsets: ", output.offsets) -print("attention_mask: ", output.attention_mask) -``` - -### FasterTokenizer在PaddleNLP Tokenizer模块加速示例 - -PaddleNLP Tokenizer模块可简单地应用在模型训练以及推理部署的文本预处理阶段,并通过`AutoTokenizer.from_pretrained`方式实例化相应的Tokenizer。其中`AutoTokenizer`默认加载得到的Tokenizer是常规Python实现的Tokenizer,其性能会低于C++实现的FasterTokenizer。为了提升PaddleNLP Tokenizer模块性能,目前PaddleNLP Tokenizer模块已经支持使用FasterTokenizer作为Tokenizer的后端加速切词阶段。在现有的Tokenizer加载接口中,仅需添加`use_faster=True`这一关键词参数,其余代码保持不变,即可加载Faster版本的Tokenizer,代码示例如下: - -```python -from paddlenlp.transformers import AutoTokenizer - -# 默认加载Python版本的Tokenizer -tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') -# 打开use_faster开关,可加载Faster版本Tokenizer -faster_tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh', use_faster=True) - -text1 = tokenizer('自然语言处理') -text2 = faster_tokenizer('自然语言处理') - -print(text1) -print(text2) -``` - -目前PaddleNLP已支持BERT、ERNIE、TinyBERT以及ERNIE-M 4种Tokenizer的Faster版本,其余模型的Tokenizer暂不支持Faster版本。 - -## FAQ - -Q:我在AutoTokenizer.from_pretrained接口上已经打开`use_faster=True`开关,为什么文本预处理阶段性能上好像没有任何变化? - -A:在有三种情况下,打开`use_faster=True`开关可能无法提升性能: - 1. 没有安装faster_tokenizer。若在没有安装faster_tokenizer库的情况下打开`use_faster`开关,PaddleNLP会给出以下warning:"Can't find the faster_tokenizer package, please ensure install faster_tokenizer correctly. "。 - - 2. 加载的Tokenizer类型暂不支持Faster版本。目前支持4种Tokenizer的Faster版本,分别是BERT、ERNIE、TinyBERT以及ERNIE-M Tokenizer。若加载不支持Faster版本的Tokenizer情况下打开`use_faster`开关,PaddleNLP会给出以下warning:"The tokenizer XXX doesn't have the faster version. Please check the map paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES to see which faster tokenizers are currently supported." - - 3. 待切词文本长度过短(如文本平均长度小于5)。这种情况下切词开销可能不是整个文本预处理的性能瓶颈,导致在使用FasterTokenizer后仍无法提升整体性能。 - -## 相关文档 - -[FasterTokenizer编译指南](docs/compile/README.md) diff --git a/faster_tokenizer/faster_tokenizer/demo/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/demo/CMakeLists.txt deleted file mode 100644 index 54cf9d3e1acb..000000000000 --- a/faster_tokenizer/faster_tokenizer/demo/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 3.10) -project(cpp_faster_tokenizer_demo CXX C) - -option(FASTER_TOKENIZER_INSTALL_DIR "Path of downloaded faster_tokenizer sdk.") - -# Download ernie vocab for demo -set(ERNIE_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/ernie_vocab.txt) -if (EXISTS ${ERNIE_VOCAB_PATH}) -message("The ${ERNIE_VOCAB_PATH} exists already.") -else() -file(DOWNLOAD "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt" ${ERNIE_VOCAB_PATH} SHOW_PROGRESS) -message("Already download the vocab.txt of ernie to ${CMAKE_CURRENT_BINARY_DIR} for demo.") -endif() - -include(${FASTER_TOKENIZER_INSTALL_DIR}/FasterTokenizer.cmake) - -include_directories(${FASTER_TOKENIZER_INCS}) - -add_executable(ernie_faster_tokenizer_demo ${PROJECT_SOURCE_DIR}/ernie_faster_tokenizer_demo.cc) -target_link_libraries(ernie_faster_tokenizer_demo ${FASTER_TOKENIZER_LIBS}) diff --git a/faster_tokenizer/faster_tokenizer/normalizers/normalizers.h b/faster_tokenizer/faster_tokenizer/normalizers/normalizers.h deleted file mode 100644 index 151508f741a9..000000000000 --- a/faster_tokenizer/faster_tokenizer/normalizers/normalizers.h +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "faster_tokenizer/normalizers/bert.h" -#include "faster_tokenizer/normalizers/normalizer.h" -#include "faster_tokenizer/normalizers/precompiled.h" -#include "faster_tokenizer/normalizers/replace.h" -#include "faster_tokenizer/normalizers/strip.h" -#include "faster_tokenizer/normalizers/unicode.h" -#include "faster_tokenizer/normalizers/utils.h" diff --git a/faster_tokenizer/faster_tokenizer/test/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/test/CMakeLists.txt deleted file mode 100644 index 2535b036b22c..000000000000 --- a/faster_tokenizer/faster_tokenizer/test/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -if(WITH_TESTING) -cc_library(tokenizers_gtest_main SRCS gtest_main.cc DEPS gtest gflags) - -# Test Normalizers modules -cc_test(test_normalizer SRCS test_normalizer.cc DEPS normalizers) -cc_test(test_unicode SRCS test_unicode.cc DEPS normalizers) -cc_test(test_replace SRCS test_replace.cc DEPS normalizers) -cc_test(test_strip SRCS test_strip.cc DEPS normalizers) -cc_test(test_utils SRCS test_utils.cc DEPS normalizers) - -# Test PreTokenizers modules -cc_test(test_whitespace SRCS test_whitespace.cc DEPS pretokenizers) -cc_test(test_bert_pretokenizer SRCS test_bert_pretokenizer.cc DEPS pretokenizers) - -# Test Model -cc_test(test_wordpiece SRCS test_wordpiece.cc DEPS models) -cc_test(test_faster_wordpiece SRCS test_faster_wordpiece.cc DEPS models) - -# Download ernie vocab for test -set(ERNIE_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/ernie_vocab.txt) -if (EXISTS ${ERNIE_VOCAB_PATH}) -message("The ${ERNIE_VOCAB_PATH} exists already.") -else() -file(DOWNLOAD "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt" ${ERNIE_VOCAB_PATH} SHOW_PROGRESS) -message("Already download the vocab.txt of ernie to ${CMAKE_CURRENT_BINARY_DIR} for test.") -endif() - -# Test Tokenizer -cc_test(test_bert_tokenizer SRCS test_bert_tokenizer.cc DEPS normalizers pretokenizers models postprocessors tokenizer) - -if(NOT WITH_PYTHON) -cc_test(test_ernie_faster_tokenizer SRCS test_ernie_faster_tokenizer.cc DEPS normalizers pretokenizers models postprocessors tokenizer core_tokenizers) -endif() - -endif() diff --git a/faster_tokenizer/faster_tokenizer/tokenizers/ernie_faster_tokenizer.cc b/faster_tokenizer/faster_tokenizer/tokenizers/ernie_faster_tokenizer.cc deleted file mode 100644 index 53a4541ab011..000000000000 --- a/faster_tokenizer/faster_tokenizer/tokenizers/ernie_faster_tokenizer.cc +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h" -#include "faster_tokenizer/core/encoding.h" -#include "faster_tokenizer/models/models.h" -#include "faster_tokenizer/normalizers/normalizers.h" -#include "faster_tokenizer/postprocessors/postprocessors.h" -#include "faster_tokenizer/pretokenizers/pretokenizers.h" -#include "faster_tokenizer/utils/utils.h" -#include "glog/logging.h" - -namespace paddlenlp { -namespace faster_tokenizer { -namespace tokenizers_impl { - -ErnieFasterTokenizer::ErnieFasterTokenizer(const std::string& vocab_path, - const std::string& unk_token, - const std::string& sep_token, - const std::string& cls_token, - const std::string& pad_token, - const std::string& mask_token, - bool clean_text, - bool handle_chinese_chars, - bool strip_accents, - bool lowercase, - const std::string& wordpieces_prefix, - uint32_t max_sequence_len) { - core::Vocab vocab; - utils::GetVocabFromFiles(vocab_path, &vocab); - VLOG(6) << "The vocab size of ErnieFasterTokenizer is " << vocab.size(); - Init(vocab, - unk_token, - sep_token, - cls_token, - pad_token, - mask_token, - clean_text, - handle_chinese_chars, - strip_accents, - lowercase, - wordpieces_prefix, - max_sequence_len); -} - - -ErnieFasterTokenizer::ErnieFasterTokenizer(const core::Vocab& vocab, - const std::string& unk_token, - const std::string& sep_token, - const std::string& cls_token, - const std::string& pad_token, - const std::string& mask_token, - bool clean_text, - bool handle_chinese_chars, - bool strip_accents, - bool lowercase, - const std::string& wordpieces_prefix, - uint32_t max_sequence_len) { - Init(vocab, - unk_token, - sep_token, - cls_token, - pad_token, - mask_token, - clean_text, - handle_chinese_chars, - strip_accents, - lowercase, - wordpieces_prefix, - max_sequence_len); -} - - -void ErnieFasterTokenizer::Init(const core::Vocab& vocab, - const std::string& unk_token, - const std::string& sep_token, - const std::string& cls_token, - const std::string& pad_token, - const std::string& mask_token, - bool clean_text, - bool handle_chinese_chars, - bool strip_accents, - bool lowercase, - const std::string& wordpieces_prefix, - uint32_t max_sequence_len) { - models::FasterWordPiece wordpiece(vocab, - unk_token, - 100 /* max_input_chars_per_word */, - wordpieces_prefix, - true); - this->SetModel(wordpiece); - - std::vector added_tokens; - uint32_t id; - if (!this->TokenToId(unk_token, &id)) { - added_tokens.emplace_back(unk_token, true); - } - if (!this->TokenToId(sep_token, &id)) { - added_tokens.emplace_back(sep_token, true); - } - if (!this->TokenToId(cls_token, &id)) { - added_tokens.emplace_back(cls_token, true); - } - if (!this->TokenToId(pad_token, &id)) { - added_tokens.emplace_back(pad_token, true); - } - if (!this->TokenToId(mask_token, &id)) { - added_tokens.emplace_back(mask_token, true); - } - this->AddSpecialTokens(added_tokens); - - - normalizers::BertNormalizer bert_normalizer( - clean_text, handle_chinese_chars, strip_accents, lowercase); - this->SetNormalizer(bert_normalizer); - - if (vocab.size() > 0) { - uint32_t sep_id, cls_id; - if (!this->TokenToId(sep_token, &sep_id)) { - throw std::invalid_argument("sep_token not found in the vocabulary"); - } - if (!this->TokenToId(cls_token, &cls_id)) { - throw std::invalid_argument("cls_token not found in the vocabulary"); - } - postprocessors::BertPostProcessor bert_postprocessor({sep_token, sep_id}, - {cls_token, cls_id}); - this->SetPostProcessor(bert_postprocessor); - } - if (max_sequence_len == 0) { - this->DisableTruncMethod(); - } else { - this->EnableTruncMethod(max_sequence_len, - 0, - core::Direction::RIGHT, - core::TruncStrategy::LONGEST_FIRST); - } -} - -} // namespace tokenizers_impl -} // namespace faster_tokenizer -} // namespace paddlenlp diff --git a/faster_tokenizer/perf/run_all_perf.sh b/faster_tokenizer/perf/run_all_perf.sh deleted file mode 100644 index 88114819e443..000000000000 --- a/faster_tokenizer/perf/run_all_perf.sh +++ /dev/null @@ -1,12 +0,0 @@ -# !/bin/sh -for seq_len in 32 64 128 256 512; do -for batch_size in 1 2 4 8 16 32 64; do -mkdir -p seq_len_$seq_len/batch_size_$batch_size -for thread_num in 1 2 4 8 16 32 64; do -echo "Experiment setting: thread_num=$thread_num, batch_size=$batch_size, sequence_length=$seq_len" -export OMP_NUM_THREADS=$thread_num -export RAYON_RS_NUM_CPUS=$thread_num -python perf.py --batch_size $batch_size --max_seq_length $seq_len >seq_len_$seq_len/batch_size_$batch_size/parallel$thread_num.log 2>nohup.out -done -done -done \ No newline at end of file diff --git a/model_zoo/ernie-1.0/run_pretrain_trainer.py b/model_zoo/ernie-1.0/run_pretrain_trainer.py index 7f0b39d3889a..9eb2055a5e63 100644 --- a/model_zoo/ernie-1.0/run_pretrain_trainer.py +++ b/model_zoo/ernie-1.0/run_pretrain_trainer.py @@ -358,7 +358,7 @@ def main(): logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + - f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint. diff --git a/model_zoo/ernie-m/README.md b/model_zoo/ernie-m/README.md index 78c70680d960..27cca57e747e 100644 --- a/model_zoo/ernie-m/README.md +++ b/model_zoo/ernie-m/README.md @@ -15,7 +15,7 @@ - **Back-Translation masked language modeling(BTMLM)**: 该方法基于回译机制从单语语料中学习语言间的对齐关系。通过CAMLM 生成伪平行语料,然后让模型学习生成的伪平行句子,使模型可以利用单语语料更好地建模语义对齐关系。 -![framework](./framework.png) +![framework](https://user-images.githubusercontent.com/40912707/201308423-bf4f0100-3ada-4bae-89d5-b07ffec1e2c0.png) 本项目是 ERNIE-M 的 PaddlePaddle 动态图实现, 包含模型训练,模型验证等内容。以下是本例的简要目录结构及说明: diff --git a/model_zoo/ernie-m/framework.png b/model_zoo/ernie-m/framework.png deleted file mode 100644 index 4b09bd153958..000000000000 Binary files a/model_zoo/ernie-m/framework.png and /dev/null differ diff --git a/model_zoo/uie/README.md b/model_zoo/uie/README.md index e20380370ad5..ea2afe215d25 100644 --- a/model_zoo/uie/README.md +++ b/model_zoo/uie/README.md @@ -855,8 +855,6 @@ python evaluate.py \ pip install -r deploy/python/requirements_cpu.txt ``` - ```text - - GPU端 为了在 GPU 上获得最佳的推理性能和稳定性,请先确保机器已正确安装 NVIDIA 相关驱动和基础软件,确保 **CUDA >= 11.2,cuDNN >= 8.1.1**,并使用以下命令安装所需依赖 diff --git a/model_zoo/uie/evaluate.py b/model_zoo/uie/evaluate.py index 3527a4b7e377..c88d748e5735 100644 --- a/model_zoo/uie/evaluate.py +++ b/model_zoo/uie/evaluate.py @@ -45,7 +45,7 @@ def evaluate(model, metric, data_loader, multilingual=False): else: start_prob, end_prob = model(batch["input_ids"], batch["token_type_ids"], - batch["att_mask"], batch["pos_ids"]) + batch["pos_ids"], batch["att_mask"]) start_ids = paddle.cast(batch["start_positions"], 'float32') end_ids = paddle.cast(batch["end_positions"], 'float32') diff --git a/paddlenlp/prompt/prompt_args.py b/paddlenlp/prompt/prompt_args.py index 7c8ca507c82f..fcab68b63754 100644 --- a/paddlenlp/prompt/prompt_args.py +++ b/paddlenlp/prompt/prompt_args.py @@ -91,7 +91,7 @@ class PromptTuningArguments(TrainingArguments): metadata={"help": "Epsilon for the AdamW optimizer of prompt."}) def __post_init__(self): - super().__post_init__() + super(PromptTuningArguments, self).__post_init__() if self.use_rgl and self.alpha_rgl == 0.0: logger.warning("Ignore `use_rgl` because `alpha_rgl` = 0. Please "\ "set `alpha_rgl` a positive float to use RGL loss.") diff --git a/paddlenlp/prompt/prompt_tokenizer.py b/paddlenlp/prompt/prompt_tokenizer.py index efb2e80bce40..46af802586e1 100644 --- a/paddlenlp/prompt/prompt_tokenizer.py +++ b/paddlenlp/prompt/prompt_tokenizer.py @@ -16,122 +16,194 @@ import warnings from functools import partial from collections import defaultdict +from typing import Any, Dict, List, Union import numpy as np -from .prompt_utils import InputFeatures - __all__ = ["MLMPromptTokenizer"] class MLMPromptTokenizer(object): - def __init__(self, tokenizer, max_seq_length, **kwargs): - self._tokenizer = tokenizer - self._max_seq_len = max_seq_length - self._num_special_tokens = self._tokenizer.num_special_tokens_to_add() - self._special_map = { - "": "cls_token", - "": "sep_token", - "": "pad_token", - "": "unk_token", - "": "mask_token" - } - self.mask_token_id = self._tokenizer.mask_token_id - self.pad_token_id = self._tokenizer.pad_token_id - self.soft_token_id = self._tokenizer.unk_token_id - - def __call__(self, input_list): - encoded_input = defaultdict(list) - - for input_dict in input_list: - # Format text and special tokens, then convert them to ids. - if input_dict["mask_ids"] == 1: - text = [self.mask_token_id] - - if input_dict["text"] in self._special_map: - special_token = getattr(self._tokenizer, - self._special_map[input_dict["text"]]) - input_dict["text"] = special_token - - soft_ids = input_dict.get("soft_token_ids", None) - if soft_ids is not None and soft_ids == 1: - text = [self.soft_token_id] - else: - text = self._tokenizer.encode( - input_dict["text"], + omask_token = "[O-MASK]" + + def __init__(self, tokenizer, max_length): + self.tokenizer = tokenizer + self.max_length = max_length + + def __call__(self, inputs: List[Dict[str, Any]]): + part_text = [part["text"] for part in inputs] + part_do_truncate = [part["do_truncate"] for part in inputs] + max_lengths = self._create_max_lengths_from_do_truncate( + part_text, part_do_truncate) + + encoded_inputs = defaultdict(list) + option_length = None + last_position = 1 # Id 0 denotes special token '[CLS]'. + last_token_type = 0 + for index, part in enumerate(inputs): + # Create input_ids. + soft_token_ids = part.get("soft_tokens", None) + if soft_token_ids is None or len( + soft_token_ids) == 1 and soft_token_ids[0] == 0: + input_ids = self.tokenizer.encode( + part["text"], add_special_tokens=False, - return_token_type_ids=False)["input_ids"] - encoded_input["input_ids"].append(text) - - # Extend other features as the same length of input ids. - for key in input_dict: - if key != "text": - encoded_input[key].append([input_dict[key]] * len(text)) - - max_seq_len = self._max_seq_len - self._num_special_tokens - encoded_input = self.truncate(encoded_input, max_seq_len) - encoded_input.pop("shortenable_ids") - encoded_input = self.join(encoded_input) - - encoded_input = self.add_special_tokens(encoded_input) - encoded_input = self.pad(encoded_input, self._max_seq_len, - self.pad_token_id) - return encoded_input - - def add_special_tokens(self, input_dict): + return_token_type_ids=False, + truncation=True, + max_length=max_lengths[index])["input_ids"] + encoded_inputs["soft_token_ids"].append([0] * len(input_ids)) + else: + input_ids = soft_token_ids + encoded_inputs["soft_token_ids"].append(soft_token_ids) + encoded_inputs["input_ids"].append(input_ids) + part_length = len(input_ids) + + # Create position_ids. + position_ids, last_position = self._create_position_ids_from_part( + input_ids, part, last_position) + encoded_inputs["position_ids"].append(position_ids) + + # Create token_type_ids. + if "token_types" in part: + last_token_type = part["token_types"] + encoded_inputs["token_type_ids"].append([last_token_type] * + part_length) + + # Create other features like encoder_ids. + for name in part: + if name not in [ + "text", "soft_tokens", "positions", "token_types" + ]: + encoded_inputs[name].append([part[name]] * part_length) + + # Record the length of options if exists. + if self.omask_token in part["text"]: + if option_length is not None: + raise ValueError( + "There are more than one sequence of options, which " + "will cause wrong attention masks.") + option_length = len(input_ids) + + encoded_inputs.pop("do_truncate") + encoded_inputs = self.join(encoded_inputs) + encoded_inputs = self.add_special_tokens(encoded_inputs) + attention_mask = self._create_attention_mask( + encoded_inputs["input_ids"], option_length) + if attention_mask is not None: + encoded_inputs["attention_mask"] = attention_mask + masked_positions = self._create_masked_positions( + encoded_inputs["input_ids"], encoded_inputs["soft_token_ids"]) + if masked_positions is not None: + encoded_inputs["masked_positions"] = masked_positions + return encoded_inputs + + def _create_position_ids_from_part(self, input_ids: List[int], + part: Dict[str, + Any], last_position: int): + """ + Create position ids from prompt for each part. + """ + part_length = len(input_ids) + if "positions" in part and part["positions"] > 0: + last_position = part["positions"] + if self.omask_token in part["text"]: + omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token) + omask_index = [ + x for x in range(part_length) if input_ids[x] == omask_id + ] + omask_index = [0] + omask_index + position_ids = [] + max_index = 0 + for start_id, end_id in zip(omask_index[:-1], omask_index[1:]): + position_ids.extend( + list(range(last_position, + last_position + end_id - start_id))) + max_index = max(end_id - start_id, max_index) + if len(position_ids) < part_length: + difference = part_length - len(position_ids) + position_ids.extend( + range(last_position, last_position + difference)) + max_index = max(difference, max_index) + last_position += max_index + else: + position_ids = list( + range(last_position, last_position + part_length)) + last_position += part_length + return position_ids, last_position + + def _create_max_lengths_from_do_truncate(self, part_text: List[str], + part_do_truncate: List[bool]): + """ + Create the max sequence length of each part. + """ + text_length = sum([len(x) for x in part_text]) + if text_length < self.max_length: + return [None] * len(part_text) + + num_special_token = self.tokenizer.num_special_tokens_to_add() + cut_length = text_length - self.max_length + num_special_token + max_lengths = [] + if self.tokenizer.truncation_side == "right": + for index, part in enumerate(part_text[::-1]): + if part_do_truncate[-1 - index] and cut_length > 0: + max_lengths.append(max(len(part) - cut_length, 0)) + cut_length = cut_length - len(part) + else: + max_lengths.append(None) + max_lengths = max_lengths[::-1] + else: + for index, part in enumerate(text): + if part_do_truncate[index] and cut_length > 0: + max_lengths.append(max(len(part) - cut_length, 0)) + cut_length = cut_length - len(part) + else: + max_lengths.append(None) + return max_lengths + + def _create_attention_mask(self, input_ids: List[int], + option_length: Union[int, None]): + if option_length is None: + return None + omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token) + input_ids = np.array(input_ids) + attention_mask = np.zeros([len(input_ids), len(input_ids)]) + pad_index = np.where(input_ids == self.tokenizer.pad_token_id)[0] + attention_mask[:, pad_index] = 1 + attention_mask[pad_index, :] = 1 + omask_index = np.where(input_ids == omask_id)[0].tolist() + opt_begin, opt_end = omask_index[0], omask_index[0] + option_length + attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 1 + omask_index.append(opt_end) + for opt_begin, opt_end in zip(omask_index[:-1], omask_index[1:]): + attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 0 + attention_mask = (1 - attention_mask) * -1e4 + return attention_mask + + def _create_masked_positions(self, input_ids: List[int], + soft_token_ids: List[int]): + non_soft_ids = np.array(input_ids) * (np.array(soft_token_ids) == 0) + mask_id = self.tokenizer.mask_token_id + + masked_positions = np.where(non_soft_ids == mask_id)[0] + if masked_positions.shape[0] == 0: + return None + return masked_positions.tolist() + + def add_special_tokens(self, input_dict: Dict[str, Any]): for key in input_dict: - new_inputs = self._tokenizer.build_inputs_with_special_tokens( + new_inputs = self.tokenizer.build_inputs_with_special_tokens( input_dict[key]) if key != "input_ids": special_mask = np.array( - self._tokenizer.get_special_tokens_mask(input_dict[key])) + self.tokenizer.get_special_tokens_mask(input_dict[key])) new_inputs = np.array(new_inputs) + # TODO (Huijuan): Use different ids according to specific keyword. new_inputs[special_mask == 1] = 0 new_inputs = new_inputs.tolist() input_dict[key] = new_inputs return input_dict - @staticmethod - def truncate(input_dict, max_seq_len): - total_tokens = sum([len(text) for text in input_dict["input_ids"]]) - trunc_length = total_tokens - max_seq_len - if trunc_length > 0: - truncated_dict = defaultdict(list) - trunc_mask = input_dict["shortenable_ids"] - for key in input_dict: - content = input_dict[key] - count = trunc_length - for idx, text in enumerate(content[::-1]): - index = -idx - 1 - if len(text) == 0 or trunc_mask[index][0] == 0: - continue - if count < len(text): - content[index] = text[:-count] - else: - content[index] = [] - count -= len(text) - if count <= 0: - break - truncated_dict[key] = content - return truncated_dict - else: - return input_dict - - @staticmethod - def pad(input_dict, max_seq_len, pad_id, other_pad_id=0): - for key, content in input_dict.items(): - if len(content) > max_seq_len: - raise ValueError( - f"Truncated length of {key} is still longer than " - f"{max_seq_len}, please use a shorter prompt.") - if key == "input_ids": - pad_seq = [pad_id] * (max_seq_len - len(content)) - else: - pad_seq = [other_pad_id] * (max_seq_len - len(content)) - input_dict[key].extend(pad_seq) - return input_dict - @staticmethod def join(input_dict): for key in input_dict: diff --git a/paddlenlp/prompt/prompt_trainer.py b/paddlenlp/prompt/prompt_trainer.py index 2ffd2d9fd58e..e46d8f810ed4 100644 --- a/paddlenlp/prompt/prompt_trainer.py +++ b/paddlenlp/prompt/prompt_trainer.py @@ -18,25 +18,22 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle.static import InputSpec from ..datasets import MapDataset from ..utils.log import logger from ..trainer import Trainer, TrainerCallback -from ..trainer.trainer_utils import EvalPrediction +from ..trainer.trainer_utils import EvalPrediction, get_scheduler from ..data import DataCollator from ..losses import RDropLoss from ..transformers import PretrainedTokenizer, export_model from .template import AutoTemplate from .verbalizer import SoftVerbalizer -from .prompt_utils import InputFeatures, signature +from .prompt_utils import signature, PromptDataCollatorWithPadding from .prompt_args import PromptTuningArguments __all__ = ["PromptTrainer", "PromptModelForSequenceClassification"] -PROMPT_NAME = "prompt.pdparams" - class PromptTrainer(Trainer): """ @@ -67,18 +64,20 @@ def __init__(self, args = PromptTuningArguments(output_dir=output_dir) if data_collator is None: - data_collator = InputFeatures.collate_fn - - super().__init__(model=model, - criterion=criterion, - args=args, - data_collator=data_collator, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=tokenizer, - compute_metrics=compute_metrics, - callbacks=callbacks, - optimizers=optimizers) + data_collator = PromptDataCollatorWithPadding(tokenizer, + padding=True, + return_tensors='pd') + + super(PromptTrainer, self).__init__(model=model, + criterion=criterion, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers) self.load_state_dict_from_checkpoint(args.resume_from_checkpoint) @@ -88,86 +87,74 @@ def __init__(self, if self.args.use_rdrop: self.rdrop_criterion = RDropLoss() + def _get_model(self): + model = self.model + if isinstance(model, paddle.DataParallel): + model = model._layers + return model + @property def template(self): - return self.model.template + return self._get_model().template @template.setter def template(self, template): - self.model.template = template + self._get_model().template = template @property def verbalizer(self): - return getattr(self.model, "verbalizer", None) + return self._get_model().verbalizer @verbalizer.setter def verbalizer(self, verbalizer): - setattr(self.model, "verbalizer", verbalizer) + self._get_model().verbalizer = verbalizer @property - def plm(self): - return self.model.plm + def pretrained_model(self): + self._set_model_attributes(self.model, "plm") - @plm.setter - def plm(self, model): - self.model.plm = model + @pretrained_model.setter + def pretrained_model(self, model): + self._set_model_attributes(self.model, "plm", model) - def _map_dataset(self, dataset): + def _map_dataset(self, dataset: MapDataset): if dataset is None: return None if not isinstance(dataset, MapDataset): raise ValueError("Expected `MapDataset` but received {}.".format( type(dataset))) - return dataset.map(self._convert_example) - def _convert_example(self, example): - encoded_inputs = self.template.wrap_one_example(example) - return encoded_inputs + def encode_with_template(example): + return self.template(example) - def _prepare_input(self, inputs: InputFeatures): + return dataset.map(encode_with_template) + + def _prepare_input(self, inputs: Dict): return inputs - def _save(self, output_dir: Optional[str] = None, state_dict=None): - super()._save(output_dir, state_dict) + def _save(self, + output_dir: Optional[str] = None, + state_dict: Dict[str, Any] = None): + super(PromptTrainer, self)._save(output_dir, state_dict) output_dir = output_dir if output_dir is not None else self.args.output_dir if self.template: - self.template.save_to(output_dir) - if self.verbalizer: - self.verbalizer.save_to(output_dir) + self.template.save(output_dir) + if self.verbalizer is not None: + self.verbalizer.save(output_dir) - def load_state_dict_from_checkpoint(self, resume_from_checkpoint=None): + def load_state_dict_from_checkpoint( + self, resume_from_checkpoint: os.PathLike = None): if resume_from_checkpoint is not None: - self.template = AutoTemplate.load_from( - resume_from_checkpoint, self.tokenizer, - self.args.max_seq_length, self.plm, - getattr(self.template, "_prompt_encoder", None), - getattr(self.template, "encoder_hidden_size", None)) - - super().load_state_dict_from_checkpoint(resume_from_checkpoint) - - def get_eval_dataloader(self, eval_dataset=None): - """ - Return the evaluation [`~paddle.io.DataLoader`]. - - Args: - eval_dataset (`paddlenlp.datasets.MapDataset`): - Created by `paddlenlp.prompt.load_dataset`, - where every item is an InputExample object. - """ - eval_dataset = self._map_dataset(eval_dataset) - return super().get_eval_dataloader(eval_dataset) + self.template = AutoTemplate.load_from(resume_from_checkpoint, + self.tokenizer, + self.args.max_seq_length, + self._get_model()) + super(PromptTrainer, + self).load_state_dict_from_checkpoint(resume_from_checkpoint) def get_test_dataloader(self, test_dataset): - """ - Return the test [`~paddle.io.DataLoader`]. - - Args: - test_dataset (`paddlenlp.datasets.MapDataset`): - The test dataset created by `paddlenlp.prompt.load_dataset`, - where every item is an InputExample object. - """ test_dataset = self._map_dataset(test_dataset) - return super().get_test_dataloader(test_dataset) + return super(PromptTrainer, self).get_test_dataloader(test_dataset) def create_optimizer(self, lr_scheduler=None): """ @@ -178,20 +165,21 @@ def create_optimizer(self, lr_scheduler=None): self.args) plm_parameters = [] - if not self.model.freeze_plm: + if not self.args.freeze_plm: plm_parameters.extend([ - p for p in self.model.plm.parameters() + p for p in self._get_model().plm.parameters() if not p.stop_gradient ]) ppt_parameters = [] if self.template is not None: ppt_parameters.extend([ - x for x in self.template.parameters() if not x.stop_gradient + x for n, x in self.template.named_parameters() + if not x.stop_gradient ]) if self.verbalizer is not None: if isinstance(self.verbalizer, SoftVerbalizer): - if not self.model.freeze_plm: + if not self.args.freeze_plm: plm_parameters.extend([ p for n, p in self.verbalizer.non_head_parameters() if not p.stop_gradient @@ -203,7 +191,7 @@ def create_optimizer(self, lr_scheduler=None): [p for n, p in self.verbalizer.parameters()]) decay_parameters = [ - p.name for n, p in self.model.named_parameters() + p.name for n, p in self._get_model().named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] apply_decay_param_fun = lambda x: x in decay_parameters @@ -225,7 +213,16 @@ def create_optimizer(self, lr_scheduler=None): else: params = plm_parameters else: - lr = self.args.ppt_learning_rate + args = self.init_num_steps(self.args, len(self.train_dataset)) + warmup = args.warmup_steps if args.warmup_steps > 0 else int( + args.warmup_ratio * args.num_training_steps) + self.lr_scheduler = get_scheduler( + args.lr_scheduler_type, + learning_rate=self.args.ppt_learning_rate, + num_warmup_steps=warmup, + num_training_steps=args.num_training_steps, + ) + lr = self.lr_scheduler params = ppt_parameters self.optimizer = optim_cls( @@ -243,20 +240,20 @@ def compute_loss(self, model, inputs, return_outputs=False): Compute the total loss for every batch. """ if "labels" not in inputs: - raise ValueError("Fail to compute loss as there are no labels "\ - "in {}.".format(inputs)) + raise ValueError( + "Fail to compute loss as `labels` not in {}.".format(inputs)) labels = inputs["labels"] - soft_token_ids = inputs.get("soft_token_ids", None) - outputs, hidden_states = model(inputs["input_ids"], - inputs["mask_ids"], - soft_token_ids, - return_hidden_states=True) + input_dict = inputs.copy() + input_dict["return_hidden_states"] = True + outputs, hidden_states = model(**input_dict) + if self.criterion is not None: loss = self.criterion(outputs, labels) if self.args.use_rdrop: - loss = self._compute_rdrop_loss(model, inputs, outputs, loss) + loss = self._compute_rdrop_loss(model, input_dict, outputs, + loss) if self.args.use_rgl: loss += self._compute_rgl_loss(hidden_states, labels) @@ -267,10 +264,10 @@ def compute_loss(self, model, inputs, return_outputs=False): return (loss, outputs) if return_outputs else loss - def _compute_rdrop_loss(self, model, inputs, outputs, loss): - re_outputs = model(inputs["input_ids"], inputs["mask_ids"], - inputs.get("soft_token_ids", None)) - ce_loss = (self.criterion(re_outputs, inputs["labels"]) + loss) * 0.5 + def _compute_rdrop_loss(self, model, input_dict, outputs, loss): + re_outputs, _ = model(**input_dict) + labels = input_dict["labels"] + ce_loss = (self.criterion(re_outputs, labels) + loss) * 0.5 kl_loss = self.rdrop_criterion(outputs, re_outputs) loss = ce_loss + self.args.alpha_rdrop * kl_loss return loss @@ -312,14 +309,11 @@ def _raw_equal(x, y): return loss - def export_model(self, export_path, input_spec=None, export_type="paddle"): + def export_model(self, export_path, input_spec, export_type="paddle"): os.makedirs(export_path, exist_ok=True) - self.template.save_to(export_path) - self.verbalizer.save_to(export_path) - if input_spec is None and hasattr(self.model, "get_input_spec"): - input_spec = self.model.get_input_spec() - if input_spec is None: - raise ValueError("Please define input_spec to export model.") + self.template.save(export_path) + if self.verbalizer is not None: + self.verbalizer.save(export_path) export_model(self.model, input_spec, export_path, export_type) @@ -334,65 +328,62 @@ def __init__(self, verbalizer=None, freeze_plm: bool = False, freeze_dropout: bool = False): - super().__init__() + super(PromptModelForSequenceClassification, self).__init__() self.plm = model self.template = template self.verbalizer = verbalizer self.freeze_plm = freeze_plm self.freeze_dropout = freeze_dropout - if self.verbalizer is not None and hasattr(verbalizer, "process_model"): - self.plm = self.verbalizer.process_model(self.plm) if self.freeze_plm: for param in self.plm.parameters(): param.stop_gradient = True - if self.freeze_dropout: - self.plm.eval() + if self.freeze_dropout: + self.plm.eval() self.forward_keys = signature(self.plm.forward) self._mask_token_id = self.template.tokenizer.mask_token_id self._pad_token_id = self.template.tokenizer.pad_token_id def forward(self, - input_ids=None, - mask_ids=None, + input_ids, + token_type_ids=None, + position_ids=None, + attention_mask=None, + masked_positions=None, soft_token_ids=None, + encoder_ids=None, **kwargs): - return_hidden_states = kwargs.pop('return_hidden_states', False) - if self.freeze_dropout: - self.plm.eval() - attention_mask = (input_ids != self._pad_token_id).astype("int64") - inputs = InputFeatures(input_ids=input_ids, - mask_ids=mask_ids, - attention_mask=attention_mask, - soft_token_ids=soft_token_ids) - if hasattr(self.template, "process_batch"): - inputs = self.template.process_batch(inputs) + input_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "position_ids": position_ids, + "masked_positions": masked_positions, + "soft_token_ids": soft_token_ids, + "attention_mask": attention_mask, + "encoder_ids": encoder_ids + } + input_dict = self.template.process_batch(input_dict) model_inputs = { - k: inputs[k] - for k in inputs.keys(keep_none=True) if k in self.forward_keys + k: input_dict[k] + for k in input_dict if k in self.forward_keys } + model_inputs["masked_positions"] = None outputs = self.plm(**model_inputs) - hidden_states = outputs - if hasattr(self.template, "post_process_batch"): - outputs = self.template.post_process_batch(outputs) - if self.verbalizer and hasattr(self.verbalizer, "process_outputs"): - outputs = self.verbalizer.process_outputs(outputs, inputs=inputs) - - if return_hidden_states: - return outputs, hidden_states + if self.verbalizer is not None: + label_outputs = self.verbalizer.process_outputs( + outputs, input_dict["masked_positions"]) + else: + label_outputs = outputs + + if kwargs.pop('return_hidden_states', False): + return label_outputs, outputs else: - return outputs + return label_outputs def prompt_parameters(self): """ Get the parameters of template and verbalizer. """ - return [p for p in self.template.parameters() - ] + [p for p in self.verbalizer.parameters()] - - def get_input_spec(self): - input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64") # soft_token_ids - ] - return input_spec + params = [p for p in self.template.parameters()] + if self.verbalizer is not None: + params += [p for p in self.verbalizer.parameters()] + return params diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py index f51d64552ade..cbf00cf29bc7 100644 --- a/paddlenlp/prompt/prompt_utils.py +++ b/paddlenlp/prompt/prompt_utils.py @@ -1,217 +1,103 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import defaultdict -import json +""" +Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This module defines the itermediate data structure of inputs. +""" + import inspect -from dataclasses import dataclass, field +from typing import Any, Dict, List, Union, Optional +from dataclasses import dataclass import numpy as np import paddle -from ..utils.log import logger - -__all__ = ["InputExample", "InputFeatures"] +from ..transformers.tokenizer_utils_base import (PretrainedTokenizerBase, + PaddingStrategy) -@dataclass -class InputExample(object): - """Data structure of every example in datasets.""" - uid: str = field(default=None, - metadata={'help': 'A unique identifier of the example.'}) - text_a: str = field( - default=None, - metadata={'help': 'The first text sequence in each example.'}) - text_b: str = field( - default=None, - metadata={'help': 'The other text sequences in each example.'}) - labels: int = field(default=None, - metadata={'help': 'The label in each example.'}) - meta: dict = field( - default=None, - metadata={ - 'help': 'An optional dictionary of other data for each example.' - }) - - def __repr__(self): - content = {k: v for k, v in self.__dict__.items() if v is not None} - content = json.dumps(content, indent=2, sort_keys=True) + '\n' - return str(content) - - def keys(self, keep_none=False): - return [ - key for key in self.__dict__.keys() - if getattr(self, key) is not None - ] - - -class InputFeatures(dict): - """ - Data structure of every wrapped example or a batch of examples as the input of model. - - Args: - input_ids (paddle.Tensor): - The token ids. - attention_mask (paddle.Tensor): - The mask ids. - token_type_ids (paddle.Tensor, optional): - The token type ids. - inputs_embeds (paddle.Tensor, optional): - The embeddings of soft tokens. - mask_ids (paddle.Tensor, optional): - The mask ids where 1 denotes that a token is a mask, 0 denotes it is not a mask. - labels (list, optional): - The labels of classification task. - uid (list, optional): - The unique id(s) for example(s). - """ - input_keys = [ - 'input_ids', 'attention_mask', 'token_type_ids', 'inputs_embeds', 'uid', - 'labels', 'mask_ids', 'soft_token_ids' - ] - tensorable = [ - 'input_ids', 'attention_mask', 'token_type_ids', 'inputs_embeds', - 'labels', 'mask_ids', 'soft_token_ids' - ] - - def __init__(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - inputs_embeds=None, - mask_ids=None, - labels=None, - uid=None, - soft_token_ids=None): - self.input_ids = input_ids - self.attention_mask = attention_mask - self.token_type_ids = token_type_ids - self.inputs_embeds = inputs_embeds - self.labels = labels - self.mask_ids = mask_ids - self.uid = uid - self.soft_token_ids = soft_token_ids - - @classmethod - def add_keys(cls, *args): - cls.input_keys.extend(args) - - def keys(self, keep_none=False): - if keep_none: - return self.input_keys - else: - return [ - key for key in self.input_keys if getattr(self, key) is not None - ] - - @property - def tensorable_keys(self, keep_none=False): - if keep_none: - return self.tensorable - else: - return [ - key for key in self.tensorable if getattr(self, key) is not None - ] - - @tensorable_keys.setter - def tensorable_keys(self, keys): - diff_keys = set(keys) - set(self.input_keys) - if len(diff_keys) > 0: - raise ValueError("{} not in predefined keys.".format( - ["`%s`" % k for k in diff_keys].join(", "))) - self.tensorable = keys - - def values(self, keep_none=False): - return [getattr(self, key) for key in self.keys(keep_none=keep_none)] - - def items(self): - return [(key, getattr(self, key)) for key in self.keys()] - - def __len__(self): - return len(self.keys()) - - def __repr__(self): - content = {} - for key, value in self.items(): - if isinstance(value, paddle.Tensor): - value = value.numpy().tolist() - elif isinstance(value, paddle.static.Variable): - value = value.to_string(True) - content[key] = value - return str(json.dumps(content)) - - def __getitem__(self, key): - return getattr(self, key) - - def __iter__(self): - return iter(self.keys()) - - def __contains__(self, key, keep_none): - return key in self.keys(keep_none) - - def __setitem__(self, key, value): - if key not in self.input_keys: - logger.warning( - "`{}` is not a predefined key in InputFeatures. Perhaps it "\ - "brings unexpected results.".format(key)) - self.add_keys(key) - setattr(self, key, value) - - def __eq__(self, other): - if not isinstance(other, InputFeatures): - return False - if self.keys() != other.keys(): - return False - for key in self.keys(): - value = getattr(self, key) - other_value = getattr(other, key) - if type(value) != type(other_value): - return False - if isinstance(value, paddle.Tensor): - value = value.numpy() - other_value = other_value.numpy() - if isinstance(value, list): - value = np.array(value) - other_value = np.array(other_value) - if not (value == other_value).all(): - return False - return True - - def __hash__(self): - return hash(self.__repr__()) - - @classmethod - def collate_fn(cls, batch): - """Collate batch data in form of InputFeatures.""" - new_batch = {} - for key in batch[0]: - values = [b[key] for b in batch] - if key in cls.tensorable: - new_batch[key] = paddle.to_tensor(values) - else: - new_batch[key] = values - - return InputFeatures(**new_batch) - - -def signature(fn): +def signature(function): """ Obtain the input arguments of the given function. """ - sig = inspect.signature(fn) + sig = inspect.signature(function) args = [ p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD ] return args + + +@dataclass +class PromptDataCollatorWithPadding: + """ + Data collator that will group inputs by keywords and dynamically + pad the inputs to the longest sequence in the batch. + + Args: + tokenizer (`paddlennlp.transformers.PretrainedTokenizer`): + The tokenizer used for encoding the data from PromptTokenizer. + """ + + tokenizer: PretrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + return_tensors: str = "pd" + return_attention_mask: Optional[bool] = None + default_model_input_names: List = ("input_ids", "token_type_ids", + "special_tokens_mask", "offset_mapping", + "position_ids") + + def _convert_to_tensors(self, data): + if self.return_tensors == "np": + return np.array(data) + else: + return paddle.to_tensor(data) + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + batch = {} + for key in features[0]: + if key in self.default_model_input_names: + batch[key] = [b[key] for b in features] + + batch = self.tokenizer.pad( + batch, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + return_attention_mask=self.return_attention_mask) + max_length = batch["input_ids"].shape[1] + for key in features[0]: + if key not in self.default_model_input_names: + values = [b[key] for b in features] + if key == "masked_positions": + new_values = [] + for index, value in enumerate(values): + value = np.array(value) + index * max_length + new_values.extend(value.tolist()) + values = new_values + elif key == "attention_mask": + new_values = np.zeros( + [len(values), 1, max_length, max_length]) + for index, value in enumerate(values): + length = len(value) + new_values[index][0, :length, :length] = value + values = new_values + elif key != "labels": + for index, value in enumerate(values): + values[index] = value + [0] * (max_length - len(value)) + batch[key] = self._convert_to_tensors(values) + return batch diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py index 400101082d0c..b9b98addd436 100644 --- a/paddlenlp/prompt/template.py +++ b/paddlenlp/prompt/template.py @@ -1,463 +1,814 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +""" +Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This module provide prompt definition methods. +""" -from abc import abstractmethod import os import re import json +import traceback +from abc import abstractmethod +from typing import Any, Dict, List, Tuple, Optional, Union + +import numpy as np import paddle import paddle.nn as nn +from paddle import Tensor +from paddlenlp.utils.log import logger +from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel -from .prompt_utils import InputExample, InputFeatures from .prompt_tokenizer import MLMPromptTokenizer -from ..utils.log import logger - -__all__ = ["Template", "ManualTemplate", "SoftTemplate", "AutoTemplate"] - -TEMPLATE_FILE = "template.json" - - -def parse_template(inputs: str, part_start="{", part_end="}"): - """ Parse items from the input template text. """ - parsed = [] - i_start = 0 - while i_start < len(inputs): - space = ' ' if (i_start > 0 and inputs[i_start - 1] == ' ') else '' - p = {"add_prefix_space": space} - while i_start < len(inputs) and inputs[i_start] == ' ': - p["add_prefix_space"] = ' ' - i_start += 1 - if i_start == len(inputs): break - - if inputs[i_start] == part_start: - i_end = i_start + 1 - count_part = 1 - while i_end < len(inputs): - if inputs[i_end] == part_end: - count_part -= 1 - if count_part == 0: break - elif inputs[i_end] == part_start: - count_part += 1 - i_end += 1 - if i_end == len(inputs): - raise ValueError( - '{} at position {} has no corresponding {}'.format( - part_start, i_start, part_end)) - try: - part = eval('{%s}' % inputs[i_start + 1:i_end]) - if isinstance(part, set): - part = {k: None for k in part} - p.update(part) - except: - import traceback - logger.error(traceback.format_exc()) - logger.error( - 'syntax error in {}'.format(f"{inputs[i_start + 1:i_end]}")) - exit() - i_start = i_end + 1 - else: - i_end = i_start + 1 - while i_end < len(inputs): - if inputs[i_end] == part_start: - break - i_end += 1 - p['hard'] = inputs[i_start:i_end].rstrip(' ') - i_start = i_end - parsed.append(p) - return parsed +__all__ = [ + "Template", "ManualTemplate", "SoftTemplate", "PrefixTemplate", + "AutoTemplate" +] + +# Template used to be saved in a file. +TEMPLATE_CONFIG_FILE = "template_config.json" +TEMPLATE_PARAMETER_FILE = "template_state.pdparams" + +# Default values for some template attributes. +DEFAULT_MAX_OPTIONS = 10 class Template(nn.Layer): """ - Base template class used to preprocess the inputs of model. + Base class for [`Template`]. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The tokenizer of pretrained models. - + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. """ - registered_input_names = ['mask_ids', 'shortenable_ids'] - registered_text_keys = ['text_a', 'text_b'] - - def __init__(self, tokenizer, max_seq_length): - super().__init__() + template_special_tokens = [ + "text", "hard", "soft", "soft_id", "prefix", "sep", "mask", "options" + ] + template_attributes = [ + "length", "encoder", "position", "token_type", "hidden_size", + "add_omask", "add_prompt", "add_space" + ] + input_feature_names = ["do_truncate", "token_types", "positions"] + opt_token = "[OPT]" + omask_token = "[O-MASK]" + + def __init__(self, prompt: str, tokenizer: PretrainedTokenizer, + max_length: int, **kwargs): + super(Template, self).__init__() + for key, value in kwargs.items(): + setattr(self, key, value) self.tokenizer = tokenizer - self.wrapped_tokenizer = MLMPromptTokenizer(tokenizer, max_seq_length) + self.prompt_tokenizer = MLMPromptTokenizer(tokenizer, max_length) + self.prompt = prompt @property - def template(self): - if not hasattr(self, '_template'): - raise RuntimeError( - 'Property template has not been set before used.') - return self._template - - @template.setter - def template(self, template): - if template is None: - return - self._template = template - self._process_template() + def prompt(self): + return self._prompt + + @prompt.setter + def prompt(self, prompt: str): + if prompt is not None: + if isinstance(prompt, str): + self._prompt = self.parse_template_string(prompt) + else: + self._prompt = prompt + self._check_template_special_tokens() + self.example_keys = self.create_example_keys_from_prompt() + self.token_types = self.create_token_type_sequence_from_prompt() + self.do_truncate = self.create_truncation_sequence_from_prompt() + self.positions = self.create_position_sequence_from_prompt() + self.create_prompt_parameters() @abstractmethod - def _process_template(self): - """ A hook to process template text when it is set. """ + def create_prompt_parameters(self): raise NotImplementedError - def parse_inputs(self, inputs): - return parse_template(inputs) + def _check_template_special_tokens(self): + valid_attr = self.template_special_tokens + self.template_attributes + prompt_attr = [] + for part in self._prompt: + prompt_attr.extend(list(part.keys())) + if "add_prompt" in part: + opt_prompt = part["add_prompt"] + if self.opt_token not in opt_prompt: + raise ValueError("'{}' not found in option prompt.".format( + self.opt_token)) + if "add_omask" in part: + self._check_omask_token() + diff_attr = set(prompt_attr) - set(valid_attr) + if len(diff_attr) > 0: + raise ValueError( + "Invalid attributes found in template: {}.".format(diff_attr)) + return True + + def _check_example_name(self, name: str, example: Dict[str, Any]): + if name not in example: + raise ValueError( + "Unexpected value in template. Can not find keyword {} in example: {}" + .format(name, example)) + return True + + def _check_omask_token(self): + omask_example = """ + Add '[O-MASK]' to tokenizer to use `add_omask`. + + Examples: + + ```python + omask_dict = {"additional_special_tokens": ["[O-MASK]"]} + tokenizer.add_special_tokens(omask_dict) + model.resize_token_embeddings(len(tokenizer)) + ```""" + if self.omask_token not in self.tokenizer.additional_special_tokens: + self.tokenizer.add_special_tokens( + {"additional_special_tokens": [self.omask_token]}) + return True + raise ValueError( + "'{}' not found in tokenizer.".format(self.omask_token) + + omask_example) + return True + + def build_inputs_with_prompt( + self, + example: Dict[str, Any], + prompt: Optional[List[Dict[str, Any]]] = None) -> List[str]: + """ + Build input text sequences according to both prompt and example. + + Args: + example (`Dict[str, Any]`): + A data sample with corresponding keys as `prompt`. + prompt (`Optional[List[Dict[str, Any]]]`): + A sequence of dictionary which defines positions of prompt, + input text and special tokens. + """ + inputs = self._prompt.copy() if prompt is None else prompt.copy() + + for index, part in enumerate(inputs): + if "text" in part: + self._check_example_name(part["text"], example) + inputs[index] = str(example[part["text"]]) + elif "mask" in part: + if "length" not in part: + part["length"] = 1 + inputs[index] = self.tokenizer.mask_token * part["length"] + elif "sep" in part: + inputs[index] = self.tokenizer.sep_token + elif "hard" in part: + inputs[index] = part["hard"] + elif "options" in part: + if not isinstance(part["options"], list): + self._check_example_name(part["options"], example) + labels = example[part["options"]] + labels = [labels] if isinstance(labels, str) else labels + else: + labels = part["options"] + if "add_prompt" in part: + opt_prompt = part["add_prompt"] + labels = [ + opt_prompt.replace(self.opt_token, x) for x in labels + ] + if "add_omask" in part: + labels = [self.omask_token + x for x in labels] + inputs[index] = "".join(labels) + else: + inputs[index] = part - def get_default_mask_ids(self): - """ List to denote whether an item in template is a mask token. """ - return [1 if 'mask' in p else 0 for p in self.template] + if "add_space" in part: + inputs[index] = " " + inputs[index] + return inputs - def get_default_shortenable_ids(self): - """ List to denote whther an item in template can be truncated. """ - idx = [] - for p in self.template: - if 'shortenable' in p: - idx.append(1 if p['shortenable'] else 0) + def create_token_type_sequence_from_prompt( + self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]: + prompt = self._prompt if prompt is None else prompt + last_token_type = 0 + token_type_ids = [] + for part in prompt: + if "token_type" in part: + last_token_type = part["token_type"] + token_type_ids.append(last_token_type) + return token_type_ids + + def create_position_sequence_from_prompt( + self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]: + prompt = self._prompt if prompt is None else prompt + position_ids = [] + for part in prompt: + if "position" in part: + position_ids.append(part["position"]) else: - idx.append(1 if 'text' in p else 0) - return idx - - def incorporate_template_text(self, example, template=None): - """ Replace each item in template with real text. """ - inputs = template.copy( - ) if self.template is None else self.template.copy() - - for i, p in enumerate(inputs): - if 'text' in p: - inputs[i] = p['add_prefix_space'] + getattr(example, p['text']) - elif 'mask' in p: - inputs[i] = self.tokenizer.mask_token - elif 'hard' in p: - inputs[i] = p['add_prefix_space'] + p['hard'] - elif 'sep' in p: - inputs[i] = self.tokenizer.sep_token + position_ids.append(-1) + return position_ids + + def create_truncation_sequence_from_prompt( + self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]: + prompt = self._prompt.copy() if prompt is None else prompt.copy() + do_truncate = [] + for part in prompt: + if "truncate" in part: + do_truncate.append(part["truncation"]) + prompt_tokens = set(part.keys()) - set(["text"]) + if len(prompt_tokens) > 0 and part["truncation"]: + logger.warning("{} in template will be truncated, ".format( + prompt_tokens) + "which might degrade performance.") + elif "text" in part: + do_truncate.append(True) else: - raise ValueError('Can not parse {}'.format(p)) + do_truncate.append(False) + return do_truncate + + def create_example_keys_from_prompt(self): + example_keys = set() + for part in self._prompt: + if "text" in part: + example_keys.add(part["text"]) + if "options" in part and isinstance(part["options"], list): + example_keys.update(set(part["options"])) + return example_keys + + def encode(self, example: Dict[str, Any]): + input_text = self.build_inputs_with_prompt(example) + input_names, input_values = ["text"], [input_text] + for name in self.input_feature_names: + input_names.append(name) + input_values.append(getattr(self, name, None)) - return inputs + inputs = [] + for value in list(zip(*input_values)): + inputs.append(dict(zip(input_names, value))) - def wrap_one_example(self, example): - """ Process InputExample according to the predefined template. """ - if self.template is None: - raise ValueError('The template has not been initialized.') - if isinstance(example, InputExample): - text = self.incorporate_template_text(example) - - non_empty_keys = example.keys() - for key in self.registered_text_keys: - if key in non_empty_keys: - non_empty_keys.remove(key) - - keys, values = ['text'], [text] - for name in self.registered_input_names: - keys.append(name) - v = None - if hasattr(self, name) and getattr(self, name) is not None: - v = getattr(self, name) - elif hasattr(self, 'get_default_' + name): - v = getattr(self, 'get_default_' + name)() - setattr(self, name, v) - else: - raise ValueError(""" - Template's part attribute '{}' is registered but not - initialized. Try using template.{} = [...] to - initialize or create a get_default_{}(self) - method in your template.""".format(name, name, name)) - values.append(v) - - wrapped_parts_to_tokenize = [] - for value in list(zip(*values)): - wrapped_parts_to_tokenize.append(dict(zip(keys, value))) - - wrapped_parts_not_to_tokenize = { - key: getattr(example, key) - for key in non_empty_keys - } - wrapped_parts_to_tokenize = self.wrapped_tokenizer( - wrapped_parts_to_tokenize) - - return InputFeatures(**wrapped_parts_to_tokenize, - **wrapped_parts_not_to_tokenize) - else: - raise TypeError('InputExample') + input_dict = self.prompt_tokenizer(inputs) + unused_example = { + k: v + for k, v in example.items() if k not in self.example_keys + } - def process_batch(self, batch): - return batch + return {**input_dict, **unused_example} - def save_to(self, data_dir): - with open(os.path.join(data_dir, TEMPLATE_FILE), "w") as f: - json.dump(self.template, f) + def __call__(self, example: Dict[str, Any]): + return self.encode(example=example) + @abstractmethod + def process_batch(self, input_dict): + raise NotImplementedError -class ManualTemplate(Template): - """ - ManualTemplate for hard prompt methods, such as PET, EFL. + def save(self, save_path): + if not os.path.exists(save_path): + os.makedirs(save_path, exist_ok=True) + template_config_file = os.path.join(save_path, TEMPLATE_CONFIG_FILE) + with open(template_config_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(self._prompt, ensure_ascii=False)) + template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE) + template_state_dict = self.state_dict() + if len(template_state_dict) > 0: + paddle.save(template_state_dict, template_param_file) + + @staticmethod + def extract_template_keywords(prompt: List[Dict[str, Any]]): + keywords = set() + for part in prompt: + keywords.update(part.keys()) + return keywords + + @staticmethod + def parse_template_string(prompt: str, + left_token: Optional[str] = "{", + right_token: Optional[str] = "}"): + """ + Parse the defined string as a sequence of dictionaries. - Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The same as `Template`. - template (str | list): - It describes how to combine text and prompts. For example, - `str`: "{'text':'text_a'} It is {'mask'}." or a corresponding - list of dictionary/set parsed by `parse_template` method. - """ + Args: + prompt: A string comprised of nestable {}, [], integers and strings. - def __init__(self, tokenizer, max_seq_length, template=None): - super().__init__(tokenizer=tokenizer, max_seq_length=max_seq_length) - self.template = template + Returns: + A list of dictionaries corresponding to the input string. + + For example, if we define `prompt` as - def _process_template(self): - if isinstance(self._template, str): - self._template = self.parse_inputs(self._template) + "{'text': 'hypothesis'}基于这一假设{'mask'}推断出{'options': 'label.txt'}", + then this function returns -class SoftTemplate(Template): + [{"text": "hypothesis"}, {"hard": "基于这一假设"}, {"mask": null}, + {"hard": "推断出"}, {"options": ["正确", "错误"]}]. + + Raises: + ValueError: A error occurred parsing an string with unmatched punctuations. + """ + left_stack = [] + parsed = [] + index = 0 + while index < len(prompt): + # Delete extra spaces. + part = {"add_space": " "} if prompt[index] == " " else {} + while index < len(prompt) and prompt[index] == " ": + index += 1 + if index == len(prompt): + break + # Parse blocks with paired tokens like "{ }". + if prompt[index] == left_token: + left_index = index + while index < len(prompt): + if prompt[index] == left_token: + left_stack.append(index) + elif prompt[index] == right_token: + left_stack.pop() + if len(left_stack) == 0: + break + index += 1 + if index == len(prompt) and len(left_stack) > 0: + raise ValueError( + "{} at position {} has no corresponding {}".format( + left_token, left_index, right_token)) + try: + part_dict = eval(prompt[left_index:index + 1]) + if isinstance(part_dict, set): + part_dict = {k: None for k in part_dict} + part.update(part_dict) + except SyntaxError as error: + logger.error(traceback.format_exc()) + exit() + index += 1 + # Parse simplified discrete prompts. + else: + left_index = index + while index < len(prompt) and prompt[index] != left_token: + index += 1 + part["hard"] = prompt[left_index:index].rstrip(" ") + + if "options" in part: + if os.path.isfile(part["options"]): + with open(part["options"], "r") as fp: + labels = [x.strip() for x in fp] + part["options"] = labels + part["length"] = len(labels) + elif "length" not in "options": + part["length"] = DEFAULT_MAX_OPTIONS + logger.warning( + "[options]: The maximum number of options not defined," + " set as {} by default.".format(DEFAULT_MAX_OPTIONS)) + if "length" in part: + assert part["length"] > 0 + if "hard" in part: + logger.warning( + "Ignore `length` attribute for keyword `hard`.") + if "position" in part: + assert part["position"] >= 0 + if "token_type" in part: + assert part["token_type"] in (0, 1) + parsed.append(part) + return parsed + + +class ManualTemplate(Template): """ - SoftTemplate on the input layer for soft prompt methods, such as p-tuning. + ManualTemplate for discrete prompt methods, such as PET, EFL. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The same as `Template`. - template (str | list): - It describes how to combine text with both manual and soft prompts. - prompt_encoder (str): - The encoder to project soft embeddings. Support `lstm` and 'mlp'. - Use soft embeddings directly when prompt_encoder is `None`. + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. """ - registered_input_names = ['soft_token_ids', 'mask_ids', 'shortenable_ids'] + template_special_tokens = ["text", "hard", "sep", "mask", "options"] + template_attributes = [ + "length", "position", "token_type", "add_prompt", "add_space", + "add_omask" + ] - def __init__(self, - tokenizer, - max_seq_length, - model=None, - template=None, - prompt_encoder=None, - encoder_hidden_size=None): - super().__init__(tokenizer=tokenizer, max_seq_length=max_seq_length) - if model is None: - self.token_embeddings = None - logger.warning( - "SoftTemplate: The pretrained model is not given. It would " - "lead to error unless it is initialized for deployment.") - else: - if type(model).__name__.endswith('Model'): - self.token_embeddings = model.embeddings.word_embeddings - else: - for module in model.children(): - if type(module).__name__.endswith('Model'): - self.token_embeddings = module.embeddings.word_embeddings - break - self.token_embeddings.weight.stop_gradient = True - self.embedding_size = self.token_embeddings.weight.shape[-1] - self.encoder_hidden_size = encoder_hidden_size - if self.encoder_hidden_size is not None and prompt_encoder is None: - logger.warning("`prompt_encoder` is not set yet. Use MLP for " - "soft embeddings' projection by default.") - prompt_encoder = "mlp" - self.prompt_encoder = prompt_encoder - self.template = template - - def _process_template(self): - if isinstance(self._template, str): - self._template = self.parse_inputs(self._template) - self.parse_soft_tokens() - self.generate_parameters() + def __init__(self, prompt: str, tokenizer: PretrainedTokenizer, + max_length: int): + super(ManualTemplate, self).__init__(prompt, tokenizer, max_length) - @property - def prompt_encoder(self): - return self._prompt_encoder + def create_prompt_parameters(self): + return None - @prompt_encoder.setter - def prompt_encoder(self, prompt_encoder): + def process_batch(self, input_dict): + return input_dict - if prompt_encoder is None: - return None - if getattr(self, "_prompt_encoder", None) is not None: - logger.warning( - f"Encoder has already set as {self._prompt_encoder}, change " + - "`prompt_encoder` will reset parameters.") +class SoftLSTM(nn.Layer): + """ + LSTM encoder for soft token embeddings. + """ - self._prompt_encoder = prompt_encoder + def __init__(self, input_size, hidden_size, output_size, activation): + super(SoftLSTM, self).__init__() + self.lstm = nn.LSTM(input_size=input_size, + hidden_size=hidden_size, + num_layers=2, + direction='bidirect', + time_major=False) + self.mlp = nn.Sequential(nn.Linear(2 * hidden_size, + hidden_size), activation, + nn.Linear(hidden_size, output_size)) - if self.encoder_hidden_size is None: - hidden_size = self.embedding_size - else: - hidden_size = self.encoder_hidden_size - if prompt_encoder == 'lstm': - self.lstm_head = nn.LSTM(input_size=self.embedding_size, - hidden_size=hidden_size, - num_layers=2, - direction='bidirect', - time_major=False) - self.mlp_head = nn.Sequential( - nn.Linear(2 * hidden_size, hidden_size), nn.ReLU(), - nn.Linear(hidden_size, self.embedding_size)) - elif prompt_encoder == 'mlp': - self.mlp_head = nn.Sequential( - nn.Linear(self.embedding_size, hidden_size), nn.ReLU(), - nn.Linear(hidden_size, self.embedding_size)) - if hasattr(self, "lstm_head"): - delattr(self, "lstm_head") - else: - raise ValueError( - "Unsupported soft token encoder: {}".format(prompt_encoder)) - - def incorporate_template_text(self, example, template=None): - """ Replace each item in template with real text. """ - inputs = template.copy( - ) if self.template is None else self.template.copy() - - for i, p in enumerate(inputs): - if 'text' in p: - inputs[i] = p['add_prefix_space'] + getattr(example, p['text']) - elif 'mask' in p: - inputs[i] = self.tokenizer.mask_token - elif 'hard' in p: - inputs[i] = p['add_prefix_space'] + p['hard'] - elif 'soft' in p: - inputs[i] = p['add_prefix_space'] + p['soft'] - elif 'sep' in p: - inputs[i] = self.tokenizer.sep_token - else: - raise ValueError('can not parse {}'.format(p)) + def forward(self, embeds): + hidden_states, _ = self.lstm(embeds) + return self.mlp(hidden_states) - return inputs - def parse_soft_tokens(self): - inputs = [] - soft_token_ids = [] - num_soft_token = 0 - soft2word_init = {} - soft_id_reindex = {} +class SoftTemplate(Template): + """ + SoftTemplate for continuous prompt methods on the input layer. - for part in self._template: - if 'soft' not in part and 'soft_id' not in part: - soft_token_ids.append(0) - inputs.append(part) - continue + Args: + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. + word_embeddings (`Tensor`): + The word embeddings of pretrained models, which can be obtained by + calling `model.get_input_embeddings().weight`. + soft_embeddings (`Tensor`): + The embeddings of soft tokens, which overwrites `word_embeddings` + as initial weights when defined. + """ + template_special_tokens = [ + "text", "hard", "soft", "soft_id", "sep", "mask", "options" + ] + input_feature_names = [ + "do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids" + ] - if 'soft' in part and part['soft'] is not None: - if 'duplicate' in part: - logger.warning( - 'Ignore ``duplicate``. It is ' - 'incompatible with ``soft`` with text values.') + def __init__(self, + prompt: str, + tokenizer: PretrainedTokenizer, + max_length: int, + word_embeddings: Tensor, + soft_embeddings: Tensor = None): + super(SoftTemplate, self).__init__(prompt, + tokenizer, + max_length, + word_embeddings=word_embeddings, + soft_embeddings=soft_embeddings) + + def named_parameters(self): + named_params = [(n, p) + for n, p in self.soft_embeddings.named_parameters()] + named_params.extend([(n, p) + for n, p in self.encoder_list.named_parameters()]) + return named_params + + def parameters(self): + return [p for n, p in self.named_parameters()] + + def create_prompt_parameters(self): + self._prompt, soft_token_config = self.parse_soft_prompt() + self.embed_size = self.word_embeddings.weight.shape[1] + soft2word, self.soft_tokens, self.num_soft_token = soft_token_config + self._init_soft_parameters(soft2word) + self.encoder_ids, self.encoder_list = self._create_soft_encoders() + + def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Convert input_ids to inputs_embeds. + + Soft tokens are encoded soft_embeddings with predefined encoders. + For other tokens, use word embeddings in pretrained model. + """ + word_embeds = self.word_embeddings(input_dict["input_ids"]) + if "attention_mask" not in input_dict or input_dict[ + "attention_mask"] is None: + pad_token_id = self.tokenizer.pad_token_id + attention_mask = paddle.unsqueeze( + (input_dict["input_ids"] == pad_token_id).astype("float32") * + -1e4, + axis=[1, 2]) + input_dict["attention_mask"] = attention_mask + input_dict["input_ids"] = None + soft_embeds = self.soft_embeddings(input_dict["soft_token_ids"]) + soft_shape = soft_embeds.shape + soft_embeds = soft_embeds.reshape([-1, soft_shape[-1]]) + for encoder_id in range(1, len(self.encoder_list)): + to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id) + to_encode = to_encode[0] * soft_shape[1] + to_encode[1] + to_encode = to_encode.squeeze(1) + to_encode_embeds = soft_embeds[to_encode] + to_encode_embeds = to_encode_embeds.reshape( + [soft_shape[0], -1, soft_shape[-1]]) + encoder = self.encoder_list[encoder_id] + encoded = encoder(to_encode_embeds) + encoded = encoded.reshape([-1, soft_shape[-1]]) + soft_embeds = paddle.scatter(soft_embeds, to_encode, encoded) + soft_embeds = soft_embeds.reshape([soft_shape[0], -1, soft_shape[-1]]) + soft_token_ids = input_dict["soft_token_ids"].unsqueeze(2) + input_dict["inputs_embeds"] = paddle.where(soft_token_ids > 0, + soft_embeds, word_embeds) + return input_dict + + def parse_soft_prompt(self): + """ + Unify the form of continuous prompts as {"soft": "xxx"} and create + continuous token id sequence for each part in template. + + Returns: + `List[Dict[str, str]]`: Template with continuous prompt formated as {"soft": "xxx"}. + `Tuple[Dict[int, int], List[List[int]], int]`: + - Mapping from continuous ids to word ids for initialization. + - Continuous ids for each part. Id 0 denotes none-continuous part. + - Number of unique coutinuous tokens. + """ + prompt = self._prompt.copy() + num_soft_token = 1 + soft_prompt = [] + soft_token_ids = [] + soft2word = {} + soft_id_reindex = {} - # Get word tokens and ids for soft token initialization. - init_token_ids = self.tokenizer( - part['add_prefix_space'] + part['soft'], + for part in prompt: + part_prompt = None + # Copy non-continuous prompt part. + if "soft" not in part and "soft_id" not in part: + soft_prompt.append(part) + soft_token_ids.append(None) + + # Deal with continuous prompt with specific initialization. + elif "soft" in part and part["soft"] is not None: + + # Get word tokens for initialization. + if "add_space" in part: + part["soft"] = part["add_space"] + part["soft"] + word_token_ids = self.tokenizer( + part["soft"], add_special_tokens=False, - return_token_type_ids=False)['input_ids'] - init_tokens = self.tokenizer.convert_ids_to_tokens( - init_token_ids) - assert len(init_tokens) == len(init_token_ids) - - # Create soft ids and corresponding ``soft`` part in template. - next_num_soft = num_soft_token + 1 - num_soft_token += len(init_tokens) - id_list = list(range(next_num_soft, num_soft_token + 1)) - - soft_token_ids.extend(id_list) - inputs.extend([{ - 'add_prefix_space': part['add_prefix_space'], - 'soft': token - } for token in init_tokens]) - for soft_id, word_id in zip(id_list, init_token_ids): - soft2word_init[soft_id] = word_id - - # Check the ids of ``soft`` and ``soft_id``. - if 'soft_id' in part: - if part['soft_id'] in soft_id_reindex: - assert id_list == soft_id_reindex[part['soft_id']] + return_token_type_ids=False)["input_ids"] + + # Create continuous token ids. + soft_id_list = list( + range(num_soft_token, num_soft_token + len(word_token_ids))) + num_soft_token += len(word_token_ids) + + for soft_id, word_id in zip(soft_id_list, word_token_ids): + soft2word[soft_id] = word_id + + # Check `length` if exists. + if "length" in part: + if part["length"] < len(word_token_ids): + logger.warning("Ignore `length` because it is less than" + " the length of defined word sequence.") + elif part["length"] > len(word_token_ids): + length = part["length"] - len(word_token_ids) + soft_id_list += list( + range(num_soft_token, num_soft_token + length)) + num_soft_token += length + part["soft"] += self.tokenizer.unk_token * length + + soft_token_ids.append(soft_id_list) + part_prompt = {"soft": part["soft"]} + + # Check or record `soft_id` if exists. + if "soft_id" in part: + if part["soft_id"] in soft_id_reindex: + assert soft_id_list == soft_id_reindex[part["soft_id"]] else: - soft_id_reindex[part['soft_id']] = id_list - continue - - if 'soft_id' in part and part['soft_id'] in soft_id_reindex: - if 'duplicate' in part: - logger.warnings('Ignore ``duplicate``. Initialize ' - '``soft`` by ``soft_id`` directly.') - id_list = soft_id_reindex[part['soft_id']] - - elif 'duplicate' in part: - assert isinstance(part['duplicate'], int) - if 'same' in part: - num_soft_token += 1 - id_list = [num_soft_token for _ in range(part['duplicate'])] - else: - next_num_soft = num_soft_token + 1 - num_soft_token += part['duplicate'] - id_list = list(range(next_num_soft, num_soft_token + 1)) + soft_id_reindex[part["soft_id"]] = soft_id_list + + # Deal with continous prompt defined by `soft_id`. + elif "soft_id" in part and part["soft_id"] in soft_id_reindex: + soft_id_list = soft_id_reindex[part["soft_id"]] + if "length" in part: + logger.warning("Ignore `length` because it is incompatible" + " with existing `soft_id`.") + soft_token_ids.append(soft_id_list) + part_prompt = { + "soft": [self.tokenizer.unk_token] * len(soft_id_list) + } + + # Deal with continous prompt with random initialization. else: - num_soft_token += 1 - id_list = [num_soft_token] - - if 'soft_id' in part: - soft_id_reindex[part['soft_id']] = id_list + if "length" not in part: + part["length"] = 1 + soft_id_list = list( + range(num_soft_token, num_soft_token + part["length"])) + num_soft_token += part["length"] + soft_token_ids.append(soft_id_list) + if "soft_id" in part: + soft_id_reindex[part["soft_id"]] = soft_id_list + part_prompt = { + "soft": [self.tokenizer.unk_token] * len(soft_id_list) + } + if part_prompt is not None: + for key in part: + if key not in ["soft", "soft_id", "length", "add_space"]: + part_prompt[key] = part[key] + soft_prompt.append(part_prompt) + + if num_soft_token == 1: + raise ValueError("Soft prompt expected for SoftTemplate, but" + " get {}.".format(self._prompt)) + + soft_token_config = (soft2word, soft_token_ids, num_soft_token) + + return soft_prompt, soft_token_config + + def _init_soft_parameters(self, soft2word: Dict[int, int]): + if self.soft_embeddings is not None: + if self.soft_embeddings.weight.shape[0] != self.num_soft_token: + raise ValueError( + "Given soft embeddings are incompatible with those " + "defined in template \"{}\"".format(self._prompt)) + else: + self.soft_embeddings = nn.Embedding(self.num_soft_token, + self.embed_size) + weight = self.soft_embeddings.weight.clone().detach() + for soft_id, word_id in soft2word.items(): + word_id = paddle.to_tensor(word_id) + weight[soft_id] = self.word_embeddings(word_id) + self.soft_embeddings.weight.set_value(weight) + + def _create_soft_encoders(self, + output_size: int = None, + activation: nn.Layer = None): + encoder_list = [nn.Identity()] + encoder2id = {} + encoder_ids = [] + output_size = self.embed_size if output_size is None else output_size + activation = nn.ReLU() if activation is None else activation + for part in self._prompt: + if "encoder" not in part or part["encoder"] is None: + encoder_ids.append(0) + else: + if part["encoder"] not in encoder2id: + encoder2id[part["encoder"]] = len(encoder_list) + encoder_ids.append(len(encoder_list)) + if "hidden_size" in part: + hidden_size = part["hidden_size"] + else: + hidden_size = self.embed_size + if part["encoder"] == "lstm": + encoder_list.append( + SoftLSTM(self.embed_size, hidden_size, output_size, + activation)) + elif part["encoder"] == "mlp": + encoder_list.append( + nn.Sequential( + nn.Linear(self.embed_size, + hidden_size), activation, + nn.Linear(hidden_size, output_size))) + else: + raise ValueError("Encoder {} not supported.".format( + part["encoder"])) + else: + encoder_ids.append(encoder2id[part["encoder"]]) + encoder_list = nn.LayerList(encoder_list) + return encoder_ids, encoder_list + + def build_inputs_with_prompt( + self, + example: Dict[str, Any], + prompt: Optional[List[Dict[str, Any]]] = None) -> List[str]: + inputs = super(SoftTemplate, + self).build_inputs_with_prompt(example, prompt) + for index, part in enumerate(inputs): + if isinstance(part, dict) and "soft" in part: + inputs[index] = part["soft"] + return inputs - soft_token_ids.extend(id_list) - inputs.extend([{ - 'add_prefix_space': part['add_prefix_space'], - 'soft': self.tokenizer.cls_token - } for _ in range(len(id_list))]) + def save(self, save_path): + super(SoftTemplate, self).save(save_path) + template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE) + paddle.save(self.state_dict(), template_param_file) - self._template = inputs - self.soft_token_ids = soft_token_ids - self.num_soft_token = num_soft_token - self.soft2word_init = soft2word_init - if self.num_soft_token == 0: - logger.warning('No soft tokens in template. '\ - 'Use ManualTemplate for better performance.') +class PrefixTemplate(SoftTemplate): + """ + PrefixTemplate for continuous prompt methods on every layer. - def generate_parameters(self): - """ - Generate parameters for soft tokens. - """ - if self.num_soft_token == 0 or self.token_embeddings is None: - return None - self.soft_embeddings = nn.Embedding(self.num_soft_token + 1, - self.embedding_size) - - weight = self.soft_embeddings.weight.clone().detach() - for soft_id, word_id in self.soft2word_init.items(): - weight[soft_id] = self.token_embeddings(paddle.to_tensor(word_id)) - self.soft_embeddings.weight.set_value(weight) - - def process_batch(self, batch): - word_embeds = self.token_embeddings(batch["input_ids"]) - batch["input_ids"] = None - if not hasattr(self, - "soft_embeddings") or batch["soft_token_ids"] is None: - batch["inputs_embeds"] = word_embeds - else: - soft_embeds = self.soft_embeddings(batch["soft_token_ids"]) - if hasattr(self, "lstm_head"): - soft_embeds = self.lstm_head(soft_embeds)[0] - if hasattr(self, "mlp_head"): - soft_embeds = self.mlp_head(soft_embeds) + Args: + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. + model (`PretrainedModel`): + An instance of PretrainedModel. + """ + template_special_tokens = [ + "text", "hard", "prefix", "soft", "sep", "mask", "options" + ] + input_feature_names = [ + "do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids" + ] - inputs_embeds = paddle.where( - (batch["soft_token_ids"] > 0).unsqueeze(-1), soft_embeds, - word_embeds) - batch["inputs_embeds"] = inputs_embeds - return batch + def __init__(self, + prompt: str, + tokenizer: PretrainedTokenizer, + max_length: int, + model: PretrainedModel, + prefix_dropout: float = 0.1): + self.n_layer, self.n_heads = self._get_config(model) + super(PrefixTemplate).__init__(prompt, tokenizer, max_length, + model.get_input_embeddings()) + self.dropout = nn.Dropout(p=prefix_dropout) + + @staticmethod + def _get_config(model): + names = [n for n, p in model.named_parameters() if "layers" in n] + pattern = re.compile(r".*?\.(\d+)\..*?") + indices = [] + for name in names: + result = pattern.match(name) + if result is not None: + indices.append(int(result.group(1))) + num_layer = max(indices) + 1 + layer_names = names[0].split(".")[:-2] + layer = model + for name in layer_names: + layer = getattr(layer, name) + num_heads = layer.num_heads + + return num_layer, num_heads + + def parse_soft_prompt(self): + prompt = self._prompt.copy() + + for index, part in enumerate(prompt): + if "soft" in part: + raise ValueError("Keyward `soft` should not be used in " + "PrefixTemplate.") + if "prefix" not in part: + continue + if index != 0: + raise ValueError("Keyword `prefix` should locate at the " + "beginning of template.") + part["soft"] = part["prefix"] + part.pop("prefix") + prompt[index] = part + + self._prompt = prompt + return super(PrefixTemplate, self).parse_soft_prompt() + + def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + word_embeds = self.word_embeddings(input_dict["input_ids"]) + if "attention_mask" not in input_dict or input_dict[ + "attention_mask"] is None: + pad_token_id = self.tokenizer.pad_token_id + attention_mask = paddle.unsqueeze( + (input_dict["input_ids"] == pad_token_id).astype("float32") * + -1e4, + axis=[1, 2]) + input_dict["attention_mask"] = attention_mask + input_dict["input_ids"] = None + + batch_size, _ = input_dict["soft_token_ids"].shape + soft_token_ids = paddle.masked_select(input_dict["soft_token_ids"], + input_dict["soft_token_ids"] > 0) + soft_token_ids = soft_token_ids.reshape([batch_size, -1]) + _, soft_len = soft_token_ids.shape + + input_dict["inputs_embeds"] = word_embeds[:, soft_len:, :] + + soft_embeds = self.soft_embeddings(soft_token_ids) + for encoder_id in range(1, len(self.encoder_list)): + to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id) + encoded = self.encoder_list[encoder_id](to_encode) + soft_embeds = paddle.where(input_dict["encoder_ids"] == encoder_id, + encoded, soft_embeds) + soft_embeds = soft_embeds.reshape([ + batch_size, soft_len, self.n_layer * 2, self.n_heads, + self.embed_size // self.n_heads + ]) + soft_embeds = self.dropout(soft_embeds) + soft_embeds = paddle.transpose(soft_embeds, perm=[2, 0, 3, 1, 4]) + soft_embeds = paddle.split(soft_embeds, num_or_sections=self.n_layer) + soft_embeds = [paddle.split(emb, 2) for emb in soft_embeds] + soft_embeds = [[x.squeeze(0) for x in emb] for emb in soft_embeds] + input_dict["past_key_values"] = tuple( + [tuple(emb) for emb in soft_embeds]) + return input_dict + + def _create_soft_encoders(self): + output_size = self.embed_size * self.n_layer * 2 + activation = nn.Tanh() + return super(PrefixTemplate, + self)._create_soft_encoders(output_size, activation) class AutoTemplate(object): @@ -465,83 +816,70 @@ class AutoTemplate(object): AutoTemplate can help you automatically create the relevant Template given the provided prompt. """ - registered_text_keys = ['text_a', 'text_b'] + default_text_keyword = "text_a" def __init__(self, *args, **kwargs): raise EnvironmentError( '{} is designed to be instantiated using {}.create_from('\ - 'template, tokenizer, text_list, ...)'.format( + 'prompt, tokenizer, max_length, ...)'.format( self.__class__.__name__, self.__class__.__name__)) - @classmethod - def parse_inputs(cls, inputs): - return parse_template(inputs) - @classmethod def create_from(cls, - template, - tokenizer, - max_seq_length, - model=None, - prompt_encoder=None, - encoder_hidden_size=None): - if template is None: - template = "{'soft'}" - if isinstance(template, str): - template = cls.parse_inputs(template) - template_keys = cls._extract_template_keys(template) - if 'text' not in template_keys: - soft_template = [] - for item in template: - if 'hard' in item: - soft_template.append({ - 'add_prefix_space': '', - 'soft': item['hard'] - }) - else: - soft_template.append(item) - text_item = [{ - 'add_prefix_space': ' ', - 'text': cls.registered_text_keys[0] - }] - template = text_item + soft_template - template_keys = cls._extract_template_keys(template) - - if 'mask' not in template_keys: - template.append({'add_prefix_space': ' ', 'mask': None}) - - if 'soft' in template_keys: - return SoftTemplate(tokenizer=tokenizer, - template=template, - max_seq_length=max_seq_length, - model=model, - prompt_encoder=prompt_encoder, - encoder_hidden_size=encoder_hidden_size) + prompt: str, + tokenizer: PretrainedTokenizer, + max_length: int = 512, + model: PretrainedModel = None, + soft_embeddings: Tensor = None): + # Default template if not defined. + if prompt is None: + prompt = "{'soft'}{'text': 'text_a'}{'mask'}" + + if isinstance(prompt, str): + prompt = Template.parse_template_string(prompt) + template_keywords = Template.extract_template_keywords(prompt) + + # Complement simplified template as ManualTemplate-style in form. + if "text" not in template_keywords: + prompt = [{"text": cls.default_text_keyword}] + prompt + if "mask" not in template_keywords: + prompt = prompt + [{"mask": None}] + + # Choose Template according to template keywords. + if "prefix" in template_keywords: + return PrefixTemplate(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length, + model=model) + elif "soft" in template_keywords or "soft_id" in template_keywords: + word_embeddings = model.get_input_embeddings() + return SoftTemplate(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length, + word_embeddings=word_embeddings) else: - return ManualTemplate(tokenizer=tokenizer, - max_seq_length=max_seq_length, - template=template) + return ManualTemplate(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length) @classmethod def load_from(cls, - data_dir, - tokenizer, - max_seq_length, - model=None, - prompt_encoder=None, - encoder_hidden_size=None): - with open(os.path.join(data_dir, TEMPLATE_FILE), "r") as f: - template = json.load(f) - return cls.create_from(template, tokenizer, max_seq_length, model, - prompt_encoder, encoder_hidden_size) - - @classmethod - def _extract_template_keys(cls, inputs: list): - template_keys = set() - for item_dict in inputs: - for key, value in item_dict.items(): - template_keys.add(key) - if key == 'text': - assert value in cls.registered_text_keys, 'No ``{}`` attribute '\ - 'in InputExample.'.format(value) - return template_keys + data_path: os.PathLike, + tokenizer: PretrainedTokenizer, + max_length: int, + model: PretrainedModel = None): + template_config_file = os.path.join(data_path, TEMPLATE_CONFIG_FILE) + if not os.path.isfile(template_config_file): + raise ValueError("{} not found under {}".format( + TEMPLATE_CONFIG_FILE, data_path)) + with open(template_config_file, "r") as fp: + prompt = json.loads(fp.readline().strip()) + # TODO (Huijuan): Load all configs from data_path. + template = cls.create_from(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length, + model=model) + template_param_file = os.path.join(data_path, TEMPLATE_PARAMETER_FILE) + if os.path.isfile(template_param_file): + template.set_state_dict(paddle.load(template_param_file)) + return template diff --git a/paddlenlp/prompt/verbalizer.py b/paddlenlp/prompt/verbalizer.py index ac64c861cc64..bb412259d78b 100644 --- a/paddlenlp/prompt/verbalizer.py +++ b/paddlenlp/prompt/verbalizer.py @@ -12,312 +12,337 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod -from collections import defaultdict import os import copy import json +from abc import abstractmethod +from typing import Any, Dict, List import numpy as np -from typing import List, Dict, Union import paddle import paddle.nn as nn import paddle.nn.functional as F +from paddle import Tensor +from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel +from paddlenlp.utils.log import logger -from ..utils.log import logger - -__all__ = [ - "Verbalizer", "MultiMaskVerbalizer", "ManualVerbalizer", "SoftVerbalizer" -] +__all__ = ["Verbalizer", "ManualVerbalizer", "SoftVerbalizer"] -VERBALIZER_FILE = "verbalizer.json" +# Verbalizer used to be saved in a file. +VERBALIZER_CONFIG_FILE = "verbalizer_config.json" +VERBALIZER_PARAMETER_FILE = "verbalizer_state.pdparams" class Verbalizer(nn.Layer): """ - Base verbalizer class used to process the outputs and labels. + Base class for [`Verbalizer`]. Args: - labels (list): - The sequence of labels in task. - + label_words (`dict`): + Define the mapping from labels to a single or multiple words. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer for label word tokenization. """ - def __init__(self, labels): - super().__init__() - self.labels = labels + def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, + **kwargs): + super(Verbalizer, self).__init__() + for key, value in kwargs.items(): + setattr(self, key, value) + self.tokenizer = tokenizer + self.token_aggregate_type = kwargs.get("token_aggregate_type", "mean") + self.word_aggregate_type = kwargs.get("word_aggregate_type", "mean") + self.mask_aggregate_type = kwargs.get("mask_aggregate_type", "product") + self.post_log_softmax = kwargs.get("post_log_sigmoid", True) + self.label_token_weight = kwargs.get("label_token_weight", None) + if self.label_token_weight is not None: + self.label_token_weight = self.normalize( + self.project(self.label_token_weight.unsqueeze(0))) + self.label_words = label_words @property def labels(self): - labels = getattr(self, "_labels", None) - if labels is None: - raise RuntimeError("`labels` is not set yet.") - return labels + if not hasattr(self, "_labels"): + raise RuntimeError("Attribute `labels` is not set yet.") + return self._labels @labels.setter def labels(self, labels): if labels is not None: self._labels = sorted(labels) - else: - self._labels = None @property def label_words(self): - label_words = getattr(self, "_label_words", None) - if label_words is None: - raise RuntimeError("`label_words not set yet.") - return label_words + if not hasattr(self, "_label_words"): + raise RuntimeError("Mapping from labels to words is not set yet.") + return self._label_words @label_words.setter - def label_words(self, label_words: Union[List, Dict]): + def label_words(self, label_words: Dict): if label_words is None: return None - if isinstance(label_words, dict): - new_labels = sorted(list(label_words.keys())) - if self._labels is None: - self._labels = new_labels - elif new_labels != self.labels: - raise ValueError( - f"The given `label_words` {new_labels} does not match " + - f"predefined labels {self.labels}.") - self._label_words = [label_words[k] for k in self.labels] - elif isinstance(label_words, list): - if self._labels is None: - raise ValueError( - "`labels` should be set as the given `label_words` is " - "a list. Make sure that the order is compatible.") - if len(self.labels) != len(label_words): - raise ValueError( - "The length of given `label_words` and predefined " + - "labels do not match.") - self._label_words = label_words - else: - raise TypeError('Unsupported type {} for label_words'.format( - type(label_words))) - self.process_label_words() - - @property - def labels_to_ids(self): - if not hasattr(self, 'labels'): - raise RuntimeError( - 'Property labels_to_ids has not been set before used.') - return {k: i for i, k in enumerate(self.labels)} - - @property - def ids_to_labels(self): - if not hasattr(self, 'labels'): - raise RuntimeError( - 'Property ids_to_labels has not been set before used.') - return {i: k for i, k in enumerate(self.labels)} - - @staticmethod - def add_prefix(label_words, prefix): - """ Add prefix to get expected token ids. """ - if isinstance(label_words[0], str): - label_words = [[word] for word in label_words] - - new_label_words = [] - for words_per_label in label_words: - new_words_per_label = [] - for word in words_per_label: - new_words_per_label.append(prefix + word) - new_label_words.append(new_words_per_label) - return new_label_words + self.labels = list(label_words.keys()) + self.labels_to_ids = { + label: idx + for idx, label in enumerate(self._labels) + } + self._words = [] + for label in self._labels: + words = label_words[label] + if isinstance(words, str): + words = [words] + self._words.append(words) + self._label_words = { + label: word + for label, word in zip(self._labels, self._words) + } + self.preprocess_label_words() + self.create_parameters() @abstractmethod - def process_label_words(self, ): - """ A hook to process verbalizer when it is set. """ - raise NotImplementedError - - @abstractmethod - def project(self, logits, **kwargs): - """ - Project the logits with shape ```[..., vocab_size]``` into - label_word_logits with shape ```[..., label_words]```. + def create_parameters(self): + """ + A hook to create parameters for mapping from labels to words. """ raise NotImplementedError - @staticmethod - def aggregate(embeddings, mask=None, atype='mean', ndim=2): - """ - Aggregate embeddings at the last dimension according to `atype` - if its number of dimensions is greater than `ndim`. - Used to handle multiple tokens for words and multiple words - for labels. + def preprocess_label_words(self): + label_token_ids = [] + for label_word in self._words: + word_token_ids = [] + for word in label_word: + token_ids = self.tokenizer.encode(word, + add_special_tokens=False, + return_token_type_ids=False) + word_token_ids.append(token_ids["input_ids"]) + label_token_ids.append(word_token_ids) + + max_num_words = max([len(words) for words in self._words]) + max_num_tokens = max([ + max([len(token_ids) for token_ids in word_token_ids]) + for word_token_ids in label_token_ids + ]) + token_ids_shape = [len(self.labels), max_num_words, max_num_tokens] + token_ids = np.zeros(token_ids_shape) + word_mask = np.zeros(token_ids_shape[:-1]) + token_mask = np.zeros(token_ids_shape) + for label_id, word_token_ids in enumerate(label_token_ids): + word_mask[label_id][:len(word_token_ids)] = 1 + for word_id, tokens in enumerate(word_token_ids): + token_ids[label_id][word_id][:len(tokens)] = tokens + token_mask[label_id][word_id][:len(tokens)] = 1 + self.token_ids = paddle.to_tensor(token_ids, + dtype="int64", + stop_gradient=True) + self.word_mask = paddle.to_tensor(word_mask, + dtype="float32", + stop_gradient=True) + self.token_mask = paddle.to_tensor(token_mask, + dtype="int64", + stop_gradient=True) - Args: - embeddings (paddle.Tensor): - The original embeddings. - atype (str): - The aggregation strategy, including mean and first. - ndim (str): - The aggregated embeddings' number of dimensions. + def convert_labels_to_ids(self, label: str): + assert isinstance(label, str) + return self.labels_to_ids[label] - """ - if embeddings.ndim > ndim and atype is not None: - if atype == 'mean': - if mask is None: - return embeddings.mean(axis=-1) - return (embeddings * mask.unsqueeze(0)).sum( - axis=-1) / (mask.unsqueeze(0).sum(axis=-1) + 1e-10) - elif atype == 'max': - if mask is None: - return embeddings.max(axis=-1) - return (embeddings - 1e4 * (1 - mask.unsqueeze(0))).max(axis=-1) - elif atype == 'first': - return embeddings[..., 0] - else: - raise ValueError('Unsupported aggregate type {}'.format(atype)) - return embeddings + def convert_ids_to_labels(self, index: int): + assert isinstance(index, int) + return self.labels[index] - def normalize(self, logits): - """ Normalize the logits of every example. """ - new_logits = F.softmax(logits.reshape(logits.shape[0], -1), axis=-1) - return new_logits.reshape(*logits.shape) + def project(self, outputs): + """ + Fetch label word predictions from outputs over vocabulary. + """ + token_ids = self.token_ids.reshape([-1]) + label_token_outputs = outputs.index_select(index=token_ids, axis=-1) + label_shape = [*outputs.shape[:-1], *self.token_ids.shape] + label_token_outputs = label_token_outputs.reshape(label_shape) + label_word_outputs = self.aggregate(label_token_outputs, + self.token_mask, + self.token_aggregate_type) + label_word_outputs -= 1e4 * (1 - self.word_mask) + return label_word_outputs + + def process_outputs(self, outputs, masked_positions: Tensor = None): + """ + Process outputs of `PretrainedModelForMaskedLM` over vocabulary. + """ + if masked_positions is None: + return outputs + batch_size, _, num_pred = outputs.shape + outputs = outputs.reshape([-1, num_pred]) + outputs = paddle.gather(outputs, masked_positions) + outputs = outputs.reshape([batch_size, -1, num_pred]) + return outputs + + def aggregate(self, outputs: Tensor, mask: Tensor, atype: str): + """ + Aggregate multiple tokens/words for each word/label. + """ + mask = mask.unsqueeze(0) + if atype == "mean": + outputs = outputs * mask + outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15) + elif atype == "max": + outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1) + elif atype == "first": + index = paddle.to_tensor([0]) + outputs = paddle.index_select(outputs, index, axis=-1) + else: + raise ValueError( + "Strategy {} is not supported to aggregate multiple " + "tokens.".format(atype)) + return outputs - def from_file(self, path): + def normalize(self, outputs: Tensor): """ - Load labels and corresponding words from files. + Normalize the outputs over the whole vocabulary. """ - raise NotImplementedError + batch_size = outputs.shape[0] + outputs = F.softmax(outputs.reshape([batch_size, -1]), + axis=-1).reshape(outputs.shape) + return outputs - def save_to(self, path): - label_state = [self.labels, self.token_ids.numpy().tolist()] - with open(os.path.join(path, VERBALIZER_FILE), "w") as f: - json.dump(label_state, f) + def calibrate(self, label_word_outputs: Tensor): + """ + Calibrate predictions with pre-defined weights over the whole vocabulary. + """ + if self.label_token_weight.dim() != 1: + raise ValueError("Weights of label tokens should be a 1-D tensor.") + weight_shape = self.label_token_weight.shape + output_shape = label_word_outputs.shape + if weight_shape[1:] != output_shape[1:] or weight_shape[0] != 1: + raise ValueError( + "Shapes of label token weights and predictions do not match, " + "got {} and {}.".format(weight_shape, output_shape)) + label_word_outputs /= (self.label_token_weight + 1e-15) + batch_size = label_word_outputs.shape0[0] + label_word_outputs = paddle.mean( + label_word_outputs.reshape([batch_size, -1])).reshape(output_shape) + + return label_word_outputs + + def save(self, save_path): + if not os.path.exists(save_path): + os.makedirs(save_path, exist_ok=True) + verb_config_file = os.path.join(save_path, VERBALIZER_CONFIG_FILE) + with open(verb_config_file, "w", encoding="utf-8") as fp: + json.dump(self.label_words, fp, ensure_ascii=False) + verb_params_file = os.path.join(save_path, VERBALIZER_PARAMETER_FILE) + verb_state_dict = self.state_dict() + if len(verb_state_dict) > 0: + paddle.save(self.state_dict(), VERBALIZER_PARAMETER_FILE) @classmethod - def load_from(cls, path): - with open(os.path.join(path, VERBALIZER_FILE), "r") as f: - label_state = json.load(f) - return label_state + def load_from(cls, data_path: os.PathLike, tokenizer: PretrainedTokenizer): + verb_config_file = os.path.join(data_path, VERBALIZER_CONFIG_FILE) + if not os.path.isfile(verb_config_file): + raise ValueError("{} not found under {}".format( + VERBALIZER_CONFIG_FILE, data_path)) + with open(verb_config_file, "r") as fp: + label_words = json.load(fp) + + verbalizer = cls(label_words, tokenizer) + verb_state_file = os.path.join(data_path, VERBALIZER_PARAMETER_FILE) + if os.path.isfile(verb_state_file): + verbalizer.set_state_dict(paddle.load(verb_state_file)) + logger.info( + "Loading verbalizer state dict from {}".format(verb_state_file)) + return verbalizer class ManualVerbalizer(Verbalizer): """ - Manual Verbalizer to map labels to words for hard prompt methods. + ManualVerbalizer defines mapping from labels to words manually. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The tokenizer of pretrained models. - labels (list): - The sequence of all labels. - label_words (dict or list): - The dictionary or corresponding list to map labels to words. - prefix (str): - The prefix string of words, used in PLMs like RoBERTa, which is sensitive to the prefix. + label_words (`dict`): + Define the mapping from labels to a single or multiple words. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer for label word tokenization. """ - def __init__(self, tokenizer, labels=None, label_words=None, prefix=None): - super().__init__(labels=labels) - self.tokenizer = tokenizer - self.prefix = prefix - self.label_words = label_words - - def process_label_words(self): - """ Create the label-word-token array and its corresponding mask. """ - if self.prefix is not None: - self._label_words = self.add_prefix(self.label_words, self.prefix) - - all_ids = [] - for words_per_label in self.label_words: - word_ids = [] - for word in words_per_label: - word_ids.append( - self.tokenizer.encode( - word, - add_special_tokens=False, - return_token_type_ids=False)["input_ids"]) - all_ids.append(word_ids) - - max_num_words = max([len(words) for words in self.label_words]) - max_num_tokens = max([ - max([len(token_ids) for token_ids in word_ids]) - for word_ids in all_ids - ]) - token_ids_shape = [len(self.labels), max_num_words, max_num_tokens] - token_ids = np.zeros(shape=token_ids_shape) - token_mask = np.zeros(shape=token_ids_shape) - word_mask = np.zeros(shape=[len(self.labels), max_num_words]) - for label_i, ids_per_label in enumerate(all_ids): - word_mask[label_i][:len(ids_per_label)] = 1 - for word_i, ids_per_word in enumerate(ids_per_label): - token_ids[label_i][word_i][:len(ids_per_word)] = ids_per_word - token_mask[label_i][word_i][:len(ids_per_word)] = 1 - self.token_ids = paddle.to_tensor(token_ids, - dtype="int64", - stop_gradient=True) - self.token_ids_mask = paddle.to_tensor(token_mask, - dtype="int64", - stop_gradient=True) - self.word_ids_mask = paddle.to_tensor(word_mask, - dtype="float32", - stop_gradient=True) - - def project(self, logits): - word_shape = [*logits.shape[:-1], *self.token_ids.shape] - token_logits = logits.index_select(index=self.token_ids.reshape([-1]), - axis=-1).reshape(word_shape) - word_logits = self.aggregate(token_logits, self.token_ids_mask) - return word_logits - - def process_outputs(self, logits, inputs, **kwargs): - mask_ids = inputs["mask_ids"].unsqueeze(2) - real_len = logits.shape[1] - mask_ids = mask_ids[:, -real_len:] - logits = paddle.where(mask_ids == 1, logits, paddle.zeros_like(logits)) - logits = logits.sum(axis=1) / mask_ids.sum(axis=1) - - word_logits = self.project(logits) - label_logits = self.aggregate(word_logits, self.word_ids_mask) - return label_logits + def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer): + super(ManualVerbalizer, self).__init__(label_words=label_words, + tokenizer=tokenizer) + + def create_parameters(self): + return None + + def aggregate_multiple_mask(self, outputs: Tensor, atype: str = None): + if atype is None: + return outputs + assert outputs.ndim == 3 + if atype == "mean": + outputs = outputs.sum(axis=1) + elif atype == "max": + outputs = outputs.max(axis=1) + elif atype == "first": + index = paddle.to_tensor([0]) + outputs = paddle.index_select(outputs, index, axis=1) + elif atype == "product": + new_outputs = outputs[:, 0, :] + for index in range(1, outputs.shape[1]): + new_outputs *= outputs[:, index, :] + outputs = new_outputs + else: + raise ValueError( + "Strategy {} is not supported to aggregate multiple " + "tokens.".format(atype)) + return outputs + + def process_outputs(self, + outputs: Tensor, + masked_positions: Tensor = None, + **kwargs): + """ + Process outputs over the vocabulary, including the following steps: - @classmethod - def from_file(cls, tokenizer, label_file, prefix=None, delimiter="=="): - with open(label_file, "r", encoding="utf-8") as fp: - label_words = defaultdict(list) - for line in fp: - data = line.strip().split(delimiter) - word = data[1] if len(data) > 1 else data[0].split("##")[-1] - label_words[data[0]].append(word) - return cls(tokenizer, - labels=set(label_words.keys()), - label_words=dict(label_words), - prefix=prefix) + (1) Project outputs into the outputs of corresponding word. + If self.post_log_softmax is True: + + (2) Normalize over all label words. -class MultiMaskVerbalizer(ManualVerbalizer): + (3) Calibrate (optional) - def __init__(self, tokenizer, labels=None, label_words=None, prefix=None): - super().__init__(tokenizer, labels, label_words, prefix) + (4) Aggregate multiple words for each label. - def process_outputs(self, logits, inputs, **kwargs): - """ - Process logits according to mask ids and label words. Args: - logits (paddle.Tensor): - The output of ForMaskedLM model with shape - [batch_size, max_seq_length, vocab_size]. - inputs (InputFeatures): - The input features of model, including mask_ids. + outputs (`Tensor`): + The outputs of `PretrainedModel` which class name ends with + `ForMaskedLM`. + Returns: + The prediction outputs over labels (`Tensor`). """ - batch_size, seq_len, vocab_size = logits.shape - batch_ids, word_ids = paddle.where(inputs["mask_ids"] == 1) - mask_ids = batch_ids * seq_len + word_ids - mask_logits = logits.reshape([-1, vocab_size])[mask_ids] - mask_logits = mask_logits.reshape([batch_size, -1, vocab_size]) - return mask_logits + outputs = super(ManualVerbalizer, + self).process_outputs(outputs, masked_positions) + label_word_outputs = self.project(outputs) + + if self.post_log_softmax: + label_word_outputs = self.normalize(label_word_outputs) + + if self.label_token_weight is not None: + label_word_outputs = self.calibrate(label_word_outputs) + + label_word_outputs = paddle.log(label_word_outputs + 1e-15) + label_outputs = self.aggregate(label_word_outputs, self.word_mask, + self.word_aggregate_type) + label_outputs = self.aggregate_multiple_mask(label_outputs, + self.mask_aggregate_type) + return label_outputs -class Identity(nn.Layer): + +class MaskedLMIdentity(nn.Layer): """ - Identity layer to replace the last linear layer in MLM model, which - outputs the input `sequence_output` directly. + Identity layer with the same arguments as the last linear layer in + `PretrainedModel` whose name ends with `ForMaskedLM`. """ def __init__(self): - super().__init__() + super(MaskedLMIdentity, self).__init__() def forward(self, sequence_output, masked_positions=None): return sequence_output @@ -325,59 +350,44 @@ def forward(self, sequence_output, masked_positions=None): class SoftVerbalizer(Verbalizer): """ - Soft Verbalizer to encode labels as embeddings. + SoftVerbalizer for the WARP method. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The tokenizer of pretrained models. - model (paddlenlp.transformers.PretrainedModel): - The pretrained language model. - labels (list): - The sequence of all labels. - label_words (dict or list): - The dictionary or corresponding list to map labels to words. - prefix (str): - The prefix string of words, used in PLMs like RoBERTa, which is sensitive to the prefix. + label_words (`dict`): + Define the mapping from labels to a single or multiple words. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer for label word tokenization. + model (`PretrainedModel`): + An instance of PretrainedModel with class name ends with `ForMaskedLM` """ - LAST_WEIGHT = ["ErnieForMaskedLM", "BertForMaskedLM"] LAST_LINEAR = ["AlbertForMaskedLM", "RobertaForMaskedLM"] - def __init__(self, tokenizer, model, labels, label_words=None, prefix=''): - super().__init__(labels=labels) - self.tokenizer = tokenizer - self.labels = labels - self.prefix = prefix - self.label_words = label_words - - self._extract_head(model) - - def process_label_words(self): - """ Create the label-token array and its corresponding mask. """ - if self.prefix is not None: - self._label_words = self.add_prefix(self.label_words, self.prefix) - - all_ids = [] - for words_per_label in self.label_words: - if len(words_per_label) > 1: - logger.warning("Only the first word used for every label.") - all_ids.append( - self.tokenizer.encode(words_per_label[0], - add_special_tokens=False, - return_token_type_ids=False)["input_ids"]) - - max_num_tokens = max([len(tokens) for tokens in all_ids]) - token_ids = np.zeros(shape=[len(self.labels), max_num_tokens]) - token_mask = np.zeros(shape=[len(self.labels), max_num_tokens]) - for label_i, ids_per_label in enumerate(all_ids): - token_ids[label_i][:len(ids_per_label)] = ids_per_label - token_mask[label_i][:len(ids_per_label)] = 1 - self.token_ids = paddle.to_tensor(token_ids, - dtype="int64", - stop_gradient=True) - self.token_ids_mask = paddle.to_tensor(token_mask, - dtype="int64", - stop_gradient=True) + def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, + model: PretrainedModel): + super(SoftVerbalizer, self).__init__(label_words=label_words, + tokenizer=tokenizer, + model=model) + del self.model + setattr(model, self.head_name[0], MaskedLMIdentity()) + + def create_parameters(self): + # Only the first word used for initialization. + if self.token_ids.shape[1] != 1: + logger.warning("Only the first word for each label is used for" + " initialization.") + index = paddle.to_tensor([0]) + self.token_ids = paddle.index_select(self.token_ids, index, axis=1) + self.token_mask = paddle.index_select(self.token_mask, + index, + axis=1) + self.word_mask = paddle.ones([len(self.labels), 1]) + self._extract_head(self.model) + + def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None): + outputs = super(SoftVerbalizer, + self).process_outputs(outputs, masked_positions) + return self.head(outputs).squeeze(1) def head_parameters(self): if isinstance(self.head, nn.Linear): @@ -393,31 +403,6 @@ def non_head_parameters(self): return [(n, p) for n, p in self.head.named_parameters() if self.head_name[1] not in n] - def process_model(self, model): - setattr(model, self.head_name[0], Identity()) - return model - - def process_outputs(self, logits, inputs=None, **kwargs): - mask_ids = inputs["mask_ids"].unsqueeze(2) - real_len = logits.shape[1] - mask_ids = mask_ids[:, -real_len:] - logits = (logits * mask_ids).sum(axis=1) / mask_ids.sum(axis=1) - return self.head(logits) - - @classmethod - def from_file(cls, tokenizer, model, label_file, prefix=None): - with open(label_file, "r", encoding="utf-8") as fp: - label_words = defaultdict(list) - for line in fp: - data = line.strip().split("==") - word = data[1] if len(data) > 1 else data[0] - label_words[data[0]].append(word) - return cls(tokenizer, - model, - labels=set(label_words.keys()), - label_words=dict(label_words), - prefix=prefix) - def _extract_head(self, model): model_type = model.__class__.__name__ if model_type in self.LAST_LINEAR: @@ -439,45 +424,51 @@ def _extract_head(self, model): self.head_name.append(name) break elif model_type in self.LAST_WEIGHT: - # OnlyMLMHead last_name = [n for n, p in model.named_children()][-1] head = getattr(model, last_name) self.head_name = [last_name] - # LMPredictionHead - last_name = [n for n, p in head.named_children()][-1] - self.head = copy.deepcopy(getattr(head, last_name)) - self.head_name.append("decoder") + # OnlyMLMHead + if model_type in ["ErnieForMaskedLM", "BertForMaskedLM"]: + last_name = [n for n, p in head.named_children()][-1] + self.head = copy.deepcopy(getattr(head, last_name)) + self.head_name.append("decoder") + else: + self.head = copy.deepcopy(head) + # LMPredictionHead module = paddle.to_tensor(getattr(self.head, "decoder_weight")) - bias = paddle.to_tensor(getattr(self.head, "decoder_bias")) new_head = nn.Linear(len(self.labels), module.shape[1], bias_attr=False) new_head.weight.set_value(self._create_init_weight(module.T).T) setattr(self.head, "decoder_weight", new_head.weight) getattr(self.head, "decoder_weight").stop_gradient = False - setattr( - self.head, "decoder_bias", - self.head.create_parameter(shape=[len(self.labels)], - dtype=new_head.weight.dtype, - is_bias=True)) - getattr(self.head, "decoder_bias").stop_gradient = False + if hasattr(self.head, "decoder_bias"): + bias = paddle.to_tensor(getattr(self.head, "decoder_bias")) + setattr( + self.head, "decoder_bias", + self.head.create_parameter(shape=[len(self.labels)], + dtype=new_head.weight.dtype, + is_bias=True)) + getattr(self.head, "decoder_bias").stop_gradient = False else: raise NotImplementedError( f"Please open an issue to request for support of {model_type}" + f" or contribute to PaddleNLP.") def _create_init_weight(self, weight, is_bias=False): + token_ids = self.token_ids.squeeze(1) + token_mask = self.token_mask.squeeze(1) + aggr_type = self.token_aggregate_type if is_bias: - bias = paddle.index_select(weight, - self.token_ids.reshape([-1]), - axis=0).reshape(self.token_ids.shape) - bias = self.aggregate(bias, self.token_ids_mask) + bias = paddle.index_select(weight, token_ids.reshape([-1]), + axis=0).reshape(token_ids.shape) + bias = self.aggregate(bias, token_mask, aggr_type) return bias else: - word_shape = [weight.shape[0], *self.token_ids.shape] + word_shape = [weight.shape[0], *token_ids.shape] weight = paddle.index_select(weight, - self.token_ids.reshape([-1]), + token_ids.reshape([-1]), axis=1).reshape(word_shape) - weight = self.aggregate(weight, self.token_ids_mask) + weight = self.aggregate(weight, token_mask, aggr_type) return weight diff --git a/paddlenlp/trainer/__init__.py b/paddlenlp/trainer/__init__.py index 1c5bdad43c3c..dd95ea0c72f9 100644 --- a/paddlenlp/trainer/__init__.py +++ b/paddlenlp/trainer/__init__.py @@ -15,7 +15,9 @@ from .argparser import * from .training_args import * from .compression_args import * -from .trainer_base import * +from .trainer import * from .trainer_callback import * from .trainer_utils import * -from .trainer_compress import * \ No newline at end of file +from .trainer_compress import * +from .training_args_seq2seq import * +from .trainer_seq2seq import * \ No newline at end of file diff --git a/paddlenlp/trainer/integrations.py b/paddlenlp/trainer/integrations.py index 806c29075f5b..2de61f24c9ff 100644 --- a/paddlenlp/trainer/integrations.py +++ b/paddlenlp/trainer/integrations.py @@ -21,6 +21,7 @@ from .trainer_callback import TrainerCallback from ..utils.log import logger +from ..transformers import PretrainedModel def is_visualdl_available(): @@ -94,12 +95,17 @@ def on_train_begin(self, args, state, control, **kwargs): self.vdl_writer.add_text("args", args.to_json_string()) if "model" in kwargs: model = kwargs["model"] - if hasattr(model, - "init_config") and model.init_config is not None: + if isinstance(model, PretrainedModel + ) and model.constructed_from_pretrained_config(): + model.config.architectures = [model.__class__.__name__] + self.vdl_writer.add_text("model_config", str(model.config)) + elif hasattr(model, + "init_config") and model.init_config is not None: model_config_json = json.dumps(model.get_model_config(), ensure_ascii=False, indent=2) self.vdl_writer.add_text("model_config", model_config_json) + if hasattr(self.vdl_writer, "add_hparams"): self.vdl_writer.add_hparams(args.to_sanitized_dict(), metrics_list=[]) diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer.py similarity index 76% rename from paddlenlp/trainer/trainer_base.py rename to paddlenlp/trainer/trainer.py index 4fe5766d8458..7df66c6b594c 100644 --- a/paddlenlp/trainer/trainer_base.py +++ b/paddlenlp/trainer/trainer.py @@ -38,7 +38,11 @@ import paddle.nn as nn import paddle.amp.auto_cast as autocast import paddle.distributed as dist +from paddle.distributed import fleet from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients +from paddle.distributed.sharding import group_sharded_parallel +from paddle.fluid.dygraph.parallel import sync_params_buffers + from paddle.io import ( Dataset, DataLoader, @@ -65,9 +69,14 @@ EvalLoopOutput, speed_metrics, OptimizerNames, + ShardingOption, PREFIX_CHECKPOINT_DIR, get_last_checkpoint, get_scheduler, + IterableDatasetShard, + has_length, + find_batch_size, + RemoveColumnsCollator, ) from .trainer_callback import ( CallbackHandler, @@ -179,10 +188,6 @@ def __init__( optimizers: Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler] = (None, None), ): - if paddle.distributed.get_world_size() > 1: - if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( - ): - paddle.distributed.init_parallel_env() if args is None: output_dir = "tmp_trainer" @@ -193,7 +198,7 @@ def __init__( self.args = args self.is_in_train = False - self.do_grad_scaling = args.fp16 + # self.do_grad_scaling = args.fp16 # Seed must be set before instantiating the model when using model set_seed(self.args.seed) @@ -204,6 +209,20 @@ def __init__( if self.args.should_save: os.makedirs(self.args.output_dir, exist_ok=True) + self.sharding = None + if len(args.sharding) > 0: + if args.local_rank == -1: + raise ValueError( + "Using sharding only works in distributed training.") + self.sharding = True + + # init parallel env + if paddle.distributed.get_world_size() > 1: + if self.sharding: + self.hcg = fleet.get_hybrid_communicate_group() + self.dp_group = self.hcg.get_data_parallel_group() + self.sharding_group = self.hcg.get_sharding_parallel_group() + default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding( tokenizer) @@ -223,6 +242,13 @@ def __init__( self.control = TrainerControl() self._signature_columns = None + if (self.sharding is not None) and (self.optimizer is not None + or self.lr_scheduler is not None): + raise RuntimeError( + "Passing `optimizers` is not allowed if sharding is enabled." + "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method." + ) + default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks( self.args.report_to) callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks @@ -243,16 +269,41 @@ def __init__( "train_dataset does not implement __len__, max_steps has to be specified" ) - if args.fp16: - self.scaler = paddle.amp.GradScaler( - init_loss_scaling=self.args.scale_loss) + self.do_grad_scaling = False + if (args.fp16 or args.bf16): logger.info("Using half precision") + self.do_grad_scaling = True + self.amp_dtype = "float16" if args.fp16 else "bfloat16" + + if self.sharding is not None: + self.scaler = paddle.amp.GradScaler( + init_loss_scaling=self.args.scale_loss) + if self.amp_dtype == "float16": + if ShardingOption.SHARD_OP in self.args.sharding: + self.scaler = fleet.distributed_scaler(self.scaler) + else: + # scaler for stage2 and stage3 + if paddle.framework.in_dygraph_mode(): + from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + self.scaler = GroupShardedScaler(self.scaler) + else: + from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler + self.scaler = ShardingScaler(self.scaler) + else: + self.do_grad_scaling = False + self.use_cuda_amp = False + self.amp_dtype = None + + else: + self.scaler = paddle.amp.GradScaler( + init_loss_scaling=self.args.scale_loss) if args.recompute: def fn(layer): - if type(layer) == paddle.nn.TransformerEncoder or type( - layer) == paddle.nn.TransformerDecoder: + if hasattr( + layer, + "enable_recompute") and layer.enable_recompute is False: layer.enable_recompute = True model.apply(fn) @@ -314,10 +365,10 @@ def load_state_dict_from_checkpoint(self, resume_from_checkpoint=None): # Load potential model checkpoint if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint: - resume_from_checkpoint = get_last_checkpoint(args.output_dir) + resume_from_checkpoint = get_last_checkpoint(self.args.output_dir) if resume_from_checkpoint is None: raise ValueError( - f"No valid checkpoint found in output directory ({args.output_dir})" + f"No valid checkpoint found in output directory ({self.args.output_dir})" ) if resume_from_checkpoint is not None: @@ -330,38 +381,15 @@ def load_state_dict_from_checkpoint(self, resume_from_checkpoint=None): logger.info(f"Loading model from {resume_from_checkpoint} .") # We load the model state dict on the CPU to avoid an OOM error. - state_dict = paddle.load( - os.path.join(resume_from_checkpoint, WEIGHTS_NAME)) + state_dict = paddle.load(os.path.join(resume_from_checkpoint, + WEIGHTS_NAME), + return_numpy=True) # If the model is on the GPU, it still works! self._set_state_dict_in_model(state_dict) # release memory del state_dict - @staticmethod - def init_num_steps(args, num_samples_per_epoch): - args.total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size - - num_update_steps_per_epoch = num_samples_per_epoch // args.train_batch_size + int( - num_samples_per_epoch % args.train_batch_size > 0) - num_update_steps_per_epoch //= args.gradient_accumulation_steps - num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) - args.num_update_steps_per_epoch = num_update_steps_per_epoch - - if args.max_steps > 0: - args.num_training_steps = args.max_steps - args.num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( - args.max_steps % num_update_steps_per_epoch > 0) - args.num_train_samples = args.max_steps * args.total_train_batch_size - else: - args.num_training_steps = num_update_steps_per_epoch * args.num_train_epochs - args.num_train_epochs = math.ceil(args.num_train_epochs) - args.num_train_samples = num_samples_per_epoch * args.num_train_epochs - - if args.warmup_steps <= 0: - args.warmup_steps = int(args.warmup_ratio * args.num_training_steps) - return args - def train( self, resume_from_checkpoint: Optional[Union[str, bool]] = None, @@ -401,8 +429,9 @@ def train( logger.info(f"Loading model from {resume_from_checkpoint} .") # TODO: Need to load the model state dict on the CPU to avoid an OOM error. - state_dict = paddle.load( - os.path.join(resume_from_checkpoint, WEIGHTS_NAME)) + state_dict = paddle.load(os.path.join(resume_from_checkpoint, + WEIGHTS_NAME), + return_numpy=True) # If the model is on the GPU, it still works! self._set_state_dict_in_model(state_dict) @@ -410,42 +439,86 @@ def train( del state_dict train_dataloader = self.get_train_dataloader() - model = self._wrap_model(self.model_wrapped) + + total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size + len_dataloader = None + if has_length(train_dataloader): + len_dataloader = len(train_dataloader) + num_update_steps_per_epoch = len( + train_dataloader) // args.gradient_accumulation_steps + num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) + num_examples = len(self.train_dataset) + + if args.max_steps > 0: + max_steps = args.max_steps + num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( + args.max_steps % num_update_steps_per_epoch > 0) + num_train_samples = args.max_steps * total_train_batch_size + else: + max_steps = num_update_steps_per_epoch * args.num_train_epochs + num_train_epochs = math.ceil(args.num_train_epochs) + num_train_samples = len( + self.train_dataset) * args.num_train_epochs + + if args.minimum_eval_times is not None and args.minimum_eval_times > 0: + if max_steps // args.eval_steps < args.minimum_eval_times: + exp_step = max_steps / args.minimum_eval_times + exp_step = max(int(exp_step - exp_step % 10), 10) + logger.info("Reset eval step by minimum_eval_times to %d" % + exp_step) + args.eval_steps = exp_step + elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size + max_steps = args.max_steps + # Setting a very large number of epochs so we go as many times as necessary over the iterator. + num_train_epochs = sys.maxsize + num_update_steps_per_epoch = max_steps + num_examples = total_train_batch_size * args.max_steps + num_train_samples = args.max_steps * total_train_batch_size + else: + raise ValueError( + f"args.max_steps must be set to a positive value if dataloader does not have a length, was {args.max_steps}" + ) + + # delay_optimizer_creation = ( + # self.sharding is not None + # and ShardingOption.SHARD_OP not in self.args.sharding + # ) + delay_optimizer_creation = self.sharding is None + + if not delay_optimizer_creation: + self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() - args = self.init_num_steps(args, len(self.train_dataset)) + model = self._wrap_model(self.model_wrapped) - if args.minimum_eval_times is not None and args.minimum_eval_times > 0: - if args.num_training_steps // args.eval_steps < args.minimum_eval_times: - exp_step = args.num_training_steps / args.minimum_eval_times - exp_step = max(int(exp_step - exp_step % 10), 10) - logger.info("Reset eval step by minimum_eval_times to %d" % - exp_step) - args.eval_steps = exp_step + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model - self.create_optimizer_and_scheduler( - num_training_steps=args.num_training_steps) + if delay_optimizer_creation: + self.create_optimizer_and_scheduler(num_training_steps=max_steps) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) - num_examples = len(self.train_dataset) - logger.info("***** Running training *****") logger.info(f" Num examples = {num_examples}") - logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Num Epochs = {num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {args.total_train_batch_size}" + f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}" ) - logger.info(f" Total optimization steps = {args.num_training_steps}") - logger.info(f" Total num train samples = {args.num_train_samples}") + logger.info(f" Total optimization steps = {max_steps}") + logger.info(f" Total num train samples = {num_train_samples}") + logger.info( + f" Number of trainable parameters = {sum(p.numel().item() for p in model.parameters() if not p.stop_gradient) }" + ) start_time = time.time() self._globalstep_last_start_time = time.time() @@ -459,10 +532,10 @@ def train( os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)): self.state = TrainerState.load_from_json( os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) - epochs_trained = self.state.global_step // args.num_update_steps_per_epoch + epochs_trained = self.state.global_step // num_update_steps_per_epoch if not args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % ( - args.num_update_steps_per_epoch) + num_update_steps_per_epoch) steps_trained_in_current_epoch *= args.gradient_accumulation_steps else: steps_trained_in_current_epoch = 0 @@ -498,15 +571,17 @@ def train( ) epoch_iterator = train_dataloader - steps_in_epoch = len(epoch_iterator) + # steps_in_epoch = len(epoch_iterator) + steps_in_epoch = (len(epoch_iterator) if len_dataloader is not None else + args.max_steps * args.gradient_accumulation_steps) self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader - self.state.max_steps = int(args.num_training_steps) - self.state.num_train_epochs = args.num_train_epochs + self.state.max_steps = int(max_steps) + self.state.num_train_epochs = num_train_epochs self.state.is_local_process_zero = self.is_local_process_zero() self.state.is_world_process_zero = self.is_world_process_zero() @@ -517,7 +592,7 @@ def train( self._total_loss_scalar = 0.0 self._globalstep_last_logged = self.state.global_step - for epoch in range(epochs_trained, args.num_train_epochs): + for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, paddle.io.DataLoader) and isinstance( train_dataloader.batch_sampler, @@ -560,11 +635,22 @@ def train( self.control = self.callback_handler.on_step_begin( args, self.state, self.control) + dp_enabled = self.args.dp_degree > 1 if self.sharding else args.local_rank != -1 + forbidden_no_sync = False + if self.sharding and (ShardingOption.SHARD_OP + not in self.args.sharding): + # stage2 and stage3 should no_sync, because the is no DDP wrapper and no_sync API + forbidden_no_sync = True + availiable_no_sync = dp_enabled and not forbidden_no_sync + is_no_sync = ((( (step + 1) % args.gradient_accumulation_steps != 0) - and args.local_rank != -1 + and availiable_no_sync and args._no_sync_in_gradient_accumulation) - or (args.recompute and args.local_rank != -1)) + or (args.recompute and availiable_no_sync)) + # sharding + # stage1. the same as ddp + # stage2. manualy collect gradient on dp group if is_no_sync: # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. @@ -580,16 +666,43 @@ def train( steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch): - if (args.recompute and args.local_rank != -1): + # Maunally collect gradients + # Case 1: Use sharding stage 2/3 with dp + # Case 2: Use recompute and dp + # local_rank != -1 don't means dp in networks. + if self.sharding and ShardingOption.SHARD_OP not in self.args.sharding: + if self.args.dp_degree > 1: + fused_allreduce_gradients( + model.parameters(), + fleet.get_hybrid_communicate_group()) + if ShardingOption.FULL_SHARD in self.args.sharding: + # Why need sync on parm again ? + # TODO: fix this. + for p in model.parameters(): + if hasattr(p, "bw_storage"): + assert p.grad is None, "This case shouldn't happen." + p.bw_storage.scale_( + 1.0 / self.dp_group.nranks) + paddle.distributed.all_reduce( + p.bw_storage, group=self.dp_group) + + elif (args.recompute and dp_enabled): fused_allreduce_gradients(list(model.parameters()), None) - + # Optimizer step + optimizer_was_run = True if self.do_grad_scaling: - self.scaler.minimize(self.optimizer, tr_loss) + scale_before = self.scaler._scale.numpy() + self.scaler.step(self.optimizer) + self.scaler.update() + scale_after = self.scaler._scale.numpy() + optimizer_was_run = scale_before <= scale_after else: self.optimizer.step() - self.lr_scheduler.step() + if optimizer_was_run: + self.lr_scheduler.step() + self.optimizer.clear_grad() self.state.global_step += 1 @@ -640,7 +753,7 @@ def train( WEIGHTS_NAME) if os.path.exists(best_model_path): # We load the model state dict on the CPU to avoid an OOM error. - state_dict = paddle.load(best_model_path) + state_dict = paddle.load(best_model_path, return_numpy=True) # If the model is on the GPU, it still works! self._set_state_dict_in_model(state_dict) else: @@ -654,7 +767,7 @@ def train( metrics = speed_metrics("train", start_time, - num_samples=args.num_train_samples, + num_samples=num_train_samples, num_steps=self.state.max_steps) metrics["train_loss"] = train_loss @@ -669,7 +782,7 @@ def train( return TrainOutput(self.state.global_step, train_loss, metrics) def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: - if not isinstance(self.train_dataset, collections.abc.Sized): + if self.train_dataset is None or not has_length(self.train_dataset): return None if self.args.world_size <= 1: @@ -709,14 +822,14 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, logs["learning_rate"] = self._get_learning_rate() logs["global_step"] = int(self.state.global_step) + total_train_batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.world_size + num_steps = self.state.global_step - self._globalstep_last_logged logs.update( speed_metrics( "interval", self._globalstep_last_start_time, - num_samples=self.args.train_batch_size * - self.args.gradient_accumulation_steps, - num_steps=self.state.global_step - - self._globalstep_last_logged, + num_samples=total_train_batch_size * num_steps, + num_steps=num_steps, )) self._total_loss_scalar += tr_loss_scalar @@ -755,6 +868,23 @@ def get_train_dataloader(self): train_dataset = self._remove_unused_columns(train_dataset, description="training") + if self._is_iterable_dataset(train_dataset): + if self.args.world_size > 1: + train_dataset = IterableDatasetShard( + train_dataset, + batch_size=self.args.per_device_train_batch_size, + drop_last=self.args.dataloader_drop_last, + num_processes=self.args.world_size, + process_index=self.args.process_index, + ) + + return DataLoader( + train_dataset, + batch_size=self.args.per_device_eval_batch_size, + collate_fn=self.data_collator, + num_workers=self.args.dataloader_num_workers, + ) + train_sampler = self._get_train_sampler() return DataLoader( @@ -768,7 +898,7 @@ def _get_eval_sampler(self, eval_dataset: Dataset): if self.args.world_size <= 1: return paddle.io.BatchSampler( eval_dataset, - batch_size=self.args.eval_batch_size, + batch_size=self.args.per_device_eval_batch_size, shuffle=False, drop_last=False, ) @@ -777,7 +907,7 @@ def _get_eval_sampler(self, eval_dataset: Dataset): eval_dataset, num_replicas=self.args.world_size, rank=self.args.process_index, - batch_size=self.args.eval_batch_size, + batch_size=self.args.per_device_eval_batch_size, shuffle=False, drop_last=False, ) @@ -804,6 +934,23 @@ def get_eval_dataloader(self, eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") + if self._is_iterable_dataset(eval_dataset): + if self.args.world_size > 1: + eval_dataset = IterableDatasetShard( + eval_dataset, + batch_size=self.args.per_device_eval_batch_size, + drop_last=self.args.dataloader_drop_last, + num_processes=self.args.world_size, + process_index=self.args.process_index, + ) + + return DataLoader( + eval_dataset, + batch_size=self.args.per_device_eval_batch_size, + collate_fn=self.data_collator, + num_workers=self.args.dataloader_num_workers, + ) + eval_sampler = self._get_eval_sampler(eval_dataset) return DataLoader( @@ -829,6 +976,25 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: test_dataset = self._remove_unused_columns(test_dataset, description="test") + if self._is_iterable_dataset(test_dataset): + if self.args.world_size > 1: + test_dataset = IterableDatasetShard( + test_dataset, + batch_size=self.args.per_device_eval_batch_size, + drop_last=self.args.dataloader_drop_last, + num_processes=self.args.world_size, + process_index=self.args.process_index, + ) + + return DataLoader( + test_dataset, + batch_size=self.args.per_device_eval_batch_size * + self.world_size, + collate_fn=self. + data_collator, # _get_collator_with_removed_columns + num_workers=self.args.dataloader_num_workers, + ) + test_sampler = self._get_eval_sampler(test_dataset) # We use the same batch_size as for eval. @@ -867,14 +1033,28 @@ def create_optimizer(self, lr_scheduler=None): optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs( self.args) - self.optimizer = optimizer_cls( - learning_rate=self.lr_scheduler - if lr_scheduler is None else lr_scheduler, - apply_decay_param_fun=apply_decay_param_fun, - parameters=self.model.parameters(), - weight_decay=self.args.weight_decay, - grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm), - **optimizer_kwargs) + if ShardingOption.SHARD_OP in self.args.sharding: + from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer import DygraphShardingOptimizer + self.optimizer = DygraphShardingOptimizer( + hcg=fleet.get_hybrid_communicate_group(), + user_defined_strategy=None, + params=self.model.parameters(), + inner_optimizer_class=optimizer_cls, + learning_rate=self.lr_scheduler + if lr_scheduler is None else lr_scheduler, + apply_decay_param_fun=apply_decay_param_fun, + weight_decay=self.args.weight_decay, + grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm), + **optimizer_kwargs) + else: + self.optimizer = optimizer_cls( + learning_rate=self.lr_scheduler + if lr_scheduler is None else lr_scheduler, + apply_decay_param_fun=apply_decay_param_fun, + parameters=self.model.parameters(), + weight_decay=self.args.weight_decay, + grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm), + **optimizer_kwargs) return self.optimizer @@ -962,9 +1142,22 @@ def create_scheduler(self, num_training_steps: int): return self.lr_scheduler + def num_examples(self, dataloader: DataLoader) -> int: + """ + Helper to get number of samples in a [`~paddle.io.DataLoader`] by accessing its dataset. When + dataloader.dataset does not exist or has no length, estimates as best it can + """ + try: + dataset = dataloader.dataset + # Special case for IterableDatasetShard, we need to dig deeper + if isinstance(dataset, IterableDatasetShard): + return len(dataloader.dataset.dataset) + return len(dataloader.dataset) + except (NameError, AttributeError, TypeError + ): # no dataset or length, estimate by length of dataloader + return len(dataloader) * self.args.per_device_train_batch_size + def _wrap_model(self, model, training=True): - if self.args.world_size > 1: - model = paddle.DataParallel(model) # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again if unwrap_model(model) is not model: @@ -975,6 +1168,53 @@ def _wrap_model(self, model, training=True): if not training: return model + # Mixed precision training + if training and self.do_grad_scaling: # self.args.fp16_opt_level=="O2": + # model, self.optimizer + decorated = paddle.amp.decorate(models=model, + optimizers=self.optimizer, + level=self.args.fp16_opt_level, + dtype=self.amp_dtype) + if self.optimizer is None: + model = decorated + else: + model, self.optimizer = decorated + + # Multi-gpu training + if self.args.world_size > 1 and self.sharding is None: + model = paddle.DataParallel(model) + # Distributed training (should be after fp16 initialization) + if self.sharding is not None: + # Sharded DDP! + if ShardingOption.SHARD_OP in self.args.sharding: + model = fleet.distributed_model(model) + self.optimizer = fleet.distributed_optimizer(self.optimizer) + else: + # sync params (broadcast) buffers in dp group + if self.args.dp_degree > 1: + hcg = fleet.get_hybrid_communicate_group() + dp_group = hcg.get_data_parallel_group() + sync_params_buffers(model, + comm_group=dp_group, + src_rank=dp_group.ranks[0]) + + cpu_offload = ShardingOption.OFFLOAD in self.args.sharding + assert self.optimizer is not None, "optimizer is empty!" + level = None + if ShardingOption.SHARD_GRAD_OP in self.args.sharding: + level = "os_g" + if ShardingOption.FULL_SHARD in self.args.sharding: + level = "p_g_os" + + model, optimizer, _ = group_sharded_parallel( + model, + self.optimizer, + level=level, + scaler=None, + group=self.sharding_group, + offload=cpu_offload) + self.optimizer = optimizer + return model def _prepare_input( @@ -1013,14 +1253,14 @@ def autocast_smart_context_manager(self): A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired arguments, depending on the situation. """ - if self.args.fp16: + if self.args.fp16 or self.args.bf16: ctx_manager = autocast(True, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", - "elementwise_div", ], - level=self.args.fp16_opt_level) + level=self.args.fp16_opt_level, + dtype=self.amp_dtype) else: ctx_manager = contextlib.nullcontext() if sys.version_info >= ( 3, 7) else contextlib.suppress() @@ -1041,6 +1281,8 @@ def compute_loss(self, model, inputs, return_outputs=False): labels = inputs["generator_labels"] else: labels = None + # TODO: label_names pop + outputs = model(**inputs) if self.criterion is not None: @@ -1118,11 +1360,25 @@ def _save_checkpoint(self, model, metrics=None): self.save_model(output_dir) + if self.sharding is not None: + if self.dp_group.rank == 0: + paddle.save( + self.optimizer.state_dict(), + os.path.join( + output_dir, + OPTIMIZER_NAME + f"_shard{self.sharding_group.rank}")) + if self.args.should_save: - paddle.save(self.optimizer.state_dict(), - os.path.join(output_dir, OPTIMIZER_NAME)) + if self.sharding is not None: + # alias for opitimizer state, should be merge on different shard! + paddle.save({}, os.path.join(output_dir, OPTIMIZER_NAME)) + else: + paddle.save(self.optimizer.state_dict(), + os.path.join(output_dir, OPTIMIZER_NAME)) + paddle.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) + if self.do_grad_scaling: paddle.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME)) @@ -1273,8 +1529,22 @@ def _load_optimizer_and_scheduler(self, checkpoint): checkpoint, OPTIMIZER_NAME)) and os.path.isfile( os.path.join(checkpoint, SCHEDULER_NAME)): # Load in optimizer and scheduler states - self.optimizer.set_state_dict( - paddle.load(os.path.join(checkpoint, OPTIMIZER_NAME))) + if self.sharding is not None: + self.optimizer.set_state_dict( + paddle.load(os.path.join( + checkpoint, + OPTIMIZER_NAME + f"_shard{self.sharding_group.rank}"), + return_numpy=True)) + empty_dict = paddle.load(os.path.join(checkpoint, + OPTIMIZER_NAME), + return_numpy=True) + assert len( + empty_dict + ) == 0, "Optimizer file of sharding, should be empty!" + else: + self.optimizer.set_state_dict( + paddle.load(os.path.join(checkpoint, OPTIMIZER_NAME), + return_numpy=True)) self.lr_scheduler.set_state_dict( paddle.load(os.path.join(checkpoint, SCHEDULER_NAME))) if self.do_grad_scaling and os.path.isfile( @@ -1392,9 +1662,9 @@ def evaluation_loop( else: raise ValueError("Only support for paddle.io.DataLoader") - if max_eval_iters <= 0: - num_samples = self.num_examples(dataloader) - else: + num_samples = None + if max_eval_iters > 0: + # on eval limit steps num_samples = batch_size * self.args.world_size * max_eval_iters if isinstance( dataloader, paddle.fluid.dataloader.dataloader_iter. @@ -1402,21 +1672,24 @@ def evaluation_loop( dataloader._batch_sampler, NlpDistributedBatchSampler): consumed_samples = ( (self.state.global_step) // args.eval_steps - ) * max_eval_iters * args.eval_batch_size * args.world_size + ) * max_eval_iters * args.per_device_eval_batch_size * args.world_size dataloader._batch_sampler.set_epoch( consumed_samples=consumed_samples) logger.info(f"***** Running {description} *****") - logger.info(f" Num examples = {num_samples}") + if has_length(dataloader): + logger.info(f" Num examples = {self.num_examples(dataloader)}") + logger.info(f" Total prediction steps = {len(dataloader)}") + else: + logger.info(" Num examples: Unknown") logger.info(f" Pre device batch size = {batch_size}") logger.info(f" Total Batch size = {batch_size * self.args.world_size}") - logger.info(f" Total prediction steps = {len(dataloader)}") model.eval() self.callback_handler.eval_dataloader = dataloader # Do this before wrapping. - # eval_dataset = dataloader.dataset + eval_dataset = dataloader.dataset if args.past_index >= 0: self._past = None @@ -1432,10 +1705,18 @@ def evaluation_loop( all_labels = None # Will be useful when we have an iterable dataset so don't know its length. + observed_num_examples = 0 # Main evaluation loop losses = [] for step, inputs in enumerate(dataloader): # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + # For batch samplers, batch_size is not known by the dataloader in advance. + if batch_size is None: + batch_size = observed_batch_size + # Prediction step loss, logits, labels = self.prediction_step(model, inputs, @@ -1476,6 +1757,22 @@ def evaluation_loop( all_labels = labels if all_labels is None else nested_concat( all_labels, labels, padding_index=-100) + # Number of samples + if num_samples is not None: + pass + elif has_length(eval_dataset): + num_samples = len(eval_dataset) + # The instance check is weird and does not actually check for the type, but whether the dataset has the right + # methods. Therefore we need to make sure it also has the attribute. + elif isinstance(eval_dataset, IterableDatasetShard) and hasattr( + eval_dataset, "num_examples"): + num_samples = eval_dataset.num_examples + else: + if has_length(dataloader): + num_samples = self.num_examples(dataloader) + else: # both len(dataloader.dataset) and len(dataloader) fail + num_samples = observed_num_examples + # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of # samplers has been rounded to a multiple of batch_size, so we truncate. if all_losses is not None: @@ -1544,7 +1841,7 @@ def predict(self, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) - total_batch_size = self.args.eval_batch_size * self.args.world_size + total_batch_size = self.args.per_device_eval_batch_size * self.args.world_size output.metrics.update( speed_metrics( metric_key_prefix, @@ -1714,6 +2011,15 @@ def _pad_across_processes(self, tensor, pad_index=-100): new_tensor[:, :old_size[1]] = tensor return new_tensor + def _set_signature_columns_if_needed(self): + if self._signature_columns is None: + # Inspect model forward signature to keep only the arguments it accepts. + signature = inspect.signature(self.model.forward) + self._signature_columns = list(signature.parameters.keys()) + # Labels may be named label or label_ids, the default data collator handles that. + self._signature_columns += list( + set(["label", "label_ids"] + self.label_names)) + def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None): @@ -1751,6 +2057,30 @@ def _remove_unused_columns(self, else: return dataset.remove_columns(ignored_columns) + def _get_collator_with_removed_columns( + self, + data_collator: Callable, + description: Optional[str] = None) -> Callable: + """Wrap the data collator in a callable removing unused columns.""" + if not self.args.remove_unused_columns: + return data_collator + self._set_signature_columns_if_needed() + signature_columns = self._signature_columns + + remove_columns_collator = RemoveColumnsCollator( + data_collator=data_collator, + signature_columns=signature_columns, + logger=logger, + description=description, + model_name=self.model.__class__.__name__, + ) + return remove_columns_collator + + def _is_iterable_dataset(self, dataset): + return isinstance(dataset, paddle.io.IterableDataset) or ( + isinstance(dataset, datasets.iterable_dataset.IterableDataset) + if is_datasets_available() else False) + def print_config(self, args=None, key=""): """ print config values diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py index 0d8fabab9621..60feab55d3c3 100644 --- a/paddlenlp/trainer/trainer_compress.py +++ b/paddlenlp/trainer/trainer_compress.py @@ -36,7 +36,7 @@ from ..transformers.model_outputs import BaseModelOutputWithPoolingAndCrossAttentions from ..metrics import ChunkEvaluator from ..metrics.squad import squad_evaluate, compute_prediction -from .trainer_base import Trainer +from .trainer import Trainer def global_try_import_slim(): diff --git a/paddlenlp/trainer/trainer_seq2seq.py b/paddlenlp/trainer/trainer_seq2seq.py new file mode 100644 index 000000000000..952eb09d3ec1 --- /dev/null +++ b/paddlenlp/trainer/trainer_seq2seq.py @@ -0,0 +1,268 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple, Union + +import paddle +from paddle import nn +from paddle.io import Dataset + +from .trainer import Trainer +from .trainer_utils import PredictionOutput +from ..utils.log import logger + +__all__ = [ + "Seq2SeqTrainer", +] + + +class Seq2SeqTrainer(Trainer): + + def evaluate(self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + **gen_kwargs) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (`Dataset`, *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns + not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` + method. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + + gen_kwargs = gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get( + "max_new_tokens") is None: + gen_kwargs["max_length"] = self.args.generation_max_length + gen_kwargs["num_beams"] = (gen_kwargs["num_beams"] + if gen_kwargs.get("num_beams") is not None + else self.args.generation_num_beams) + self._gen_kwargs = gen_kwargs + + return super().evaluate(eval_dataset, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix) + + def predict(self, + test_dataset: Dataset, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "test", + **gen_kwargs) -> PredictionOutput: + """ + Run prediction and returns predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method + will also return metrics, like in `evaluate()`. + + Args: + test_dataset (`Dataset`): + Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the + `model.forward()` method are automatically removed. Has to implement the method `__len__` + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + + + If your predictions or labels have different sequence lengths (for instance because you're doing dynamic + padding in a token classification task) the predictions will be padded (on the right) to allow for + concatenation into one array. The padding index is -100. + + + + Returns: *NamedTuple* A namedtuple with the following keys: + + - predictions (`np.ndarray`): The predictions on `test_dataset`. + - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). + - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained + labels). + """ + + gen_kwargs = gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get( + "max_new_tokens") is None: + gen_kwargs["max_length"] = self.args.generation_max_length + gen_kwargs["num_beams"] = (gen_kwargs["num_beams"] + if gen_kwargs.get("num_beams") is not None + else self.args.generation_num_beams) + self._gen_kwargs = gen_kwargs + + return super().predict(test_dataset, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix) + + def prediction_step( + self, + model: nn.Layer, + inputs: Dict[str, Union[paddle.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[paddle.Tensor], + Optional[paddle.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Layer`): + The model to evaluate. + inputs (`Dict[str, Union[paddle.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[paddle.Tensor], Optional[paddle.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, + inputs, + prediction_loss_only=prediction_loss_only, + ignore_keys=ignore_keys) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + + gen_kwargs = self._gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get( + "max_new_tokens") is None: + gen_kwargs["max_length"] = self.model.config.max_length + gen_kwargs["num_beams"] = (gen_kwargs["num_beams"] + if gen_kwargs.get("num_beams") is not None + else self.model.config.num_beams) + + if "attention_mask" in inputs: + gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) + if "global_attention_mask" in inputs: + gen_kwargs["global_attention_mask"] = inputs.get( + "global_attention_mask", None) + + # prepare generation inputs + # some encoder-decoder models can have varying encoder's and thus + # varying model input names + if hasattr( + self.model, "encoder" + ) and self.model.encoder.main_input_name != self.model.main_input_name: + generation_inputs = inputs[self.model.encoder.main_input_name] + else: + generation_inputs = inputs[self.model.main_input_name] + + generated_tokens = self.model.generate( + generation_inputs, + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if gen_kwargs.get("max_length") is not None and generated_tokens.shape[ + -1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len( + generated_tokens, gen_kwargs["max_length"]) + elif gen_kwargs.get( + "max_new_tokens") is not None and generated_tokens.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1): + generated_tokens = self._pad_tensors_to_max_len( + generated_tokens, gen_kwargs["max_new_tokens"] + 1) + + with paddle.no_grad(): + if has_labels: + with self.compute_loss_context_manager(): + outputs = model(**inputs) + if self.label_smoother is not None: + loss = self.label_smoother( + outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else + outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + if has_labels: + labels = inputs["labels"] + if gen_kwargs.get("max_length") is not None and labels.shape[ + -1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, + gen_kwargs["max_length"]) + elif gen_kwargs.get( + "max_new_tokens") is not None and labels.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1): + labels = self._pad_tensors_to_max_len( + labels, (gen_kwargs["max_new_tokens"] + 1)) + else: + labels = None + + return (loss, generated_tokens, labels) + + def _pad_tensors_to_max_len(self, tensor, max_length): + if self.tokenizer is not None and hasattr(self.tokenizer, + "pad_token_id"): + # If PAD token is not defined at least EOS token has to be defined + pad_token_id = (self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None else + self.tokenizer.eos_token_id) + else: + if self.model.config.pad_token_id is not None: + pad_token_id = self.model.config.pad_token_id + else: + raise ValueError( + "Pad_token_id must be set in the configuration of the model, in order to pad tensors" + ) + # paddle.ones need to support device args. + padded_tensor = pad_token_id * paddle.ones( + (tensor.shape[0], max_length), dtype=tensor.dtype) + padded_tensor[:, :tensor.shape[-1]] = tensor + return padded_tensor diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py index 6092127e10f2..18f0042d6dcb 100644 --- a/paddlenlp/trainer/trainer_utils.py +++ b/paddlenlp/trainer/trainer_utils.py @@ -24,12 +24,17 @@ import random import re import time +import math from enum import Enum -from typing import Dict, NamedTuple, Optional, Tuple, Union +from typing import Dict, NamedTuple, Optional, Tuple, Union, List import numpy as np +import paddle +from paddle.io import IterableDataset from paddle.optimizer.lr import LambdaDecay + from ..utils.log import logger +from ..transformers.tokenizer_utils_base import BatchEncoding __all__ = [ "TrainOutput", @@ -134,6 +139,21 @@ class OptimizerNames(ExplicitEnum): ADAFACTOR = "adafactor" +class ShardingOption(ExplicitEnum): + """ + Sharding Option + OP for sharding optimizer state + GRAD for sharding gradients + FULL_SHARD for sharding optimizer gradient and parameter + OFFLOAD means offload to cpu. + """ + SHARD_OP = "stage1" + SHARD_GRAD_OP = "stage2" + FULL_SHARD = "stage3" + # NO_SHARD = "no" + OFFLOAD = "offload" + + def is_main_process(local_rank): """ Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on @@ -452,7 +472,7 @@ def has_length(dataset): """ try: return len(dataset) is not None - except TypeError: + except (TypeError, ValueError, RuntimeError): # TypeError: len() of unsized object return False @@ -469,3 +489,159 @@ def get_last_checkpoint(folder): folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0]))) + + +class IterableDatasetShard(IterableDataset): + """ + Wraps a Paddle `IterableDataset` to generate samples for one of the processes only. Instances of this class will + always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x + num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at the + first batch that would be too small or loop with indices from the beginning. + On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch size of + 2: + - the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`, `[8, 9]` + - the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`, `[10, 11]` + Args: + dataset (`paddle.io.IterableDataset`): + The batch sampler to split in several shards. + batch_size (`int`, *optional*, defaults to 1): + The size of the batches per shard. + drop_last (`bool`, *optional*, defaults to `False`): + Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the + beginning. + num_processes (`int`, *optional*, defaults to 1): + The number of processes running concurrently. + process_index (`int`, *optional*, defaults to 0): + The index of the current process. + seed (`int`, *optional*, defaults to 0): + A random seed that will be used for the random number generation in + [`~trainer_utils.IterableDatasetShard.set_epoch`]. + """ + + def __init__( + self, + dataset: IterableDataset, + batch_size: int = 1, + drop_last: bool = False, + num_processes: int = 1, + process_index: int = 0, + seed: int = 0, + ): + self.dataset = dataset + self.batch_size = batch_size + self.drop_last = drop_last + self.num_processes = num_processes + self.process_index = process_index + self.seed = seed + self.epoch = 0 + self.num_examples = 0 + + def set_epoch(self, epoch): + self.epoch = epoch + if hasattr(self.dataset, "set_epoch"): + self.dataset.set_epoch(epoch) + + def __iter__(self): + self.num_examples = 0 + # TODO: support generator seed in sampling. + # + # if ( + # not hasattr(self.dataset, "set_epoch") + # and hasattr(self.dataset, "generator") + # and isinstance(self.dataset.generator, paddle.fluid.Generator) + # ): + # self.dataset.generator.manual_seed(self.seed + self.epoch) + real_batch_size = self.batch_size * self.num_processes + process_slice = range(self.process_index * self.batch_size, + (self.process_index + 1) * self.batch_size) + + first_batch = None + current_batch = [] + for element in self.dataset: + self.num_examples += 1 + current_batch.append(element) + # Wait to have a full batch before yielding elements. + if len(current_batch) == real_batch_size: + for i in process_slice: + yield current_batch[i] + if first_batch is None: + first_batch = current_batch.copy() + current_batch = [] + + # Finished if drop_last is True, otherwise complete the last batch with elements from the beginning. + if not self.drop_last and len(current_batch) > 0: + if first_batch is None: + first_batch = current_batch.copy() + while len(current_batch) < real_batch_size: + current_batch += first_batch + for i in process_slice: + yield current_batch[i] + + def __len__(self): + # Will raise an error if the underlying dataset is not sized. + if self.drop_last: + return (len(self.dataset) // + (self.batch_size * self.num_processes)) * self.batch_size + else: + return math.ceil( + len(self.dataset) / + (self.batch_size * self.num_processes)) * self.batch_size + + +def find_batch_size(tensors): + """ + Find the first dimension of a tensor in a nested list/tuple/dict of tensors. + """ + if isinstance(tensors, (list, tuple)): + for t in tensors: + result = find_batch_size(t) + if result is not None: + return result + elif isinstance(tensors, (dict, BatchEncoding)): + for key, value in tensors.items(): + result = find_batch_size(value) + if result is not None: + return result + elif isinstance(tensors, paddle.Tensor): + return tensors.shape[0] if len(tensors.shape) >= 1 else None + elif isinstance(tensors, np.ndarray): + return tensors.shape[0] if len(tensors.shape) >= 1 else None + + +class RemoveColumnsCollator: + """Wrap the data collator to remove unused columns before they are passed to the collator.""" + + def __init__( + self, + data_collator, + signature_columns, + logger=None, + model_name: Optional[str] = None, + description: Optional[str] = None, + ): + self.data_collator = data_collator + self.signature_columns = signature_columns + self.logger = logger + self.description = description + self.model_name = model_name + self.message_logged = False + + def _remove_columns(self, feature: dict) -> dict: + if not isinstance(feature, dict): + return feature + if not self.message_logged and self.logger and self.model_name: + ignored_columns = list( + set(feature.keys()) - set(self.signature_columns)) + if len(ignored_columns) > 0: + dset_description = "" if self.description is None else f"in the {self.description} set" + self.logger.info( + f"The following columns {dset_description} don't have a corresponding argument in " + f"`{self.model_name}.forward` and have been ignored: {', '.join(ignored_columns)}." + f" If {', '.join(ignored_columns)} are not expected by `{self.model_name}.forward`, " + " you can safely ignore this message.") + self.message_logged = True + return {k: v for k, v in feature.items() if k in self.signature_columns} + + def __call__(self, features: List[dict]): + features = [self._remove_columns(feature) for feature in features] + return self.data_collator(features) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 9a85680f8d60..0db34bd87347 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -21,17 +21,20 @@ import math import os from dataclasses import asdict, dataclass, field +import warnings from enum import Enum import types from typing import Any, Dict, List, Optional import paddle +from paddle.distributed import fleet from ..utils.log import logger from .trainer_utils import ( SchedulerType, IntervalStrategy, OptimizerNames, + ShardingOption, ) __all__ = [ @@ -175,6 +178,19 @@ class TrainingArguments: fp16_opt_level (`str`, *optional*, defaults to 'O1'): For `fp16` training, AMP optimization level selected in ['O0', 'O1', 'O2']. See details at https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/amp/auto_cast_cn.html + sharding (`str`, *optional*, defaults to ``): + Whether or not to use Paddle Sharding Data Parallel training (in distributed training + only). The base option should be `stage1`, `stage2` or `stage3` and you can add + CPU-offload to `stage2` or `stage3` like this: `stage2 offload` or `stage3 offload`. + Each stage means: + stage1 : optimizer state segmentation + stage2 : optimizer state + gradient segmentation + stage3 : parameter + gradient + optimizer state segmentation + offload : offload parameters to cpu + sharding_degree (`int`, *optional*, defaults to `-1`) + Sharding parameter in certain cards group. For example, aussume we use 2 machines each with 8 cards, + then set sharding_degree=8, sharding will only communication inside machine. + default -1 means sharding parameters between all workers. recompute (`bool`, *optional*, defaults to `False`): Recompute the forward pass to calculate gradients. Used for saving memory. Only support for networks with transformer blocks. @@ -388,6 +404,15 @@ class TrainingArguments: "help": "Random seed that will be set at the beginning of training." }) + bf16: bool = field( + default=False, + metadata={ + "help": + ("Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA" + " architecture or using CPU (no_cuda). This is an experimental API and it may change." + ) + }, + ) fp16: bool = field( default=False, metadata={ @@ -403,7 +428,40 @@ class TrainingArguments: ) }, ) + bf16_full_eval: bool = field( + default=False, + metadata={ + "help": + ("Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may" + " change.") + }, + ) + fp16_full_eval: bool = field( + default=False, + metadata={ + "help": "Whether to use full float16 evaluation instead of 32-bit" + }, + ) + sharding: str = field( + default="", + metadata={ + "help": + ("Whether or not to use Paddle Sharding Data Parallel training (in distributed training" + " only). The base option should be `stage1`, `stage2` or `stage3` and you can add" + " CPU-offload to `stage2` or `stage3` like this: stage2 offload` or `stage3" + " offload`. ") + }, + ) + sharding_degree: int = field( + default=-1, + metadata={ + "help": + ("Sharding parameter in certain cards group. For example, aussume we use 2 machines each with 8 cards, " + "then set sharding_degree=8, sharding will only communication inside machine. " + "default -1 means sharding parameters between all workers.") + }, + ) recompute: bool = field( default=False, metadata={ @@ -412,7 +470,6 @@ class TrainingArguments: "Only support for networks with transformer blocks." }, ) - scale_loss: float = field( default=2**15, metadata={"help": "The value of initial scale_loss for fp16."}) @@ -593,8 +650,45 @@ def __post_init__(self): if self.run_name is None: self.run_name = self.output_dir + if self.fp16 and self.bf16: + raise ValueError( + "At most one of fp16 and bf16 can be True, but not both") + + if self.fp16_full_eval and self.bf16_full_eval: + raise ValueError( + "At most one of fp16 and bf16 can be True for full eval, but not both" + ) + self.optim = OptimizerNames(self.optim) + if isinstance(self.sharding, bool): + self.sharding = "stage1" if self.sharding else "" + if isinstance(self.sharding, str): + self.sharding = [ShardingOption(s) for s in self.sharding.split()] + if self.sharding == [ShardingOption.OFFLOAD]: + raise ValueError( + "`--sharding offload` can't work on its own. It needs to be added to `--sharding stage2` or " + '`--sharding stage3`. For example, `--sharding "stage2 offload"`.' + ) + elif len(self.sharding) > (ShardingOption.OFFLOAD in self.sharding) + 1: + raise ValueError("`--sharding` recived too many arguments.") + + if len(self.sharding) == 0 and self.sharding_degree > 0: + warnings.warn( + "`--sharding_degree` is useful only when `--sharding` is specified." + ) + if len(self.sharding) > 0: + if self.sharding_degree == -1: + # self.sharding_degree = self.world_size + self.sharding_degree = paddle.distributed.get_world_size() + + assert self.world_size % self.sharding_degree == 0, ( + "The world size for workers should be divided by sharding_degree, " + "sharding_degree:{sharding_degree}, world_size:{self.world_size}" + ) + if ShardingOption.OFFLOAD in self.sharding or ShardingOption.FULL_SHARD in self.sharding: + warnings.warn("`offload` and `stage3` is not supported NOW!") + if self.report_to is None: logger.info( "The default value for the training argument `--report_to` will change in v5 (from all installed " @@ -660,7 +754,25 @@ def world_size(self): The number of processes used in parallel. """ if self.local_rank != -1: - return paddle.distributed.get_world_size() + world_size = paddle.distributed.get_world_size() + # TODO use paddle.distributed.is_initialized() after paddle 2.4rc + if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( + ): + if len(self.sharding) > 0: + self.dp_degree = world_size // self.sharding_degree + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": self.dp_degree, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": self.sharding_degree + } + fleet.init(is_collective=True, strategy=strategy) + logger.info(strategy) + else: + paddle.distributed.init_parallel_env() + + return world_size return 1 @property diff --git a/paddlenlp/trainer/training_args_seq2seq.py b/paddlenlp/trainer/training_args_seq2seq.py new file mode 100644 index 000000000000..ea7870bc2c3f --- /dev/null +++ b/paddlenlp/trainer/training_args_seq2seq.py @@ -0,0 +1,74 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from dataclasses import dataclass, field +from typing import Optional + +from .training_args import TrainingArguments +from .utils import add_start_docstrings + +from ..utils.log import logger + +__all__ = [ + "Seq2SeqTrainingArguments", +] + + +@dataclass +@add_start_docstrings(TrainingArguments.__doc__) +class Seq2SeqTrainingArguments(TrainingArguments): + """ + Args: + sortish_sampler (`bool`, *optional*, defaults to `False`): + Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset* + for now but will become generally available in the near future. + + It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness + for the training set. + predict_with_generate (`bool`, *optional*, defaults to `False`): + Whether to use generate to calculate generative metrics (ROUGE, BLEU). + generation_max_length (`int`, *optional*): + The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + generation_num_beams (`int`, *optional*): + The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + """ + + sortish_sampler: bool = field( + default=False, + metadata={"help": "Whether to use SortishSampler or not."}) + predict_with_generate: bool = field( + default=False, + metadata={ + "help": + "Whether to use generate to calculate generative metrics (ROUGE, BLEU)." + }) + generation_max_length: Optional[int] = field( + default=None, + metadata={ + "help": + ("The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default " + "to the `max_length` value of the model configuration.") + }, + ) + generation_num_beams: Optional[int] = field( + default=None, + metadata={ + "help": + ("The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default " + "to the `num_beams` value of the model configuration.") + }, + ) diff --git a/paddlenlp/trainer/utils/__init__.py b/paddlenlp/trainer/utils/__init__.py index f5f5e56b0560..670a5684bab4 100644 --- a/paddlenlp/trainer/utils/__init__.py +++ b/paddlenlp/trainer/utils/__init__.py @@ -12,4 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .helper import * \ No newline at end of file +from .helper import * + +from .doc import ( + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, +) \ No newline at end of file diff --git a/paddlenlp/trainer/utils/doc.py b/paddlenlp/trainer/utils/doc.py new file mode 100644 index 000000000000..74aeb1e019de --- /dev/null +++ b/paddlenlp/trainer/utils/doc.py @@ -0,0 +1,64 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Doc utilities: Utilities related to documentation +""" + +import functools +import re +import types + + +def add_start_docstrings(*docstr): + + def docstring_decorator(fn): + fn.__doc__ = "".join(docstr) + (fn.__doc__ + if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def add_start_docstrings_to_model_forward(*docstr): + + def docstring_decorator(fn): + docstring = "".join(docstr) + (fn.__doc__ + if fn.__doc__ is not None else "") + class_name = f"[`{fn.__qualname__.split('.')[0]}`]" + intro = f" The {class_name} forward method, overrides the `__call__` special method." + note = r""" + + + + Although the recipe for forward pass needs to be defined within this function, one should call the [`Layer`] + instance afterwards instead of this since the former takes care of running the pre and post processing steps while + the latter silently ignores them. + + +""" + + fn.__doc__ = intro + note + docstring + return fn + + return docstring_decorator + + +def add_end_docstrings(*docstr): + + def docstring_decorator(fn): + fn.__doc__ = (fn.__doc__ + if fn.__doc__ is not None else "") + "".join(docstr) + return fn + + return docstring_decorator diff --git a/paddlenlp/transformers/bart/modeling.py b/paddlenlp/transformers/bart/modeling.py index 694b41e19787..6db19abd1e18 100644 --- a/paddlenlp/transformers/bart/modeling.py +++ b/paddlenlp/transformers/bart/modeling.py @@ -22,6 +22,16 @@ from paddle.nn import Layer, Embedding from .. import PretrainedModel, register_base_model +from ..model_outputs import ( + ModelOutput, + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, + convert_encoder_output, +) __all__ = [ 'BartModel', 'BartPretrainedModel', 'BartEncoder', 'BartDecoder', @@ -181,7 +191,13 @@ def __init__(self, self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers) self.apply(self.init_weights) - def forward(self, input_ids=None, attention_mask=None, **kwargs): + def forward(self, + input_ids=None, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False, + **kwargs): """ The BartEncoder forward method, overrides the `__call__()` special method. @@ -190,9 +206,20 @@ def forward(self, input_ids=None, attention_mask=None, **kwargs): See :class:`BartModel`. attention_mask (Tensor, optional): See :class:`BartModel`. + output_attentions (bool, optional): + See :class:`BartModel`. + output_hidden_states (bool, optional): + See :class:`BartModel`. + return_dict (bool, optional): + See :class:`BartModel`. Returns: - Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model. + An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. + Especially, When `return_dict=output_hidden_states=output_attentions=False`, + returns tensor `encoder_outputs` which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. """ @@ -215,7 +242,11 @@ def forward(self, input_ids=None, attention_mask=None, **kwargs): attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True - encoder_output = self.encoder(encoder_input, src_mask=attention_mask) + encoder_output = self.encoder(encoder_input, + src_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) return encoder_output @@ -266,7 +297,10 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, memory_mask=None, - cache=None): + cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): """ The BartDecoder forward method, overrides the `__call__()` special method. @@ -281,9 +315,20 @@ def forward(self, See :class:`BartModel`. cache (Tensor, optional): See :class:`BartModel`. + output_attentions (bool, optional): + See :class:`BartModel`. + output_hidden_states (bool, optional): + See :class:`BartModel`. + return_dict (bool, optional): + See :class:`BartModel`. Returns: - Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model. + An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. + Especially, When `return_dict=output_hidden_states=output_attentions=False`, + returns tensor `decoder_outputs` which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. """ @@ -302,11 +347,16 @@ def forward(self, hidden_states = self.decoder_layernorm_embedding(hidden_states) decoder_input = self.decoder_dropout(hidden_states) - decoder_output = self.decoder(tgt=decoder_input, - memory=encoder_output, - tgt_mask=decoder_attention_mask, - memory_mask=memory_mask, - cache=cache) + decoder_output = self.decoder( + tgt=decoder_input, + memory=encoder_output if isinstance( + encoder_output, type(decoder_input)) else encoder_output[0], + tgt_mask=decoder_attention_mask, + memory_mask=memory_mask, + cache=cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) return decoder_output @@ -439,7 +489,10 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r''' The BartModel forward method, overrides the `__call__()` special method. @@ -482,9 +535,22 @@ def forward(self, See `TransformerDecoder.gen_cache `__ for more details. It is only used for inference and should be None for training. Default to `None`. - + output_attentions (bool, optional): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. Defaults to `False`. + output_hidden_states (bool, optional): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. Defaults to `False`. + return_dict (bool, optional): + Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output + will be a tuple of tensors. Defaults to `False`. Returns: - Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model. + An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. + Especially, When `return_dict=output_hidden_states=output_attentions=False`, + returns tensor `decoder_output`, which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. Example: @@ -523,16 +589,53 @@ def forward(self, attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True if encoder_output is None: - encoder_output = self.encoder(input_ids, attention_mask) + encoder_output = self.encoder( + input_ids, + attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_output, ModelOutput): + if isinstance(encoder_output, type(decoder_input_ids)): + encoder_output = (encoder_output, ) + encoder_output = convert_encoder_output(encoder_output) + if isinstance(encoder_output, type(decoder_input_ids)): + encoder_last_hidden_state = encoder_output + else: + encoder_last_hidden_state = encoder_output[0] if use_cache: if cache is None: - cache = self.decoder.decoder.gen_cache(encoder_output) + cache = self.decoder.decoder.gen_cache( + encoder_last_hidden_state) else: cache = None - decoder_output = self.decoder(decoder_input_ids, decoder_attention_mask, - encoder_output, attention_mask, cache) - - return decoder_output + decoder_output = self.decoder(decoder_input_ids, + decoder_attention_mask, + encoder_last_hidden_state, + attention_mask, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + if not return_dict: + if isinstance(decoder_output, type(decoder_input_ids)): + decoder_output = (decoder_output, ) + if isinstance(encoder_output, type(decoder_input_ids)): + encoder_output = (encoder_output, ) + return decoder_output + encoder_output + + return Seq2SeqModelOutput( + last_hidden_state=decoder_output.last_hidden_state, + past_key_values=decoder_output.past_key_values, + decoder_hidden_states=decoder_output.hidden_states, + decoder_attentions=decoder_output.attentions, + cross_attentions=decoder_output.cross_attentions, + encoder_last_hidden_state=encoder_output.last_hidden_state, + encoder_hidden_states=encoder_output.hidden_states, + encoder_attentions=encoder_output.attentions, + ) class BartClassificationHead(Layer): @@ -580,6 +683,7 @@ class BartForSequenceClassification(BartPretrainedModel): def __init__(self, bart, num_labels=2, dropout=None): super().__init__() self.bart = bart + self.num_labels = num_labels self.classifier = BartClassificationHead( self.bart.config['d_model'], self.bart.config['d_model'], num_labels, dropout if dropout else self.bart.config['dropout']) @@ -592,7 +696,11 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + labels=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The BartForSequenceClassification forward method, overrides the __call__() special method. @@ -611,9 +719,25 @@ def forward(self, See :class:`BartModel`. cache (Tensor, optional): See :class:`BartModel`. + labels (Tensor, optional): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + num_labels - 1]`. If `num_labels > 1` a classification loss is computed (Cross-Entropy). + Default to `None`. + output_attentions (bool, optional): + See :class:`BartModel`. + output_hidden_states (bool, optional): + See :class:`BartModel`. + return_dict (bool, optional): + See :class:`BartModel`. + Returns: - Tensor: Returns tensor `logits`, a tensor of the input text classification logits. + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput`. + Especially, When `return_dict=output_hidden_states=output_attentions=False` and labels=None, + returns tensor `logits`, a tensor of the input text classification logits. Shape as `[batch_size, num_labels]` and dtype as float32. Example: @@ -629,11 +753,19 @@ def forward(self, inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} logits = model(**inputs) """ - output = self.bart(input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, encoder_output, use_cache, - cache) - if use_cache: - output = output[0] + outputs = self.bart( + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_output, + use_cache, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + output = outputs[0] eos_mask = paddle.cast(input_ids == self.bart.config['eos_token_id'], dtype='int64') if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1: @@ -648,7 +780,37 @@ def forward(self, sentence_representation = output.reshape( [output_shape[0], -1, output_shape[-1]])[:, -1, :] logits = self.classifier(sentence_representation) - return logits + + loss = None + if labels is not None: + if self.num_labels == 1: + loss_fct = nn.MSELoss() + loss = loss_fct(logits, labels) + elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits.reshape((-1, self.num_labels)), + labels.reshape((-1, ))) + else: + loss_fct = nn.BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + if len(outputs) == 2: + return (loss, logits) if loss is not None else logits + output = (logits, ) + outputs[1:] + return ((loss, ) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) class BartForQuestionAnswering(BartPretrainedModel): @@ -674,7 +836,12 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + start_positions=None, + end_positions=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The BartForQuestionAnswering forward method, overrides the __call__() special method. @@ -693,9 +860,30 @@ def forward(self, See :class:`BartModel`. cache (Tensor, optional): See :class:`BartModel`. + start_positions (Tensor, optional): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence + are not taken into account for computing the loss. + A tensor of shape `(batch_size, )`. Default to `None`. + end_positions (Tensor, optional): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence + are not taken into account for computing the loss. + A tensor of shape `(batch_size, )`. Default to `None`. + output_attentions (bool, optional): + See :class:`BartModel`. + output_hidden_states (bool, optional): + See :class:`BartModel`. + return_dict (bool, optional): + See :class:`BartModel`. Returns: - tuple: Returns tuple (`start_logits`, `end_logits`). + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput`. + Especially, When `return_dict=output_hidden_states=output_attentions=False` and `start_positions=end_positions=None`, + returns tuple (`start_logits`, `end_logits`). With the fields: @@ -722,13 +910,54 @@ def forward(self, start_logits = outputs[0] end_logits =outputs[1] """ - output = self.bart(input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, encoder_output, use_cache, - cache) - logits = self.classifier(output[0] if use_cache else output, ) + outputs = self.bart(input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_output, + use_cache, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + logits = self.classifier(outputs[0]) logits = paddle.transpose(logits, perm=[2, 0, 1]) start_logits, end_logits = paddle.unstack(x=logits, axis=0) - return start_logits, end_logits + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if start_positions.ndim > 1: + start_positions = start_positions.squeeze(-1) + if start_positions.ndim > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.shape[1] + start_positions = start_positions.clip(0, ignored_index) + end_positions = end_positions.clip(0, ignored_index) + + loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + outputs = (start_logits, + end_logits) + (outputs[1:] if len(outputs) > 2 else ()) + return ((total_loss, ) + outputs) if total_loss else outputs + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) class BartForConditionalGeneration(BartPretrainedModel): @@ -796,7 +1025,11 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + labels=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The BartForConditionalGeneration forward method, overrides the __call__() special method. @@ -815,19 +1048,32 @@ def forward(self, See :class:`BartModel`. cache (Tensor, optional): See :class:`BartModel`. + labels (Tensor, optional): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. + A tensor of shape `(batch_size, sequence_length)`. Default to `None`. + output_attentions (bool, optional): + See :class:`BartModel`. + output_hidden_states (bool, optional): + See :class:`BartModel`. + return_dict (bool, optional): + See :class:`BartModel`. Returns: - Tensor or tuple: Returns Tensor `lm_logits` if `use_cache` is `False`, otherwise, returns tuple (`lm_logits`, `cache`). - + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. + Especially, When `use_cache=return_dict=output_hidden_states=output_attentions=False` and labels=None, + returns tensor `logits`, a tensor of the input text classification logits. + With the fields: - `lm_logits` (Tensor): The generated sentence of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, vocab_size]. - - `cache` (Tensor): - See :class:`BartModel`. - Example: .. code-block:: @@ -842,18 +1088,47 @@ def forward(self, outputs = model(**inputs) """ - output = self.bart(input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, encoder_output, use_cache, - cache) + outputs = self.bart(input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_output, + use_cache, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) lm_logits = paddle.tensor.matmul( - output[0] if use_cache else output, - self.lm_head_weight, + outputs[0], self.lm_head_weight, transpose_y=True) + self.final_logits_bias - if use_cache: - cache = output[1] - return lm_logits, cache - else: - return lm_logits + + masked_lm_loss = None + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + masked_lm_loss = loss_fct( + lm_logits.reshape((-1, self.bart.config['vocab_size'])), + labels.reshape((-1, ))) + + if not return_dict: + if len(outputs) == 2: + return (masked_lm_loss, + lm_logits) if masked_lm_loss is not None else lm_logits + else: + outputs = (lm_logits, ) + outputs[1:] + return ((masked_lm_loss, ) + + outputs) if masked_lm_loss is not None else outputs + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) def prepare_decoder_input_ids_from_labels(self, labels): return shift_tokens_right(labels, diff --git a/paddlenlp/transformers/generation_utils.py b/paddlenlp/transformers/generation_utils.py index 8c5accd4469a..39c4277ff821 100644 --- a/paddlenlp/transformers/generation_utils.py +++ b/paddlenlp/transformers/generation_utils.py @@ -971,7 +971,8 @@ def greedy_search(self, input_ids, logits_processors, max_length, probs = F.softmax(logits) probs = paddle.log(probs) next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1) - next_scores = paddle.index_sample(probs, next_tokens) + next_scores = paddle.index_sample(probs.astype("float32"), + next_tokens) if eos_token_id is not None: next_tokens = paddle.where( diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py index 9e15a039ff7c..11031d194fd2 100644 --- a/paddlenlp/transformers/mbart/modeling.py +++ b/paddlenlp/transformers/mbart/modeling.py @@ -22,6 +22,15 @@ from paddle.nn import Layer, Embedding from .. import PretrainedModel, register_base_model +from ..model_outputs import ( + ModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, + convert_encoder_output, +) __all__ = [ 'MBartModel', 'MBartPretrainedModel', 'MBartEncoder', 'MBartDecoder', @@ -252,7 +261,13 @@ def __init__(self, nn.LayerNorm(d_model)) self.apply(self.init_weights) - def forward(self, input_ids=None, attention_mask=None, **kwargs): + def forward(self, + input_ids=None, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False, + **kwargs): """ The MBartEncoder forward method, overrides the `__call__()` special method. @@ -286,7 +301,11 @@ def forward(self, input_ids=None, attention_mask=None, **kwargs): attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True - encoder_output = self.encoder(encoder_input, src_mask=attention_mask) + encoder_output = self.encoder(encoder_input, + src_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) return encoder_output @@ -335,12 +354,17 @@ def __init__(self, nn.LayerNorm(d_model)) self.apply(self.init_weights) - def forward(self, - decoder_input_ids=None, - decoder_attention_mask=None, - encoder_output=None, - memory_mask=None, - cache=None): + def forward( + self, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_output=None, + memory_mask=None, + cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False, + ): """ The MBartDecoder forward method, overrides the `__call__()` special method. @@ -381,7 +405,10 @@ def forward(self, memory=encoder_output, tgt_mask=decoder_attention_mask, memory_mask=memory_mask, - cache=cache) + cache=cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) return decoder_output @@ -514,7 +541,10 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r''' The MBartModel forward method, overrides the `__call__()` special method. @@ -557,9 +587,23 @@ def forward(self, See `TransformerDecoder.gen_cache `__ for more details. It is only used for inference and should be None for training. Default to `None`. + output_attentions (bool, optional): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. Defaults to `False`. + output_hidden_states (bool, optional): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. Defaults to `False`. + return_dict (bool, optional): + Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output + will be a tuple of tensors. Defaults to `False`. Returns: - Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model. + An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. + Especially, When `return_dict=output_hidden_states=output_attentions=False`, + returns tensor `decoder_output`, which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. Example: @@ -597,16 +641,56 @@ def forward(self, attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True if encoder_output is None: - encoder_output = self.encoder(input_ids, attention_mask) + encoder_output = self.encoder( + input_ids, + attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_output, ModelOutput): + if isinstance(encoder_output, type(decoder_input_ids)): + encoder_output = (encoder_output, ) + encoder_output = convert_encoder_output(encoder_output) + if isinstance(encoder_output, type(decoder_input_ids)): + encoder_last_hidden_state = encoder_output + else: + encoder_last_hidden_state = encoder_output[0] + if use_cache: if cache is None: cache = self.decoder.decoder.gen_cache(encoder_output) else: cache = None - decoder_output = self.decoder(decoder_input_ids, decoder_attention_mask, - encoder_output, attention_mask, cache) - - return decoder_output + decoder_output = self.decoder( + decoder_input_ids, + decoder_attention_mask, + encoder_last_hidden_state, + attention_mask, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + if isinstance(decoder_output, type(decoder_input_ids)): + decoder_output = (decoder_output, ) + if isinstance(encoder_output, type(decoder_input_ids)): + encoder_output = (encoder_output, ) + return decoder_output + encoder_output + + return Seq2SeqModelOutput( + last_hidden_state=decoder_output.last_hidden_state, + past_key_values=decoder_output.past_key_values, + decoder_hidden_states=decoder_output.hidden_states, + decoder_attentions=decoder_output.attentions, + cross_attentions=decoder_output.cross_attentions, + encoder_last_hidden_state=encoder_output.last_hidden_state, + encoder_hidden_states=encoder_output.hidden_states, + encoder_attentions=encoder_output.attentions, + ) class MBartClassificationHead(Layer): @@ -666,7 +750,11 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + labels=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The MBartForSequenceClassification forward method, overrides the __call__() special method. @@ -685,9 +773,24 @@ def forward(self, See :class:`MBartModel`. cache (Tensor, optional): See :class:`MBartModel`. + labels (Tensor, optional): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + num_labels - 1]`. If `num_labels > 1` a classification loss is computed (Cross-Entropy). + Default to `None`. + output_attentions (bool, optional): + See :class:`MBartModel`. + output_hidden_states (bool, optional): + See :class:`MBartModel`. + return_dict (bool, optional): + See :class:`MBartModel`. Returns: - Tensor: Returns tensor `logits`, a tensor of the input text classification logits. + `An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput`. + Especially, When `return_dict=output_hidden_states=output_attentions=False` and labels=None, + returns tensor `logits`, a tensor of the input text classification logits. Shape as `[batch_size, num_labels]` and dtype as float32. Example: @@ -703,11 +806,19 @@ def forward(self, inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} logits = model(**inputs) """ - output = self.mbart(input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, encoder_output, use_cache, - cache) - if use_cache: - output = output[0] + outputs = self.mbart( + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_output, + use_cache, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + output = outputs[0] eos_mask = paddle.cast(input_ids == self.mbart.config['eos_token_id'], dtype='int64') if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1: @@ -722,7 +833,37 @@ def forward(self, sentence_representation = output.reshape( [output_shape[0], -1, output_shape[-1]])[:, -1, :] logits = self.classifier(sentence_representation) - return logits + + loss = None + if labels is not None: + if self.num_labels == 1: + loss_fct = nn.MSELoss() + loss = loss_fct(logits, labels) + elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits.reshape((-1, self.num_labels)), + labels.reshape((-1, ))) + else: + loss_fct = nn.BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + if len(outputs) == 2: + return (loss, logits) if loss is not None else logits + output = (logits, ) + outputs[1:] + return ((loss, ) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) class MBartForQuestionAnswering(MBartPretrainedModel): @@ -748,7 +889,12 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + start_positions=None, + end_positions=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The MBartForQuestionAnswering forward method, overrides the __call__() special method. @@ -767,9 +913,30 @@ def forward(self, See :class:`MBartModel`. cache (Tensor, optional): See :class:`MBartModel`. + start_positions (Tensor, optional): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence + are not taken into account for computing the loss. + A tensor of shape `(batch_size, )`. Default to `None`. + end_positions (Tensor, optional): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence + are not taken into account for computing the loss. + A tensor of shape `(batch_size, )`. Default to `None`. + output_attentions (bool, optional): + See :class:`MBartModel`. + output_hidden_states (bool, optional): + See :class:`MBartModel`. + return_dict (bool, optional): + See :class:`MBartModel`. Returns: - tuple: Returns tuple (`start_logits`, `end_logits`). + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput`. + Especially, When `return_dict=output_hidden_states=output_attentions=False` and `start_positions=end_positions=None`, + returns tuple (`start_logits`, `end_logits`). With the fields: @@ -796,13 +963,56 @@ def forward(self, start_logits = outputs[0] end_logits =outputs[1] """ - output = self.mbart(input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, encoder_output, use_cache, - cache) - logits = self.classifier(output[0] if use_cache else output, ) + outputs = self.mbart( + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_output, + use_cache, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = self.classifier(outputs[0]) logits = paddle.transpose(logits, perm=[2, 0, 1]) start_logits, end_logits = paddle.unstack(x=logits, axis=0) - return start_logits, end_logits + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if start_positions.ndim > 1: + start_positions = start_positions.squeeze(-1) + if start_positions.ndim > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.shape[1] + start_positions = start_positions.clip(0, ignored_index) + end_positions = end_positions.clip(0, ignored_index) + + loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + outputs = (start_logits, + end_logits) + (outputs[1:] if len(outputs) > 2 else ()) + return ((total_loss, ) + outputs) if total_loss else outputs + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) class MBartForConditionalGeneration(MBartPretrainedModel): @@ -864,7 +1074,11 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, use_cache=False, - cache=None): + cache=None, + labels=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The MBartForConditionalGeneration forward method, overrides the __call__() special method. @@ -883,9 +1097,25 @@ def forward(self, See :class:`MBartModel`. cache (Tensor, optional): See :class:`MBartModel`. + abels (Tensor, optional): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. + A tensor of shape `(batch_size, sequence_length)`. Default to `None`. + output_attentions (bool, optional): + See :class:`MBartModel`. + output_hidden_states (bool, optional): + See :class:`MBartModel`. + return_dict (bool, optional): + See :class:`MBartModel`. Returns: - Tensor or tuple: Returns Tensor `lm_logits` if `use_cache` is `False`, otherwise, returns tuple (`lm_logits`, `cache`). + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. + Especially, When `use_cache=return_dict=output_hidden_states=output_attentions=False` and labels=None, + returns tensor `logits`, a tensor of the input text classification logits. With the fields: @@ -910,19 +1140,50 @@ def forward(self, outputs = model(**inputs) """ - output = self.mbart(input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, encoder_output, use_cache, - cache) + outputs = self.mbart( + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_output, + use_cache, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) lm_logits = paddle.tensor.matmul( - output[0] if use_cache else output, - self.lm_head_weight, + outputs[0], self.lm_head_weight, transpose_y=True) + self.final_logits_bias - if use_cache: - cache = output[1] - return lm_logits, cache - else: - return lm_logits + + masked_lm_loss = None + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + masked_lm_loss = loss_fct( + lm_logits.reshape((-1, self.mbart.config['vocab_size'])), + labels.reshape((-1, ))) + + if not return_dict: + if len(outputs) == 2: + return (masked_lm_loss, + lm_logits) if masked_lm_loss is not None else lm_logits + else: + outputs = (lm_logits, ) + outputs[1:] + return ((masked_lm_loss, ) + + outputs) if masked_lm_loss is not None else outputs + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) def prepare_inputs_for_generation(self, decoder_input_ids, diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py index 3e5043fab7f0..1bc12cb91eb9 100644 --- a/paddlenlp/transformers/model_outputs.py +++ b/paddlenlp/transformers/model_outputs.py @@ -41,6 +41,22 @@ def tuple_output(outputs: Tuple[Tensor], loss: Optional[Tensor] = None): return outputs +def convert_encoder_output(encoder_output): + """ + Convert encoder_output from tuple to class:`~paddlenlp.transformers.model_outputs.BaseModelOutput`. + + Args: + encoder_output (tuple or ModleOutput): + The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional). + The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size]. + """ + return BaseModelOutput( + last_hidden_state=encoder_output[0], + hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, + attentions=encoder_output[2] if len(encoder_output) > 2 else None, + ) + + def layer_init_wrapper(func): @functools.wraps(func) @@ -55,54 +71,6 @@ def _impl(self, *args, **kwargs): return _impl -def _transformer_decoder_layer_fwd(self, - tgt, - memory=None, - tgt_mask=None, - memory_mask=None, - cache=None): - tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) - - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - if cache is None: - tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None) - else: - tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, - cache[0]) - tgt = residual + self.dropout1(tgt) - if not self.normalize_before: - tgt = self.norm1(tgt) - - residual = tgt - - if memory is not None: - memory_mask = _convert_attention_mask(memory_mask, memory.dtype) - - if self.normalize_before: - tgt = self.norm2(tgt) - if cache is None: - tgt = self.cross_attn(tgt, memory, memory, memory_mask, None) - else: - tgt, static_cache = self.cross_attn(tgt, memory, memory, - memory_mask, cache[1]) - tgt = residual + self.dropout2(tgt) - if not self.normalize_before: - tgt = self.norm2(tgt) - - residual = tgt - - if self.normalize_before: - tgt = self.norm3(tgt) - tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) - tgt = residual + self.dropout3(tgt) - if not self.normalize_before: - tgt = self.norm3(tgt) - return tgt if cache is None else (tgt, ( - incremental_cache, static_cache if memory is not None else None)) - - def _transformer_encoder_layer_fwd(self, src, src_mask=None, @@ -139,45 +107,163 @@ def _transformer_encoder_layer_fwd(self, (src, ) + outputs[::-1]) # hidden_states, cache, attentions +def _transformer_decoder_layer_fwd( + self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + cache=None, + output_attentions=False, +): + residual = tgt + + # self attention + self.self_attn.need_weights = output_attentions + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + + if self.normalize_before: + tgt = self.norm1(tgt) + + self_attn_outputs = self.self_attn(tgt, tgt, tgt, tgt_mask, + cache[0] if cache else None) + # self_attn_outputs = (tgt, attn_weights, incremental_cache) or only tgt + if isinstance(self_attn_outputs, type(tgt)): + tgt = self_attn_outputs + else: + tgt = self_attn_outputs[0] + if output_attentions: + self_attn_weights = self_attn_outputs[1] + if cache: + incremental_cache = self_attn_outputs[-1] + + tgt = residual + self.dropout1(tgt) + if not self.normalize_before: + tgt = self.norm1(tgt) + + residual = tgt + + # cross attention + if memory is not None: + self.cross_attn.need_weights = output_attentions + memory_mask = _convert_attention_mask(memory_mask, memory.dtype) + + if self.normalize_before: + tgt = self.norm2(tgt) + + cross_attn_outputs = self.cross_attn(tgt, memory, memory, memory_mask, + cache[1] if cache else None) + if isinstance(cross_attn_outputs, type(tgt)): + tgt = cross_attn_outputs + else: + tgt = cross_attn_outputs[0] + if output_attentions: + cross_attn_weights = cross_attn_outputs[1] + if cache: + static_cache = cross_attn_outputs[-1] + + tgt = residual + self.dropout2(tgt) + if not self.normalize_before: + tgt = self.norm2(tgt) + + residual = tgt + + if self.normalize_before: + tgt = self.norm3(tgt) + tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = residual + self.dropout3(tgt) + if not self.normalize_before: + tgt = self.norm3(tgt) + + if not output_attentions and cache is None: + return tgt + else: + outputs = (tgt, ) + if output_attentions: + outputs += (self_attn_weights, + cross_attn_weights if memory is not None else None) + if cache: + outputs += ((incremental_cache, + static_cache if memory is not None else None), ) + return outputs + + def _transformer_decoder_fwd(self, tgt, memory=None, tgt_mask=None, memory_mask=None, - cache=None): + cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) if memory is not None: memory_mask = _convert_attention_mask(memory_mask, memory.dtype) - output = tgt - new_caches = [] + new_caches = [] if cache else None + all_hidden_states = [tgt] if output_hidden_states else None + all_self_attns = [] if output_attentions else None + all_cross_attns = [] if output_attentions else None + for i, mod in enumerate(self.layers): if cache is None: if self.enable_recompute: - output = recompute(mod, - output, - memory, - tgt_mask, - memory_mask, - cache=None) + outputs = recompute(mod, tgt, memory, tgt_mask, memory_mask, + None, output_attentions) else: - output = mod(output, - memory, - tgt_mask=tgt_mask, - memory_mask=memory_mask, - cache=None) + outputs = mod( + tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + cache=None, + output_attentions=output_attentions, + ) + else: + outputs = mod(tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + cache=cache[i] if cache else None, + output_attentions=output_attentions) + if isinstance(outputs, type(tgt)): + tgt = outputs else: - output, new_cache = mod(output, - memory, - tgt_mask=tgt_mask, - memory_mask=memory_mask, - cache=cache[i]) - new_caches.append(new_cache) + tgt = outputs[0] + if cache: + new_caches.append(outputs[-1]) + if output_attentions: + all_self_attns.append(outputs[1]) + all_cross_attns.append(outputs[2]) + if output_hidden_states: + all_hidden_states.append(tgt) if self.norm is not None: - output = self.norm(output) + tgt = self.norm(tgt) + if output_hidden_states: + all_hidden_states[-1] = tgt + + if not return_dict: + if isinstance(outputs, type(tgt)): + return tgt + + temp_list = [ + tgt, + new_caches if cache else None, + all_hidden_states, + all_self_attns, + all_cross_attns, + ] + return tuple(v for v in temp_list if v is not None) - return output if cache is None else (output, new_caches) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=tgt, + past_key_values=new_caches, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attns, + ) def _transformer_encoder_fwd(self, @@ -261,8 +347,8 @@ def _transformer_encoder_fwd(self, # patches of paddle.nn.Transformer to get all hidden_states and attentions paddle.nn.TransformerEncoderLayer.forward = _transformer_encoder_layer_fwd -paddle.nn.TransformerEncoder.forward = _transformer_encoder_fwd paddle.nn.TransformerDecoderLayer.forward = _transformer_decoder_layer_fwd +paddle.nn.TransformerEncoder.forward = _transformer_encoder_fwd paddle.nn.TransformerDecoder.forward = _transformer_decoder_fwd _encoder_init = paddle.nn.TransformerEncoder.__init__ @@ -915,3 +1001,137 @@ class Seq2SeqLMOutput(ModelOutput): encoder_last_hidden_state: Optional[paddle.Tensor] = None encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (`paddle.Tensor` ,optional): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + A Tensor of shape `(1,)`, returned when `labels` is provided. + start_logits (`paddle.Tensor`): + Span-start scores (before SoftMax). Tensor of shape `(batch_size, sequence_length)`). + end_logits (`paddle.Tensor`): + Span-end scores (before SoftMax). Tensor of shape `(batch_size, sequence_length)`). + past_key_values (`tuple(tuple(paddle.Tensor))`, optional): + Tuple of `tuple(paddle.Tensor)` of length `n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + Returned when `use_cache=True` is passed. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Returned when `output_attentions=True` is passed. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Returned when `output_attentions=True` is passed. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` optional): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + Tensor of shape `(batch_size, sequence_length, hidden_size)`. + encoder_hidden_states (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Returned when `output_attentions=True` is passed. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[paddle.Tensor] = None + start_logits: paddle.Tensor = None + end_logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (`paddle.Tensor` optional): + Classification (or regression if config.num_labels==1) loss of shape `(1,)`. Returned when `label` is provided). + logits (`paddle.Tensor`): + Classification (or regression if config.num_labels==1) scores (before SoftMax) of shape `(batch_size, config.num_labels)` + past_key_values (`tuple(tuple(paddle.Tensor))`, optional): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + Returned when `use_cache=True` is passed. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Returned when `output_attentions=True` is passed. + + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Returned when `output_attentions=True` is passed. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor`, optional): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + Tensor of shape `(batch_size, sequence_length, hidden_size)`. + encoder_hidden_states (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, optional): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + Returned when `output_attentions=True` is passed. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index e92fed48aaf3..2fff87872b56 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -186,6 +186,7 @@ class is a pretrained model class adding layers on top of the base model, resource_files_names = {"model_state": "model_state.pdparams"} pretrained_resource_files_map = {} base_model_prefix = "" + main_input_name = "input_ids" config_class = None # a list of `re` patterns of `state_dict` keys that should be removed from the list of missing @@ -571,13 +572,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Allow the float16 model to load float32 weights, which decreases memory # usage in model loading stage and is useful to big models. dtype_prefix_len = len("paddle.") # paddle.float16 + for k, v in model_to_load.state_dict().items(): if not isinstance(v, np.ndarray): dtype = str(v.dtype)[dtype_prefix_len:] # TODO(guosheng): add warnings for unmatched dtypes if k in state_to_load: if paddle.in_dynamic_mode(): - state_to_load[k] = paddle.cast(state_to_load[k], dtype) + if isinstance(state_to_load[k], np.ndarray): + state_to_load[k] = state_to_load[k].astype(dtype) + else: + state_to_load[k] = paddle.cast(state_to_load[k], dtype) else: # there are some latent error when case dtype in static-mode, so let's: # 1. convert fluid.*.Tensor -> numpy.ndarray diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index c924fb7c6d24..58896ee09424 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -23,8 +23,10 @@ import paddle.nn as nn import paddle.nn.functional as F +from paddle.distributed.fleet.utils import recompute from ..model_utils import PretrainedModel, register_base_model +from ...utils.log import logger from ..nezha.modeling import ACT2FN from ..model_outputs import ( BaseModelOutputWithPastAndCrossAttentions, @@ -32,6 +34,7 @@ Seq2SeqLMOutput, BaseModelOutput, ModelOutput, + convert_encoder_output, ) __all__ = [ @@ -43,6 +46,8 @@ "t5-small", "t5-base", "t5-large", + "t5-3b", + "t5-11b", ] @@ -190,6 +195,7 @@ def __init__(self, self.n_heads = num_heads self.dropout = dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.enable_recompute = False # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias_attr=False) @@ -364,6 +370,8 @@ def project(hidden_states, proj_layer, key_value_states, cache): shape=(1, self.n_heads, real_seq_length, key_length), dtype=scores.dtype, ) + if self.training and self.enable_recompute: + position_bias.stop_gradient = False else: position_bias = self.compute_bias(real_seq_length, key_length) @@ -732,6 +740,42 @@ class T5PretrainedModel(PretrainedModel): "initializer_factor": 1.0, "feed_forward_proj": "gated-gelu", }, + "t5-3b": { + "tie_word_embeddings": True, + "pad_token_id": 0, + "bos_token_id": 0, + "eos_token_id": 1, + "vocab_size": 32128, + "d_model": 1024, + "d_kv": 128, + "d_ff": 16384, + "num_layers": 24, + "num_decoder_layers": 24, + "num_heads": 32, + "relative_attention_num_buckets": 32, + "dropout_rate": 0.1, + "layer_norm_epsilon": 1e-06, + "initializer_factor": 1.0, + "feed_forward_proj": "relu" + }, + "t5-11b": { + "tie_word_embeddings": True, + "pad_token_id": 0, + "bos_token_id": 0, + "eos_token_id": 1, + "vocab_size": 32128, + "d_model": 1024, + "d_kv": 128, + "d_ff": 65536, + "num_layers": 24, + "num_decoder_layers": 24, + "num_heads": 128, + "relative_attention_num_buckets": 32, + "dropout_rate": 0.1, + "layer_norm_epsilon": 1e-06, + "initializer_factor": 1.0, + "feed_forward_proj": "relu" + }, } pretrained_resource_files_map = { "model_state": { @@ -741,6 +785,10 @@ class T5PretrainedModel(PretrainedModel): "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-base/model_state.pdparams", "t5-large": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-large/model_state.pdparams", + "t5-3b": + "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-3b/model_state.pdparams", + "t5-11b": + "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-11b/model_state.pdparams", "t5-v1_1-base": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-base/model_state.pdparams", "t5-v1_1-large": @@ -913,7 +961,8 @@ def __init__(self, feed_forward_proj, d_ff, embed_tokens=None, - is_decoder=False): + is_decoder=False, + enable_recompute=False): super().__init__() self.is_decoder = is_decoder self.embed_tokens = embed_tokens @@ -932,6 +981,7 @@ def __init__(self, ]) self.final_layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) + self.enable_recompute = enable_recompute def get_input_embeddings(self): return self.embed_tokens @@ -948,16 +998,32 @@ def forward(self, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, + inputs_embeds=None, cache=None, use_cache=False, output_attentions=False, output_hidden_states=False, return_dict=False): - assert input_ids is not None, "input_ids can not be None" - input_shape = input_ids.shape - input_ids = input_ids.reshape(shape=[-1, input_shape[-1]]) - inputs_embeds = self.embed_tokens(input_ids) + if input_ids is not None and inputs_embeds is not None: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.shape + input_ids = input_ids.reshape(shape=[-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.shape[:-1] + else: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds" + ) + + if inputs_embeds is None: + assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings" + inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -1015,17 +1081,39 @@ def forward(self, if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) - layer_outputs = layer_module( - hidden_states, - attention_mask=extended_attention_mask, - position_bias=position_bias, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - encoder_decoder_position_bias=encoder_decoder_position_bias, - cache=past_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) + if self.enable_recompute and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with `config.enable_recompute=True`. Setting " + "`use_cache=False`...") + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return tuple( + module(*inputs, use_cache, output_attentions)) + + return custom_forward + + layer_outputs = recompute(create_custom_forward(layer_module), + hidden_states, + extended_attention_mask, + position_bias, encoder_hidden_states, + encoder_extended_attention_mask, + encoder_decoder_position_bias, None) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + cache=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) @@ -1164,11 +1252,14 @@ def invert_attention_mask(self, encoder_attention_mask): 1.0 - encoder_extended_attention_mask) * -1e4 elif self.dtype == paddle.float32: encoder_extended_attention_mask = ( - 1.0 - encoder_extended_attention_mask) * -1e9 + 1.0 - encoder_extended_attention_mask) * -1e4 else: - raise ValueError( - f"{self.dtype} not recognized. `dtype` should be set to either `paddle.float32` or `paddle.float16`" - ) + encoder_extended_attention_mask = ( + 1.0 - encoder_extended_attention_mask) * -1e4 + + # raise ValueError( + # f"{self.dtype} not recognized. `dtype` should be set to either `paddle.float32` or `paddle.float16`" + # ) return encoder_extended_attention_mask @@ -1226,23 +1317,26 @@ class T5Model(T5PretrainedModel): """ - def __init__(self, - tie_word_embeddings=True, - pad_token_id=0, - bos_token_id=0, - eos_token_id=1, - initializer_factor=1.0, - vocab_size=32128, - d_model=768, - d_kv=64, - d_ff=3072, - num_layers=12, - num_decoder_layers=12, - num_heads=12, - relative_attention_num_buckets=32, - dropout_rate=0.1, - layer_norm_epsilon=1e-06, - feed_forward_proj="relu"): + def __init__( + self, + tie_word_embeddings=True, + pad_token_id=0, + bos_token_id=0, + eos_token_id=1, + initializer_factor=1.0, + vocab_size=32128, + d_model=768, + d_kv=64, + d_ff=3072, + num_layers=12, + num_decoder_layers=12, + num_heads=12, + relative_attention_num_buckets=32, + dropout_rate=0.1, + layer_norm_epsilon=1e-06, + feed_forward_proj="relu", + enable_recompute=False, + ): super().__init__() self.tie_word_embeddings = tie_word_embeddings self.pad_token_id = pad_token_id @@ -1272,7 +1366,8 @@ def __init__(self, feed_forward_proj, d_ff, self.shared, - is_decoder=False) + is_decoder=False, + enable_recompute=enable_recompute) self.decoder = T5Stack(d_model, num_decoder_layers, layer_norm_epsilon, @@ -1283,7 +1378,8 @@ def __init__(self, feed_forward_proj, d_ff, self.shared, - is_decoder=True) + is_decoder=True, + enable_recompute=enable_recompute) self.init_weights() @@ -1308,6 +1404,8 @@ def forward(self, decoder_attention_mask=None, encoder_output=None, cache=None, + inputs_embeds=None, + decoder_inputs_embeds=None, use_cache=True, output_attentions=False, output_hidden_states=False, @@ -1351,6 +1449,20 @@ def forward(self, The `input_ids` which have their past given to this model should not be passed as input ids as they have already been computed. Defaults to `None`. + inputs_embeds (Tensor, optional): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation + of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over + how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + Default to None. + decoder_inputs_embeds (Tensor, optional): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used, + optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). + This is useful if you want more control over how to convert `decoder_input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. Default to None. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. use_cache (bool, optional): Whether or not to use cache. If set to `True`, `past_buckets_states` states are returned and can be used to speed up decoding. @@ -1444,6 +1556,7 @@ def forward(self, encoder_output = self.encoder( input_ids=input_ids, attention_mask=attention_mask, + inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -1455,6 +1568,7 @@ def forward(self, decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, cache=cache, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, @@ -1529,7 +1643,9 @@ def forward(self, encoder_output=None, cache=None, labels=None, - use_cache=True, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=False): @@ -1554,6 +1670,20 @@ def forward(self, selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`. Shape is [batch_size, sequence_length] and dtype is int64. + inputs_embeds (Tensor, optional): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation + of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over + how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + Default to None. + decoder_inputs_embeds (Tensor , optional): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation of shape `(batch_size, target_sequence_length, hidden_size)`. If `past_key_values` is used, + optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful + if you want more control over how to convert `decoder_input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. Default to None. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. use_cache (bool, optional): See :class:`T5Model`. output_attentions (bool, optional): @@ -1623,24 +1753,26 @@ def forward(self, """ + use_cache = use_cache if use_cache is not None else False # Encode if needed (training, first prediction pass) if encoder_output is None: # Convert encoder inputs in embeddings if needed encoder_output = self.t5.encoder( input_ids=input_ids, attention_mask=attention_mask, + inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) else: - if isinstance(encoder_output, paddle.Tensor): + if isinstance(encoder_output, type(decoder_input_ids)): encoder_output = (encoder_output, ) if return_dict and not isinstance(encoder_output, BaseModelOutput): encoder_output = convert_encoder_output(encoder_output) hidden_states = encoder_output[0] - if labels is not None and decoder_input_ids is None: + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(labels) @@ -1657,6 +1789,7 @@ def forward(self, decoder_outputs = self.t5.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, cache=cache, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, @@ -1681,8 +1814,10 @@ def forward(self, loss = None if labels is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=-100) - loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]), - labels.flatten()) + loss = loss_fct( + lm_logits.reshape( + shape=[-1, lm_logits.shape[-1]]).astype("float32"), + labels.flatten()) if not return_dict: output = (lm_logits, ) + decoder_outputs[1:] + encoder_output @@ -1869,6 +2004,7 @@ def forward( encoder_hidden_states: Optional[Tuple[Tensor]] = None, encoder_attention_mask: Optional[Tensor] = None, cache=None, + inputs_embeds: Optional[Tensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, @@ -1877,6 +2013,7 @@ def forward( encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, + inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, cache=cache, @@ -1889,19 +2026,3 @@ def forward( T5EncoderModel.base_model_class = T5EncoderModel - - -def convert_encoder_output(encoder_output): - """ - Convert encoder_output from tuple to class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. - - Args: - encoder_output (tuple or ModleOutput): - The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional). - The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size]. - """ - return BaseModelOutput( - last_hidden_state=encoder_output[0], - hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, - attentions=encoder_output[2] if len(encoder_output) > 2 else None, - ) diff --git a/paddlenlp/transformers/t5/tokenizer.py b/paddlenlp/transformers/t5/tokenizer.py index 549a9bdccf9c..38c057417812 100644 --- a/paddlenlp/transformers/t5/tokenizer.py +++ b/paddlenlp/transformers/t5/tokenizer.py @@ -28,6 +28,8 @@ "t5-small": 512, "t5-base": 512, "t5-large": 512, + "t5-3b": 512, + "t5-11b": 512, } @@ -69,6 +71,10 @@ class T5Tokenizer(AlbertEnglishTokenizer): "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-base/spiece.model", "t5-large": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-large/spiece.model", + "t5-3b": + "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-3b/spiece.model", + "t5-11b": + "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-11b/spiece.model", "t5-v1_1-base": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-base/spiece.model", "t5-v1_1-large": @@ -86,6 +92,12 @@ class T5Tokenizer(AlbertEnglishTokenizer): "t5-large": { "do_lower_case": False }, + "t5-3b": { + "do_lower_case": False + }, + "t5-11b": { + "do_lower_case": False + }, "t5-v1_1-base": { "do_lower_case": False }, diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py index aa6cb0a2b6d1..696f2bf8ac89 100644 --- a/paddlenlp/transformers/unified_transformer/modeling.py +++ b/paddlenlp/transformers/unified_transformer/modeling.py @@ -19,6 +19,7 @@ from paddle.nn import TransformerEncoder from .. import PretrainedModel, register_base_model +from ..model_outputs import CausalLMOutputWithCrossAttentions __all__ = [ "UnifiedTransformerPretrainedModel", @@ -343,7 +344,10 @@ def forward(self, attention_mask=None, use_cache=False, cache=None, - role_ids=None): + role_ids=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The UnifiedTransformerModel forward method, overrides the special :meth:`__call__` method. @@ -392,17 +396,25 @@ def forward(self, Indices of role ids indicated different roles. It's data type should be `int64` and has a shape of [batch_size, sequence_length]. Defaults to None. + output_attentions (bool, optional): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. Defaults to `False`. + output_hidden_states (bool, optional): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. Defaults to `False`. + return_dict (bool, optional): + Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. + If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: - Tensor|tuple: If `use_cache` is False, it is a tensor - representing the output of :class:`UnifiedTransformerModel`, with + An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. + Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=None`, + returns a tensor representing the output of :class:`UnifiedTransformerModel`, with shape [batch_size, sequence_length, hidden_size]. The data type is - float32 or float64. Otherwise, it is a tuple, besides the output of - :class:`UnifiedTransformerModel`, the tuple also includes the new - cache which is same as input `cache` but `incremental_cache` in it - has an incremental length. - See :meth:`paddle.nn.MultiHeadAttention.gen_cache` method and - :meth:`paddle.nn.MultiHeadAttention.forward` method for more details. + float32 or float64. Example: .. code-block:: @@ -429,16 +441,18 @@ def forward(self, token_type_ids, position_ids, role_ids=role_ids) - if use_cache: - if cache is None: - cache = self.encoder.gen_cache(embedding_output) - sequence_output, cache = self.encoder(embedding_output, - attention_mask, cache) - return sequence_output, cache - else: - sequence_output = self.encoder(embedding_output, attention_mask) + if use_cache and cache is None: + cache = self.encoder.gen_cache(embedding_output) - return sequence_output + sequence_output = self.encoder( + embedding_output, + attention_mask, + cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + return sequence_output class UnifiedTransformerLMHead(nn.Layer): @@ -502,7 +516,11 @@ def forward(self, masked_positions=None, use_cache=False, cache=None, - role_ids=None): + role_ids=None, + labels=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False): r""" The UnifiedTransformerLMHeadModel forward method, overrides the special :meth:`__call__` method. @@ -522,17 +540,26 @@ def forward(self, See :class:`UnifiedTransformerModel`. role_ids: (Tensor, optional): See :class:`UnifiedTransformerModel`. + labels: (Tensor, optional): + Labels for computing the left-to-right language modeling loss. Indices should be in + `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., vocab_size]` + output_attentions (bool, optional): + See :class: `UnifiedTransformerModel` + output_hidden_states (bool, optional): + See :class: `UnifiedTransformerModel` + return_dict (bool, optional): + See :class: `UnifiedTransformerModel` Returns: - Tensor|tuple: If `use_cache` is False, it is a tensor - representing the output of :class:`UnifiedTransformerLMHeadModel`, + An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions` if + `return_dict=True`. Otherwise it returns a tuple of tensors corresponding + to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions`. + Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=labels=None`, + returns a tensor representing the output of :class:`UnifiedTransformerLMHeadModel`, with shape [batch_size, sequence_length, vocab_size]. The data type - is float32 or float64. Otherwise, it is a tuple, besides the output - of :class:`UnifiedTransformerLMHeadModel`, the tuple also includes - the new cache which is same as input `cache` but `incremental_cache` - in it has an incremental length. - See :meth:`paddle.nn.MultiHeadAttention.gen_cache` method and - :meth:`paddle.nn.MultiHeadAttention.forward` method for more details. + is float32 or float64. Example: .. code-block:: @@ -551,20 +578,43 @@ def forward(self, logits = model(**inputs) """ - outputs = self.unified_transformer(input_ids, - token_type_ids, - position_ids, - attention_mask, - use_cache, - cache, - role_ids=role_ids) - sequence_output = outputs[0] if use_cache else outputs + outputs = self.unified_transformer( + input_ids, + token_type_ids, + position_ids, + attention_mask, + use_cache, + cache, + role_ids=role_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs if isinstance(outputs, + type(input_ids)) else outputs[0] logits = self.lm_head(sequence_output, masked_positions) - if use_cache: - cache = outputs[1] - return logits, cache - else: - return logits + + lm_loss = None + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + lm_loss = loss_fct(logits.reshape((-1, logits.shape[-1])), + labels.reshape([-1])) + if not return_dict: + if isinstance(outputs, type(input_ids)): + return (lm_loss, logits) if lm_loss is not None else logits + else: + outputs = (logits, ) + outputs[1:] + return ((lm_loss, ) + + outputs) if lm_loss is not None else outputs + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) def prepare_faster_entry(self, kwargs): from paddlenlp.ops import FasterUnifiedTransformer diff --git a/pipelines/README.md b/pipelines/README.md index af13526da685..70efdf4d3590 100644 --- a/pipelines/README.md +++ b/pipelines/README.md @@ -1,6 +1,6 @@ -## PaddleNLP Pipelines:智能文本产线 +## PaddleNLP Pipelines:NLP流水线系统 -PaddleNLP Pipelines 是一个端到端智能文本产线框架,面向 NLP **全场景**,帮助用户**低门槛**构建强大**产品级系统**。 +PaddleNLP Pipelines 是一个端到端NLP流水线系统框架,面向 NLP **全场景**,帮助用户**低门槛**构建强大**产品级系统**。
@@ -8,12 +8,12 @@ PaddleNLP Pipelines 是一个端到端智能文本产线框架,面向 NLP ** 更多效果展示Demo请参考 [效果展示](#效果展示) -## 智能文本产线特色 +## NLP流水线系统特色 * **全场景支持**:依托灵活的插拔式组件产线化设计,支持各类 NLP 场景任务,包括:信息抽取、情感倾向分析、阅读理解、检索系统、问答系统、文本分类、文本生成等。 * **低门槛开发**:依托丰富的预置组件,像搭积木一样快速构建产品级系统,预置组件覆盖文档解析、数据处理、模型组网、预测部署、Web 服务、UI 界面等全流程系统功能。 -* **高精度预测**:基于前沿的预训练模型、成熟的系统方案,可构建效果领先的产品级系统,如[智能文本产线库](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/pipelines#智能文本产线库)中预置的语义检索系统、阅读理解式智能问答系统等。 +* **高精度预测**:基于前沿的预训练模型、成熟的系统方案,可构建效果领先的产品级系统,如[NLP流水线系统](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/pipelines#NLP流水线系统)中预置的语义检索系统、阅读理解式智能问答系统等。 * **灵活可定制**:除深度兼容 PaddleNLP 模型组件外,还可嵌入飞桨生态下任意模型、[AI 开放平台算子](https://ai.baidu.com/)、其它开源项目如 Elasticsearch 等作为基础组件,快速扩展,从而实现任意复杂系统的灵活定制开发。 @@ -25,9 +25,9 @@ PaddleNLP Pipelines 是一个端到端智能文本产线框架,面向 NLP ** 更多的Benchmarks的信息请参考文档[Benchmarks](./benchmarks/README.md) -## 智能文本产线库 +## NLP流水线系统 -PaddleNLP Pipelines 智能文本产线库针对 NLP 部分高频场景开源了经过充分打磨的产品级系统,并会不断开放其它场景的产品级系统,用户可以基于智能文本产线库提供的系统能力快速开发出适配业务数据的产品。 +PaddleNLP Pipelines NLP流水线系统针对 NLP 部分高频场景开源了经过充分打磨的产品级系统,并会不断开放其它场景的产品级系统,用户可以基于NLP流水线系统提供的系统能力快速开发出适配业务数据的产品。 * 快速搭建产品级[**语义检索**](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/pipelines/examples/semantic-search)系统:使用自然语言文本通过语义进行智能文档查询,而不是关键字匹配 * 快速搭建产品级[**智能问答**](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/pipelines/examples/question-answering)系统:用自然语言提问,即可获得精准答案片段 diff --git a/pipelines/examples/FAQ/Install_windows.md b/pipelines/examples/FAQ/Install_windows.md index fb1e50dcfe68..20e1d04bf783 100644 --- a/pipelines/examples/FAQ/Install_windows.md +++ b/pipelines/examples/FAQ/Install_windows.md @@ -26,9 +26,9 @@ python setup.py install ```bash # 我们建议在 GPU 环境下运行本示例,运行速度较快 -python examples/frequently-asked-question/dense_faq_example.py --device gpu +python examples/FAQ/dense_faq_example.py --device gpu # 如果只有 CPU 机器,安装CPU版本的Paddle后,可以通过 --device 参数指定 cpu 即可, 运行耗时较长 -python examples/frequently-asked-question/dense_faq_example.py --device cpu +python examples/FAQ/dense_faq_example.py --device cpu ``` ### 1.4 构建 Web 可视化FAQ系统 diff --git a/pipelines/examples/FAQ/README.md b/pipelines/examples/FAQ/README.md index 627aeca9e4e7..fa6621cc11d3 100644 --- a/pipelines/examples/FAQ/README.md +++ b/pipelines/examples/FAQ/README.md @@ -6,7 +6,7 @@ ## 2. 产品功能介绍 -本项目提供了低成本搭建端到端FAQ智能问答的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的检索系统模型(召回模型、排序模型)快速搭建一个针对自己业务数据的问答系统,并可以提供 Web 化产品服务。 +本项目提供了低成本搭建端到端FAQ智能问答的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的检索系统模型(召回模型、排序模型)快速搭建一个针对自己业务数据的问答系统,并可以提供 Web 化产品服务。以下是使用预置模型的教程,如果用户想训练并接入自己训练的模型,模型训练可以参考[FAQ Finance](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/question_answering/faq_finance),模型的接入流程参考Pipelines语义检索中Neural Search模型接入流程即可。
@@ -66,10 +66,10 @@ FAQ智能问答数据库的数据来自于[8000 多条保险行业问答数据]( # 我们建议在 GPU 环境下运行本示例,运行速度较快 # 设置 1 个空闲的 GPU 卡,此处假设 0 卡为空闲 GPU export CUDA_VISIBLE_DEVICES=0 -python examples/frequently-asked-question/dense_faq_example.py --device gpu +python examples/FAQ/dense_faq_example.py --device gpu # 如果只有 CPU 机器,可以通过 --device 参数指定 cpu 即可, 运行耗时较长 unset CUDA_VISIBLE_DEVICES -python examples/frequently-asked-question/dense_faq_example.py --device cpu +python examples/FAQ/dense_faq_example.py --device cpu ``` `dense_faq_example.py`中`DensePassageRetriever`和`ErnieRanker`的模型介绍请参考[API介绍](../../API.md) @@ -136,7 +136,7 @@ python rest_api/application.py 8891 Linux 用户推荐采用 Shell 脚本来启动服务:: ```bash -sh examples/frequently-asked-question/run_faq_server.sh +sh examples/FAQ/run_faq_server.sh ``` 启动后可以使用curl命令验证是否成功运行: @@ -155,7 +155,7 @@ python -m streamlit run ui/webapp_faq.py --server.port 8502 Linux 用户推荐采用 Shell 脚本来启动服务:: ```bash -sh examples/frequently-asked-question/run_faq_web.sh +sh examples/FAQ/run_faq_web.sh ``` 到这里您就可以打开浏览器访问 http://127.0.0.1:8502 地址体验FAQ智能问答系统服务了。 diff --git a/pipelines/examples/document-intelligence/docprompt_example.py b/pipelines/examples/document-intelligence/docprompt_example.py index 8ecd3385e532..22fc74c6c17c 100644 --- a/pipelines/examples/document-intelligence/docprompt_example.py +++ b/pipelines/examples/document-intelligence/docprompt_example.py @@ -34,7 +34,7 @@ def docprompt_pipeline(): preprocessor = DocOCRProcessor(use_gpu=use_gpu) docprompter = DocPrompter(use_gpu=use_gpu, batch_size=args.batch_size) - pipe = DocPipeline(preprocessor=preprocessor, modelrunner=docprompter) + pipe = DocPipeline(preprocessor=preprocessor, docreader=docprompter) # image link input meta = { "doc": diff --git a/pipelines/examples/question-answering/README.md b/pipelines/examples/question-answering/README.md index 46f94630284c..065081d27267 100644 --- a/pipelines/examples/question-answering/README.md +++ b/pipelines/examples/question-answering/README.md @@ -12,7 +12,7 @@ ## 2. 产品功能介绍 -本项目提供了低成本搭建端到端问答系统的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的问答系统模型(召回模型、排序模型、阅读理解模型)快速搭建一个针对自己业务数据的问答系统,并可以提供基于[Streamlit](https://streamlit.io/) 的 Web 可视化服务。 +本项目提供了低成本搭建端到端问答系统的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的问答系统模型(召回模型、排序模型、阅读理解模型)快速搭建一个针对自己业务数据的问答系统,并可以提供基于[Streamlit](https://streamlit.io/) 的 Web 可视化服务。以下是使用预置模型的教程,如果用户想训练并接入自己训练的模型,对于召回和排序模型训练可以参考[Neural Search](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search),对于其中的答案抽取模型,训练教程请参考[machine_reading_comprehension](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/machine_reading_comprehension/DuReader-robust),召回和排序模型接入流程参考语义检索的Neural Search接入流程即可,阅读理解模型只需要在加载模型的时候,把模型名称换成您的模型的路径即可。
diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md index 7e719ffbab79..ae1eb16cec60 100644 --- a/pipelines/examples/semantic-search/README.md +++ b/pipelines/examples/semantic-search/README.md @@ -17,7 +17,7 @@ ## 2. 产品功能介绍 -本项目提供了低成本搭建端到端语义检索系统的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的语义检索系统模型(召回模型、排序模型)快速搭建一个针对自己业务数据的问答系统,并可以提供 Web 化产品服务。以下是使用预置模型的教程,如果用户想接入自己训练的模型,可以参考[Neural Search的流程](./Neural_Search.md)。 +本项目提供了低成本搭建端到端语义检索系统的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的语义检索系统模型(召回模型、排序模型)快速搭建一个针对自己业务数据的问答系统,并可以提供 Web 化产品服务。以下是使用预置模型的教程,如果用户想训练并接入自己训练的模型,模型训练可以参考[Neural Search](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search),接入流程可以参考[Neural Search的流程](./Neural_Search.md)。
diff --git a/pipelines/examples/unsupervised-question-answering/README.md b/pipelines/examples/unsupervised-question-answering/README.md new file mode 100644 index 000000000000..71f801016603 --- /dev/null +++ b/pipelines/examples/unsupervised-question-answering/README.md @@ -0,0 +1,224 @@ +# 无监督智能检索问答系统 + +## 1. 场景概述 + +智能问答(QA)是获取信息和只是的更直接、更高效的方式之一,传统的信息检索系统只能找到相关的文档,而问答系统能够直接找到精准的答案,极大的节省了人们获取信息的时间。问答系统中最关键的挑战之一是标记数据的稀缺性,这是因为对目标领域获取问答对或常见问答对(FAQ)的成本很高,需要消耗大量的人力和时间。由于上述制约,这导致检索式问答系统落地困难,解决此问题的一种方法是依据上下文或大量非结构化文本自动生成的QA问答对。 + +本项目,即无监督智能检索问答(问答对自动生成智能检索式问答),基于PaddleNLP问题生成、UIE、检索式问答,支持以非结构化文本形式为上下文自动生成QA问答对,生成的问答对语料可以通过无监督的方式构建检索式问答系统。 + +
+ +
+ +若开发者已有FAQ语料,请参考FAQ检索式问答。 +## 2. 产品功能介绍 + +本项目提供了低成本搭建问答对自动生成智能检索问答系统的能力。开发者只需要提供非结构化的纯文本,就可以使用本项目预制的问答对生成模块生成大量的问答对,并基于此快速搭建一个针对自己业务的检索问答系统,并可以提供Web可视化产品服务。Web可视化产品服务支持问答检索、在线问答对生成,在线文件上传和解析,在线索引库更新等功能,用户也可根据需要自行调整。 + +**【注意】** 以下教程使用预置模型,如果用户想训练并接入自己训练的模型,可以参考[intelligent-qa](paddle/paddlenlp/unsupervised_qa_pipelines/PaddleNLP/applications/question_answering)。 + +### 2.1 系统特色 ++ 低成本 + + 可通过自动生成的方式快速大量合成QA语料,大大降低人力成本 + + 可控性好,合成语料和语义检索解耦合,可以人工筛查和删除合成的问答对,也可以添加人工标注的问答对 ++ 端到端 + + 提供包括问答语料生成、索引库构建、模型服务部署、WebUI可视化一整套端到端智能问答系统能力 + + 支持对Txt、Word、PDF、Image多源数据上传,同时支持离线、在线QA语料生成和ANN数据库更新 ++ 效果好 + + 可通过自动问答对生成提升问答对语料覆盖度,缓解中长尾问题覆盖较少的问题 + + 依托百度领先的NLP技术,预置效果领先的深度学习模型 + +## 3. 快速开始: 快速搭建无监督智能检索问答系统 + +以下是针对mac和linux的搭建流程。 + +### 3.1 运行环境和安装说明 + +本项目采用了以下的运行环境进行,详细说明如下,用户也可以在自己的GPU硬件环境进行: + +a. 软件环境: +- python >= 3.7.0 +- paddlenlp >= 2.4.3 +- paddlepaddle-gpu >=2.3 +- CUDA Version: 10.2 +- NVIDIA Driver Version: 440.64.00 +- Ubuntu 16.04.6 LTS (Docker) + +b. 硬件环境: + +- NVIDIA Tesla V100 16GB x4卡 +- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz + +c. 依赖安装: +首先需要安装PaddlePaddle,PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)。然后需要安装paddle-pipelines依赖,使用pip安装命令如下: +```bash +# pip一键安装 +pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +或者进入pipelines目录下,针对源码进行安装: +```bash +# 源码进行安装 +cd PaddleNLP/pipelines/ +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +python setup.py install +``` +**【注意】** 以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录 + +### 3.2 数据说明 +我们以提供的纯文本文件[source_file.txt](https://paddlenlp.bj.bcebos.com/applications/unsupervised_qa/source_file.txt)为例,系统将每一条都视为一个上下文并基于此生成多个问答对,并基于此构建索引库,该文件可直接下载放入./data,开发者也可以使用自己的文件。 + + +### 3.3 一键体验无监督智能检索问答系统 + +开发者可以通过如下命令快速体验无监督智能检索问答系统的效果,系统将自动根据提供的纯文本文件构建问答对语料库,并基于生成的问答对语料库构造检索数据库。 +我们建议在GPU环境下运行本示例,运行速度较快,运行命令如下: +```bash +# GPU环境下运行示例 +# 设置1个空闲的GPU卡,此处假设0卡为空闲GPU +export CUDA_VISIBLE_DEVICES=0 +python examples/unsupervised-question-answering/unsupervised_question_answering_example.py --device gpu --source_file data/source_file.txt --doc_dir data/my_data --index_name faiss_index --retriever_batch_size 16 +``` +关键参数释义如下: +- `device`: 使用的设备,默认为'gpu',可选择['cpu', 'gpu']。 +- `source_file`: 源文件路径,指定该路径将自动为其生成问答对至`doc_dir`。 +- `doc_dir`: 生成的问答对语料保存的位置,系统将根据该位置自动构建检索数据库,默认为'data/my_data'。 +- `index_name`: FAISS的ANN索引名称,默认为'faiss_index'。 +- `retriever_batch_size`: 构建ANN索引时的批量大小,默认为16。 + +如果只有CPU机器,可以通过--device参数指定cpu即可, 运行耗时较长,运行命令如下: +```bash +# CPU环境下运行示例 +unset CUDA_VISIBLE_DEVICES +python examples/unsupervised-question-answering/unsupervised_question_answering_example.py --device cpu --source_file data/source_file.txt --doc_dir data/my_data +``` +**【注意】** `unsupervised_question_answering_example.py`中`DensePassageRetriever`和`ErnieRanker`的模型介绍请参考[API介绍](../../API.md) + +### 3.4 构建Web可视化无监督智能检索问答系统 + +整个Web可视化无监督智能检索问答系统主要包含3大组件: +1. 基于ElasticSearch的ANN服务搭建在线索引库 +2. 基于RestAPI构建模型后端服务 +3. 基于Streamlit构建前端WebUI + +接下来我们依次搭建这些个服务,得到可视化、可交互的无监督智能检索问答系统。 + + +#### 3.4.1 离线生成问答对语料 +执行以下命令将自动根据提供的纯文本文件离线构建问答对语料库: +```bash +# GPU环境下运行示例 +# 设置1个空闲的GPU卡,此处假设0卡为空闲GPU +export CUDA_VISIBLE_DEVICES=0 +python examples/unsupervised-question-answering/offline_question_answer_pairs_generation.py --device gpu --source_file data/source_file.txt --doc_dir data/my_data +``` +关键参数释义如下: +- `device`: 使用的设备,默认为'gpu',可选择['cpu', 'gpu']。 +- `source_file`: 源文件路径,指定该路径将自动为其生成问答对至`doc_dir`。 +- `doc_dir`: 生成的问答对语料保存的位置,系统将根据该位置自动构建检索数据库,默认为'data/my_data'。 + + +如果只有CPU机器,可以通过--device参数指定cpu即可, 运行耗时较长,运行命令如下: +```bash +# CPU环境下运行示例 +unset CUDA_VISIBLE_DEVICES +python examples/unsupervised-question-answering/offline_question_answer_pairs_generation.py --device cpu --source_file data/source_file.txt --doc_dir data/my_data +``` + +#### 3.4.2 启动ElasticSearch ANN服务 +1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。 +2. 启动ElasticSearch服务。 + +首先修改`config/elasticsearch.yml`的配置: +``` +xpack.security.enabled: false +``` +然后启动elasticsearch: +```bash +./bin/elasticsearch +``` +3. 检查确保ElasticSearch服务启动成功。 + +执行以下命令,如果ElasticSearch里面没有数据,结果会输出为空,即{ }。 +```bash +curl http://localhost:9200/_aliases?pretty=true +``` + +备注:ElasticSearch服务默认开启端口为 9200 + +#### 3.4.3 ANN索引库构建 +执行以下命令建立ANN索引库: +``` +python utils/offline_ann.py --index_name my_data \ + --doc_dir data/my_data \ + --split_answers \ + --delete_index +``` +参数含义说明 +* `index_name`: 索引的名称 +* `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 +* `split_answers`: 是否切分每一行的数据为query和answer两部分 +* `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false + +执行以下命令打印几条数据,检测ANN索引库是否构建成功: +``` +curl http://localhost:9200/my_data/_search +``` +如果索引库正常会输出类似如下的结果: +``` +{"took":1,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":5,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"my_data","_id":"fb308738f2767626d72282f5a35402e5","_score":1.0,"_source":{"content":...... +``` + +#### 3.4.4 启动RestAPI模型后端 +```bash +export CUDA_VISIBLE_DEVICES=0 +# 指定无监督智能检索问答系统的Yaml配置文件 +export PIPELINE_YAML_PATH=rest_api/pipeline/unsupervised_qa.yaml +# 使用端口号8896启动模型服务 +python rest_api/application.py 8896 +``` +Linux 用户推荐采用Shell脚本来启动服务:: + +```bash +sh examples/unsupervised-question-answering/run_unsupervised_question_answering_server.sh +``` +启动后可以使用curl命令验证是否成功运行: +``` +curl -X POST -k http://localhost:8896/query -H 'Content-Type: application/json' -d '{"query": "企业如何办理养老保险?","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}' +``` +如果成功运行,则会返回结果。 + +#### 3.4.5 启动Streamlit WebUI前端 +```bash +# 配置模型服务地址 +export API_ENDPOINT=http://127.0.0.1:8896 +# 在指定端口 8502 启动 WebUI +python -m streamlit run ui/webapp_unsupervised_question_answering.py --server.port 8508 +``` +Linux 用户推荐采用 Shell 脚本来启动服务:: + +```bash +sh examples/unsupervised-question-answering/run_unsupervised_question_answering_web.sh +``` + +到这里您就可以打开浏览器访问地址 http://127.0.0.1:8508 体验无监督智能检索问答系统服务了。 + + + +**【注意】** 如果安装遇见问题可以查看[FAQ文档](../../FAQ.md) + +## Reference +[1]Y. Sun et al., “[ERNIE 3.0: Large-scale Knowledge Enhanced Pre-training for Language Understanding and Generation](https://arxiv.org/pdf/2107.02137.pdf),” arXiv:2107.02137 [cs], Jul. 2021, Accessed: Jan. 17, 2022. [Online]. Available: http://arxiv.org/abs/2107.02137 + +[2]Y. Qu et al., “[RocketQA: An Optimized Training Approach to Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2010.08191),” arXiv:2010.08191 [cs], May 2021, Accessed: Aug. 16, 2021. [Online]. Available: http://arxiv.org/abs/2010.08191 + +[3]H. Tang, H. Li, J. Liu, Y. Hong, H. Wu, and H. Wang, “[DuReader_robust: A Chinese Dataset Towards Evaluating Robustness and Generalization of Machine Reading Comprehension in Real-World Applications](https://arxiv.org/pdf/2004.11142.pdf).” arXiv, Jul. 21, 2021. Accessed: May 15, 2022. [Online]. Available: http://arxiv.org/abs/2004.11142 + +[4]Li, Wei, et al. "Unimo: Towards unified-modal understanding and generation via cross-modal contrastive learning." arXiv preprint arXiv:2012.15409 (2020). + +## Acknowledge + +我们借鉴了 Deepset.ai [Haystack](https://github.com/deepset-ai/haystack) 优秀的框架设计,在此对[Haystack](https://github.com/deepset-ai/haystack)作者及其开源社区表示感谢。 + +We learn form the excellent framework design of Deepset.ai [Haystack](https://github.com/deepset-ai/haystack), and we would like to express our thanks to the authors of Haystack and their open source community. diff --git a/pipelines/examples/unsupervised-question-answering/offline_question_answer_pairs_generation.py b/pipelines/examples/unsupervised-question-answering/offline_question_answer_pairs_generation.py new file mode 100644 index 000000000000..1c604799d1ab --- /dev/null +++ b/pipelines/examples/unsupervised-question-answering/offline_question_answer_pairs_generation.py @@ -0,0 +1,80 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +from pprint import pprint + +import paddle +from pipelines.nodes import AnswerExtractor, QAFilter, QuestionGenerator +from pipelines.nodes import ErnieRanker, DensePassageRetriever +from pipelines.document_stores import FAISSDocumentStore +from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http, print_documents +from pipelines.pipelines import QAGenerationPipeline, SemanticSearchPipeline + +# yapf: disable +parser = argparse.ArgumentParser() +parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run dense_qa system, defaults to gpu.") +parser.add_argument("--doc_dir", default="data/my_data", type=str, help="The question-answer piars file to be loaded when building ANN index.") +parser.add_argument("--source_file", default=None, type=str, help="The source raw texts file to be loaded when creating question-answer pairs.") + +args = parser.parse_args() +# yapf: enable + + +def offline_qa_generation(): + answer_extractor = AnswerExtractor( + model='uie-base-answer-extractor-v1', + device=args.device, + schema=['答案'], + position_prob=0.01, + ) + + question_generator = QuestionGenerator( + model='unimo-text-1.0-question-generator-v1', + device=args.device, + ) + + qa_filter = QAFilter( + model='uie-base-qa-filter-v1', + device=args.device, + schema=['答案'], + position_prob=0.1, + ) + + pipe = QAGenerationPipeline(answer_extractor=answer_extractor, + question_generator=question_generator, + qa_filter=qa_filter) + pipeline_params = {"QAFilter": {"is_filter": True}} + + if args.source_file: + meta = [] + with open(args.source_file, 'r', encoding='utf-8') as rf: + for line in rf: + meta.append(line.strip()) + prediction = pipe.run(meta=meta, params=pipeline_params) + prediction = prediction['filtered_cqa_triples'] + if not os.path.exists(args.doc_dir): + os.makedirs(args.doc_dir) + with open(os.path.join(args.doc_dir, 'generated_qa_pairs.txt'), + 'w', + encoding='utf-8') as wf: + for pair in prediction: + wf.write(pair['synthetic_question'].strip() + '\t' + + pair['synthetic_answer'].strip() + '\n') + + +if __name__ == "__main__": + offline_qa_generation() diff --git a/pipelines/examples/unsupervised-question-answering/run_unsupervised_question_answering_server.sh b/pipelines/examples/unsupervised-question-answering/run_unsupervised_question_answering_server.sh new file mode 100644 index 000000000000..07fe42ebc70e --- /dev/null +++ b/pipelines/examples/unsupervised-question-answering/run_unsupervised_question_answering_server.sh @@ -0,0 +1,22 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 环境变量设置 +export CUDA_VISIBLE_DEVICES=0 + +# 指定语义检索系统的Yaml配置文件 +export PIPELINE_YAML_PATH=rest_api/pipeline/unsupervised_qa.yaml + +# 使用端口号 8896 启动模型服务 +python rest_api/application.py 8896 \ No newline at end of file diff --git a/pipelines/examples/unsupervised-question-answering/run_unsupervised_question_answering_web.sh b/pipelines/examples/unsupervised-question-answering/run_unsupervised_question_answering_web.sh new file mode 100644 index 000000000000..da69ca06d019 --- /dev/null +++ b/pipelines/examples/unsupervised-question-answering/run_unsupervised_question_answering_web.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 环境变量设置 +unset http_proxy && unset https_proxy +export CUDA_VISIBLE_DEVICES=0 + +# 配置模型服务地址 +export API_ENDPOINT=http://127.0.0.1:8896 + +# 在指定端口8896启动WebUI +python -m streamlit run ui/webapp_unsupervised_question_answering.py --server.port 8508 \ No newline at end of file diff --git a/pipelines/examples/unsupervised-question-answering/unsupervised_question_answering_example.py b/pipelines/examples/unsupervised-question-answering/unsupervised_question_answering_example.py new file mode 100644 index 000000000000..890e61c4c9a0 --- /dev/null +++ b/pipelines/examples/unsupervised-question-answering/unsupervised_question_answering_example.py @@ -0,0 +1,158 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +from pprint import pprint + +import paddle +from pipelines.nodes import AnswerExtractor, QAFilter, QuestionGenerator +from pipelines.nodes import ErnieRanker, DensePassageRetriever +from pipelines.document_stores import FAISSDocumentStore +from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http, print_documents +from pipelines.pipelines import QAGenerationPipeline, SemanticSearchPipeline + +# yapf: disable +parser = argparse.ArgumentParser() +parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run dense_qa system, defaults to gpu.") +parser.add_argument("--index_name", default='faiss_index', type=str, help="The ann index name of FAISS.") +parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") +parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") +parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") +parser.add_argument("--doc_dir", default="data/my_data", type=str, help="The question-answer piars file to be loaded when building ANN index.") +parser.add_argument("--source_file", default=None, type=str, help="The source raw texts file to be loaded when creating question-answer pairs.") + +args = parser.parse_args() +# yapf: enable + + +def dense_faq_pipeline(): + use_gpu = True if args.device == 'gpu' else False + faiss_document_store = "faiss_document_store.db" + if os.path.exists(args.index_name) and os.path.exists(faiss_document_store): + # connect to existed FAISS Index + document_store = FAISSDocumentStore.load(args.index_name) + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="rocketqa-zh-dureader-query-encoder", + passage_embedding_model="rocketqa-zh-dureader-query-encoder", + max_seq_len_query=args.max_seq_len_query, + max_seq_len_passage=args.max_seq_len_passage, + batch_size=args.retriever_batch_size, + use_gpu=use_gpu, + embed_title=False, + ) + else: + dicts = convert_files_to_dicts(dir_path=args.doc_dir, + split_paragraphs=True, + split_answers=True, + encoding='utf-8') + + if os.path.exists(args.index_name): + os.remove(args.index_name) + if os.path.exists(faiss_document_store): + os.remove(faiss_document_store) + + document_store = FAISSDocumentStore(embedding_dim=768, + faiss_index_factory_str="Flat") + document_store.write_documents(dicts) + + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="rocketqa-zh-dureader-query-encoder", + passage_embedding_model="rocketqa-zh-dureader-query-encoder", + max_seq_len_query=args.max_seq_len_query, + max_seq_len_passage=args.max_seq_len_passage, + batch_size=args.retriever_batch_size, + use_gpu=use_gpu, + embed_title=False, + ) + + # update Embedding + document_store.update_embeddings(retriever) + + # save index + document_store.save(args.index_name) + + ### Ranker + ranker = ErnieRanker( + model_name_or_path="rocketqa-zh-dureader-cross-encoder", + use_gpu=use_gpu) + + pipe = SemanticSearchPipeline(retriever, ranker) + + pipeline_params = {"Retriever": {"top_k": 50}, "Ranker": {"top_k": 1}} + prediction = pipe.run(query="世界上最早的地雷发明者是谁?", params=pipeline_params) + + print_documents(prediction, print_name=False, print_meta=True) + + +def qa_generation_pipeline(): + answer_extractor = AnswerExtractor( + model='uie-base-answer-extractor', + device=args.device, + schema=['答案'], + max_answer_candidates=3, + position_prob=0.01, + ) + + question_generator = QuestionGenerator( + model='unimo-text-1.0-question-generation', + device=args.device, + num_return_sequences=2, + ) + + qa_filter = QAFilter( + model='uie-base-qa-filter', + device=args.device, + schema=['答案'], + position_prob=0.1, + ) + + pipe = QAGenerationPipeline(answer_extractor=answer_extractor, + question_generator=question_generator, + qa_filter=qa_filter) + pipeline_params = {"QAFilter": {"is_filter": True}} + + # list example + meta = [ + "世界上最早的电影院是美国洛杉矶的“电气剧场”,建于1902年。", + "以脸书为例,2020年时,54%的成年人表示,他们从该平台获取新闻。而现在,这个数字下降到了44%。与此同时,YouTube在过去几年里一直保持平稳,约有三分之一的用户在该平台上获取新闻。" + ] + prediction = pipe.run(meta=meta, params=pipeline_params) + prediction = prediction['filtered_cqa_triples'] + pprint(prediction) + + # file example + if args.source_file: + meta = [] + with open(args.source_file, 'r', encoding='utf-8') as rf: + for line in rf: + meta.append(line.strip()) + prediction = pipe.run(meta=meta, params=pipeline_params) + prediction = prediction['filtered_cqa_triples'] + if not os.path.exists(args.doc_dir): + os.makedirs(args.doc_dir) + with open(os.path.join(args.doc_dir, 'generated_qa_pairs.txt'), + 'w', + encoding='utf-8') as wf: + for pair in prediction: + wf.write(pair['synthetic_question'].strip() + '\t' + + pair['synthetic_answer'].strip() + '\n') + + +if __name__ == "__main__": + qa_generation_pipeline() + dense_faq_pipeline() diff --git a/pipelines/pipelines/__init__.py b/pipelines/pipelines/__init__.py index 4e4edb9dba82..a704e7362caa 100644 --- a/pipelines/pipelines/__init__.py +++ b/pipelines/pipelines/__init__.py @@ -37,11 +37,9 @@ from pipelines.schema import Document, Answer, Label, Span from pipelines.nodes import BaseComponent from pipelines.pipelines import Pipeline -from pipelines.pipelines.standard_pipelines import (BaseStandardPipeline, - ExtractiveQAPipeline, - SemanticSearchPipeline, - DocPipeline, - TextToImagePipeline) +from pipelines.pipelines.standard_pipelines import ( + BaseStandardPipeline, ExtractiveQAPipeline, SemanticSearchPipeline, + TextToImagePipeline, QAGenerationPipeline, DocPipeline) import pandas as pd pd.options.display.max_colwidth = 80 diff --git a/pipelines/pipelines/nodes/__init__.py b/pipelines/pipelines/nodes/__init__.py index 4b2a2e02aacb..ab753d078255 100644 --- a/pipelines/pipelines/nodes/__init__.py +++ b/pipelines/pipelines/nodes/__init__.py @@ -31,3 +31,5 @@ from pipelines.nodes.retriever import BaseRetriever, DensePassageRetriever from pipelines.nodes.document import DocOCRProcessor, DocPrompter from pipelines.nodes.text_to_image_generator import ErnieTextToImageGenerator +from pipelines.nodes.answer_extractor import AnswerExtractor, QAFilter, AnswerExtractorPreprocessor, QAFilterPostprocessor +from pipelines.nodes.question_generator import QuestionGenerator diff --git a/pipelines/pipelines/nodes/answer_extractor/__init__.py b/pipelines/pipelines/nodes/answer_extractor/__init__.py new file mode 100644 index 000000000000..87089b190666 --- /dev/null +++ b/pipelines/pipelines/nodes/answer_extractor/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pipelines.nodes.answer_extractor.answer_extractor import AnswerExtractor +from pipelines.nodes.answer_extractor.answer_extractor_preprocessor import AnswerExtractorPreprocessor +from pipelines.nodes.answer_extractor.qa_filter import QAFilter +from pipelines.nodes.answer_extractor.qa_filter_postprocessor import QAFilterPostprocessor diff --git a/pipelines/pipelines/nodes/answer_extractor/answer_extractor.py b/pipelines/pipelines/nodes/answer_extractor/answer_extractor.py new file mode 100644 index 000000000000..01b9affe8f29 --- /dev/null +++ b/pipelines/pipelines/nodes/answer_extractor/answer_extractor.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2021 deepset GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import json +import sys +import argparse +import re +from tqdm import tqdm + +import paddle +from paddlenlp import Taskflow +from pipelines.nodes.base import BaseComponent +from paddlenlp.utils.env import PPNLP_HOME +from paddlenlp.taskflow.utils import download_file +from paddle.dataset.common import md5file + + +class AnswerExtractor(BaseComponent): + """ + Answer Extractor based on Universal Information Extraction. + """ + resource_files_names = { + "model_state": "model_state.pdparams", + "model_config": "model_config.json", + "vocab_file": "vocab.txt", + "special_tokens_map": "special_tokens_map.json", + "tokenizer_config": "tokenizer_config.json" + } + + resource_files_urls = { + "uie-base-answer-extractor": { + "model_state": [ + "https://bj.bcebos.com/paddlenlp/pipelines/answer_generator/uie-base-answer-extractor/uie-base-answer-extractor-v1/model_state.pdparams", + "c8619f631a0c20434199840d34bb8b8c" + ], + "model_config": [ + "https://bj.bcebos.com/paddlenlp/pipelines/answer_generator/uie-base-answer-extractor/uie-base-answer-extractor-v1/model_config.json", + "74f033ab874a1acddb3aec9b9c4d9cde" + ], + "vocab_file": [ + "https://bj.bcebos.com/paddlenlp/pipelines/answer_generator/uie-base-answer-extractor/uie-base-answer-extractor-v1/vocab.txt", + "1c1c1f4fd93c5bed3b4eebec4de976a8" + ], + "special_tokens_map": [ + "https://bj.bcebos.com/paddlenlp/pipelines/answer_generator/uie-base-answer-extractor/uie-base-answer-extractor-v1/special_tokens_map.json", + "8b3fb1023167bb4ab9d70708eb05f6ec" + ], + "tokenizer_config": [ + "https://bj.bcebos.com/paddlenlp/pipelines/answer_generator/uie-base-answer-extractor/uie-base-answer-extractor-v1/tokenizer_config.json", + "3e623b57084882fd73e17f544bdda47d" + ] + }, + } + + return_no_answers: bool + outgoing_edges = 1 + query_count = 0 + query_time = 0 + + def __init__(self, + model='uie-base-answer-extractor', + schema=['答案'], + task_path=None, + device="gpu", + batch_size=64, + position_prob=0.01, + max_answer_candidates=5): + paddle.set_device(device) + self.model = model + self._from_taskflow = False + self._custom_model = False + if task_path: + self._task_path = task_path + self._custom_model = True + else: + if model in ["uie-base"]: + self._task_path = None + self._from_taskflow = True + else: + self._task_path = os.path.join( + PPNLP_HOME, "pipelines", "unsupervised_question_answering", + self.model) + self._check_task_files() + self.batch_size = batch_size + self.max_answer_candidates = max_answer_candidates + self.schema = schema + self.answer_generator = Taskflow( + "information_extraction", + model=self.model if self._from_taskflow else "uie-base", + schema=schema, + task_path=self._task_path, + batch_size=batch_size, + position_prob=position_prob) + + def _check_task_files(self): + """ + Check files required by the task. + """ + for file_id, file_name in self.resource_files_names.items(): + path = os.path.join(self._task_path, file_name) + url = self.resource_files_urls[self.model][file_id][0] + md5 = self.resource_files_urls[self.model][file_id][1] + + downloaded = True + if not os.path.exists(path): + downloaded = False + else: + if not self._custom_model: + if os.path.exists(path): + # Check whether the file is updated + if not md5file(path) == md5: + downloaded = False + if file_id == "model_state": + self._param_updated = True + else: + downloaded = False + if not downloaded: + download_file(self._task_path, file_name, url, md5) + + def answer_generation_from_paragraphs(self, + paragraphs, + batch_size=16, + model=None, + max_answer_candidates=5, + schema=None, + wf=None): + """Generate answer from given paragraphs.""" + result = [] + buffer = [] + i = 0 + len_paragraphs = len(paragraphs) + for paragraph_tobe in tqdm(paragraphs): + buffer.append(paragraph_tobe) + if len(buffer) == batch_size or (i + 1) == len_paragraphs: + predicts = model(buffer) + paragraph_list = buffer + buffer = [] + for predict_dict, paragraph in zip(predicts, paragraph_list): + answers = [] + probabilitys = [] + for prompt in schema: + if prompt in predict_dict: + answer_dicts = predict_dict[prompt] + answers += [ + answer_dict['text'] + for answer_dict in answer_dicts + ] + probabilitys += [ + answer_dict['probability'] + for answer_dict in answer_dicts + ] + else: + answers += [] + probabilitys += [] + candidates = sorted(list( + set([(a, p) for a, p in zip(answers, probabilitys)])), + key=lambda x: -x[1]) + if len(candidates) > max_answer_candidates: + candidates = candidates[:max_answer_candidates] + outdict = { + 'context': paragraph, + 'answer_candidates': candidates, + } + if wf: + wf.write(json.dumps(outdict, ensure_ascii=False) + "\n") + result.append(outdict) + i += 1 + return result + + def run(self, meta): + print('createing synthetic answers...') + synthetic_context_answer_pairs = self.answer_generation_from_paragraphs( + meta, + batch_size=self.batch_size, + model=self.answer_generator, + max_answer_candidates=self.max_answer_candidates, + schema=self.schema, + wf=None) + results = {"ca_pairs": synthetic_context_answer_pairs} + return results, "output_1" diff --git a/pipelines/pipelines/nodes/answer_extractor/answer_extractor_preprocessor.py b/pipelines/pipelines/nodes/answer_extractor/answer_extractor_preprocessor.py new file mode 100644 index 000000000000..b5cec953a86e --- /dev/null +++ b/pipelines/pipelines/nodes/answer_extractor/answer_extractor_preprocessor.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2021 deepset GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pipelines.nodes.base import BaseComponent +import paddle + + +class AnswerExtractorPreprocessor(BaseComponent): + """ + Answer Extractor Preprocessor used to preprocess the result of textconvert. + """ + return_no_answers: bool + outgoing_edges = 1 + query_count = 0 + query_time = 0 + + def __init__(self, device="gpu"): + paddle.set_device(device) + + def run(self, documents): + results = {"meta": [document['content'] for document in documents]} + return results, "output_1" diff --git a/pipelines/pipelines/nodes/answer_extractor/qa_filter.py b/pipelines/pipelines/nodes/answer_extractor/qa_filter.py new file mode 100644 index 000000000000..6a74dabc2d70 --- /dev/null +++ b/pipelines/pipelines/nodes/answer_extractor/qa_filter.py @@ -0,0 +1,227 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2021 deepset GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import json +import sys +import argparse +import re +from tqdm import tqdm + +import paddle +from paddlenlp import Taskflow +from pipelines.nodes.base import BaseComponent +from paddlenlp.utils.env import PPNLP_HOME +from paddlenlp.taskflow.utils import download_file +from paddle.dataset.common import md5file + + +class QAFilter(BaseComponent): + """ + Question Answer Pairs Filter based on Universal Information Extraction. + """ + resource_files_names = { + "model_state": "model_state.pdparams", + "model_config": "model_config.json", + "vocab_file": "vocab.txt", + "special_tokens_map": "special_tokens_map.json", + "tokenizer_config": "tokenizer_config.json" + } + + resource_files_urls = { + "uie-base-qa-filter": { + "model_state": [ + "https://bj.bcebos.com/paddlenlp/pipelines/qa_filter/uie-base-qa-filter-v1/model_state.pdparams", + "feb2d076fa2f78a0d3c3e3d20e9d5dc5" + ], + "model_config": [ + "https://bj.bcebos.com/paddlenlp/pipelines/qa_filter/uie-base-qa-filter-v1/model_config.json", + "74f033ab874a1acddb3aec9b9c4d9cde" + ], + "vocab_file": [ + "https://bj.bcebos.com/paddlenlp/pipelines/qa_filter/uie-base-qa-filter-v1/vocab.txt", + "1c1c1f4fd93c5bed3b4eebec4de976a8" + ], + "special_tokens_map": [ + "https://bj.bcebos.com/paddlenlp/pipelines/qa_filter/uie-base-qa-filter-v1/special_tokens_map.json", + "8b3fb1023167bb4ab9d70708eb05f6ec" + ], + "tokenizer_config": [ + "https://bj.bcebos.com/paddlenlp/pipelines/qa_filter/uie-base-qa-filter-v1/tokenizer_config.json", + "3e623b57084882fd73e17f544bdda47d" + ] + }, + } + + return_no_answers: bool + outgoing_edges = 1 + query_count = 0 + query_time = 0 + + def __init__( + self, + model='uie-base-qa-filter', + schema=['答案'], + task_path=None, + device="gpu", + batch_size=64, + position_prob=0.1, + ): + paddle.set_device(device) + self.model = model + self._custom_model = False + self._from_taskflow = False + if task_path: + self._task_path = task_path + self._custom_model = True + else: + if model in ["uie-base"]: + self._task_path = None + self._from_taskflow = True + else: + self._task_path = os.path.join( + PPNLP_HOME, "pipelines", "unsupervised_question_answering", + self.model) + self._check_task_files() + self.batch_size = batch_size + self.schema = schema + self.filtration_model = Taskflow( + "information_extraction", + model=self.model if self._from_taskflow else "uie-base", + schema=schema, + task_path=self._task_path, + batch_size=batch_size, + position_prob=position_prob) + + def _check_task_files(self): + """ + Check files required by the task. + """ + for file_id, file_name in self.resource_files_names.items(): + path = os.path.join(self._task_path, file_name) + url = self.resource_files_urls[self.model][file_id][0] + md5 = self.resource_files_urls[self.model][file_id][1] + + downloaded = True + if not os.path.exists(path): + downloaded = False + else: + if not self._custom_model: + if os.path.exists(path): + # Check whether the file is updated + if not md5file(path) == md5: + downloaded = False + if file_id == "model_state": + self._param_updated = True + else: + downloaded = False + if not downloaded: + download_file(self._task_path, file_name, url, md5) + + def filtration(self, + paragraphs, + batch_size=16, + model=None, + schema=None, + wf=None, + wf_debug=None): + result = [] + buffer = [] + valid_num, invalid_num = 0, 0 + i = 0 + len_paragraphs = len(paragraphs) + for paragraph_tobe in tqdm(paragraphs): + buffer.append(paragraph_tobe) + if len(buffer) == batch_size or (i + 1) == len_paragraphs: + model_inputs = [] + for d in buffer: + context = d['context'] + synthetic_question = d['synthetic_question'] + prefix = '问题:' + synthetic_question + '上下文:' + content = prefix + context + model_inputs.append(content) + predicts = model(model_inputs) + paragraph_list = buffer + buffer = [] + for predict_dict, paragraph in zip(predicts, paragraph_list): + context = paragraph['context'] + synthetic_question = paragraph['synthetic_question'] + synthetic_question_probability = paragraph[ + 'synthetic_question_probability'] + synthetic_answer = paragraph['synthetic_answer'] + synthetic_answer_probability = paragraph[ + 'synthetic_answer_probability'] + + answers = [] + probabilitys = [] + for prompt in schema: + if prompt in predict_dict: + answer_dicts = predict_dict[prompt] + answers += [ + answer_dict['text'] + for answer_dict in answer_dicts + ] + probabilitys += [ + answer_dict['probability'] + for answer_dict in answer_dicts + ] + else: + answers += [] + probabilitys += [] + candidates = [ + an for an, pro in sorted([( + a, p) for a, p in zip(answers, probabilitys)], + key=lambda x: -x[1]) + ] + out_dict = { + 'context': + context, + 'synthetic_answer': + synthetic_answer, + 'synthetic_answer_probability': + synthetic_answer_probability, + 'synthetic_question': + synthetic_question, + 'synthetic_question_probability': + synthetic_question_probability, + } + if synthetic_answer in candidates: + if wf: + wf.write( + json.dumps(out_dict, ensure_ascii=False) + "\n") + result.append(out_dict) + valid_num += 1 + else: + if wf_debug: + wf_debug.write( + json.dumps(out_dict, ensure_ascii=False) + "\n") + invalid_num += 1 + i += 1 + print('valid synthetic question-answer pairs number:', valid_num) + print('invalid sythetic question-answer pairs numbewr:', invalid_num) + return result + + def run(self, cqa_triples, is_filter=True): + if is_filter: + print('filtering synthetic question-answer pairs...') + filtered_cqa_triples = self.filtration(cqa_triples, + batch_size=self.batch_size, + model=self.filtration_model, + schema=self.schema) + print('filter synthetic question-answer pairs successfully!') + else: + filtered_cqa_triples = cqa_triples + + results = {"filtered_cqa_triples": filtered_cqa_triples} + return results, "output_1" diff --git a/pipelines/pipelines/nodes/answer_extractor/qa_filter_postprocessor.py b/pipelines/pipelines/nodes/answer_extractor/qa_filter_postprocessor.py new file mode 100644 index 000000000000..4177870c2fd5 --- /dev/null +++ b/pipelines/pipelines/nodes/answer_extractor/qa_filter_postprocessor.py @@ -0,0 +1,44 @@ +# coding:utf-8 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pipelines.nodes.base import BaseComponent +import paddle + + +class QAFilterPostprocessor(BaseComponent): + """ + QA Filter Postprocessor used to postprocess the result of qa filter. + """ + + return_no_answers: bool + outgoing_edges = 1 + query_count = 0 + query_time = 0 + + def __init__(self, device="gpu"): + paddle.set_device(device) + + def run(self, filtered_cqa_triples): + results = { + "documents": [{ + 'content': triple['synthetic_question'], + 'content_type': 'text', + 'meta': { + 'answer': triple['synthetic_answer'], + '_split_id': 0 + } + } for triple in filtered_cqa_triples] + } + return results, "output_1" diff --git a/pipelines/pipelines/nodes/question_generator/__init__.py b/pipelines/pipelines/nodes/question_generator/__init__.py new file mode 100644 index 000000000000..9d52d4007895 --- /dev/null +++ b/pipelines/pipelines/nodes/question_generator/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pipelines.nodes.question_generator.question_generator import QuestionGenerator diff --git a/pipelines/pipelines/nodes/question_generator/question_generator.py b/pipelines/pipelines/nodes/question_generator/question_generator.py new file mode 100644 index 000000000000..b50c60ad5235 --- /dev/null +++ b/pipelines/pipelines/nodes/question_generator/question_generator.py @@ -0,0 +1,279 @@ +# coding:utf-8 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +import sys +import argparse +import re +from tqdm import tqdm + +import paddle +from paddlenlp import Taskflow +from pipelines.nodes.base import BaseComponent +from paddlenlp.utils.env import PPNLP_HOME +from paddlenlp.taskflow.utils import download_file +from paddle.dataset.common import md5file + + +class QuestionGenerator(BaseComponent): + """ + Question Generator based on Unimo Text. + """ + resource_files_names = { + "model_state": "model_state.pdparams", + "model_config": "model_config.json", + "vocab_file": "vocab.txt", + "special_tokens_map": "special_tokens_map.json", + "tokenizer_config": "tokenizer_config.json" + } + + resource_files_urls = { + "unimo-text-1.0-question-generator": { + "model_state": [ + "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/model_state.pdparams", + "856a2980f83dc227a8fed4ecd730696d" + ], + "model_config": [ + "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/model_config.json", + "b5bab534683d9f0ef82fc84803ee6f3d" + ], + "vocab_file": [ + "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/vocab.txt", + "ea3f8a8cc03937a8df165d2b507c551e" + ], + "special_tokens_map": [ + "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/special_tokens_map.json", + "8b3fb1023167bb4ab9d70708eb05f6ec" + ], + "tokenizer_config": [ + "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/tokenizer_config.json", + "ef261f5d413a46ed1d6f071aed6fb345" + ] + }, + } + + return_no_answers: bool + outgoing_edges = 1 + query_count = 0 + query_time = 0 + + def __init__(self, + model='unimo-text-1.0-question-generation', + task_path=None, + device="gpu", + batch_size=16, + output_scores=True, + is_select_from_num_return_sequences=False, + max_length=50, + decode_strategy="sampling", + temperature=1.0, + top_k=5, + top_p=1.0, + num_beams=6, + num_beam_groups=1, + diversity_rate=0.0, + num_return_sequences=1, + template=1): + paddle.set_device(device) + self.model = model + self._from_taskflow = False + self._custom_model = False + if task_path: + self._task_path = task_path + self._custom_model = True + else: + if model in [ + "unimo-text-1.0", "unimo-text-1.0-dureader_qg", + "unimo-text-1.0-question-generation", + "unimo-text-1.0-question-generation-dureader_qg" + ]: + self._task_path = None + self._from_taskflow = True + else: + self._task_path = os.path.join( + PPNLP_HOME, "pipelines", "unsupervised_question_answering", + self.model) + self._check_task_files() + self.model = "unimo-text-1.0" + self.num_return_sequences = num_return_sequences + self.batch_size = batch_size + if self._from_taskflow: + self.question_generation = Taskflow( + "question_generation", + model=self.model if self._from_taskflow else "unimo-text-1.0", + output_scores=True, + max_length=max_length, + is_select_from_num_return_sequences= + is_select_from_num_return_sequences, + num_return_sequences=num_return_sequences, + batch_size=batch_size, + decode_strategy=decode_strategy, + num_beams=num_beams, + num_beam_groups=num_beam_groups, + diversity_rate=diversity_rate, + top_k=top_k, + top_p=top_p, + temperature=temperature, + template=1) + else: + self.question_generation = Taskflow( + "question_generation", + model=self.model if self._from_taskflow else "unimo-text-1.0", + task_path=self._task_path, + output_scores=True, + max_length=max_length, + is_select_from_num_return_sequences= + is_select_from_num_return_sequences, + num_return_sequences=num_return_sequences, + batch_size=batch_size, + decode_strategy=decode_strategy, + num_beams=num_beams, + num_beam_groups=num_beam_groups, + diversity_rate=diversity_rate, + top_k=top_k, + top_p=top_p, + temperature=temperature, + template=1) + + def _check_task_files(self): + """ + Check files required by the task. + """ + for file_id, file_name in self.resource_files_names.items(): + path = os.path.join(self._task_path, file_name) + url = self.resource_files_urls[self.model][file_id][0] + md5 = self.resource_files_urls[self.model][file_id][1] + + downloaded = True + if not os.path.exists(path): + downloaded = False + else: + if not self._custom_model: + if os.path.exists(path): + # Check whether the file is updated + if not md5file(path) == md5: + downloaded = False + if file_id == "model_state": + self._param_updated = True + else: + downloaded = False + if not downloaded: + download_file(self._task_path, file_name, url, md5) + + def create_question(self, + json_file_or_pair_list, + out_json=None, + num_return_sequences=1, + all_sample_num=None, + batch_size=8): + if out_json: + wf = open(out_json, 'w', encoding='utf-8') + if isinstance(json_file_or_pair_list, list): + all_lines = json_file_or_pair_list + else: + rf = open(json_file_or_pair_list, 'r', encoding='utf-8') + all_lines = [] + for json_line in rf: + line_dict = json.loads(json_line) + all_lines.append(line_dict) + rf.close() + num_all_lines = len(all_lines) + output = [] + context_buffer = [] + answer_buffer = [] + answer_probability_buffer = [] + true_question_buffer = [] + i = 0 + for index, line_dict in enumerate(tqdm(all_lines)): + if 'question' in line_dict: + q = line_dict['question'] + else: + q = '' + c = line_dict['context'] + assert 'answer_candidates' in line_dict + answers = line_dict['answer_candidates'] + if not answers: + continue + for j, pair in enumerate(answers): + a, p = pair + context_buffer += [c] + answer_buffer += [a] + answer_probability_buffer += [p] + true_question_buffer += [q] + if (i + 1) % batch_size == 0 or ( + all_sample_num and + (i + 1) == all_sample_num) or ((index + 1) == num_all_lines + and j == len(answers) - 1): + result_buffer = self.question_generation([{ + 'context': context, + 'answer': answer + } for context, answer in zip(context_buffer, answer_buffer) + ]) + context_buffer_temp, answer_buffer_temp, answer_probability_buffer_temp, true_question_buffer_temp = [], [], [], [] + for context, answer, answer_probability, true_question in zip( + context_buffer, answer_buffer, + answer_probability_buffer, true_question_buffer): + context_buffer_temp += [context] * num_return_sequences + answer_buffer_temp += [answer] * num_return_sequences + answer_probability_buffer_temp += [ + answer_probability + ] * num_return_sequences + true_question_buffer_temp += [true_question + ] * num_return_sequences + result_one_two_buffer = [ + (one, two) + for one, two in zip(result_buffer[0], result_buffer[1]) + ] + for context, answer, answer_probability, true_question, result in zip( + context_buffer_temp, answer_buffer_temp, + answer_probability_buffer_temp, + true_question_buffer_temp, result_one_two_buffer): + fake_quesitons_tokens = [result[0]] + fake_quesitons_scores = [result[1]] + for fake_quesitons_token, fake_quesitons_score in zip( + fake_quesitons_tokens, fake_quesitons_scores): + out_dict = { + 'context': context, + 'synthetic_answer': answer, + 'synthetic_answer_probability': + answer_probability, + 'synthetic_question': fake_quesitons_token, + 'synthetic_question_probability': + fake_quesitons_score, + 'true_question': true_question, + } + if out_json: + wf.write( + json.dumps(out_dict, ensure_ascii=False) + + "\n") + output.append(out_dict) + context_buffer = [] + answer_buffer = [] + true_question_buffer = [] + if all_sample_num and (i + 1) >= all_sample_num: + break + i += 1 + if out_json: + wf.close() + return output + + def run(self, ca_pairs): + print('createing synthetic question-answer pairs...') + synthetic_context_answer_question_triples = self.create_question( + ca_pairs, None, self.num_return_sequences, None, self.batch_size) + print('create synthetic question-answer pairs successfully!') + results = {"cqa_triples": synthetic_context_answer_question_triples} + return results, "output_1" diff --git a/pipelines/pipelines/nodes/ranker/ernie_ranker.py b/pipelines/pipelines/nodes/ranker/ernie_ranker.py index 15651d30414d..c9904ede4544 100644 --- a/pipelines/pipelines/nodes/ranker/ernie_ranker.py +++ b/pipelines/pipelines/nodes/ranker/ernie_ranker.py @@ -197,8 +197,7 @@ def predict_batch( right_idx = 0 for number in number_of_docs: right_idx = left_idx + number - grouped_predictions.append( - similarity_scores[left_idx:right_idx]) + grouped_predictions.append(preds[left_idx:right_idx]) left_idx = right_idx result = [] for pred_group, doc_group in zip(grouped_predictions, documents): @@ -212,7 +211,6 @@ def predict_batch( doc for _, doc in sorted_scores_and_documents ] result.append(sorted_documents[:top_k]) - return result def _preprocess_batch_queries_and_docs( diff --git a/pipelines/pipelines/pipelines/__init__.py b/pipelines/pipelines/pipelines/__init__.py index 40f7714d6b5c..3929f9061228 100644 --- a/pipelines/pipelines/pipelines/__init__.py +++ b/pipelines/pipelines/pipelines/__init__.py @@ -13,8 +13,12 @@ # limitations under the License. from pipelines.pipelines.base import Pipeline, RootNode -from pipelines.pipelines.standard_pipelines import (BaseStandardPipeline, - ExtractiveQAPipeline, - SemanticSearchPipeline, - DocPipeline, - TextToImagePipeline) \ No newline at end of file + +from pipelines.pipelines.standard_pipelines import ( + BaseStandardPipeline, + ExtractiveQAPipeline, + SemanticSearchPipeline, + DocPipeline, + TextToImagePipeline, + QAGenerationPipeline, +) diff --git a/pipelines/pipelines/pipelines/base.py b/pipelines/pipelines/pipelines/base.py index 2ef81c4d5f49..63bad9ad17c4 100644 --- a/pipelines/pipelines/pipelines/base.py +++ b/pipelines/pipelines/pipelines/base.py @@ -262,8 +262,9 @@ def load_from_yaml(cls, variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. """ - pipeline_config = read_pipeline_config_from_yaml(path) + print(pipeline_config) + print(pipeline_name) if pipeline_config["version"] != __version__: logger.warning( f"YAML version ({pipeline_config['version']}) does not match with pipelines version ({__version__}). " @@ -823,10 +824,13 @@ def load_from_config(cls, overwrite_with_env_variables=overwrite_with_env_variables) pipeline = cls() - + print(pipeline_definition) components: dict = {} # instances of component objects. for node in pipeline_definition["nodes"]: + print('node', node) name = node["name"] + if name == 'QAFilterPostprocessor': + print('exit') component = cls._load_or_get_component( name=name, definitions=component_definitions, diff --git a/pipelines/pipelines/pipelines/standard_pipelines.py b/pipelines/pipelines/pipelines/standard_pipelines.py index 91964e3f1c0f..7f8ed86fbb1c 100644 --- a/pipelines/pipelines/pipelines/standard_pipelines.py +++ b/pipelines/pipelines/pipelines/standard_pipelines.py @@ -25,6 +25,8 @@ from pipelines.nodes.retriever import BaseRetriever from pipelines.document_stores import BaseDocumentStore from pipelines.nodes.text_to_image_generator import ErnieTextToImageGenerator +from pipelines.nodes.answer_extractor import AnswerExtractor, QAFilter +from pipelines.nodes.question_generator import QuestionGenerator from pipelines.pipelines import Pipeline from pipelines.nodes.base import BaseComponent @@ -272,17 +274,17 @@ class DocPipeline(BaseStandardPipeline): Pipeline for document intelligence. """ - def __init__(self, preprocessor: BaseComponent, modelrunner: BaseComponent): + def __init__(self, preprocessor: BaseComponent, docreader: BaseComponent): """ :param preprocessor: file/image preprocessor instance - :param modelrunner: document model runner instance + :param docreader: document model runner instance """ self.pipeline = Pipeline() self.pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["Query"]) - self.pipeline.add_node(component=modelrunner, - name="Runner", + self.pipeline.add_node(component=docreader, + name="Reader", inputs=["PreProcessor"]) def run(self, @@ -331,3 +333,41 @@ def run_batch( params=params, debug=debug) return output + + +class QAGenerationPipeline(BaseStandardPipeline): + """ + Pipeline for semantic search. + """ + + def __init__(self, answer_extractor: AnswerExtractor, + question_generator: QuestionGenerator, qa_filter: QAFilter): + """ + :param retriever: Retriever instance + """ + self.pipeline = Pipeline() + self.pipeline.add_node(component=answer_extractor, + name="AnswerExtractor", + inputs=["Query"]) + self.pipeline.add_node(component=question_generator, + name="QuestionGenerator", + inputs=["AnswerExtractor"]) + self.pipeline.add_node(component=qa_filter, + name="QAFilter", + inputs=["QuestionGenerator"]) + + def run(self, + meta: List[str], + params: Optional[dict] = None, + debug: Optional[bool] = None): + """ + :param query: the query string. + :param params: params for the `retriever` and `reader`. For instance, params={"Retriever": {"top_k": 10}} + :param debug: Whether the pipeline should instruct nodes to collect debug information + about their execution. By default these include the input parameters + they received and the output they generated. + All debug information can then be found in the dict returned + by this method under the key "_debug" + """ + output = self.pipeline.run(meta=meta, params=params, debug=debug) + return output diff --git a/pipelines/rest_api/config.py b/pipelines/rest_api/config.py index ab86d26d9855..d2b968c6ad42 100644 --- a/pipelines/rest_api/config.py +++ b/pipelines/rest_api/config.py @@ -20,6 +20,8 @@ str((Path(__file__).parent / "pipeline" / "pipelines.yaml").absolute())) QUERY_PIPELINE_NAME = os.getenv("QUERY_PIPELINE_NAME", "query") INDEXING_PIPELINE_NAME = os.getenv("INDEXING_PIPELINE_NAME", "indexing") +INDEXING_QA_GENERATING_PIPELINE_NAME = os.getenv( + "INDEXING_QA_GENERATING_PIPELINE_NAME", "indexing_qa_generating") FILE_UPLOAD_PATH = os.getenv( "FILE_UPLOAD_PATH", str((Path(__file__).parent / "file-upload").absolute())) diff --git a/pipelines/rest_api/controller/file_upload.py b/pipelines/rest_api/controller/file_upload.py index 56f36c5ea260..3af543fab41f 100644 --- a/pipelines/rest_api/controller/file_upload.py +++ b/pipelines/rest_api/controller/file_upload.py @@ -28,7 +28,7 @@ from pipelines.pipelines.base import Pipeline from pipelines.pipelines.config import get_component_definitions, get_pipeline_definition, read_pipeline_config_from_yaml -from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME, FILE_PARSE_PATH +from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME, INDEXING_QA_GENERATING_PIPELINE_NAME, FILE_PARSE_PATH from rest_api.controller.utils import as_form logger = logging.getLogger(__name__) @@ -55,11 +55,17 @@ "Indexing Pipeline with FAISSDocumentStore or InMemoryDocumentStore is not supported with the REST APIs." ) INDEXING_PIPELINE = None + INDEXING_QA_GENERATING_PIPELINE = None else: + INDEXING_QA_GENERATING_PIPELINE = Pipeline.load_from_yaml( + Path(PIPELINE_YAML_PATH), + pipeline_name=INDEXING_QA_GENERATING_PIPELINE_NAME) INDEXING_PIPELINE = Pipeline.load_from_yaml( Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) + except KeyError: INDEXING_PIPELINE = None + INDEXING_QA_GENERATING_PIPELINE = None logger.warning( "Indexing Pipeline not found in the YAML configuration. File Upload API will not be available." ) @@ -89,6 +95,55 @@ class Response(BaseModel): file_id: str +@router.post("/file-upload-qa-generate") +def upload_file( + files: List[UploadFile] = File(...), + # JSON serialized string + meta: Optional[str] = Form("null"), # type: ignore + fileconverter_params: FileConverterParams = Depends( + FileConverterParams.as_form), # type: ignore +): + """ + You can use this endpoint to upload a file for indexing + """ + if not INDEXING_QA_GENERATING_PIPELINE: + raise HTTPException( + status_code=501, + detail="INDEXING_QA_GENERATING_PIPELINE is not configured.") + + file_paths: list = [] + file_metas: list = [] + meta_form = json.loads(meta) or {} # type: ignore + if not isinstance(meta_form, dict): + raise HTTPException( + status_code=500, + detail= + f"The meta field must be a dict or None, not {type(meta_form)}") + + for file in files: + try: + file_path = Path( + FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" + with file_path.open("wb") as buffer: + shutil.copyfileobj(file.file, buffer) + + file_paths.append(file_path) + meta_form["name"] = file.filename + file_metas.append(meta_form) + finally: + file.file.close() + + result = INDEXING_QA_GENERATING_PIPELINE.run( + file_paths=file_paths, + meta=file_metas, + params={ + "TextFileConverter": fileconverter_params.dict(), + "PDFFileConverter": fileconverter_params.dict(), + }, + ) + return {'message': "OK"} + + @router.post("/file-upload") def upload_file( files: List[UploadFile] = File(...), diff --git a/pipelines/rest_api/controller/search.py b/pipelines/rest_api/controller/search.py index bb2d010f50f3..4561b199a7b6 100644 --- a/pipelines/rest_api/controller/search.py +++ b/pipelines/rest_api/controller/search.py @@ -27,7 +27,7 @@ from pipelines.pipelines.base import Pipeline from rest_api.config import PIPELINE_YAML_PATH, QUERY_PIPELINE_NAME from rest_api.config import LOG_LEVEL, CONCURRENT_REQUEST_PER_WORKER -from rest_api.schema import QueryRequest, QueryResponse, DocumentRequest, DocumentResponse, QueryImageResponse +from rest_api.schema import QueryRequest, QueryResponse, DocumentRequest, DocumentResponse, QueryImageResponse, QueryQAPairResponse, QueryQAPairRequest from rest_api.controller.utils import RequestLimiter logging.getLogger("pipelines").setLevel(LOG_LEVEL) @@ -41,6 +41,9 @@ PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=QUERY_PIPELINE_NAME) + +QA_PAIR_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), + pipeline_name="query_qa_pairs") DOCUMENT_STORE = PIPELINE.get_document_store() logging.info(f"Loaded pipeline nodes: {PIPELINE.graph.nodes.keys()}") @@ -76,6 +79,7 @@ def query(request: QueryRequest): This endpoint receives the question as a string and allows the requester to set additional parameters that will be passed on to the pipelines pipeline. """ + print('query', request) with concurrency_limiter.run(): result = _process_request(PIPELINE, request) return result @@ -118,6 +122,25 @@ def query_documents(request: DocumentRequest): return result +@router.post("/query_qa_pairs", + response_model=QueryQAPairResponse, + response_model_exclude_none=True) +def query_qa_pairs(request: QueryQAPairRequest): + """ + This endpoint receives the question as a string and allows the requester to set + additional parameters that will be passed on to the pipelines pipeline. + """ + print('request', request) + result = {} + result['meta'] = request.meta + params = request.params or {} + res = QA_PAIR_PIPELINE.run(meta=request.meta, + params=params, + debug=request.debug) + result['filtered_cqa_triples'] = res['filtered_cqa_triples'] + return result + + def _process_request(pipeline, request) -> Dict[str, Any]: start_time = time.time() diff --git a/pipelines/rest_api/pipeline/docprompt.yaml b/pipelines/rest_api/pipeline/docprompt.yaml index 3d556b512b44..18d455cd325c 100644 --- a/pipelines/rest_api/pipeline/docprompt.yaml +++ b/pipelines/rest_api/pipeline/docprompt.yaml @@ -6,7 +6,7 @@ components: use_gpu: True lang: ch type: DocOCRProcessor - - name: Runner + - name: Reader params: topn: 1 use_gpu: True @@ -23,7 +23,7 @@ pipelines: nodes: - name: PreProcessor inputs: [Query] - - name: Runner + - name: Reader inputs: [PreProcessor] diff --git a/pipelines/rest_api/pipeline/pipelines.yaml b/pipelines/rest_api/pipeline/pipelines.yaml index c2243a90093b..d844a1187d48 100644 --- a/pipelines/rest_api/pipeline/pipelines.yaml +++ b/pipelines/rest_api/pipeline/pipelines.yaml @@ -29,6 +29,10 @@ components: # define all the building-blocks for Pipeline type: PDFToTextConverter - name: DocxFileConverter type: DocxToTextConverter + - name: AnswerExtractorPreprocessor + type: AnswerExtractorPreprocessor + - name: QAFilterPostprocessor + type: QAFilterPostprocessor - name: Preprocessor type: PreProcessor params: @@ -64,3 +68,31 @@ pipelines: inputs: [Preprocessor] - name: DocumentStore inputs: [Retriever] + + - name: indexing_qa_generating + type: Indexing_qa_generating + nodes: + - name: FileTypeClassifier + inputs: [File] + - name: TextFileConverter + inputs: [FileTypeClassifier.output_1] + - name: PDFFileConverter + inputs: [FileTypeClassifier.output_2] + - name: DocxFileConverter + inputs: [FileTypeClassifier.output_4] + - name: ImageFileConverter + inputs: [FileTypeClassifier.output_6] + - name: AnswerExtractorPreprocessor + inputs: [PDFFileConverter, TextFileConverter, DocxFileConverter, ImageFileConverter] + - name: AnswerExtractor + inputs: [AnswerExtractorPreprocessor] + - name: QuestionGenerator + inputs: [AnswerExtractor] + - name: QAFilter + inputs: [QuestionGenerator] + - name: QAFilterPostprocessor + inputs: [QAFilter] + - name: Retriever + inputs: [QAFilterPostprocessor] + - name: DocumentStore + inputs: [Retriever] diff --git a/pipelines/rest_api/pipeline/unsupervised_qa.yaml b/pipelines/rest_api/pipeline/unsupervised_qa.yaml new file mode 100644 index 000000000000..40d799d85627 --- /dev/null +++ b/pipelines/rest_api/pipeline/unsupervised_qa.yaml @@ -0,0 +1,106 @@ +version: '1.1.0' + +components: # define all the building-blocks for Pipeline + - name: DocumentStore + type: ElasticsearchDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents + params: + host: localhost + port: 9200 + index: my_data + embedding_dim: 312 + - name: Retriever + type: DensePassageRetriever + params: + document_store: DocumentStore # params can reference other components defined in the YAML + top_k: 10 + query_embedding_model: rocketqa-zh-nano-query-encoder + passage_embedding_model: rocketqa-zh-nano-para-encoder + embed_title: False + - name: Ranker # custom-name for the component; helpful for visualization & debugging + type: ErnieRanker # pipelines Class name for the component + params: + model_name_or_path: rocketqa-nano-cross-encoder + top_k: 3 + - name: TextFileConverter + type: TextConverter + - name: ImageFileConverter + type: ImageToTextConverter + - name: PDFFileConverter + type: PDFToTextConverter + - name: DocxFileConverter + type: DocxToTextConverter + - name: AnswerExtractorPreprocessor + type: AnswerExtractorPreprocessor + - name: QAFilterPostprocessor + type: QAFilterPostprocessor + - name: Preprocessor + type: PreProcessor + params: + split_by: passage + split_respect_sentence_boundary: False + split_answers: True + - name: FileTypeClassifier + type: FileTypeClassifier + - name: AnswerExtractor + type: AnswerExtractor + params: + model: uie-base-answer-extractor + schema: ['答案'] + position_prob: 0.01 + max_answer_candidates: 3 + - name: QuestionGenerator + type: QuestionGenerator + params: + model: unimo-text-1.0-question-generation + num_return_sequences: 2 + - name: QAFilter + type: QAFilter + params: + model: uie-base-qa-filter + schema: ['答案'] + position_prob: 0.1 + +pipelines: + - name: query # a sample extractive-qa Pipeline + type: Query + nodes: + - name: Retriever + inputs: [Query] + - name: Ranker + inputs: [Retriever] + - name: indexing_qa_generating + type: Indexing_qa_generating + nodes: + - name: FileTypeClassifier + inputs: [File] + - name: TextFileConverter + inputs: [FileTypeClassifier.output_1] + - name: PDFFileConverter + inputs: [FileTypeClassifier.output_2] + - name: DocxFileConverter + inputs: [FileTypeClassifier.output_4] + - name: ImageFileConverter + inputs: [FileTypeClassifier.output_6] + - name: AnswerExtractorPreprocessor + inputs: [PDFFileConverter, TextFileConverter, DocxFileConverter, ImageFileConverter] + - name: AnswerExtractor + inputs: [AnswerExtractorPreprocessor] + - name: QuestionGenerator + inputs: [AnswerExtractor] + - name: QAFilter + inputs: [QuestionGenerator] + - name: QAFilterPostprocessor + inputs: [QAFilter] + - name: Retriever + inputs: [QAFilterPostprocessor] + - name: DocumentStore + inputs: [Retriever] + - name: query_qa_pairs + type: Query + nodes: + - name: AnswerExtractor + inputs: [Query] + - name: QuestionGenerator + inputs: [AnswerExtractor] + - name: QAFilter + inputs: [QuestionGenerator] diff --git a/pipelines/rest_api/schema.py b/pipelines/rest_api/schema.py index 486078554a94..c0f823707f7c 100644 --- a/pipelines/rest_api/schema.py +++ b/pipelines/rest_api/schema.py @@ -106,3 +106,19 @@ class QueryImageResponse(BaseModel): answers: List[str] = [] documents: List[DocumentSerialized] = [] debug: Optional[Dict] = Field(None, alias="_debug") + + +class QueryQAPairRequest(BaseModel): + meta: List[str] + params: Optional[dict] = None + debug: Optional[bool] = False + + class Config: + # Forbid any extra fields in the request to avoid silent failures + extra = Extra.forbid + + +class QueryQAPairResponse(BaseModel): + meta: List[str] + filtered_cqa_triples: List[dict] = [] + debug: Optional[Dict] = Field(None, alias="_debug") \ No newline at end of file diff --git a/pipelines/ui/utils.py b/pipelines/ui/utils.py index 540c44cc2247..a66106c84624 100644 --- a/pipelines/ui/utils.py +++ b/pipelines/ui/utils.py @@ -24,6 +24,12 @@ import streamlit as st from io import StringIO +import paddle +from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http +from pipelines.document_stores import ElasticsearchDocumentStore, MilvusDocumentStore +from pipelines.nodes import DensePassageRetriever +from pipelines.utils import launch_es + API_ENDPOINT = os.getenv("API_ENDPOINT") STATUS = "initialized" HS_VERSION = "hs_version" @@ -32,6 +38,8 @@ DOC_UPLOAD = "file-upload" DOC_PARSE = 'files' IMAGE_REQUEST = 'query_text_to_images' +QA_PAIR_REQUEST = 'query_qa_pairs' +FILE_UPLOAD_QA_GENERATE = 'file-upload-qa-generate' def pipelines_is_ready(): @@ -214,6 +222,31 @@ def text_to_image_search( return results, response +def text_to_qa_pair_search(query, + is_filter=True + ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]: + """ + Send a prompt text and corresponding parameters to the REST API + """ + url = f"{API_ENDPOINT}/{QA_PAIR_REQUEST}" + params = { + "QAFilter": { + "is_filter": is_filter, + }, + } + + req = {"meta": [query], "params": params} + response_raw = requests.post(url, json=req) + if response_raw.status_code >= 400 and response_raw.status_code != 503: + raise Exception(f"{vars(response_raw)}") + + response = response_raw.json() + if "errors" in response: + raise Exception(", ".join(response["errors"])) + results = response["filtered_cqa_triples"] + return results, response + + def send_feedback(query, answer_obj, is_correct_answer, is_correct_document, document) -> None: """ @@ -242,6 +275,13 @@ def upload_doc(file): return response +def file_upload_qa_generate(file): + url = f"{API_ENDPOINT}/{FILE_UPLOAD_QA_GENERATE}" + files = [("files", file)] + response = requests.post(url, files=files).json() + return response + + def get_backlink(result) -> Tuple[Optional[str], Optional[str]]: if result.get("document", None): doc = result["document"] @@ -252,3 +292,60 @@ def get_backlink(result) -> Tuple[Optional[str], Optional[str]]: "title", None): return doc["meta"]["url"], doc["meta"]["title"] return None, None + + +def offline_ann(index_name, + doc_dir, + search_engine="elastic", + host="127.0.0.1", + port="9200", + query_embedding_model="rocketqa-zh-nano-query-encoder", + passage_embedding_model="rocketqa-zh-nano-para-encoder", + params_path="checkpoints/model_40/model_state.pdparams", + embedding_dim=312, + split_answers=True): + if (search_engine == "milvus"): + document_store = MilvusDocumentStore(embedding_dim=embedding_dim, + host=host, + index=index_name, + port=port, + index_param={ + "M": 16, + "efConstruction": 50 + }, + index_type="HNSW") + else: + launch_es() + document_store = ElasticsearchDocumentStore(host=host, + port=port, + username="", + password="", + embedding_dim=embedding_dim, + index=index_name) + # 将每篇文档按照段落进行切分 + dicts = convert_files_to_dicts(dir_path=doc_dir, + split_paragraphs=True, + split_answers=split_answers, + encoding='utf-8') + + print(dicts[:3]) + + # 文档数据写入数据库 + document_store.write_documents(dicts) + + ### 语义索引模型 + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=query_embedding_model, + passage_embedding_model=passage_embedding_model, + params_path=params_path, + output_emb_size=embedding_dim, + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=1, + use_gpu=True, + embed_title=False, + ) + + # 建立索引库 + document_store.update_embeddings(retriever) diff --git a/pipelines/ui/webapp_unsupervised_question_answering.py b/pipelines/ui/webapp_unsupervised_question_answering.py new file mode 100644 index 000000000000..c288f316d406 --- /dev/null +++ b/pipelines/ui/webapp_unsupervised_question_answering.py @@ -0,0 +1,340 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2021 deepset GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import logging +import pandas as pd +from json import JSONDecodeError +from pathlib import Path +import streamlit as st +from annotated_text import annotation +from markdown import markdown + +from ui.utils import pipelines_is_ready, semantic_search, send_feedback, upload_doc, file_upload_qa_generate, pipelines_version, get_backlink, text_to_qa_pair_search, offline_ann + +# Adjust to a question that you would like users to see in the search bar when they load the UI: +# DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "如何办理企业养老保险?") +DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "") +# Sliders +DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", + "30")) +DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3")) +# Labels for the evaluation +EVAL_LABELS = os.getenv("EVAL_FILE", + str(Path(__file__).parent / "insurance_faq.csv")) +# Corpus dir for ANN +CORPUS_DIR = os.getenv("CORPUS_DIR", str('data/my_data')) +# QA pairs file to be saved +UPDATE_FILE = os.getenv("UPDATE_FILE", str('data/my_data/custom_qa_pairs.txt')) +# Whether the file upload should be enabled or not +DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD")) + +DEFAULT_NUMBER_OF_FILTER_STRENGTH = int( + os.getenv("DEFAULT_NUMBER_OF_FILTER_STRENGTH", "10")) + + +def set_state_if_absent(key, value): + if key not in st.session_state: + st.session_state[key] = value + + +def on_change_text(): + st.session_state.question = st.session_state.quest + st.session_state.answer = None + st.session_state.results = None + st.session_state.raw_json = None + + +def on_change_text_qag(): + st.session_state.qag_question = st.session_state.qag_quest + st.session_state.answer = None + st.session_state.qag_results = None + st.session_state.qag_raw_json = None + + +def upload(): + data_files = st.session_state.upload_files['files'] + for data_file in data_files: + # Upload file + if data_file and data_file.name not in st.session_state.upload_files[ + 'uploaded_files']: + # raw_json = upload_doc(data_file) + raw_json = file_upload_qa_generate(data_file) + st.session_state.upload_files['uploaded_files'].append( + data_file.name) + # Save the uploaded files + st.session_state.upload_files['uploaded_files'] = list( + set(st.session_state.upload_files['uploaded_files'])) + + +def main(): + + st.set_page_config(page_title="PaddleNLP无监督智能检索问答", page_icon='🐮') + # page_icon="https://github.com/PaddlePaddle/Paddle/blob/develop/doc/imgs/logo.png") + + # Persistent state + set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP) + set_state_if_absent("qag_question", DEFAULT_QUESTION_AT_STARTUP) + set_state_if_absent("results", None) + set_state_if_absent("qag_results", None) + set_state_if_absent("raw_json", None) + set_state_if_absent("qag_raw_json", None) + set_state_if_absent("random_question_requested", False) + set_state_if_absent("upload_files", {'uploaded_files': [], 'files': []}) + + # Small callback to reset the interface in case the text of the question changes + def reset_results(*args): + st.session_state.answer = None + st.session_state.results = None + st.session_state.raw_json = None + + def reset_results_qag(*args): + st.session_state.answer = None + st.session_state.qag_results = None + st.session_state.qag_raw_json = None + + # Title + st.write("## 无监督智能检索问答") + # Sidebar + st.sidebar.header("选项") + st.sidebar.write("### 问答对生成:") + is_filter = st.sidebar.selectbox( + "是否进行自动过滤", + ('是', '否'), + on_change=reset_results, + ) + st.sidebar.write("### 问答检索:") + top_k_reader = st.sidebar.slider( + "返回答案数量", + min_value=1, + max_value=30, + value=DEFAULT_NUMBER_OF_ANSWERS, + step=1, + on_change=reset_results, + ) + top_k_retriever = st.sidebar.slider( + "最大检索数量", + min_value=1, + max_value=100, + value=DEFAULT_DOCS_FROM_RETRIEVER, + step=1, + on_change=reset_results, + ) + + if not DISABLE_FILE_UPLOAD: + st.sidebar.write("### 文件上传:") + data_files = st.sidebar.file_uploader( + "", + type=["pdf", "txt", "docx", "png"], + help="选择多个文件", + accept_multiple_files=True) + st.session_state.upload_files['files'] = data_files + st.sidebar.button("文件上传并自动生成载入问答对", on_click=upload) + for data_file in st.session_state.upload_files['uploaded_files']: + st.sidebar.write(str(data_file) + "    ✅ ") + + hs_version = "" + try: + hs_version = f" (v{pipelines_version()})" + except Exception: + pass + # Load csv into pandas dataframe + try: + df = pd.read_csv(EVAL_LABELS, sep=";") + except Exception: + st.error(f"The eval file was not found.") + sys.exit(f"The eval file was not found under `{EVAL_LABELS}`.") + + ## QA pairs generation + # Search bar + st.write("### 问答对生成:") + context = st.text_input("", + value=st.session_state.qag_question, + key="qag_quest", + on_change=on_change_text_qag, + max_chars=350, + placeholder='请输入要抽取问答对的文本') + qag_col1, qag_col2 = st.columns(2) + qag_col1.markdown("", + unsafe_allow_html=True) + qag_col2.markdown("", + unsafe_allow_html=True) + + # Run button + qag_run_pressed = qag_col1.button("开始生成") + + # Get next random question from the CSV + if qag_col2.button("存入数据库"): + with open(UPDATE_FILE, 'a', encoding='utf-8') as wf: + for count, result in enumerate(st.session_state.qag_results): + context = result["context"] + synthetic_answer = result["synthetic_answer"] + synthetic_question = result["synthetic_question"] + wf.write(synthetic_question.strip() + '\t' + + synthetic_answer.strip() + '\n') + offline_ann('my_data', CORPUS_DIR) + reset_results_qag() + + # st.session_state.random_question_requested = False + qag_run_query = (qag_run_pressed or context != st.session_state.qag_question + ) and not st.session_state.random_question_requested + # qag_run_query = qag_run_pressed + + # Check the connection + with st.spinner("⌛️    pipelines is starting..."): + if not pipelines_is_ready(): + st.error("🚫    Connection Error. Is pipelines running?") + run_query = False + reset_results_qag() + # Get results for query + if (qag_run_query or st.session_state.qag_results is None) and context: + reset_results_qag() + st.session_state.qag_question = context + with st.spinner( + "🧠    Performing neural search on documents... \n " + "Do you want to optimize speed or accuracy? \n"): + try: + st.session_state.qag_results, st.session_state.qag_raw_json = text_to_qa_pair_search( + context, is_filter=True if is_filter == "是" else False) + except JSONDecodeError as je: + st.error( + "👓    An error occurred reading the results. Is the document store working?" + ) + return + except Exception as e: + logging.exception(e) + if "The server is busy processing requests" in str( + e) or "503" in str(e): + st.error( + "🧑‍🌾    All our workers are busy! Try again later." + ) + else: + st.error( + "🐞    An error occurred during the request.") + return + + if st.session_state.qag_results: + st.write("#### 返回结果:") + for count, result in enumerate(st.session_state.qag_results): + context = result["context"] + synthetic_answer = result["synthetic_answer"] + synthetic_answer_probability = result[ + "synthetic_answer_probability"] + synthetic_question = result["synthetic_question"] + synthetic_question_probability = result[ + "synthetic_question_probability"] + st.write( + markdown(context), + unsafe_allow_html=True, + ) + st.write( + markdown('**问题:**' + synthetic_question), + unsafe_allow_html=True, + ) + st.write( + markdown('**答案:**' + synthetic_answer), + unsafe_allow_html=True, + ) + + st.write("___") + + ## QA search + # Search bar + st.write("### 问答检索:") + question = st.text_input("", + value=st.session_state.question, + key="quest", + on_change=on_change_text, + max_chars=100, + placeholder='请输入您的问题') + col1, col2 = st.columns(2) + col1.markdown("", + unsafe_allow_html=True) + col2.markdown("", + unsafe_allow_html=True) + + # Run button + run_pressed = col1.button("运行") + + # Get next random question from the CSV + if col2.button("随机提问"): + reset_results() + new_row = df.sample(1) + while ( + new_row["Question Text"].values[0] == st.session_state.question + ): # Avoid picking the same question twice (the change is not visible on the UI) + new_row = df.sample(1) + st.session_state.question = new_row["Question Text"].values[0] + st.session_state.random_question_requested = True + # Re-runs the script setting the random question as the textbox value + # Unfortunately necessary as the Random Question button is _below_ the textbox + st.experimental_rerun() + + st.session_state.random_question_requested = False + + run_query = (run_pressed or question != st.session_state.question + ) and not st.session_state.random_question_requested + + # Check the connection + with st.spinner("⌛️    pipelines is starting..."): + if not pipelines_is_ready(): + st.error("🚫    Connection Error. Is pipelines running?") + run_query = False + reset_results() + # Get results for query + if (run_query or st.session_state.results is None) and question: + reset_results() + st.session_state.question = question + with st.spinner( + "🧠    Performing neural search on documents... \n " + "Do you want to optimize speed or accuracy? \n"): + try: + st.session_state.results, st.session_state.raw_json = semantic_search( + question, + top_k_reader=top_k_reader, + top_k_retriever=top_k_retriever) + except JSONDecodeError as je: + st.error( + "👓    An error occurred reading the results. Is the document store working?" + ) + return + except Exception as e: + logging.exception(e) + if "The server is busy processing requests" in str( + e) or "503" in str(e): + st.error( + "🧑‍🌾    All our workers are busy! Try again later." + ) + else: + st.error( + "🐞    An error occurred during the request.") + return + + if st.session_state.results: + + st.write("#### 返回结果:") + for count, result in enumerate(st.session_state.results): + context = result["context"] + st.write( + markdown(context), + unsafe_allow_html=True, + ) + st.write("**答案:** ", result["answer"]) + st.write("**Relevance:** ", result["relevance"]) + st.write("___") + + +main() diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py new file mode 100644 index 000000000000..97043fd7ba68 --- /dev/null +++ b/pipelines/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdiffusers/README.md b/ppdiffusers/README.md index f105a99bd183..ff5b5d11b169 100644 --- a/ppdiffusers/README.md +++ b/ppdiffusers/README.md @@ -1,18 +1,29 @@ # PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle - **PPDiffusers**是一款支持**跨模态**(如图像与语音)训练和推理的**扩散模型**(Diffusion Model)工具箱,我们借鉴了🤗 Huggingface团队的[**Diffusers**](https://github.com/huggingface/diffusers)的优秀设计,并且依托[**PaddlePaddle**](https://www.paddlepaddle.org.cn/)框架和[**PaddleNLP**](https://github.com/PaddlePaddle/PaddleNLP)自然语言处理库,打造了一款国产化的工具箱。 ## 1. News 📢 - -* 🔥 **2022.11.04 支持 IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 和 IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 中文权重** -* 🔥 **2022.10.27 发布 PPDiffusers仓库** +* 🔥 **2022.11.11 发布 0.6.2 版本,支持[StableDiffusion模型导出](./scripts/export/README.md)及[FastDeploy Diffusion模型高性能部署 +](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/multimodal/stable_diffusion#%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C)、支持[Diffusers或原版模型->PPDiffusers权重转换](./scripts/convert_diffusers_model/README.md)、支持EulerAncestralDiscreteScheduler、支持[图片超分Pipeline](https://github.com/PaddlePaddle/PaddleNLP/pull/3710);** +* 🔥 **2022.11.04 支持 IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 和 IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 中文权重**; +* 🔥 **2022.10.27 发布 PPDiffusers仓库**。 ## 2. 安装 + +**环境依赖** +- paddlepaddle-gpu>=2.4.0 +- paddlenlp>=2.4.1 +- ftfy +- regex +- Pillow + +**Tips:** 为了能够复现Pytorch的结果,请使用大于**2.4.0**的**paddlepaddle**。 + **使用 `pip` 安装** ```bash +# 请使用最新版的ppdiffusers pip install --upgrade ppdiffusers ``` @@ -155,7 +166,13 @@ image.save("cat_on_bench_new.png") ``` image -## 5. Credits +## 5. 模型部署 + +StableDiffusion模型除了支持动态图运行,还支持将模型导出并使用推理引擎运行。我们提供在[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)上的StableDiffusion模型文生图任务的部署示例,用户可以按照我们提供[StableDiffusion模型导出教程](./scripts/export/README.md)将模型导出,并参考[FastDeploy Diffusion模型高性能部署 +](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/multimodal/stable_diffusion#%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C)完成部署。 + + +## 6. Credits This library concretizes previous work by many different authors and would not have been possible without their great research and implementations. We'd like to thank, in particular, the following implementations which have helped us in our development and without which the API could not have been as polished today: - @huggingface' diffusers library, available [here](https://github.com/huggingface/diffusers) @@ -166,7 +183,7 @@ This library concretizes previous work by many different authors and would not h We also want to thank @heejkoo for the very helpful overview of papers, code and resources on diffusion models, available [here](https://github.com/heejkoo/Awesome-Diffusion-Models) as well as @crowsonkb and @rromb for useful discussions and insights. -## 6. Citation +## 7. Citation ```bibtex @misc{von-platen-etal-2022-diffusers, @@ -179,6 +196,6 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and } ``` -## 7. License +## 8. License PPDiffusers遵循[Apache-2.0开源协议](./LICENSE)。 diff --git a/ppdiffusers/VERSION b/ppdiffusers/VERSION index 7ceb04048e8a..b1d7abc0dbab 100644 --- a/ppdiffusers/VERSION +++ b/ppdiffusers/VERSION @@ -1 +1 @@ -0.6.1 \ No newline at end of file +0.6.2 \ No newline at end of file diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/ldm_trainer.py index 63f24fc57c88..1e0d697581b1 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/ldm_trainer.py +++ b/ppdiffusers/examples/text_to_image_laion400m/ldm_trainer.py @@ -352,8 +352,10 @@ def train( self.control = self.callback_handler.on_step_end( args, self.state, self.control) # TODO junnyu - self._maybe_log_save_evaluate(inputs, tr_loss, model, epoch, - ignore_keys_for_eval) + with self.autocast_smart_context_manager(): + self._maybe_log_save_evaluate(inputs, tr_loss, model, + epoch, + ignore_keys_for_eval) else: self.control = self.callback_handler.on_substep_end( args, self.state, self.control) diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py index 5f88b3772fb3..d3d04bbb33c9 100644 --- a/ppdiffusers/ppdiffusers/__init__.py +++ b/ppdiffusers/ppdiffusers/__init__.py @@ -44,7 +44,7 @@ get_scheduler, ) from .pipeline_utils import DiffusionPipeline - from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline + from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, LDMSuperResolutionPipeline, PNDMPipeline, ScoreSdeVePipeline from .schedulers import ( EulerAncestralDiscreteScheduler, DDIMScheduler, @@ -64,10 +64,6 @@ from .utils.dummy_paddle_and_scipy_objects import * # noqa F403 if is_paddle_available() and is_paddlenlp_available(): - # NEG_INF = float("-inf") - # use -1e9 as NEG_INF - import paddlenlp.transformers.clip.modeling - paddlenlp.transformers.clip.modeling.NEG_INF = -1e9 from .pipelines import (LDMBertModel, LDMTextToImagePipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, diff --git a/ppdiffusers/ppdiffusers/models/ema.py b/ppdiffusers/ppdiffusers/models/ema.py new file mode 100644 index 000000000000..3585177e00d4 --- /dev/null +++ b/ppdiffusers/ppdiffusers/models/ema.py @@ -0,0 +1,104 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn + + +class LitEma(nn.Layer): + """ + Exponential Moving Average (EMA) of model updates + + Parameters: + model: The model architecture for apply EMA. + decay: The exponential decay. Default 0.9999. + use_num_updates: Whether to use number of updates when computing + averages. + """ + + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError('Decay must be between 0 and 1') + + self.m_name2s_name = {} + self.register_buffer('decay', + paddle.to_tensor(decay, dtype=paddle.float32)) + self.register_buffer( + 'num_updates', + paddle.to_tensor(0, dtype=paddle.int64) + if use_num_upates else paddle.to_tensor(-1, dtype=paddle.int64)) + + for name, p in model.named_parameters(): + if not p.stop_gradient: + #remove as '.'-character is not allowed in buffers + s_name = name.replace('.', '') + self.m_name2s_name.update({name: s_name}) + self.register_buffer(s_name, p.clone().detach()) + + self.collected_params = [] + + def forward(self, model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + decay = min(self.decay, + (1 + self.num_updates) / (10 + self.num_updates)) + + one_minus_decay = 1.0 - decay + + with paddle.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if not m_param[key].stop_gradient: + sname = self.m_name2s_name[key] + shadow_params[sname].scale_(decay) + shadow_params[sname].add_(m_param[key] * one_minus_decay) + else: + assert not key in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if not m_param[key].stop_gradient: + m_param[key].copy_(shadow_params[self.m_name2s_name[key]], True) + else: + assert not key in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `paddle.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `paddle.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.copy_(c_param, True) diff --git a/ppdiffusers/ppdiffusers/pipelines/README.md b/ppdiffusers/ppdiffusers/pipelines/README.md new file mode 100644 index 000000000000..b4f249144f1b --- /dev/null +++ b/ppdiffusers/ppdiffusers/pipelines/README.md @@ -0,0 +1,54 @@ +# PPDiffusers Pipelines + +Pipelines提供了一种对各种SOTA扩散模型进行各种下游任务推理的简单方式。 +大多数扩散模型系统由多个独立训练的模型和高度自适应的调度器(scheduler)组成,通过pipeline我们可以很方便的对这些扩散模型系统进行端到端的推理。 + +举例来说, [Stable Diffusion](https://huggingface.co/blog/stable_diffusion)由以下组件构成: +- Autoencoder +- Conditional Unet +- CLIP text encoder +- scheduler +- CLIPFeatureExtractor +- safety checker + +这些组件之间是独立训练或创建的,同时在Stable Diffusion的推理运行中也是必需的,我们可以通过pipelines来对整个系统进行封装,从而提供一个简洁的推理接口。 + +我们通过pipelines在统一的API下提供所有开源且SOTA的扩散模型系统的推理能力。具体来说,我们的pipelines能够提供以下功能: +1. 可以加载官方发布的权重,并根据相应的论文复现出与原始实现相同的输出 +2. 提供一个简单的用户界面来推理运行扩散模型系统,参见[Pipelines API](#pipelines-api)部分 +3. 提供易于理解的代码实现,可以与官方文档一起阅读,参见[Pipelines汇总](#Pipelines汇总)部分 +4. 可以很容易地由社区贡献 + +**【注意】** Pipelines不(也不应该)提供任何训练功能。 +如果您正在寻找训练的相关示例,请查看[examples](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples). + +## Pipelines汇总 + +下表总结了所有支持的Pipelines,以及相应的论文、任务、推理脚本。 + +| Pipeline | Source | Tasks | Inference +|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:| +| [ddpm](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | *Unconditional Image Generation* | [link](../../scripts/inference/unconditional_image_generation-ddpm.py) +| [ddim](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | *Unconditional Image Generation* | [link](../../scripts/inference/unconditional_image_generation-ddim.py) +| [latent_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Text-to-Image Generation* | [link](../../scripts/inference/text_to_image_generation-latent_diffusion.py) +| [latent_diffusion_uncond](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Unconditional Image Generation* | [link](../../scripts/inference/unconditional_image_generation-latent_diffusion_uncond.py) +| [pndm](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | *Unconditional Image Generation* | [link](../../scripts/inference/unconditional_image_generation-pndm.py) +| [score_sde_ve](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | [link](../../scripts/inference/unconditional_image_generation-score_sde_ve.py) +| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-to-Image Generation* | [link](../../scripts/inference/text_to_image_generation-stable_diffusion.py) +| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Image-to-Image Text-Guided Generation* | [link](../../scripts/inference/image_to_image_text_guided_generation-stable_diffusion.py) +| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-Guided Image Inpainting* | [link](../../scripts/inference/text_guided_image_inpainting-stable_diffusion.py) + +**【注意】** Pipelines可以端到端的展示相应论文中描述的扩散模型系统。然而,大多数Pipelines可以使用不同的调度器组件,甚至不同的模型组件。 + +## Pipelines API + +扩散模型系统通常由多个独立训练的模型以及调度器等其他组件构成。 +其中每个模型都是在不同的任务上独立训练的,调度器可以很容易地进行替换。 +然而,在推理过程中,我们希望能够轻松地加载所有组件并在推理中使用它们,即使某个组件来自不同的库, 为此,所有pipeline都提供以下功能: + + +- `from_pretrained` 该方法接收PaddleNLP模型库id(例如`runwayml/stable-diffusion-v1-5`)或本地目录路径。为了能够准确加载相应的模型和组件,相应目录下必须提供`model_index.json`文件。 + +- `save_pretrained` 该方法接受一个本地目录路径,Pipelines的所有模型或组件都将被保存到该目录下。对于每个模型或组件,都会在给定目录下创建一个子文件夹。同时`model_index.json`文件将会创建在本地目录路径的根目录下,以便可以再次从本地路径实例化整个Pipelines。 + +- `__call__` Pipelines在推理时将调用该方法。该方法定义了Pipelines的推理逻辑,它应该包括预处理、张量在不同模型之间的前向传播、后处理等整个推理流程。 diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py index 96ffc029d3fe..d25f92ca86b9 100644 --- a/ppdiffusers/ppdiffusers/pipelines/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/__init__.py @@ -26,7 +26,7 @@ from ..utils.dummy_paddle_objects import * # noqa F403 if is_paddle_available() and is_paddlenlp_available(): - from .latent_diffusion import LDMTextToImagePipeline, LDMBertModel + from .latent_diffusion import LDMTextToImagePipeline, LDMBertModel, LDMSuperResolutionPipeline from .stable_diffusion import (StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py index 4f96c1d6138c..eec46d29285f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py @@ -18,3 +18,4 @@ if is_paddlenlp_available(): from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline + from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py new file mode 100644 index 000000000000..6c4afff0573c --- /dev/null +++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Optional, Tuple, Union +import numpy as np +import PIL + +import paddle +import paddle.nn as nn +from ...models import UNet2DModel, VQModel +from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ...schedulers import ( + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from paddlenlp.utils.tools import compare_version +if compare_version(PIL.__version__, "9.1.0") >= 0: + Resampling = PIL.Image.Resampling +else: + Resampling = PIL.Image + + +def preprocess(image): + w, h = image.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=Resampling.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = paddle.to_tensor(image) + return 2.0 * image - 1.0 + + +class LDMSuperResolutionPipeline(DiffusionPipeline): + r""" + A pipeline for image super-resolution using Latent + This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Parameters: + vqvae ([`VQModel`]): + Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations. + unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`],[`PNDMScheduler`]. + """ + + def __init__( + self, + vqvae: VQModel, + unet: UNet2DModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, ], + ): + super().__init__() + self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler) + + @paddle.no_grad() + def __call__( + self, + init_image: Union[paddle.Tensor, PIL.Image.Image], + batch_size: Optional[int] = 1, + num_inference_steps: Optional[int] = 100, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[Tuple, ImagePipelineOutput]: + r""" + Args: + init_image (`paddle.Tensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + batch_size (`int`, *optional*, defaults to 1): + Number of images to generate. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + seed (`int`, *optional*): + The seed used by paddle.randn(). + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*): + Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + + Returns: + [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if + `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the + generated images. + """ + + if isinstance(init_image, PIL.Image.Image): + batch_size = 1 + elif isinstance(init_image, paddle.Tensor): + batch_size = init_image.shape[0] + else: + raise ValueError( + f"`init_image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(init_image)}" + ) + + if isinstance(init_image, PIL.Image.Image): + init_image = preprocess(init_image) + + height, width = init_image.shape[-2:] + + # in_channels should be 6: 3 for latents, 3 for low resolution image + latents_shape = (batch_size, self.unet.in_channels // 2, height, width) + latents_dtype = self.unet.dtype + + if seed is not None: paddle.seed(seed) + latents = paddle.randn(latents_shape, dtype=latents_dtype) + + init_image = init_image.astype(latents_dtype) + + # set timesteps + self.scheduler.set_timesteps(num_inference_steps) + + # Some schedulers like PNDM have timesteps as arrays + # It's more optimized to move all timesteps to correct device beforehand + timesteps_tensor = self.scheduler.timesteps + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature. + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set( + inspect.signature(self.scheduler.step).parameters.keys()) + extra_kwargs = {} + if accepts_eta: + extra_kwargs["eta"] = eta + + for t in self.progress_bar(timesteps_tensor): + # concat latents and low resolution image in the channel dimension. + latents_input = paddle.concat([latents, init_image], axis=1) + latents_input = self.scheduler.scale_model_input(latents_input, t) + # predict the noise residual + noise_pred = self.unet(latents_input, t).sample + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, + **extra_kwargs).prev_sample + + # decode the image latents with the VQVAE + image = self.vqvae.decode(latents).sample + image = paddle.clip(image, -1.0, 1.0) + image = image / 2 + 0.5 + image = image.transpose([0, 2, 3, 1]).numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image, ) + + return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/ppnlp_patch_utils.py b/ppdiffusers/ppdiffusers/ppnlp_patch_utils.py index cd88c27e3606..8a419af35e87 100644 --- a/ppdiffusers/ppdiffusers/ppnlp_patch_utils.py +++ b/ppdiffusers/ppdiffusers/ppnlp_patch_utils.py @@ -55,6 +55,10 @@ def _inner(f): if is_paddle_available() and is_paddlenlp_available(): import paddle from paddlenlp.transformers import PretrainedModel + # NEG_INF = float("-inf") + # use -1e4 as NEG_INF + import paddlenlp.transformers.clip.modeling + paddlenlp.transformers.clip.modeling.NEG_INF = -1e4 @patch_to(PretrainedModel, as_prop=True) def dtype(self): diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py index e7e5d00acf33..bafcd79ae884 100644 --- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py +++ b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py @@ -198,6 +198,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["paddle"]) +class LDMSuperResolutionPipeline(metaclass=DummyObject): + _backends = ["paddle"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["paddle"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["paddle"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["paddle"]) + + class PNDMPipeline(metaclass=DummyObject): _backends = ["paddle"] diff --git a/ppdiffusers/scripts/convert_diffusers_model/README.md b/ppdiffusers/scripts/convert_diffusers_model/README.md new file mode 100644 index 000000000000..268710608c76 --- /dev/null +++ b/ppdiffusers/scripts/convert_diffusers_model/README.md @@ -0,0 +1,326 @@ +# Stable Diffusion模型转换教程(Pytorch -> Paddle) + +本教程支持将Huggingface的[Diffusers](https://github.com/huggingface/diffusers)版本的Stable Diffusion权重转换成[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)版本的Stable Diffusion权重。 + +Tips: +如果我们想要将原版的权重转换为[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)的权重,我们可以首先使用 +Huggingface提供的转换脚本[convert_original_stable_diffusion_to_diffusers.py](https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py)将原版权重转换为[Diffusers](https://github.com/huggingface/diffusers)版本的权重。 + +## 1 Diffusers 权重转换为 PPDiffusers权重 + +### 1.1 依赖安装 + +模型权重转换需要依赖`torch`, `diffusers`, `transformers`, `paddlepaddle`, `paddlenlp`以及`ppdiffusers`,我可使用`pip`执行下面的命令进行快速安装。 + +```shell +pip install -r requirements.txt +``` + +### 1.2 模型权重转换 + +___注意:模型权重转换过程中,需要下载Stable Diffusion模型。为了使用该模型与权重,你必须接受该模型所要求的License,并且获取HF Hub授予的Token。请访问HuggingFace的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的License,然后签署该协议。___ + +___Tips: Stable Diffusion是基于以下的License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___ + +若第一次权重转换模型,需要先登录HuggingFace客户端。执行以下命令进行登录: + +```shell +# 安装huggingface_hub +pip install huggingface_hub +# 登录huggingface_hub +huggingface-cli login +``` + +登录成功后,可执行以下命令行完成模型权重转换。 + +```shell +python convert_diffusers_stable_diffusion_to_ppdiffusers.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion-v1-5-ppdiffusers +``` + +输出的模型目录结构如下: +```shell +├── stable-diffusion-v1-5-ppdiffusers # 我们指定的输出文件路径 + ├── model_index.json # 模型index文件 + ├── vae # vae权重文件夹 + ├── model_state.pdparams + ├── config.json + ├── text_encoder # text_encoder权重文件夹 + ├── model_config.json + ├── model_state.pdparams + ├── unet # unet权重文件夹 + ├── model_state.pdparams + ├── config.json + ├── scheduler # scheduler文件夹 + ├── scheduler_config.json + ├── feature_extractor # feature_extractor文件夹 + ├── preprocessor_config.json + ├── safety_checker # safety_checker文件夹 + ├── model_config.json + ├── model_state.pdparams + ├── tokenizer # tokenizer文件夹 + ├── tokenizer_config.json + ├── merges.txt + ├── special_tokens_map.json + ├── vocab.json +``` + +#### 1.3 参数说明 + +`convert_diffusers_stable_diffusion_to_ppdiffusers.py` 各命令行参数的说明。 + +| 参数 |参数说明 | +|----------|--------------| +|
--pretrained_model_name_or_path
| Huggingface上提供的diffuers版本的diffusion预训练模型。默认为:"runwayml/stable-diffusion-v1-5"。更多diffusion预训练模型可参考[CompVis模型列表](https://huggingface.co/CompVis)及[runwayml模型列表](https://huggingface.co/runwayml),目前仅支持SD版模型。| +|--output_path | 转换后的模型目录。 | + + +## 2 原版Stable Diffusion模型权重转换为PPDiffusers权重 + +总共分为2个步骤 +- Step1 原版ckpt权重转换为Diffusers权重; +- Step2 Diffusers权重转换为PPDiffusers权重。 + +### 2.1 依赖安装 + +模型权重转换需要依赖`omegaconf`, `torch`, `diffusers`, `transformers`, `paddlepaddle`, `paddlenlp`以及`ppdiffusers`,我可使用`pip`执行下面的命令进行快速安装。 + +```shell +pip install -r requirements.txt +``` + +### 2.2 模型权重转换 + +#### Step1 原版ckpt权重转换为Diffusers权重 +在开始之前我们需要准备如下的文件: +- Huggingface提供的转换脚本, https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py; +- 原版的权重文件, https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.ckpt; +- yaml配置文件, https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml. + +所需的文件目录如下所示: +```shell +├── convert_original_stable_diffusion_to_diffusers.py # Huggingface的转换脚本 +├── v1-5-pruned.ckpt # 原版v1-5模型权重文件 +├── v1-inference.yaml # yaml配置文件 +``` + +```shell +python convert_original_stable_diffusion_to_diffusers.py --checkpoint_path v1-5-pruned.ckpt --original_config_file v1-inference.yaml --dump_path stable-diffusion-v1-5-diffusers +``` + +输出的模型目录结构如下: + +```shell +├── stable-diffusion-v1-5-diffusers # 我们指定的输出文件路径 + ├── model_index.json # 模型index文件 + ├── vae # vae权重文件夹 + ├── diffusion_pytorch_model.bin + ├── config.json + ├── text_encoder # text_encoder权重文件夹 + ├── config.json + ├── pytorch_model.bin + ├── unet # unet权重文件夹 + ├── diffusion_pytorch_model.bin + ├── config.json + ├── scheduler # scheduler文件夹 + ├── scheduler_config.json + ├── feature_extractor # feature_extractor文件夹 + ├── preprocessor_config.json + ├── safety_checker # safety_checker文件夹 + ├── config.json + ├── pytorch_model.bin + ├── tokenizer # tokenizer文件夹 + ├── tokenizer_config.json + ├── merges.txt + ├── special_tokens_map.json + ├── vocab.json +``` +#### 参数说明 + +`convert_original_stable_diffusion_to_diffusers.py` 各命令行参数的说明。 + +| 参数 |参数说明 | +|----------|--------------| +|
--checkpoint_path
| 原版Stable Diffusion模型ckpt后缀的权重文件。默认为:"v1-5-pruned.ckpt"。更多原版的预训练模型可在[HuggingFace上搜索](https://huggingface.co/)。| +|--original_config_file | 该权重文件所使用的配置文件,默认为"v1-inference.yaml"。 | +|--dump_path | 转换后的Diffusers版本模型目录。 | + +#### Step2 Diffusers权重转换为PPDiffusers权重 +由于我们已经得到了Huggingface的[Diffusers](https://github.com/huggingface/diffusers)版本的权重,因此我们可以参考第1部分进行权重转换。 + +我们仅需要运行下面的代码即可成功转换[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)版本的权重。 + +```shell +python convert_diffusers_stable_diffusion_to_ppdiffusers.py --pretrained_model_name_or_path stable-diffusion-v1-5-diffusers --output_path stable-diffusion-v1-5-ppdiffusers +``` + +脚本运行完成后,输出的模型目录结构如下: +```shell +├── stable-diffusion-v1-5-ppdiffusers # 我们指定的输出文件路径 + ├── model_index.json # 模型index文件 + ├── vae # vae权重文件夹 + ├── model_state.pdparams + ├── config.json + ├── text_encoder # text_encoder权重文件夹 + ├── model_config.json + ├── model_state.pdparams + ├── unet # unet权重文件夹 + ├── model_state.pdparams + ├── config.json + ├── scheduler # scheduler文件夹 + ├── scheduler_config.json + ├── feature_extractor # feature_extractor文件夹 + ├── preprocessor_config.json + ├── safety_checker # safety_checker文件夹 + ├── model_config.json + ├── model_state.pdparams + ├── tokenizer # tokenizer文件夹 + ├── tokenizer_config.json + ├── merges.txt + ├── special_tokens_map.json + ├── vocab.json +``` + + + + + +## 3 转换后的权重效果对比 + +### 3.1 Text-to-Image效果对比 +```python +import torch +from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline +pipe = DiffusersStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") +pipe = pipe.to("cuda") +seed = 1024 +generator = torch.Generator("cuda").manual_seed(seed) +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt, generator=generator).images[0] +image.save("diffusers_astronaut_rides_horse.png") +``` +![diffusers_astronaut_rides_horse](https://user-images.githubusercontent.com/50394665/201277740-c9b37d59-4ec0-4b3d-8118-bd7f0dfaf352.png) + +```python +import paddle +from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline +pipe = PPDiffusersStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") +prompt = "a photo of an astronaut riding a horse on mars" +seed = 1024 +paddle.seed(seed) +image = pipe(prompt).images[0] +image.save("ppdiffusers_astronaut_rides_horse.png") +``` +![ppdiffusers_astronaut_rides_horse](https://user-images.githubusercontent.com/50394665/201277735-fafa458a-9409-4795-887a-897a2851753d.png) + +### 3.2 Image-to-Image text-guided generation效果对比 +```python +import requests +import torch +from PIL import Image +from io import BytesIO + +from diffusers import StableDiffusionImg2ImgPipeline as DiffusersStableDiffusionImg2ImgPipeline + +pipe = DiffusersStableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") +pipe = pipe.to("cuda") + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") +init_image = init_image.resize((768, 512)) + +prompt = "A fantasy landscape, trending on artstation" +seed = 1024 +generator = torch.Generator("cuda").manual_seed(seed) +image = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5, generator=generator).images[0] + +image.save("diffusers_fantasy_landscape.png") +``` +![diffusers_fantasy_landscape](https://user-images.githubusercontent.com/50394665/201277726-2c2f2fc8-dbfe-4b38-9940-9000bb6c8333.png) + +```python +import requests +import paddle +from PIL import Image +from io import BytesIO + +from ppdiffusers import StableDiffusionImg2ImgPipeline as PPDiffusersStableDiffusionImg2ImgPipeline + +pipe = PPDiffusersStableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") +init_image = init_image.resize((768, 512)) + +prompt = "A fantasy landscape, trending on artstation" +seed = 1024 +paddle.seed(seed) +image = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images[0] + +image.save("ppdiffusers_fantasy_landscape.png") +``` +![ppdiffusers_fantasy_landscape](https://user-images.githubusercontent.com/50394665/201277718-f01e8f8d-b560-442f-bf93-c026285c337e.png) +### 3.3 In-painting效果对比 +```python +import torch +import PIL +import requests +from io import BytesIO + +from diffusers import StableDiffusionInpaintPipeline as DiffusersStableDiffusionInpaintPipeline, EulerAncestralDiscreteScheduler as DiffusersEulerAncestralDiscreteScheduler + +def download_image(url): + response = requests.get(url) + return PIL.Image.open(BytesIO(response.content)).convert("RGB") + + +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + +init_image = download_image(img_url).resize((512, 512)) +mask_image = download_image(mask_url).resize((512, 512)) +scheduler = DiffusersEulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") +pipe = DiffusersStableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", scheduler=scheduler) +pipe.to("cuda") + +prompt = "Face of a yellow cat, high resolution, sitting on a park bench" +seed = 1024 +generator = torch.Generator("cuda").manual_seed(seed) +image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] + +image.save("diffusers_cat_on_bench.png") +``` +![diffusers_cat_on_bench](https://user-images.githubusercontent.com/50394665/201277724-76145ee6-a3ef-49e7-a1e9-8ccd3c9eb39e.png) + +```python +import paddle +import PIL +import requests +from io import BytesIO + +from ppdiffusers import StableDiffusionInpaintPipeline as PPDiffusersStableDiffusionInpaintPipeline, EulerAncestralDiscreteScheduler as PPDiffusersEulerAncestralDiscreteScheduler + +def download_image(url): + response = requests.get(url) + return PIL.Image.open(BytesIO(response.content)).convert("RGB") + + +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + +init_image = download_image(img_url).resize((512, 512)) +mask_image = download_image(mask_url).resize((512, 512)) +scheduler = PPDiffusersEulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") +pipe = PPDiffusersStableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", scheduler=scheduler) + +prompt = "Face of a yellow cat, high resolution, sitting on a park bench" +seed = 1024 +paddle.seed(seed) +image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0] + +image.save("ppdiffusers_cat_on_bench.png") +``` +![ppdiffusers_cat_on_bench](https://user-images.githubusercontent.com/50394665/201277712-2e10c188-e1ca-44f5-b963-657e9d51cc95.png) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py new file mode 100644 index 000000000000..6315b1f73256 --- /dev/null +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py @@ -0,0 +1,170 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile +import paddle + +paddle.set_device("cpu") +import argparse +import torch +from collections import OrderedDict +from diffusers import LDMTextToImagePipeline as DiffusersLDMTextToImagePipeline +from ppdiffusers import LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline, LDMBertModel, AutoencoderKL, UNet2DConditionModel, PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler +from paddlenlp.transformers import BertTokenizer + + +def convert_to_ppdiffusers(vae_or_unet, dtype="float32"): + need_transpose = [] + for k, v in vae_or_unet.named_modules(): + if isinstance(v, torch.nn.Linear): + need_transpose.append(k + ".weight") + new_vae_or_unet = OrderedDict() + for k, v in vae_or_unet.state_dict().items(): + if k not in need_transpose: + new_vae_or_unet[k] = v.cpu().numpy().astype(dtype) + else: + new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype) + return new_vae_or_unet + + +def convert_hf_ldmbert_to_ppnlp_ldmbert(ldmbert, dtype="float32"): + transformers2ppnlp = { + "model.embed_tokens.weight": "embeddings.word_embeddings.weight", + "model.embed_positions.weight": "embeddings.position_embeddings.weight", + "model.layer_norm.": "final_layer_norm.", + "model.layers": "encoder.layers", + ".self_attn_layer_norm.": ".norm1.", + ".final_layer_norm.": ".norm2.", + ".fc1.": ".linear1.", + ".fc2.": ".linear2.", + } + ignore_value = ["to_logits"] + donot_transpose = ["embed_tokens", "embed_positions", "norm"] + new_model_state = OrderedDict() + for name, value in ldmbert.state_dict().items(): + # step1: ignore to_logits + if any(i in name for i in ignore_value): + continue + # step2: transpose nn.Linear weight + if value.ndim == 2 and not any(i in name for i in donot_transpose): + value = value.t() + # step3: hf_name -> ppnlp_name mapping + for hf_name, ppnlp_name in transformers2ppnlp.items(): + name = name.replace(hf_name, ppnlp_name) + new_model_state[name] = value.cpu().numpy().astype(dtype) + + new_config = { + "vocab_size": ldmbert.config.vocab_size, + "max_position_embeddings": ldmbert.config.max_position_embeddings, + "encoder_layers": ldmbert.config.encoder_layers, + "encoder_ffn_dim": ldmbert.config.encoder_ffn_dim, + "encoder_attention_heads": ldmbert.config.encoder_attention_heads, + "head_dim": ldmbert.config.head_dim, + "activation_function": ldmbert.config.activation_function, + "d_model": ldmbert.config.d_model, + "dropout": 0.0, # we do not use dropout in original ldmbert + "attention_dropout": ldmbert.config.attention_dropout, + "activation_dropout": ldmbert.config.activation_dropout, + "init_std": ldmbert.config.init_std, + "pad_token_id": ldmbert.config.pad_token_id + } + return new_model_state, new_config + + +def convert_diffusers_stable_diffusion_to_ppdiffusers( + pretrained_model_name_or_path, output_path=None): + # 0. load diffusers pipe and convert to ppdiffusers weights format + diffusers_pipe = DiffusersLDMTextToImagePipeline.from_pretrained( + pretrained_model_name_or_path, use_auth_token=True) + vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae) + unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) + bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert( + diffusers_pipe.bert) + + # 1. vqvae + pp_vqvae = AutoencoderKL(**diffusers_pipe.vqvae.config) + pp_vqvae.set_dict(vqvae_state_dict) + + # 2. unet + pp_unet = UNet2DConditionModel(**diffusers_pipe.unet.config) + pp_unet.set_dict(unet_state_dict) + + # 3. bert + pp_bert = LDMBertModel(**bert_config) + pp_bert.set_dict(bert_state_dict) + + # 4. scheduler + beta_start = diffusers_pipe.scheduler.beta_start + beta_end = diffusers_pipe.scheduler.beta_end + num_train_timesteps = diffusers_pipe.scheduler.num_train_timesteps + scheduler_type = diffusers_pipe.scheduler._class_name.lower() + if "pndm" in scheduler_type: + pp_scheduler = PNDMScheduler( + beta_end=beta_end, + beta_schedule="scaled_linear", + beta_start=beta_start, + num_train_timesteps=num_train_timesteps, + skip_prk_steps=True, + ) + elif "lms" in scheduler_type: + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, + beta_end=beta_end, + beta_schedule="scaled_linear") + elif "ddim" in scheduler_type: + pp_scheduler = DDIMScheduler( + beta_start=beta_start, + beta_end=beta_end, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + else: + raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") + + with tempfile.TemporaryDirectory() as tmpdirname: + # 5. tokenizer + diffusers_pipe.tokenizer.save_pretrained(tmpdirname) + pp_tokenizer = BertTokenizer.from_pretrained(tmpdirname, + model_max_length=77) + + # 6. create ppdiffusers pipe + paddle_pipe = PPDiffusersLDMTextToImagePipeline(vqvae=pp_vqvae, + bert=pp_bert, + tokenizer=pp_tokenizer, + unet=pp_unet, + scheduler=pp_scheduler) + + # 7. save_pretrained + paddle_pipe.save_pretrained(output_path) + return paddle_pipe + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pytorch model weights to Paddle model weights.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="CompVis/ldm-text2im-large-256", + help= + "Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--output_path", + type=str, + default="ldm-text2im-large-256-ppdiffusers", + help="The model output path.", + ) + args = parser.parse_args() + ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers( + args.pretrained_model_name_or_path, args.output_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py new file mode 100644 index 000000000000..fb7ca24514ea --- /dev/null +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile +import paddle + +paddle.set_device("cpu") +import argparse +import torch +from collections import OrderedDict +from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline +from ppdiffusers.configuration_utils import FrozenDict +from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline, AutoencoderKL, UNet2DConditionModel, PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker +from paddlenlp.transformers import CLIPTextModel, CLIPVisionModel, CLIPTokenizer, CLIPFeatureExtractor + + +def convert_to_ppdiffusers(vae_or_unet, dtype="float32"): + need_transpose = [] + for k, v in vae_or_unet.named_modules(): + if isinstance(v, torch.nn.Linear): + need_transpose.append(k + ".weight") + new_vae_or_unet = OrderedDict() + for k, v in vae_or_unet.state_dict().items(): + if k not in need_transpose: + new_vae_or_unet[k] = v.cpu().numpy().astype(dtype) + else: + new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype) + return new_vae_or_unet + + +def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): + new_model_state = {} + transformers2ppnlp = { + ".encoder.": ".transformer.", + ".layer_norm": ".norm", + ".mlp.": ".", + ".fc1.": ".linear1.", + ".fc2.": ".linear2.", + ".final_layer_norm.": ".ln_final.", + ".embeddings.": ".", + ".position_embedding.": ".positional_embedding.", + ".patch_embedding.": ".conv1.", + "visual_projection.weight": "vision_projection", + "text_projection.weight": "text_projection", + ".pre_layrnorm.": ".ln_pre.", + ".post_layernorm.": ".ln_post.", + ".vision_model.": "." + } + ignore_value = ["position_ids"] + donot_transpose = [ + "embeddings", "norm", "concept_embeds", "special_care_embeds" + ] + + for name, value in clip.state_dict().items(): + # step1: ignore position_ids + if any(i in name for i in ignore_value): + continue + # step2: transpose nn.Linear weight + if value.ndim == 2 and not any(i in name for i in donot_transpose): + value = value.t() + # step3: hf_name -> ppnlp_name mapping + for hf_name, ppnlp_name in transformers2ppnlp.items(): + name = name.replace(hf_name, ppnlp_name) + # step4: 0d tensor -> 1d tensor + if name == "logit_scale": value = value.reshape((1, )) + # step5: safety_checker need prefix "clip." + if "vision_model" in name: name = "clip." + name + new_model_state[name] = value.cpu().numpy().astype(dtype) + + if is_text_encoder: + new_config = { + 'max_text_length': clip.config.max_position_embeddings, + 'vocab_size': clip.config.vocab_size, + 'text_embed_dim': clip.config.hidden_size, + 'text_heads': clip.config.num_attention_heads, + 'text_layers': clip.config.num_hidden_layers, + 'text_hidden_act': clip.config.hidden_act, + 'projection_dim': clip.config.projection_dim, + 'initializer_range': clip.config.initializer_range, + 'initializer_factor': clip.config.initializer_factor, + } + else: + new_config = { + 'image_resolution': + clip.config.vision_config.image_size, + 'vision_layers': + clip.config.vision_config.num_hidden_layers, + 'vision_heads': + clip.config.vision_config.num_attention_heads, + 'vision_embed_dim': + clip.config.vision_config.hidden_size, + 'vision_patch_size': + clip.config.vision_config.patch_size, + 'vision_mlp_ratio': + clip.config.vision_config.intermediate_size // + clip.config.vision_config.hidden_size, + 'vision_hidden_act': + clip.config.vision_config.hidden_act, + 'projection_dim': + clip.config.projection_dim, + } + return new_model_state, new_config + + +def convert_diffusers_stable_diffusion_to_ppdiffusers( + pretrained_model_name_or_path, output_path=None): + # 0. load diffusers pipe and convert to ppdiffusers weights format + diffusers_pipe = DiffusersStableDiffusionPipeline.from_pretrained( + pretrained_model_name_or_path, use_auth_token=True) + vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) + unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) + text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip( + diffusers_pipe.text_encoder, is_text_encoder=True) + safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip( + diffusers_pipe.safety_checker, is_text_encoder=False) + + # 1. vae + pp_vae = AutoencoderKL(**diffusers_pipe.vae.config) + pp_vae.set_dict(vae_state_dict) + + # 2. unet + pp_unet = UNet2DConditionModel(**diffusers_pipe.unet.config) + pp_unet.set_dict(unet_state_dict) + + # 3. text_encoder + pp_text_encoder = CLIPTextModel(**text_encoder_config) + pp_text_encoder.set_dict(text_encoder_state_dict) + + # 4. safety_checker + pp_safety_checker = StableDiffusionSafetyChecker( + CLIPVisionModel(**safety_checker_config)) + pp_safety_checker.set_dict(safety_checker_state_dict) + + # 5. scheduler + beta_start = diffusers_pipe.scheduler.beta_start + beta_end = diffusers_pipe.scheduler.beta_end + num_train_timesteps = diffusers_pipe.scheduler.num_train_timesteps + scheduler_type = diffusers_pipe.scheduler._class_name.lower() + if "pndm" in scheduler_type: + pp_scheduler = PNDMScheduler( + beta_end=beta_end, + beta_schedule="scaled_linear", + beta_start=beta_start, + num_train_timesteps=num_train_timesteps, + skip_prk_steps=True, + ) + elif "lms" in scheduler_type: + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, + beta_end=beta_end, + beta_schedule="scaled_linear") + elif "ddim" in scheduler_type: + pp_scheduler = DDIMScheduler( + beta_start=beta_start, + beta_end=beta_end, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + else: + raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") + + with tempfile.TemporaryDirectory() as tmpdirname: + # 6. feature_extractor + diffusers_pipe.feature_extractor.save_pretrained(tmpdirname) + pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(tmpdirname) + + # 7. tokenizer + diffusers_pipe.tokenizer.save_pretrained(tmpdirname) + pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname) + + # 8. create ppdiffusers pipe + paddle_pipe = PPDiffusersStableDiffusionPipeline( + vae=pp_vae, + text_encoder=pp_text_encoder, + tokenizer=pp_tokenizer, + unet=pp_unet, + safety_checker=pp_safety_checker, + feature_extractor=pp_feature_extractor, + scheduler=pp_scheduler) + if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path: + _internal_dict = dict(paddle_pipe._internal_dict) + if _internal_dict["_ppdiffusers_version"] == "0.0.0": + _internal_dict.update({"_ppdiffusers_version": "0.6.0"}) + paddle_pipe._internal_dict = FrozenDict(_internal_dict) + # 9. save_pretrained + paddle_pipe.save_pretrained(output_path) + return paddle_pipe + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pytorch model weights to Paddle model weights.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="runwayml/stable-diffusion-v1-5", + help= + "Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--output_path", + type=str, + default="stable-diffusion-v1-5-ppdiffusers", + help="The model output path.", + ) + args = parser.parse_args() + ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers( + args.pretrained_model_name_or_path, args.output_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/requirements.txt b/ppdiffusers/scripts/convert_diffusers_model/requirements.txt new file mode 100644 index 000000000000..cca5a8ae5b22 --- /dev/null +++ b/ppdiffusers/scripts/convert_diffusers_model/requirements.txt @@ -0,0 +1,7 @@ +ppdiffusers +paddlenlp +paddlepaddle-gpu +torch +diffusers +transformers +omegaconf \ No newline at end of file diff --git a/ppdiffusers/scripts/convert_diffusers_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_to_ppdiffusers.py deleted file mode 100644 index 5c29c59fc958..000000000000 --- a/ppdiffusers/scripts/convert_diffusers_to_ppdiffusers.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - -paddle.set_device("cpu") -from collections import OrderedDict -import copy -import torch -from ppdiffusers import StableDiffusionPipeline as PaddleStableDiffusionPipeline, DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler -# pip install diffusers -from diffusers import StableDiffusionPipeline as PytorchStableDiffusionPipeline - - -def convert_vae_to_paddlenlp(vae, dtype="float32"): - need_transpose = [] - for k, v in vae.named_modules(): - if isinstance(v, torch.nn.Linear): - need_transpose.append(k + ".weight") - new_vae = OrderedDict() - for k, v in vae.state_dict().items(): - if k not in need_transpose: - new_vae[k] = v.numpy().astype(dtype) - else: - new_vae[k] = v.t().numpy().astype(dtype) - return new_vae - - -def convert_unet_to_paddlenlp(unet, dtype="float32"): - need_transpose = [] - for k, v in unet.named_modules(): - if isinstance(v, torch.nn.Linear): - need_transpose.append(k + ".weight") - new_unet = OrderedDict() - for k, v in unet.state_dict().items(): - if k not in need_transpose: - new_unet[k] = v.numpy().astype(dtype) - else: - new_unet[k] = v.t().numpy().astype(dtype) - return new_unet - - -def convert_hf_clip_to_paddlenlp(clip, dtype="float32"): - new_model_state = OrderedDict() - old2new = { - ".encoder.": ".transformer.", - ".layer_norm": ".norm", - ".mlp.": ".", - ".fc1.": ".linear1.", - ".fc2.": ".linear2.", - ".final_layer_norm.": ".ln_final.", - ".embeddings.": ".", - ".position_embedding.": ".positional_embedding.", - ".patch_embedding.": ".conv1.", - "visual_projection.weight": "vision_projection", - "text_projection.weight": "text_projection", - ".pre_layrnorm.": ".ln_pre.", - ".post_layernorm.": ".ln_post." - } - ignore = ["position_ids"] - - for k, v in clip.state_dict().items(): - # 过滤掉ignore - if any(i in k for i in ignore): - continue - oldk = copy.deepcopy(k) - # 批量替换名字 - is_transpose = False - if v.ndim == 2: - if "embeddings" in oldk or "norm" in oldk or 'concept_embeds' in oldk or 'special_care_embeds' in oldk: - pass - else: - v = v.t() - is_transpose = True - for oldname, newname in old2new.items(): - k = k.replace(oldname, newname).replace(".vision_model.", ".") - - # pytorch的是0d的tensor,paddle的是1d tensor所以要reshape。这要注意。 - if k == "logit_scale": v = v.reshape((1, )) - if "vision_model" in k: k = "clip." + k - # if "text_model" in k: k = "clip." + k - new_model_state[k] = v.numpy().astype(dtype) - print(f"Convert {oldk} -> {k} | {v.shape}, is_transpose {is_transpose}") - return new_model_state - - -def convert_model(model_name): - pytorch_pipe = PytorchStableDiffusionPipeline.from_pretrained( - model_name, use_auth_token=True) - new_vae = convert_vae_to_paddlenlp(pytorch_pipe.vae) - new_unet = convert_unet_to_paddlenlp(pytorch_pipe.unet) - new_text_encoder = convert_hf_clip_to_paddlenlp(pytorch_pipe.text_encoder) - new_safety_checker = convert_hf_clip_to_paddlenlp( - pytorch_pipe.safety_checker) - - paddle_pipe = PaddleStableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") - paddle_pipe.vae.set_dict(new_vae) - paddle_pipe.unet.set_dict(new_unet) - paddle_pipe.text_encoder.set_dict(new_text_encoder) - paddle_pipe.safety_checker.set_dict(new_safety_checker) - return paddle_pipe - - -if __name__ == "__main__": - # model_name为Huggingface.co上diffusers权重地址。 - paddle_pipe = convert_model(model_name="CompVis/stable-diffusion-v1-4") - paddle_pipe.save_pretrained("./stable-diffusion-v1-4-paddle") diff --git a/ppdiffusers/scripts/export/README.md b/ppdiffusers/scripts/export/README.md new file mode 100644 index 000000000000..2c64d370d484 --- /dev/null +++ b/ppdiffusers/scripts/export/README.md @@ -0,0 +1,72 @@ +# Stable Diffusion模型导出教程 + +- [注意事项](#注意事项) +- [环境依赖](#环境依赖) +- [模型导出](#模型导出) + - [参数说明](#参数说明) +- [推理部署](#推理部署) + +## 注意事项 + +___注意:模型导出过程中,需要下载StableDiffusion模型。为了使用该模型与权重,你必须接受该模型所要求的License,请访问HuggingFace的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的License,然后签署该协议。___ + +___Tips: Stable Diffusion是基于以下的License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___ + +## 环境依赖 + +- paddlepaddle >= 2.4.0 +- paddlenlp >= 2.4.1 +- ppdiffusers >= 0.6.2 + +可执行以下命令行安装环境依赖包。 + +```shell +pip install --upgrade ppdiffusers paddlepaddle-gpu paddlenlp +``` + + +## 模型导出 + +可执行以下命令行完成模型导出。 + +```shell +python export_model.py --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 --output_path stable-diffusion-v1-4 +``` + +如需导出stable-diffusion-v1-5,可执行以下命令: + +```shell +python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion-v1-5 +``` + + +输出的模型目录结构如下: +```shell +stable-diffusion-v1-4/ +├── text_encoder +│ ├── inference.pdiparams +│ ├── inference.pdiparams.info +│ └── inference.pdmodel +├── unet +│ ├── inference.pdiparams +│ ├── inference.pdiparams.info +│ └── inference.pdmodel +└── vae_decoder + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +### 参数说明 + +`export_model.py` 各命令行参数的说明。 + +| 参数 |参数说明 | +|----------|--------------| +|
--pretrained_model_name_or_path
| ppdiffuers提供的diffusion预训练模型名称以及用户自行训练的模型目录。默认为:"CompVis/stable-diffusion-v1-4 "。更多diffusion预训练模型可参考[ppdiffuser模型列表](../examples/textual_inversion)。| +|--output_path | 导出的模型目录。 | + + +## 推理部署 + +完成模型导出后,可以加载导出后的模型,完成StableDiffusion的模型部署。我们提供在[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)上的StableDiffusion模型文生图任务的部署示例,可参考[FastDeploy Diffusion模型高性能部署](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/multimodal/stable_diffusion#%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C)完成部署。 diff --git a/ppdiffusers/scripts/export/export_model.py b/ppdiffusers/scripts/export/export_model.py new file mode 100644 index 000000000000..df13dee54105 --- /dev/null +++ b/ppdiffusers/scripts/export/export_model.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import paddle +import paddlenlp + +from ppdiffusers import UNet2DConditionModel, AutoencoderKL +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker +from paddlenlp.transformers import CLIPTextModel + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument("--pretrained_model_name_or_path", + default='CompVis/stable-diffusion-v1-4', + help="The pretrained diffusion model.") + parser.add_argument("--output_path", + type=str, + required=True, + help="The pretrained diffusion model.") + return parser.parse_args() + + +class VAEDecoder(AutoencoderKL): + + def forward(self, z): + return self.decode(z, True).sample + + +if __name__ == "__main__": + paddle.set_device('cpu') + args = parse_arguments() + # Load models and create wrapper for stable diffusion + text_encoder = CLIPTextModel.from_pretrained( + os.path.join(args.pretrained_model_name_or_path, "text_encoder")) + vae_decoder = VAEDecoder.from_pretrained(args.pretrained_model_name_or_path, + subfolder="vae") + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet") + + # Convert to static graph with specific input description + text_encoder = paddle.jit.to_static( + text_encoder, + input_spec=[ + paddle.static.InputSpec(shape=[None, None], + dtype="int64", + name="input_ids") # input_ids + ]) + + # Save text_encoder in static graph model. + save_path = os.path.join(args.output_path, "text_encoder", "inference") + paddle.jit.save(text_encoder, save_path) + print(f"Save text_encoder model in {save_path} successfully.") + + # Convert to static graph with specific input description + vae_decoder = paddle.jit.to_static( + vae_decoder, + input_spec=[ + paddle.static.InputSpec(shape=[None, 4, 64, 64], + dtype="float32", + name="latent"), # latent + ]) + # Save vae_decoder in static graph model. + save_path = os.path.join(args.output_path, "vae_decoder", "inference") + paddle.jit.save(vae_decoder, save_path) + print(f"Save vae_decoder model in {save_path} successfully.") + + # Convert to static graph with specific input description + unet = paddle.jit.to_static( + unet, + input_spec=[ + paddle.static.InputSpec(shape=[None, 4, None, None], + dtype="float32", + name="latent_input"), # latent + paddle.static.InputSpec(shape=[1], dtype="int64", + name="timestep"), # timesteps + paddle.static.InputSpec( + shape=[None, None, 768], + dtype="float32", + name="encoder_embedding") # encoder_embedding + ]) + save_path = os.path.join(args.output_path, "unet", "inference") + paddle.jit.save(unet, save_path) + print(f"Save unet model in {save_path} successfully.") diff --git a/ppdiffusers/scripts/fid/README.md b/ppdiffusers/scripts/fid/README.md new file mode 100644 index 000000000000..d66e477bea60 --- /dev/null +++ b/ppdiffusers/scripts/fid/README.md @@ -0,0 +1,34 @@ +# FID score for PaddlePaddle + +FID(Frechet Inception Distance score,FID)是计算真实图像和生成图像的特征向量之间距离的一种度量,最常用于评估生成性对抗网络样本的质量。FID 从原始图像的计算机视觉特征的统计方面的相似度来衡量两组图像的相似度,这种视觉特征是使用 `Inception v3` 图像分类模型计算的得到的。分数越低代表两组图像越相似,或者说二者的统计量越相似,FID 在最佳情况下的得分为 0.0,表示两组图像相同。 + + +## 依赖 + +- PaddlePaddle +- Pillow +- Numpy +- Scipy + +## 快速使用 + +计算两个图片数据集的FID,`path/to/dataset1`/`path/to/dataset2`为图片文件夹 +``` +python fid_score.py path/to/dataset1 path/to/dataset2 +``` + +使用CPU计算 +``` +python fid_score.py path/to/dataset1 path/to/dataset2 --device cpu +``` + +参数说明 +- `batch-size`:使用批次的大小,默认为50 +- `num-workers`: 用于加载数据的子进程个数,默认为`min(8, num_cpus)`。 +- `device`:使用设备,支持GPU、CPU。 +- `dims`:要使用的Inception特征的维度。默认使用2048. + +## 参考 + +- [https://github.com/mseitzer/pytorch-fid](https://github.com/mseitzer/pytorch-fid) +- [https://github.com/bioinf-jku/TTUR](https://github.com/bioinf-jku/TTUR) diff --git a/ppdiffusers/scripts/fid/fid_score.py b/ppdiffusers/scripts/fid/fid_score.py new file mode 100755 index 000000000000..5644e86106fe --- /dev/null +++ b/ppdiffusers/scripts/fid/fid_score.py @@ -0,0 +1,308 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) mseitzer Author. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Calculates the Frechet Inception Distance (FID) to evalulate GANs + +The FID metric calculates the distance between two distributions of images. +Typically, we have summary statistics (mean & covariance matrix) of one +of these distributions, while the 2nd distribution is given by a GAN. + +When run as a stand-alone program, it compares the distribution of +images that are stored as PNG/JPEG at a specified location with a +distribution given by summary statistics (in pickle format). + +The FID is calculated by assuming that X_1 and X_2 are the activations of +the pool_3 layer of the inception net for generated samples and real world +samples respectively. + +See --help to see further details. + +Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead +of Tensorflow + +Copyright 2018 Institute of Bioinformatics, JKU Linz + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +import pathlib +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +import numpy as np +from PIL import Image +from scipy import linalg +import paddle +import paddle.vision.transforms as TF +from paddle.nn.functional import adaptive_avg_pool2d + +try: + from tqdm import tqdm +except ImportError: + # If tqdm is not available, provide a mock version of it + def tqdm(x): + return x + + +from inception import InceptionV3 + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--batch-size', + type=int, + default=50, + help='Batch size to use') +parser.add_argument('--num-workers', + type=int, + help=('Number of processes to use for data loading. ' + 'Defaults to `min(8, num_cpus)`')) +parser.add_argument('--device', + type=str, + default=None, + help='Device to use. Like gpu, gpu:0 or cpu') +parser.add_argument('--dims', + type=int, + default=2048, + choices=list(InceptionV3.BLOCK_INDEX_BY_DIM), + help=('Dimensionality of Inception features to use. ' + 'By default, uses pool3 features')) +parser.add_argument('path', + type=str, + nargs=2, + help=('Paths to the generated images or ' + 'to .npz statistic files')) + +IMAGE_EXTENSIONS = { + 'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm', 'tif', 'tiff', 'webp' +} + + +class ImagePathDataset(paddle.io.Dataset): + + def __init__(self, files, transforms=None): + self.files = files + self.transforms = transforms + + def __len__(self): + return len(self.files) + + def __getitem__(self, i): + path = self.files[i] + img = Image.open(path).convert('RGB') + if self.transforms is not None: + img = self.transforms(img) + return {'img': img} + + +def get_activations(files, model, batch_size=50, dims=2048, num_workers=1): + """Calculates the activations of the pool_3 layer for all images. + + Params: + -- files : List of image files paths + -- model : Instance of inception model + -- batch_size : Batch size of images for the model to process at once. + Make sure that the number of samples is a multiple of + the batch size, otherwise some samples are ignored. This + behavior is retained to match the original FID score + implementation. + -- dims : Dimensionality of features returned by Inception + -- num_workers : Number of parallel dataloader workers + + Returns: + -- A numpy array of dimension (num images, dims) that contains the + activations of the given tensor when feeding inception with the + query tensor. + """ + model.eval() + + if batch_size > len(files): + print(('Warning: batch size is bigger than the data size. ' + 'Setting batch size to data size')) + batch_size = len(files) + + dataset = ImagePathDataset(files, transforms=TF.ToTensor()) + dataloader = paddle.io.DataLoader(dataset, + batch_size=batch_size, + shuffle=False, + drop_last=False, + num_workers=num_workers) + + pred_arr = np.empty((len(files), dims)) + + start_idx = 0 + + for batch in tqdm(dataloader): + batch = batch['img'] + with paddle.no_grad(): + pred = model(batch)[0] + + # If model output is not scalar, apply global spatial average pooling. + # This happens if you choose a dimensionality not equal 2048. + # import pdb;pdb.set_trace() + if pred.shape[2] != 1 or pred.shape[3] != 1: + pred = adaptive_avg_pool2d(pred, output_size=(1, 1)) + + pred = pred.squeeze(3).squeeze(2).cpu().numpy() + + pred_arr[start_idx:start_idx + pred.shape[0]] = pred + + start_idx = start_idx + pred.shape[0] + + return pred_arr + + +def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): + """Numpy implementation of the Frechet Distance. + The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) + and X_2 ~ N(mu_2, C_2) is + d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). + + Stable version by Dougal J. Sutherland. + + Params: + -- mu1 : Numpy array containing the activations of a layer of the + inception net (like returned by the function 'get_predictions') + for generated samples. + -- mu2 : The sample mean over activations, precalculated on an + representative data set. + -- sigma1: The covariance matrix over activations for generated samples. + -- sigma2: The covariance matrix over activations, precalculated on an + representative data set. + + Returns: + -- : The Frechet Distance. + """ + + mu1 = np.atleast_1d(mu1) + mu2 = np.atleast_1d(mu2) + + sigma1 = np.atleast_2d(sigma1) + sigma2 = np.atleast_2d(sigma2) + + assert mu1.shape == mu2.shape, \ + 'Training and test mean vectors have different lengths' + assert sigma1.shape == sigma2.shape, \ + 'Training and test covariances have different dimensions' + + diff = mu1 - mu2 + + # Product might be almost singular + covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) + if not np.isfinite(covmean).all(): + msg = ('fid calculation produces singular product; ' + 'adding %s to diagonal of cov estimates') % eps + print(msg) + offset = np.eye(sigma1.shape[0]) * eps + covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) + + # Numerical error might give slight imaginary component + if np.iscomplexobj(covmean): + if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): + m = np.max(np.abs(covmean.imag)) + raise ValueError('Imaginary component {}'.format(m)) + covmean = covmean.real + + tr_covmean = np.trace(covmean) + + return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - + 2 * tr_covmean) + + +def calculate_activation_statistics(files, + model, + batch_size=50, + dims=2048, + num_workers=1): + """Calculation of the statistics used by the FID. + Params: + -- files : List of image files paths + -- model : Instance of inception model + -- batch_size : The images numpy array is split into batches with + batch size batch_size. A reasonable batch size + depends on the hardware. + -- dims : Dimensionality of features returned by Inception + -- num_workers : Number of parallel dataloader workers + + Returns: + -- mu : The mean over samples of the activations of the pool_3 layer of + the inception model. + -- sigma : The covariance matrix of the activations of the pool_3 layer of + the inception model. + """ + act = get_activations(files, model, batch_size, dims, num_workers) + mu = np.mean(act, axis=0) + sigma = np.cov(act, rowvar=False) + return mu, sigma + + +def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1): + if path.endswith('.npz'): + with np.load(path) as f: + m, s = f['mu'][:], f['sigma'][:] + else: + path = pathlib.Path(path) + files = sorted([ + file for ext in IMAGE_EXTENSIONS + for file in path.glob('*.{}'.format(ext)) + ]) + m, s = calculate_activation_statistics(files, model, batch_size, dims, + num_workers) + + return m, s + + +def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1): + """Calculates the FID of two paths""" + for p in paths: + if not os.path.exists(p): + raise RuntimeError('Invalid path: %s' % p) + + block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims] + + model = InceptionV3([block_idx]) + + m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, + num_workers) + m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, + num_workers) + fid_value = calculate_frechet_distance(m1, s1, m2, s2) + + return fid_value + + +def main(): + args = parser.parse_args() + if args.device is not None: + paddle.set_device(args.device) + + if args.num_workers is None: + num_avail_cpus = len(os.sched_getaffinity(0)) + num_workers = min(num_avail_cpus, 8) + else: + num_workers = args.num_workers + + fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims, + num_workers) + print('FID: ', fid_value) + + +if __name__ == '__main__': + main() diff --git a/ppdiffusers/scripts/fid/inception.py b/ppdiffusers/scripts/fid/inception.py new file mode 100644 index 000000000000..386d90ff500d --- /dev/null +++ b/ppdiffusers/scripts/fid/inception.py @@ -0,0 +1,539 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) mseitzer Author. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.utils.download import get_weights_path_from_url + +# Inception weights ported to Pytorch from +# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz +FID_WEIGHTS_URL = ( + 'https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams', + '8e2ae24c34c5c8b81d45167bb9361f4c') +WEIGHTS_PATH = 'pp_inception-2015-12-05-6726825d.pdparams' + + +class ConvNormActivation(nn.Sequential): + """ + Configurable block used for Convolution-Normalzation-Activation blocks. + This code is based on the torchvision code with modifications. + You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68 + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block + kernel_size: (int|list|tuple, optional): Size of the convolving kernel. Default: 3 + stride (int|list|tuple, optional): Stride of the convolution. Default: 1 + padding (int|str|tuple|list, optional): Padding added to all four sides of the input. Default: None, + in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation`` + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer. + If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2D`` + activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization + layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU`` + dilation (int): Spacing between kernel elements. Default: 1 + bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=None, + groups=1, + norm_layer=nn.BatchNorm2D, + activation_layer=nn.ReLU, + dilation=1, + bias=None): + if padding is None: + padding = (kernel_size - 1) // 2 * dilation + if bias is None: + bias = norm_layer is None + layers = [ + nn.Conv2D(in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=dilation, + groups=groups, + bias_attr=bias) + ] + if norm_layer is not None: + # The hyperparameter of BatchNorm2D is different from PaddlePaddle. + layers.append(norm_layer(out_channels, momentum=0.1, epsilon=0.001)) + if activation_layer is not None: + layers.append(activation_layer()) + super().__init__(*layers) + + +class InceptionV3(nn.Layer): + """Pretrained InceptionV3 network returning feature maps""" + + # Index of default block of inception to return, + # corresponds to output of final average pooling + DEFAULT_BLOCK_INDEX = 3 + + # Maps feature dimensionality to their output blocks indices + BLOCK_INDEX_BY_DIM = { + 64: 0, # First max pooling features + 192: 1, # Second max pooling featurs + 768: 2, # Pre-aux classifier features + 2048: 3 # Final average pooling features + } + + def __init__(self, + output_blocks=(DEFAULT_BLOCK_INDEX, ), + resize_input=True, + normalize_input=True, + requires_grad=False, + use_fid_inception=True): + """Build pretrained InceptionV3 + + Parameters + ---------- + output_blocks : list of int + Indices of blocks to return features of. Possible values are: + - 0: corresponds to output of first max pooling + - 1: corresponds to output of second max pooling + - 2: corresponds to output which is fed to aux classifier + - 3: corresponds to output of final average pooling + resize_input : bool + If true, bilinearly resizes input to width and height 299 before + feeding input to model. As the network without fully connected + layers is fully convolutional, it should be able to handle inputs + of arbitrary size, so resizing might not be strictly needed + normalize_input : bool + If true, scales the input from range (0, 1) to the range the + pretrained Inception network expects, namely (-1, 1) + requires_grad : bool + If true, parameters of the model require gradients. Possibly useful + for finetuning the network + use_fid_inception : bool + If true, uses the pretrained Inception model used in Tensorflow's + FID implementation. If false, uses the pretrained Inception model + available in paddle.vision. The FID Inception model has different + weights and a slightly different structure from paddle.vision's + Inception model. If you want to compute FID scores, you are + strongly advised to set this parameter to true to get comparable + results. + """ + super(InceptionV3, self).__init__() + + self.resize_input = resize_input + self.normalize_input = normalize_input + self.output_blocks = sorted(output_blocks) + self.last_needed_block = max(output_blocks) + + assert self.last_needed_block <= 3, \ + 'Last possible output block index is 3' + + self.blocks = nn.LayerList() + + if use_fid_inception: + inception = fid_inception_v3() + else: + inception = _inception_v3(pretrained=True) + + # Block 0: input to maxpool1 + block0 = [ + inception.inception_stem.conv_1a_3x3, + inception.inception_stem.conv_2a_3x3, + inception.inception_stem.conv_2b_3x3, + inception.inception_stem.max_pool + ] + self.blocks.append(nn.Sequential(*block0)) + + # Block 1: maxpool1 to maxpool2 + if self.last_needed_block >= 1: + block1 = [ + inception.inception_stem.conv_3b_1x1, + inception.inception_stem.conv_4a_3x3, + inception.inception_stem.max_pool + ] + self.blocks.append(nn.Sequential(*block1)) + + # Block 2: maxpool2 to aux classifier + if self.last_needed_block >= 2: + block2 = [ + inception.inception_block_list[0], + inception.inception_block_list[1], + inception.inception_block_list[2], + inception.inception_block_list[3], + inception.inception_block_list[4], + inception.inception_block_list[5], + inception.inception_block_list[6], + inception.inception_block_list[7], + ] + self.blocks.append(nn.Sequential(*block2)) + + # Block 3: aux classifier to final avgpool + if self.last_needed_block >= 3: + block3 = [ + inception.inception_block_list[8], + inception.inception_block_list[9], + inception.inception_block_list[10], inception.avg_pool + ] + self.blocks.append(nn.Sequential(*block3)) + + for param in self.parameters(): + param.stop_gradient = requires_grad + + def forward(self, inp): + """Get Inception feature maps + + Parameters + ---------- + inp : paddle.Tensor + Input tensor of shape Bx3xHxW. Values are expected to be in + range (0, 1) + + Returns + ------- + List of paddle.Tensor, corresponding to the selected output + block, sorted ascending by index + """ + outp = [] + x = inp + if self.resize_input: + x = F.interpolate(x, + size=(299, 299), + mode='bilinear', + align_corners=False) + + if self.normalize_input: + x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) + for idx, block in enumerate(self.blocks): + x = block(x) + if idx in self.output_blocks: + outp.append(x) + + if idx == self.last_needed_block: + break + + return outp + + +def hack_bn_layer(layer): + if isinstance(layer, nn.BatchNorm2D): + layer._momentum = 0.1 + layer._epsilon = 0.001 + + +def _inception_v3(*args, **kwargs): + """Wraps `paddle.vision.models.inception_v3` + """ + return paddle.vision.models.inception_v3(*args, + **kwargs).apply(hack_bn_layer) + + +def fid_inception_v3(): + """Build pretrained Inception model for FID computation + + The Inception model for FID computation uses a different set of weights + and has a slightly different structure than paddle.vision's Inception. + + This method first constructs paddle.vision's Inception and then patches the + necessary parts that are different in the FID Inception model. + """ + inception = _inception_v3(num_classes=1008, + with_pool=True, + pretrained=False) + inception.inception_block_list[0] = InceptionA(192, pool_features=32) + inception.inception_block_list[1] = InceptionA(256, pool_features=64) + inception.inception_block_list[2] = InceptionA(288, pool_features=64) + inception.inception_block_list[4] = InceptionC(768, channels_7x7=128) + inception.inception_block_list[5] = InceptionC(768, channels_7x7=160) + inception.inception_block_list[6] = InceptionC(768, channels_7x7=160) + inception.inception_block_list[7] = InceptionC(768, channels_7x7=192) + inception.inception_block_list[9] = InceptionE_1(1280) + inception.inception_block_list[10] = InceptionE_2(2048) + + weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], + FID_WEIGHTS_URL[1]) + state_dict = paddle.load(weight_path) + inception.set_state_dict(state_dict) + return inception + + +class InceptionA(nn.Layer): + + def __init__(self, num_channels, pool_features): + super().__init__() + self.branch1x1 = ConvNormActivation(in_channels=num_channels, + out_channels=64, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + + self.branch5x5_1 = ConvNormActivation(in_channels=num_channels, + out_channels=48, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + self.branch5x5_2 = ConvNormActivation(in_channels=48, + out_channels=64, + kernel_size=5, + padding=2, + activation_layer=nn.ReLU) + + self.branch3x3dbl_1 = ConvNormActivation(in_channels=num_channels, + out_channels=64, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + self.branch3x3dbl_2 = ConvNormActivation(in_channels=64, + out_channels=96, + kernel_size=3, + padding=1, + activation_layer=nn.ReLU) + self.branch3x3dbl_3 = ConvNormActivation(in_channels=96, + out_channels=96, + kernel_size=3, + padding=1, + activation_layer=nn.ReLU) + # Patch: Tensorflow's average pool does not use the padded zero's in + # its average calculation + self.branch_pool = nn.AvgPool2D(kernel_size=3, + stride=1, + padding=1, + exclusive=True) + self.branch_pool_conv = ConvNormActivation(in_channels=num_channels, + out_channels=pool_features, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + branch5x5 = self.branch5x5_1(x) + branch5x5 = self.branch5x5_2(branch5x5) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) + + branch_pool = self.branch_pool(x) + branch_pool = self.branch_pool_conv(branch_pool) + x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], + axis=1) + return x + + +class InceptionC(nn.Layer): + + def __init__(self, num_channels, channels_7x7): + super().__init__() + self.branch1x1 = ConvNormActivation(in_channels=num_channels, + out_channels=192, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + + self.branch7x7_1 = ConvNormActivation(in_channels=num_channels, + out_channels=channels_7x7, + kernel_size=1, + stride=1, + padding=0, + activation_layer=nn.ReLU) + self.branch7x7_2 = ConvNormActivation(in_channels=channels_7x7, + out_channels=channels_7x7, + kernel_size=(1, 7), + stride=1, + padding=(0, 3), + activation_layer=nn.ReLU) + self.branch7x7_3 = ConvNormActivation(in_channels=channels_7x7, + out_channels=192, + kernel_size=(7, 1), + stride=1, + padding=(3, 0), + activation_layer=nn.ReLU) + + self.branch7x7dbl_1 = ConvNormActivation(in_channels=num_channels, + out_channels=channels_7x7, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + self.branch7x7dbl_2 = ConvNormActivation(in_channels=channels_7x7, + out_channels=channels_7x7, + kernel_size=(7, 1), + padding=(3, 0), + activation_layer=nn.ReLU) + self.branch7x7dbl_3 = ConvNormActivation(in_channels=channels_7x7, + out_channels=channels_7x7, + kernel_size=(1, 7), + padding=(0, 3), + activation_layer=nn.ReLU) + self.branch7x7dbl_4 = ConvNormActivation(in_channels=channels_7x7, + out_channels=channels_7x7, + kernel_size=(7, 1), + padding=(3, 0), + activation_layer=nn.ReLU) + self.branch7x7dbl_5 = ConvNormActivation(in_channels=channels_7x7, + out_channels=192, + kernel_size=(1, 7), + padding=(0, 3), + activation_layer=nn.ReLU) + # Patch: Tensorflow's average pool does not use the padded zero's in + # its average calculation + self.branch_pool = nn.AvgPool2D(kernel_size=3, + stride=1, + padding=1, + exclusive=True) + self.branch_pool_conv = ConvNormActivation(in_channels=num_channels, + out_channels=192, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch7x7 = self.branch7x7_1(x) + branch7x7 = self.branch7x7_2(branch7x7) + branch7x7 = self.branch7x7_3(branch7x7) + + branch7x7dbl = self.branch7x7dbl_1(x) + branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) + + branch_pool = self.branch_pool(x) + branch_pool = self.branch_pool_conv(branch_pool) + + x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], + axis=1) + + return x + + +class InceptionE_1(nn.Layer): + + def __init__(self, num_channels): + super().__init__() + self.branch1x1 = ConvNormActivation(in_channels=num_channels, + out_channels=320, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + self.branch3x3_1 = ConvNormActivation(in_channels=num_channels, + out_channels=384, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + self.branch3x3_2a = ConvNormActivation(in_channels=384, + out_channels=384, + kernel_size=(1, 3), + padding=(0, 1), + activation_layer=nn.ReLU) + self.branch3x3_2b = ConvNormActivation(in_channels=384, + out_channels=384, + kernel_size=(3, 1), + padding=(1, 0), + activation_layer=nn.ReLU) + + self.branch3x3dbl_1 = ConvNormActivation(in_channels=num_channels, + out_channels=448, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + self.branch3x3dbl_2 = ConvNormActivation(in_channels=448, + out_channels=384, + kernel_size=3, + padding=1, + activation_layer=nn.ReLU) + self.branch3x3dbl_3a = ConvNormActivation(in_channels=384, + out_channels=384, + kernel_size=(1, 3), + padding=(0, 1), + activation_layer=nn.ReLU) + self.branch3x3dbl_3b = ConvNormActivation(in_channels=384, + out_channels=384, + kernel_size=(3, 1), + padding=(1, 0), + activation_layer=nn.ReLU) + + # Patch: Tensorflow's average pool does not use the padded zero's in + # its average calculation + self.branch_pool = nn.AvgPool2D(kernel_size=3, + stride=1, + padding=1, + exclusive=True) + self.branch_pool_conv = ConvNormActivation(in_channels=num_channels, + out_channels=192, + kernel_size=1, + padding=0, + activation_layer=nn.ReLU) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch3x3 = self.branch3x3_1(x) + branch3x3 = [ + self.branch3x3_2a(branch3x3), + self.branch3x3_2b(branch3x3), + ] + branch3x3 = paddle.concat(branch3x3, axis=1) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = [ + self.branch3x3dbl_3a(branch3x3dbl), + self.branch3x3dbl_3b(branch3x3dbl), + ] + branch3x3dbl = paddle.concat(branch3x3dbl, axis=1) + + branch_pool = self.branch_pool(x) + branch_pool = self.branch_pool_conv(branch_pool) + + x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], + axis=1) + return x + + +class InceptionE_2(InceptionE_1): + + def __init__(self, num_channels): + super(InceptionE_2, self).__init__(num_channels) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch3x3 = self.branch3x3_1(x) + branch3x3 = [ + self.branch3x3_2a(branch3x3), + self.branch3x3_2b(branch3x3), + ] + branch3x3 = paddle.concat(branch3x3, axis=1) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = [ + self.branch3x3dbl_3a(branch3x3dbl), + self.branch3x3dbl_3b(branch3x3dbl), + ] + branch3x3dbl = paddle.concat(branch3x3dbl, axis=1) + + # Patch: The FID Inception model uses max pooling instead of average + # pooling. This is likely an error in this specific Inception + # implementation, as other Inception models use average pooling here + # (which matches the description in the paper). + branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool_conv(branch_pool) + + x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], + axis=1) + return x diff --git a/ppdiffusers/scripts/inference/image_to_image_text_guided_generation-stable_diffusion.py b/ppdiffusers/scripts/inference/image_to_image_text_guided_generation-stable_diffusion.py new file mode 100644 index 000000000000..dc213ef57ca0 --- /dev/null +++ b/ppdiffusers/scripts/inference/image_to_image_text_guided_generation-stable_diffusion.py @@ -0,0 +1,41 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import paddle +from PIL import Image +from io import BytesIO + +from ppdiffusers import StableDiffusionImg2ImgPipeline + +# 加载pipeline +pipe = StableDiffusionImg2ImgPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5") + +# 下载初始图片 +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") +init_image = init_image.resize((768, 512)) + +prompt = "A fantasy landscape, trending on artstation" +# 使用fp16加快生成速度 +with paddle.amp.auto_cast(True): + image = pipe(prompt=prompt, + init_image=init_image, + strength=0.75, + guidance_scale=7.5).images[0] + +image.save("fantasy_landscape.png") diff --git a/ppdiffusers/scripts/inference/super_resolution-latent_diffusion.py b/ppdiffusers/scripts/inference/super_resolution-latent_diffusion.py new file mode 100644 index 000000000000..d07ef2813e31 --- /dev/null +++ b/ppdiffusers/scripts/inference/super_resolution-latent_diffusion.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import paddle +from PIL import Image +from io import BytesIO + +from ppdiffusers import LDMSuperResolutionPipeline + +# 加载pipeline +pipe = LDMSuperResolutionPipeline.from_pretrained( + 'CompVis/ldm-super-resolution-4x-openimages') + +# 下载初始图片 +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") +init_image = init_image.resize((128, 128)) +init_image.save("original-image.png") + +# 使用fp16加快生成速度 +with paddle.amp.auto_cast(True): + image = pipe(init_image, num_inference_steps=100, eta=1).images[0] + +image.save("super-resolution-image.png") diff --git a/ppdiffusers/scripts/inference/text_guided_image_inpainting-stable_diffusion.py b/ppdiffusers/scripts/inference/text_guided_image_inpainting-stable_diffusion.py new file mode 100644 index 000000000000..8030c4a86dec --- /dev/null +++ b/ppdiffusers/scripts/inference/text_guided_image_inpainting-stable_diffusion.py @@ -0,0 +1,45 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from io import BytesIO + +import requests +import PIL + +from ppdiffusers import StableDiffusionInpaintPipeline + + +def download_image(url): + response = requests.get(url) + return PIL.Image.open(BytesIO(response.content)).convert("RGB") + + +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + +init_image = download_image(img_url).resize((512, 512)) +mask_image = download_image(mask_url).resize((512, 512)) + +pipe = StableDiffusionInpaintPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5") + +prompt = "a cat sitting on a bench" +with paddle.amp.auto_cast(True): + image = pipe(prompt=prompt, + init_image=init_image, + mask_image=mask_image, + strength=0.75).images[0] + +image.save("cat_on_bench.png") diff --git a/ppdiffusers/scripts/inference/text_to_image_generation-latent_diffusion.py b/ppdiffusers/scripts/inference/text_to_image_generation-latent_diffusion.py new file mode 100644 index 000000000000..8bc8585bff26 --- /dev/null +++ b/ppdiffusers/scripts/inference/text_to_image_generation-latent_diffusion.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import LDMTextToImagePipeline + +# 加载模型和scheduler +pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256") + +# 执行pipeline进行推理 +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt).images[0] + +# 保存图片 +image.save("astronaut_rides_horse.png") diff --git a/ppdiffusers/scripts/inference/text_to_image_generation-stable_diffusion.py b/ppdiffusers/scripts/inference/text_to_image_generation-stable_diffusion.py new file mode 100644 index 000000000000..812d8c982432 --- /dev/null +++ b/ppdiffusers/scripts/inference/text_to_image_generation-stable_diffusion.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import StableDiffusionPipeline + +# 加载模型和scheduler +pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + +# 执行pipeline进行推理 +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt).images[0] + +# 保存图片 +image.save("astronaut_rides_horse.png") diff --git a/ppdiffusers/scripts/inference/unconditional_image_generation-ddim.py b/ppdiffusers/scripts/inference/unconditional_image_generation-ddim.py new file mode 100644 index 000000000000..5e47b754ce6d --- /dev/null +++ b/ppdiffusers/scripts/inference/unconditional_image_generation-ddim.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import DDIMPipeline + +# 加载模型和scheduler +pipe = DDIMPipeline.from_pretrained("dboshardy/ddim-butterflies-128") + +# 执行pipeline进行推理 +image = pipe(num_inference_steps=25).images[0] + +# 保存图片 +image.save("ddim_generated_image.png") diff --git a/ppdiffusers/scripts/inference/unconditional_image_generation-ddpm.py b/ppdiffusers/scripts/inference/unconditional_image_generation-ddpm.py new file mode 100644 index 000000000000..88508e38ac35 --- /dev/null +++ b/ppdiffusers/scripts/inference/unconditional_image_generation-ddpm.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import DDPMPipeline + +# 加载模型和scheduler +pipe = DDPMPipeline.from_pretrained("google/ddpm-celebahq-256") + +# 执行pipeline进行推理 +image = pipe().images[0] + +# 保存图片 +image.save("ddpm_generated_image.png") diff --git a/ppdiffusers/scripts/inference/unconditional_image_generation-latent_diffusion_uncond.py b/ppdiffusers/scripts/inference/unconditional_image_generation-latent_diffusion_uncond.py new file mode 100644 index 000000000000..3d579a73b66b --- /dev/null +++ b/ppdiffusers/scripts/inference/unconditional_image_generation-latent_diffusion_uncond.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import DiffusionPipeline, LDMPipeline + +# 加载模型和scheduler +pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256") + +# 执行pipeline进行推理 +image = pipe(num_inference_steps=200).images[0] + +# 保存图片 +image.save("ldm_generated_image.png") diff --git a/ppdiffusers/scripts/inference/unconditional_image_generation-pndm.py b/ppdiffusers/scripts/inference/unconditional_image_generation-pndm.py new file mode 100644 index 000000000000..68833313e903 --- /dev/null +++ b/ppdiffusers/scripts/inference/unconditional_image_generation-pndm.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import PNDMPipeline + +# 加载模型和scheduler +pipe = PNDMPipeline.from_pretrained("google/ddpm-celebahq-256") + +# 执行pipeline进行推理 +image = pipe(num_inference_steps=1000).images[0] + +# 保存图片 +image.save("pndm_generated_image.png") diff --git a/ppdiffusers/scripts/inference/unconditional_image_generation-score_sde_ve.py b/ppdiffusers/scripts/inference/unconditional_image_generation-score_sde_ve.py new file mode 100644 index 000000000000..a4935c46e16f --- /dev/null +++ b/ppdiffusers/scripts/inference/unconditional_image_generation-score_sde_ve.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ppdiffusers import DiffusionPipeline, ScoreSdeVePipeline + +# 加载模型和scheduler +sde_ve = ScoreSdeVePipeline.from_pretrained("google/ncsnpp-ffhq-1024") + +# 执行pipeline进行推理 +image = pipe().images[0] + +# 保存图片 +image[0].save("sde_ve_generated_image.png") diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 40e53822b6c3..51d288671ff7 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -18,10 +18,17 @@ import inspect from distutils.util import strtobool from collections.abc import Mapping +import gc __all__ = ['get_vocab_list', 'stable_softmax', 'cross_entropy'] +class PaddleNLPModelTest(unittest.TestCase): + + def tearDown(self): + gc.collect() + + def get_vocab_list(vocab_path): with open(vocab_path, "r", encoding="utf-8") as f: vocab_list = [ diff --git a/tests/transformers/bart/test_modeling.py b/tests/transformers/bart/test_modeling.py index 44ef76a186bd..763d51096a61 100644 --- a/tests/transformers/bart/test_modeling.py +++ b/tests/transformers/bart/test_modeling.py @@ -18,6 +18,7 @@ import unittest import numpy as np import random +from parameterized import parameterized_class from tests.testing_utils import slow @@ -90,7 +91,6 @@ def __init__( self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training - self.use_labels = use_labels self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -168,15 +168,18 @@ def create_and_check_decoder_model_past_large_inputs( decoder_attention_mask = paddle.zeros([input_ids.shape[0], 1, 1, 1], dtype=paddle.get_default_dtype()) - encoder_output = encoder(input_ids, attention_mask) + encoder_output = encoder(input_ids, + attention_mask, + return_dict=self.parent.return_dict) origin_cache = decoder.decoder.gen_cache(encoder_output) outputs = decoder(decoder_input_ids, decoder_attention_mask, encoder_output, attention_mask, - cache=origin_cache) + cache=origin_cache, + return_dict=self.parent.return_dict) - output, cache = outputs + output, cache = outputs[:2] # create hypothetical multiple next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), @@ -191,16 +194,19 @@ def create_and_check_decoder_model_past_large_inputs( next_attention_mask = paddle.concat( [decoder_attention_mask, next_attn_mask], axis=-1) - output_from_no_past, _ = decoder(next_input_ids, - next_attention_mask, - encoder_output, - attention_mask, - cache=origin_cache) + output_from_no_past = decoder(next_input_ids, + next_attention_mask, + encoder_output, + attention_mask, + return_dict=self.parent.return_dict) + if self.parent.return_dict: + output_from_no_past = output_from_no_past[0] output_from_past, _ = decoder(next_tokens, next_attention_mask, encoder_output, attention_mask, - cache=cache) + cache=cache, + return_dict=self.parent.return_dict)[:2] # select random slice random_slice_idx = ids_tensor((1, ), @@ -222,8 +228,16 @@ def create_and_check_decoder_model_past_large_inputs( atol=1e-3)) +@parameterized_class(("return_dict", "use_labels"), [ + [False, False], + [False, True], + [True, False], + [True, True], +]) class BartHeadTests(unittest.TestCase): vocab_size = 99 + use_labels = False + return_dict = False def _get_config_and_data(self): input_ids = paddle.to_tensor( @@ -266,27 +280,57 @@ def test_sequence_classification_forward(self): config, input_ids, batch_size = self._get_config_and_data() bart_model = BartModel(**config) num_labels = 2 + labels = _long_tensor([1] * batch_size) if self.use_labels else None model = BartForSequenceClassification(bart_model, num_labels=num_labels) - outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) + outputs = model(input_ids=input_ids, + decoder_input_ids=input_ids, + labels=labels, + return_dict=self.return_dict) expected_shape = [batch_size, num_labels] - self.assertEqual(outputs.shape, expected_shape) + if self.use_labels: + self.assertIsInstance(outputs[0].item(), float) # test loss + self.assertEqual(outputs[1].shape, expected_shape) # test logits + elif isinstance(outputs, paddle.Tensor): + self.assertEqual(outputs.shape, expected_shape) + else: + self.assertEqual(outputs[0].shape, expected_shape) def test_question_answering_forward(self): config, input_ids, batch_size = self._get_config_and_data() + sequence_labels = ids_tensor([batch_size], + 2) if self.use_labels else None bart_model = BartModel(**config) model = BartForQuestionAnswering(bart_model) - start_logits, end_logits = model(input_ids=input_ids) - + outputs = model(input_ids=input_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + return_dict=self.return_dict) + + if self.use_labels: + loss, start_logits, end_logits = outputs[:3] + self.assertIsInstance(loss.item(), float) + else: + start_logits, end_logits = outputs[:2] self.assertEqual(start_logits.shape, input_ids.shape) self.assertEqual(end_logits.shape, input_ids.shape) def test_lm_forward(self): config, input_ids, batch_size = self._get_config_and_data() bart_model = BartModel(**config) + lm_labels = ids_tensor([batch_size, input_ids.shape[1]], + self.vocab_size) if self.use_labels else None lm_model = BartForConditionalGeneration(bart_model) - outputs = lm_model(input_ids=input_ids) + outputs = lm_model(input_ids=input_ids, + labels=lm_labels, + return_dict=self.return_dict) expected_shape = [batch_size, input_ids.shape[1], config["vocab_size"]] - self.assertEqual(outputs.shape, expected_shape) + if self.use_labels: + self.assertIsInstance(outputs[0].item(), float) + self.assertEqual(outputs[1].shape, expected_shape) + elif isinstance(outputs, paddle.Tensor): + self.assertEqual(outputs.shape, expected_shape) + else: + self.assertEqual(outputs[0].shape, expected_shape) def test_lm_uneven_forward(self): config = { @@ -307,10 +351,18 @@ def test_lm_uneven_forward(self): dtype="int64") summary = paddle.to_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype="int64") - outputs = lm_model(input_ids=context, decoder_input_ids=summary) + outputs = lm_model(input_ids=context, + decoder_input_ids=summary, + labels=summary if self.use_labels else None, + return_dict=self.return_dict) expected_shape = summary.shape expected_shape.append(config["vocab_size"]) - self.assertEqual(outputs.shape, expected_shape) + if self.use_labels: + self.assertIsInstance(outputs[0].item(), float) + elif isinstance(outputs, paddle.Tensor): + self.assertEqual(outputs.shape, expected_shape) + else: + self.assertEqual(outputs[0].shape, expected_shape) def test_generate_beam_search(self): input_ids = paddle.to_tensor([[71, 82, 2], [68, 34, 2]], dtype="int64") @@ -374,6 +426,7 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering) + all_generative_model_classes = { BartForConditionalGeneration: (BartModel, "bart") } @@ -381,6 +434,8 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): fx_compatible = True test_pruning = False test_missing_keys = False + use_labels = False + return_dict = False def setUp(self): self.model_tester = BartModelTester(self) @@ -403,7 +458,7 @@ def assert_tensors_close(a, b, atol=1e-12, prefix=""): return True raise except Exception: - pct_different = (paddle.gt((a - b).abs(), atol)).float().mean().item() + pct_different = ((a - b).abs() > atol).astype("float").mean().item() if a.numel() > 100: msg = f"tensor values are {pct_different:.1%} percent different." else: @@ -480,11 +535,10 @@ def test_bart_base_generation(self): decode_strategy="beam_search", max_length=1024) result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0] - assert EXPECTED == result + assert EXPECTED == result, f"{EXPECTED}\n{result}" def test_xsum_1_1_batch_generation(self): # test batch - batch = self.tok()( [ "The Palestinian Authority officially became the 123rd member of the International Criminal Court on" @@ -639,7 +693,7 @@ def test_inference_no_head(self): with paddle.no_grad(): output = model(input_ids=input_ids, attention_mask=attention_mask) expected_shape = [1, 11, 1024] - self.assertEqual(output.shape, expected_shape) + self.assertEqual(output[0].shape, expected_shape) @slow def test_cnn_summarization_same_as_fairseq(self): diff --git a/tests/transformers/mbart/test_modeling.py b/tests/transformers/mbart/test_modeling.py index f914099532b8..e24c8fecd158 100644 --- a/tests/transformers/mbart/test_modeling.py +++ b/tests/transformers/mbart/test_modeling.py @@ -13,14 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import tempfile -import unittest -from tests.testing_utils import slow +from tests.testing_utils import slow, PaddleNLPModelTest from ..test_generation_utils import GenerationTesterMixin from ..test_modeling_common import ModelTesterMixin, ids_tensor +from parameterized import parameterized_class import paddle @@ -85,7 +84,6 @@ def __init__( self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training - self.use_labels = use_labels self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -170,9 +168,10 @@ def create_and_check_decoder_model_past_large_inputs( # first forward pass outputs = model(input_ids, decoder_attention_mask=attention_mask, - cache=cache) + cache=cache, + return_dict=self.parent.return_dict) - output, past_key_values = outputs + output, past_key_values = outputs[:2] # create hypothetical multiple next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), @@ -189,7 +188,10 @@ def create_and_check_decoder_model_past_large_inputs( output_from_no_past = model(next_input_ids, decoder_attention_mask=next_attention_mask, - cache=None) + cache=None, + return_dict=self.parent.return_dict) + if self.parent.return_dict: + output_from_no_past = output_from_no_past[0] output_from_past = model(next_tokens, decoder_attention_mask=next_attention_mask, cache=past_key_values)[0] @@ -214,8 +216,12 @@ def create_and_check_decoder_model_past_large_inputs( atol=1e-3)) +@parameterized_class(("return_dict", ), [ + [False], + [True], +]) class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, - unittest.TestCase): + PaddleNLPModelTest): base_model_class = MBartModel all_model_classes = (MBartModel, MBartForConditionalGeneration, @@ -227,6 +233,7 @@ class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, } is_encoder_decoder = True test_missing_keys = False + return_dict = False def setUp(self): self.model_tester = MBartModelTester(self) @@ -270,7 +277,7 @@ def _long_tensor(tok_lst): return paddle.to_tensor(tok_lst, dtype="int64") -class AbstractSeq2SeqIntegrationTest(unittest.TestCase): +class AbstractSeq2SeqIntegrationTest(PaddleNLPModelTest): maxDiff = 1000 # longer string compare tracebacks checkpoint_name = None @@ -287,6 +294,10 @@ def model(self): return model +@parameterized_class(("return_dict", ), [ + [False], + [True], +]) class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest): checkpoint_name = "mbart-large-en-ro" src_text = [ @@ -294,15 +305,14 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest): """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""", ] tgt_text = [ - "Şeful ONU declară că nu există o soluţie militară în Siria", - "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei" - ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor' - " face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.", + 'Şeful ONU declară că nu există o soluţie militară în Siria', + 'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar acordat de Rusia Siriei este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria a milioane de oameni.', ] expected_src_tokens = [ 8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, 250004 ] + return_dict = False @slow def test_enro_generate_one(self): @@ -324,10 +334,14 @@ def test_enro_generate_batch(self): truncation=True, return_token_type_ids=False) model = self.model() - translated_tokens = model.generate(**batch, max_length=128)[0] + translated_tokens = model.generate(**batch, + max_length=128, + decode_strategy="greedy_search")[0] decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True) - assert self.tgt_text == decoded + + for i in range(len(self.tgt_text)): + assert str(self.tgt_text[i]) == str(decoded[i]), f"{i}" def test_mbart_fast_forward(self): config = { @@ -348,8 +362,12 @@ def test_mbart_fast_forward(self): dtype="int64") summary = paddle.to_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype="int64") - logits = lm_model(input_ids=context, decoder_input_ids=summary) + loss, logits = lm_model(input_ids=context, + decoder_input_ids=summary, + labels=summary, + return_dict=self.return_dict)[:2] expected_shape = [*summary.shape, config["vocab_size"]] + self.assertIsInstance(loss.item(), float) self.assertEqual(logits.shape, expected_shape) @@ -368,7 +386,8 @@ class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest): def test_fill_mask(self): inputs = self.tokenizer(["One of the best I ever read!"], return_tensors="pd") - outputs = self.model.generate( + model = self.model() + outputs = model.generate( inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"])[0] prediction = self.tokenizer.batch_decode( @@ -411,7 +430,6 @@ def __init__( self.seq_length = self.decoder_seq_length self.is_training = is_training self.use_attention_mask = use_attention_mask - self.use_labels = use_labels self.vocab_size = vocab_size self.d_model = d_model @@ -449,7 +467,7 @@ def prepare_config_and_inputs(self): dtype="int64") lm_labels = None - if self.use_labels: + if self.parent.use_labels: lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size, dtype="int64") @@ -491,12 +509,20 @@ def create_and_check_decoder_model_past( origin_cache = model.decoder.gen_cache(encoder_output) # first forward pass - outputs = model(input_ids, cache=origin_cache) - outputs_use_cache_conf = model(input_ids) - outputs_no_past = model(input_ids, cache=None) - - self.parent.assertTrue(len(outputs[0]) == len(outputs_use_cache_conf)) - self.parent.assertTrue(len(outputs[0]) == len(outputs_no_past)) + outputs = model(input_ids, + cache=origin_cache, + return_dict=self.parent.return_dict) + outputs_use_cache_conf = model(input_ids, + return_dict=self.parent.return_dict) + outputs_no_past = model(input_ids, + cache=None, + return_dict=self.parent.return_dict) + + # self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) # didn't support using cache by config yet + if not self.parent.return_dict: + self.parent.assertTrue(len(outputs) == len((outputs_no_past, )) + 1) + else: + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) past_key_values = outputs[1] @@ -507,9 +533,13 @@ def create_and_check_decoder_model_past( # append to next input_ids and next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1) - - output_from_no_past = model(next_input_ids) - output_from_past = model(next_tokens, cache=past_key_values)[0] + output_from_no_past = model(next_input_ids, + return_dict=self.parent.return_dict) + if self.parent.return_dict: + output_from_no_past = output_from_no_past[0] + output_from_past = model(next_tokens, + cache=past_key_values, + return_dict=self.parent.return_dict)[0] # select random slice random_slice_idx = ids_tensor((1, ), @@ -545,10 +575,18 @@ def create_and_check_decoder_model_attention_mask_past( encoder_output = paddle.randn(shape=input_ids.shape + [self.d_model]) origin_cache = model.decoder.gen_cache(encoder_output) + cache = model.decoder.gen_cache( + paddle.randn(shape=[ + input_ids.shape[0], input_ids.shape[1], config["d_model"] + ])) # first forward pass - past_key_values = model(input_ids, - decoder_attention_mask=attn_mask, - cache=origin_cache)[1] + + past_key_values = model( + input_ids, + # attention_mask=attn_mask, + decoder_attention_mask=attn_mask, + cache=origin_cache, + return_dict=self.parent.return_dict)[1] # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), @@ -572,13 +610,16 @@ def create_and_check_decoder_model_attention_mask_past( ], axis=-1, ) - # get two different outputs output_from_no_past = model(next_input_ids, - decoder_attention_mask=attn_mask) + decoder_attention_mask=attn_mask, + return_dict=self.parent.return_dict) + if self.parent.return_dict: + output_from_no_past = output_from_no_past[0] output_from_past = model(next_tokens, decoder_attention_mask=attn_mask, - cache=past_key_values)[0] + cache=past_key_values, + return_dict=self.parent.return_dict)[0] # select random slice random_slice_idx = ids_tensor((1, ), @@ -610,14 +651,21 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict +@parameterized_class(("return_dict", "use_labels"), [ + [False, False], + [False, True], + [True, False], + [True, True], +]) class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, - unittest.TestCase): + PaddleNLPModelTest): base_model_class = MBartModel all_model_classes = () - + use_test_model_name_list = False all_generative_model_classes = {} is_encoder_decoder = False + use_labels = False def setUp(self): self.model_tester = MBartStandaloneDecoderModelTester(self, diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py index 32959ce49b21..cac88f8a0e56 100644 --- a/tests/transformers/t5/test_modeling.py +++ b/tests/transformers/t5/test_modeling.py @@ -537,6 +537,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = True test_model_parallel = True + use_test_inputs_embeds = True is_encoder_decoder = True # The small T5 model needs higher percentages for CPU/MP tests model_split_percents = [0.8, 0.9] diff --git a/tests/transformers/test_generation_utils.py b/tests/transformers/test_generation_utils.py index b6b6f7baaeb1..2f76d2ed681f 100644 --- a/tests/transformers/test_generation_utils.py +++ b/tests/transformers/test_generation_utils.py @@ -433,6 +433,7 @@ def test_greedy_generate(self): ) pretrained_model = self.all_generative_model_classes[model_class][ 0](**config) + paddle.seed(128) model = model_class(pretrained_model) model.eval() @@ -446,15 +447,13 @@ def test_greedy_generate(self): output_generate[0].tolist()) def test_sample_generate(self): - random.seed(128) - np.random.seed(128) - paddle.seed(128) for model_class in self.all_generative_model_classes.keys(): config, input_ids, attention_mask, max_length = self._get_input_ids_and_config( ) pretrained_model = self.all_generative_model_classes[model_class][ 0](**config) + paddle.seed(128) model = model_class(pretrained_model) model.eval() diff --git a/tests/transformers/test_modeling.py b/tests/transformers/test_modeling.py new file mode 100644 index 000000000000..faf8d96ceb4a --- /dev/null +++ b/tests/transformers/test_modeling.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from paddlenlp.transformers import TinyBertModel + + +class TestModeling(unittest.TestCase): + """Test PretrainedModel single time, not in Transformer models""" + + def test_from_pretrained_with_load_as_state_np_params(self): + """init model with `load_state_as_np` params""" + TinyBertModel.from_pretrained("tinybert-4l-312d", load_state_as_np=True) diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py index 8a79d96bfc4b..bbd239de3814 100644 --- a/tests/transformers/test_modeling_common.py +++ b/tests/transformers/test_modeling_common.py @@ -67,6 +67,7 @@ class ModelTesterMixin: test_resize_position_embeddings = False test_mismatched_shapes = True test_missing_keys = True + use_test_inputs_embeds = False use_test_model_name_list = True is_encoder_decoder = False has_attentions = True @@ -508,6 +509,48 @@ def test_resize_tokens_embeddings(self): self.assertTrue(models_equal) + def test_inputs_embeds(self): + # pass the test if don't need to test inputs embeddings + if not self.use_test_inputs_embeds: + return + # get config for model and inputs_dict for model forward + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + # test all model classes + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + model.eval() + + inputs = copy.deepcopy( + self._prepare_for_class(inputs_dict, model_class)) + + with paddle.no_grad(): + ids_output = model(**inputs) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", + encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + wte = model.get_input_embeddings() + if not self.is_encoder_decoder: + inputs["inputs_embeds"] = wte(input_ids) + else: + inputs["inputs_embeds"] = wte(encoder_input_ids) + inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) + + with paddle.no_grad(): + embeds_output = model(**inputs) + + self.assertTrue( + paddle.allclose(ids_output, embeds_output, rtol=1e-4, + atol=1e-4)) + def test_model_name_list(self): if not self.use_test_model_name_list: return diff --git a/tests/transformers/unified_transformer/test_modeling.py b/tests/transformers/unified_transformer/test_modeling.py index 21fa477bfc2e..dae6ef41fa92 100644 --- a/tests/transformers/unified_transformer/test_modeling.py +++ b/tests/transformers/unified_transformer/test_modeling.py @@ -19,6 +19,7 @@ import random from tests.testing_utils import slow +from parameterized import parameterized_class from ..test_generation_utils import GenerationTesterMixin from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask @@ -150,9 +151,15 @@ def prepare_config_and_inputs(self): paddle.arange(end=self.seq_length, dtype="int64").reshape([1, -1]), [self.batch_size, 1]) + lm_labels = None + if self.parent.use_labels: + lm_labels = ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + config = self.get_config() - return (config, input_ids, input_mask, token_type_ids, position_ids) + return (config, input_ids, input_mask, token_type_ids, position_ids, + lm_labels) def get_config(self): return { @@ -177,9 +184,10 @@ def get_config(self): } def prepare_config_and_inputs_for_decoder(self): - (config, input_ids, input_mask, token_type_ids, - position_ids) = self.prepare_config_and_inputs() - return (config, input_ids, input_mask, token_type_ids, position_ids) + (config, input_ids, input_mask, token_type_ids, position_ids, + lm_labels) = self.prepare_config_and_inputs() + return (config, input_ids, input_mask, token_type_ids, position_ids, + lm_labels) def create_and_check_unified_transformer_model(self, config, input_ids, input_mask, token_type_ids, @@ -191,7 +199,8 @@ def create_and_check_unified_transformer_model(self, config, input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=input_mask, - use_cache=True) + use_cache=True, + return_dict=self.parent.return_dict)[:2] self.parent.assertEqual( result.shape, [self.batch_size, self.seq_length, self.hidden_size]) @@ -209,23 +218,24 @@ def create_and_check_unified_transformer_model_past(self, config, input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=input_mask, - use_cache=True) - outputs_use_cache_conf = model( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=input_mask, - ) + use_cache=True, + return_dict=self.parent.return_dict) + outputs_use_cache_conf = model(input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + attention_mask=input_mask, + return_dict=self.parent.return_dict) outputs_no_past = model(input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=input_mask, - use_cache=False) + use_cache=False, + return_dict=self.parent.return_dict) self.parent.assertTrue( len(outputs_no_past) == len(outputs_use_cache_conf)) - output, past = outputs + output, past = outputs[:2] # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), @@ -252,18 +262,21 @@ def create_and_check_unified_transformer_model_past(self, config, input_ids, value=0)(next_attention_mask) next_attention_mask[:, :, -1, -1] = 1 - output_from_no_past, cache = model(next_input_ids, - token_type_ids=next_token_type_ids, - position_ids=next_position_ids, - attention_mask=next_attention_mask, - use_cache=True) + output_from_no_past, cache = model( + next_input_ids, + token_type_ids=next_token_type_ids, + position_ids=next_position_ids, + attention_mask=next_attention_mask, + use_cache=True, + return_dict=self.parent.return_dict)[:2] output_from_past = model(next_tokens, token_type_ids=next_token_types, position_ids=next_position, attention_mask=next_attention_mask[:, :, -1:, :], use_cache=True, - cache=past)[0] + cache=past, + return_dict=self.parent.return_dict)[0] # select random slice random_slice_idx = ids_tensor((1, ), @@ -292,7 +305,8 @@ def create_and_check_unified_transformer_model_past_large_inputs( token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=input_mask, - use_cache=True) + use_cache=True, + return_dict=self.parent.return_dict)[:2] # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), @@ -324,21 +338,22 @@ def create_and_check_unified_transformer_model_past_large_inputs( next_attention_mask[:, :, -3, -1] = 1 next_attention_mask[:, :, -3, -2] = 1 - output_from_no_past = model( - next_input_ids, - token_type_ids=next_token_type_ids, - attention_mask=next_attention_mask, - position_ids=next_position_ids, - use_cache=False, - ) - output_from_past = model( - next_tokens, - token_type_ids=next_token_types, - attention_mask=next_attention_mask[:, :, -3:, :], - position_ids=next_position, - cache=past, - use_cache=True, - )[0] + output_from_no_past = model(next_input_ids, + token_type_ids=next_token_type_ids, + attention_mask=next_attention_mask, + position_ids=next_position_ids, + use_cache=False, + return_dict=self.parent.return_dict) + if self.parent.return_dict: + output_from_no_past = output_from_no_past[0] + output_from_past = model(next_tokens, + token_type_ids=next_token_types, + attention_mask=next_attention_mask[:, :, + -3:, :], + position_ids=next_position, + cache=past, + use_cache=True, + return_dict=self.parent.return_dict)[0] self.parent.assertTrue( output_from_past.shape[1] == next_tokens.shape[1]) @@ -359,37 +374,48 @@ def create_and_check_unified_transformer_model_past_large_inputs( atol=1e-3)) def create_and_check_lm_head_model(self, config, input_ids, input_mask, - token_type_ids, position_ids, *args): + token_type_ids, position_ids, lm_labels, + *args): base_model = UnifiedTransformerModel(**config) model = UnifiedTransformerLMHeadModel(base_model) model.eval() - result = model(input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=input_mask) + outputs = model(input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + attention_mask=input_mask, + labels=lm_labels, + return_dict=self.parent.return_dict) + if self.parent.use_labels: + loss, result = outputs[:2] + self.parent.assertIsInstance(loss.item(), float) + else: + result = outputs[0] if self.parent.return_dict else outputs self.parent.assertEqual( result.shape, [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_forward_and_backwards(self, config, input_ids, input_mask, token_type_ids, - position_ids, *args): + position_ids, lm_head, *args): base_model = UnifiedTransformerModel(**config) model = UnifiedTransformerLMHeadModel(base_model) - model.eval() - logits = model(input_ids, - token_type_ids=token_type_ids, - attention_mask=input_mask, - position_ids=position_ids) + loss, logits = model(input_ids, + token_type_ids=token_type_ids, + attention_mask=input_mask, + position_ids=position_ids, + label=input_ids, + return_dict=self.parent.return_dict)[:2] + self.parent.assertIsInstance(loss.item(), float) self.parent.assertEqual( logits.shape, [self.batch_size, self.seq_length, self.vocab_size]) + loss.backward() def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, token_type_ids, - position_ids) = config_and_inputs + (config, input_ids, input_mask, token_type_ids, position_ids, + lm_labels) = config_and_inputs inputs_dict = { "input_ids": input_ids, @@ -401,6 +427,12 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict +@parameterized_class(("return_dict", "use_labels"), [ + [False, False], + [False, True], + [True, False], + [True, True], +]) class UnifiedTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): base_model_class = UnifiedTransformerModel @@ -412,15 +444,19 @@ class UnifiedTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, } test_missing_keys = False + use_labels = False + return_dict = False + # special case for DoubleHeads model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class) return inputs_dict def setUp(self): - random.seed(128) - np.random.seed(128) - paddle.seed(128) + seed = 1028 + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) self.model_tester = UnifiedTransformerModelTester(self)