From 1fc23a88887aab85b563bb3a9a8e844bc5dca497 Mon Sep 17 00:00:00 2001 From: ymyjl <113601649+ymyjl@users.noreply.github.com> Date: Thu, 3 Nov 2022 10:42:47 +0800 Subject: [PATCH] [Tutorial] Add torch migration tutorial (#3641) Co-authored-by: Zhong Hui --- examples/torch_migration/README.md | 62 ++ .../docs/ThesisReproduction_NLP.md | 928 ++++++++++++++++++ .../torch_migration/pipeline/Step1/README.md | 86 ++ .../pipeline/Step1/check_step1.py | 23 + .../pipeline/Step1/pd_forward_bert.py | 50 + .../pipeline/Step1/pt_forward_bert.py | 48 + .../pipeline/Step1/torch2paddle.py | 114 +++ .../torch_migration/pipeline/Step2/README.md | 131 +++ .../pipeline/Step2/accuracy.py | 96 ++ .../pipeline/Step2/check_step2.py | 24 + .../Step2/demo_sst2_sentence/demo.tsv | 33 + .../torch_migration/pipeline/Step2/predict.py | 94 ++ .../pipeline/Step2/test_data.py | 145 +++ .../pipeline/Step2/test_metric.py | 50 + .../torch_migration/pipeline/Step3/README.md | 67 ++ .../pipeline/Step3/check_step3.py | 24 + .../pipeline/Step3/paddle_loss.py | 59 ++ .../pipeline/Step3/torch_loss.py | 58 ++ .../torch_migration/pipeline/Step4/README.md | 136 +++ .../pipeline/Step4/check_step4.py | 23 + .../torch_migration/pipeline/Step4/test_bp.py | 141 +++ .../pipeline/Step4/test_lr_scheduler.py | 102 ++ .../torch_migration/pipeline/Step5/README.md | 29 + .../pipeline/Step5/bert_paddle/train.py | 342 +++++++ .../pipeline/Step5/bert_paddle/train.sh | 20 + .../pipeline/Step5/bert_paddle/utils.py | 211 ++++ .../pipeline/Step5/bert_torch/accuracy.py | 96 ++ .../pipeline/Step5/bert_torch/glue.py | 633 ++++++++++++ .../pipeline/Step5/bert_torch/train.py | 373 +++++++ .../pipeline/Step5/bert_torch/train.sh | 19 + .../pipeline/Step5/bert_torch/utils.py | 200 ++++ .../pipeline/Step5/check_step5.py | 24 + .../generate_classifier_weights.py | 37 + .../pipeline/fake_data/gen_fake_data.py | 26 + .../pipeline/models/pd_bert.py | 454 +++++++++ .../pipeline/models/pt_bert.py | 456 +++++++++ .../reprod_log_demo/check_log_diff.py | 28 + .../pipeline/reprod_log_demo/write_log.py | 31 + .../pipeline/weights/torch2paddle.py | 116 +++ .../pipeline/weights/torch_bert_weight.py | 21 + examples/torch_migration/requirements.txt | 5 + 41 files changed, 5615 insertions(+) create mode 100644 examples/torch_migration/README.md create mode 100644 examples/torch_migration/docs/ThesisReproduction_NLP.md create mode 100644 examples/torch_migration/pipeline/Step1/README.md create mode 100644 examples/torch_migration/pipeline/Step1/check_step1.py create mode 100644 examples/torch_migration/pipeline/Step1/pd_forward_bert.py create mode 100644 examples/torch_migration/pipeline/Step1/pt_forward_bert.py create mode 100644 examples/torch_migration/pipeline/Step1/torch2paddle.py create mode 100644 examples/torch_migration/pipeline/Step2/README.md create mode 100644 examples/torch_migration/pipeline/Step2/accuracy.py create mode 100644 examples/torch_migration/pipeline/Step2/check_step2.py create mode 100644 examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv create mode 100644 examples/torch_migration/pipeline/Step2/predict.py create mode 100644 examples/torch_migration/pipeline/Step2/test_data.py create mode 100644 examples/torch_migration/pipeline/Step2/test_metric.py create mode 100644 examples/torch_migration/pipeline/Step3/README.md create mode 100644 examples/torch_migration/pipeline/Step3/check_step3.py create mode 100644 examples/torch_migration/pipeline/Step3/paddle_loss.py create mode 100644 examples/torch_migration/pipeline/Step3/torch_loss.py create mode 100644 examples/torch_migration/pipeline/Step4/README.md create mode 100644 examples/torch_migration/pipeline/Step4/check_step4.py create mode 100644 examples/torch_migration/pipeline/Step4/test_bp.py create mode 100644 examples/torch_migration/pipeline/Step4/test_lr_scheduler.py create mode 100644 examples/torch_migration/pipeline/Step5/README.md create mode 100644 examples/torch_migration/pipeline/Step5/bert_paddle/train.py create mode 100644 examples/torch_migration/pipeline/Step5/bert_paddle/train.sh create mode 100644 examples/torch_migration/pipeline/Step5/bert_paddle/utils.py create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/glue.py create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/train.py create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/train.sh create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/utils.py create mode 100644 examples/torch_migration/pipeline/Step5/check_step5.py create mode 100644 examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py create mode 100644 examples/torch_migration/pipeline/fake_data/gen_fake_data.py create mode 100644 examples/torch_migration/pipeline/models/pd_bert.py create mode 100644 examples/torch_migration/pipeline/models/pt_bert.py create mode 100644 examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py create mode 100644 examples/torch_migration/pipeline/reprod_log_demo/write_log.py create mode 100644 examples/torch_migration/pipeline/weights/torch2paddle.py create mode 100644 examples/torch_migration/pipeline/weights/torch_bert_weight.py create mode 100644 examples/torch_migration/requirements.txt diff --git a/examples/torch_migration/README.md b/examples/torch_migration/README.md new file mode 100644 index 000000000000..603f040ed114 --- /dev/null +++ b/examples/torch_migration/README.md @@ -0,0 +1,62 @@ +# BERT-SST2-Prod +Reproduction process of BERT on SST2 dataset + +# 安装说明 + +* 下载代码库 + +```shell +git clone https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/torch_migration +``` + +* 进入文件夹,安装requirements + +```shell +pip install -r requirements.txt +``` + +* 安装PaddlePaddle与PyTorch + +```shell +# CPU版本的PaddlePaddle +pip install paddlepaddle==2.2.0 -i https://mirror.baidu.com/pypi/simple +# 如果希望安装GPU版本的PaddlePaddle,可以使用下面的命令 +# pip install paddlepaddle-gpu==2.2.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +# 安装PyTorch +pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html +``` + +**注意**: 本项目依赖于paddlepaddle-2.2.0版本,安装时需要注意。 + +* 验证PaddlePaddle是否安装成功 + +运行python,输入下面的命令。 + +```shell +import paddle +paddle.utils.run_check() +print(paddle.__version__) +``` + +如果输出下面的内容,则说明PaddlePaddle安装成功。 + +``` +PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now. +2.2.0 +``` + + +* 验证PyTorch是否安装成功 + +运行python,输入下面的命令,如果可以正常输出,则说明torch安装成功。 + +```shell +import torch +print(torch.__version__) +# 如果安装的是cpu版本,可以按照下面的命令确认torch是否安装成功 +# 期望输出为 tensor([1.]) +print(torch.Tensor([1.0])) +# 如果安装的是gpu版本,可以按照下面的命令确认torch是否安装成功 +# 期望输出为 tensor([1.], device='cuda:0') +print(torch.Tensor([1.0]).cuda()) +``` diff --git a/examples/torch_migration/docs/ThesisReproduction_NLP.md b/examples/torch_migration/docs/ThesisReproduction_NLP.md new file mode 100644 index 000000000000..eee175d34a28 --- /dev/null +++ b/examples/torch_migration/docs/ThesisReproduction_NLP.md @@ -0,0 +1,928 @@ +# 论文复现指南 + +## 目录 + +- [1. 总览](#1) + - [1.1 背景](#1.1) + - [1.2 前序工作](#1.2) +- [2. 整体框图](#2) + - [2.1 流程概览](#2.1) + - [2.2 reprod_log whl包](#2.2) +- [3. 论文复现理论知识及实战](#3) + - [3.1 模型结构对齐](#3.1) + - [3.2 验证/测试集数据读取对齐](#3.2) + - [3.3 评估指标对齐](#3.3) + - [3.4 损失函数对齐](#3.4) + - [3.5 优化器对齐](#3.5) + - [3.6 学习率对齐](#3.6) + - [3.7 正则化策略对齐](#3.7) + - [3.8 反向对齐](#3.8) + - [3.9 训练集数据读取对齐](#3.9) + - [3.10 网络初始化对齐](#3.10) + - [3.11 模型训练对齐](#3.11) + - [3.12 单机多卡训练](#3.12) +- [4. 论文复现注意事项与FAQ](#4) + - [4.0 通用注意事项](#4.0) + - [4.1 模型结构对齐](#4.1) + - [4.2 验证/测试集数据读取对齐](#4.2) + - [4.3 评估指标对齐](#4.3) + - [4.4 损失函数对齐](#4.4) + - [4.5 优化器对齐](#4.5) + - [4.6 学习率对齐](#4.6) + - [4.7 正则化策略对齐](#4.7) + - [4.8 反向对齐](#4.8) + - [4.9 训练集数据读取对齐](#4.9) + - [4.10 网络初始化对齐](#4.10) + - [4.11 模型训练对齐](#4.11) + + +## 1. 总览 + + +### 1.1 背景 + +* 以深度学习为核心的人工智能技术仍在高速发展,通过论文复现,开发者可以获得 + * 学习成长:自我能力提升 + * 技术积累:对科研或工作有所帮助和启发 + * 社区荣誉:成果被开发者广泛使用 + + +### 1.2 前序工作 + +基于本指南复现论文过程中,建议开发者准备以下内容。 + +* 了解该模型输入输出格式。以BERT的情感分类任务为例,通过阅读论文与参考代码,了解到模型输入为`[batch_size, sequence_length]`的tensor,类型为`int64`,label为`[batch, ]`的label,类型为`int64`。 +* 准备好训练/验证数据集,用于模型训练与评估 +* 准备好fake input data以及label,与模型输入shape、type等保持一致,用于后续模型前向对齐。 + * 在对齐模型前向过程中,我们不需要考虑数据集模块等其他模块,此时使用fake data是将模型结构和数据部分解耦非常合适的一种方式。 + * 将fake data以文件的形式存储下来,也可以保证PaddlePaddle与参考代码的模型结构输入是完全一致的,更便于排查问题。 + * 在该步骤中,以BERT为例,生成fake data的脚本可以参考:[gen_fake_data.py](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/fake_data/gen_fake_data.py)。 +* 在特定设备(CPU/GPU)上,跑通参考代码的预测过程(前向)以及至少2轮(iteration)迭代过程,保证后续基于PaddlePaddle复现论文过程中可对比。 +* 本文档基于 `BERT-SST2-Prod` 代码以及`reprod_log` whl包进行说明与测试。如果希望体验,建议参考[BERT-SST2-Prod文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/README.md)进行安装与测试。 +* 在复现的过程中,只需要将PaddlePaddle的复现代码以及打卡日志上传至github,不能在其中添加参考代码的实现,在验收通过之后,需要删除打卡日志。建议在初期复现的时候,就将复现代码与参考代码分成2个文件夹进行管理。 + + +## 2. 整体框图 + + +### 2.1 流程概览 + +面对一篇自然语言处理的论文,复现该论文的整体流程如下图所示。 + +![图片](https://user-images.githubusercontent.com/16911935/199389647-b000a7b1-28d1-485e-8ec0-3e7e2c05884a.png) + +总共包含11个步骤。为了高效复现论文,设置了5个验收节点。如上图中黄色框所示。后续章节会详细介绍上述步骤和验收节点,具体内容安排如下: + +* 第3章:介绍11个复现步骤的理论知识、实战以及验收流程。 +* 第4章:针对复现流程过程中每个步骤可能出现的问题,本章会进行详细介绍。如果还是不能解决问题,可以提ISSUE进行讨论,提ISSUE地址:[https://github.com/PaddlePaddle/Paddle/issues/new/choose](https://github.com/PaddlePaddle/Paddle/issues/new/choose) + + +### 2.2 reprod_log whl包 + +#### 2.2.1 reprod_log工具简介 +`reprod_log`是用于论文复现赛中辅助自查和验收工具。该工具源代码地址在:[https://github.com/WenmuZhou/reprod_log](https://github.com/WenmuZhou/reprod_log)。主要功能如下: + +* 存取指定节点的输入输出tensor +* 基于文件的tensor读写 +* 2个字典的对比验证 +* 对比结果的输出与记录 + +更多API与使用方法可以参考:[reprod_log API使用说明](https://github.com/WenmuZhou/reprod_log/blob/master/README.md)。 + +#### 2.2.2 reprod_log使用demo + +下面基于代码:[https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/reprod_log_demo](https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/reprod_log_demo),给出如何使用该工具。 + +文件夹中包含`write_log.py`和`check_log_diff.py`文件,其中`write_log.py`中给出了`ReprodLogger`类的使用方法,`check_log_diff.py`给出了`ReprodDiffHelper`类的使用方法,依次运行两个python文件,使用下面的方式运行代码。 + +```shell +# 进入文件夹 +cd pipeline/reprod_log_demo +# 随机生成矩阵,写入文件中 +python write_log.py +# 进行文件对比,输出日志 +python check_log_diff.py +``` + +最终会输出以下内容 + +``` +[2021/11/18 09:29:31] root INFO: demo_test_1: +[2021/11/18 09:29:31] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/18 09:29:31] root INFO: demo_test_2: +[2021/11/18 09:29:31] root INFO: mean diff: check passed: False, value: 0.33387675881385803 +[2021/11/18 09:29:31] root INFO: diff check failed +``` + +可以看出:对于key为`demo_test_1`的矩阵,由于diff为0,小于设置的阈值`1e-6`,核验成功;对于key为`demo_test_2`的矩阵,由于diff为0.33,大于设置的阈值`1e-6`,核验失败。 + +#### 2.2.3 reprod_log在论文复现中应用 + +在论文复现中,基于reprod_log的结果记录模块,产出下面若干文件 +``` +log_reprod +├── forward_paddle.npy +├── forward_torch.npy # 与forward_paddle.npy作为一并核查的文件对 +├── metric_paddle.npy +├── metric_torch.npy # 与metric_paddle.npy作为一并核查的文件对 +├── loss_paddle.npy +├── loss_torch.npy # 与loss_paddle.npy作为一并核查的文件对 +├── bp_align_paddle.npy +├── bp_align_torch.npy # 与bp_align_paddle.npy作为一并核查的文件对 +├── train_align_paddle.npy +├── train_align_torch.npy # pytorch运行得到的参考评估指标 +``` + +基于reprod_log的`ReprodDiffHelper`模块,产出下面5个日志文件。 + +``` +├── forward_diff.log # forward_paddle.npy与forward_torch.npy生成的diff结果文件 +├── metric_diff.log # metric_paddle.npy与metric_torch.npy生成的diff结果文件 +├── loss_diff.log # loss_paddle.npy与loss_torch.npy生成的diff结果文件 +├── bp_align_diff.log # bp_align_paddle.npy与bp_align_torch.npy生成的diff结果文件 +├── train_align_diff.log # train_align_paddle.train_align_torch.npy生成的diff结果文件 +``` + +上述文件的生成代码都需要开发者进行开发,验收时需要提供上面罗列的所有文件(不需要提供产生这些文件的可运行程序)以及完整的模型训练评估程序和日志。 +BERT-SST2-Prod项目提供了基于reprod_log的5个验收点对齐验收示例,具体代码地址为:[https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline](https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline), +每个文件夹中的README.md文档提供了使用说明。 + + +## 3. 论文复现理论知识及实战 + + +### 3.1 模型结构对齐 + +对齐模型结构时,一般有3个主要步骤: + +* 网络结构代码转换 +* 权重转换 +* 模型组网正确性验证 + +下面详细介绍这3个部分。 + +#### 3.1.1 网络结构代码转换 + +**【基本流程】** + +由于PyTorch的API和PaddlePaddle的API非常相似,可以参考[PyTorch-PaddlePaddle API映射表](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_api_mapping/pytorch_api_mapping_cn.html) +,组网部分代码直接进行手动转换即可。 + +**【注意事项】** + +如果遇到PaddlePaddle没有的API,可以尝试用多种API来组合,也可以给PaddlePaddle团队提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues),获得支持。 + +**【实战】** + +BERT网络结构的PyTorch实现: [transformers-bert](https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/modeling_bert.py) + +对应转换后的PaddlePaddle实现: [paddlenlp-bert](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/bert/modeling.py) + + +#### 3.1.2 权重转换 + +**【基本流程】** + +组网代码转换完成之后,需要对模型权重进行转换,如果PyTorch repo中已经提供权重,那么可以直接下载并进行后续的转换;如果没有提供,则可以基于PyTorch代码,随机生成一个初始化权重(定义完model以后,使用`torch.save()` API保存模型权重),然后进行权重转换。 + +**【注意事项】** + +在权重转换的时候,需要注意`paddle.nn.Linear`等API的权重保存格式和名称等与PyTorch稍有diff,具体内容可以参考`4.1章节`。 + +**【实战】** + +BERT的代码转换脚本可以在这里查看:[https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py), + +注意:运行该代码需要首先下载Huggingface的BERT预训练模型到该目录下,下载地址为:[https://huggingface.co/bert-base-uncased/blob/main/pytorch_model.bin](https://huggingface.co/bert-base-uncased/blob/main/pytorch_model.bin) + +```python +# https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py + +from collections import OrderedDict + +import numpy as np +import paddle +import torch +from paddlenlp.transformers import BertForPretraining as PDBertForMaskedLM +from transformers import BertForMaskedLM as PTBertForMaskedLM + + +def convert_pytorch_checkpoint_to_paddle( + pytorch_checkpoint_path="pytorch_model.bin", + paddle_dump_path="model_state.pdparams", + version="old", ): + hf_to_paddle = { + "embeddings.LayerNorm": "embeddings.layer_norm", + "encoder.layer": "encoder.layers", + "attention.self.query": "self_attn.q_proj", + "attention.self.key": "self_attn.k_proj", + "attention.self.value": "self_attn.v_proj", + "attention.output.dense": "self_attn.out_proj", + "intermediate.dense": "linear1", + "output.dense": "linear2", + "attention.output.LayerNorm": "norm1", + "output.LayerNorm": "norm2", + "predictions.decoder.": "predictions.decoder_", + "predictions.transform.dense": "predictions.transform", + "predictions.transform.LayerNorm": "predictions.layer_norm", + } + do_not_transpose = [] + if version == "old": + hf_to_paddle.update({ + "predictions.bias": "predictions.decoder_bias", + ".gamma": ".weight", + ".beta": ".bias", + }) + do_not_transpose = do_not_transpose + ["predictions.decoder.weight"] + + pytorch_state_dict = torch.load( + pytorch_checkpoint_path, map_location="cpu") + paddle_state_dict = OrderedDict() + for k, v in pytorch_state_dict.items(): + is_transpose = False + if k[-7:] == ".weight": + # embeddings.weight and LayerNorm.weight do not transpose + if all(d not in k for d in do_not_transpose): + if ".embeddings." not in k and ".LayerNorm." not in k: + if v.ndim == 2: + v = v.transpose(0, 1) + is_transpose = True + oldk = k + for hf_name, pd_name in hf_to_paddle.items(): + k = k.replace(hf_name, pd_name) + + # add prefix `bert.` + if "bert." not in k and "cls." not in k and "classifier" not in k: + k = "bert." + k + + print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}") + paddle_state_dict[k] = v.data.numpy() + + paddle.save(paddle_state_dict, paddle_dump_path) + + +def compare(out_torch, out_paddle): + out_torch = out_torch.detach().numpy() + out_paddle = out_paddle.detach().numpy() + assert out_torch.shape == out_paddle.shape + abs_dif = np.abs(out_torch - out_paddle) + mean_dif = np.mean(abs_dif) + max_dif = np.max(abs_dif) + min_dif = np.min(abs_dif) + print("mean_dif:{}".format(mean_dif)) + print("max_dif:{}".format(max_dif)) + print("min_dif:{}".format(min_dif)) + + +def test_forward(): + paddle.set_device("cpu") + model_torch = PTBertForMaskedLM.from_pretrained("./bert-base-uncased") + model_paddle = PDBertForMaskedLM.from_pretrained("./bert-base-uncased") + model_torch.eval() + model_paddle.eval() + np.random.seed(42) + x = np.random.randint( + 1, model_paddle.bert.config["vocab_size"], size=(4, 64)) + input_torch = torch.tensor(x, dtype=torch.int64) + out_torch = model_torch(input_torch)[0] + + input_paddle = paddle.to_tensor(x, dtype=paddle.int64) + out_paddle = model_paddle(input_paddle)[0] + + print("torch result shape:{}".format(out_torch.shape)) + print("paddle result shape:{}".format(out_paddle.shape)) + compare(out_torch, out_paddle) + + +if __name__ == "__main__": + convert_pytorch_checkpoint_to_paddle( + "./bert-base-uncased/pytorch_model.bin", + "./bert-base-uncased/model_state.pdparams") + test_forward() + # torch result shape:torch.Size([4, 64, 30522]) + # paddle result shape:[4, 64, 30522] + # mean_dif:1.666686512180604e-05 + # max_dif:0.00015211105346679688 + # min_dif:0.0 +``` + +运行完成之后,会在当前目录生成`model_state.pdparams`文件,即为转换后的PaddlePaddle预训练模型。 +**Tips**: 由于paddlenlp中已有转换后的bert-base-uncased模型,因此可以一键加载,程序会自动下载对应权重! + + +#### 3.1.3 模型组网正确性验证 + +**【基本流程】** + +1. 定义PyTorch模型,加载权重,固定seed,基于numpy生成随机数,转换为PyTorch可以处理的tensor,送入网络,获取输出,使用reprod_log保存结果。 +2. 定义PaddlePaddle模型,加载权重,固定seed,基于numpy生成随机数,转换为PaddlePaddle可以处理的tensor,送入网络,获取输出,使用reprod_log保存结果。 +3. 使用reprod_log排查diff,小于阈值,即可完成自测。 + +**【注意事项】** + +* 模型在前向对齐验证时,需要调用`model.eval()`方法,保证组网中的随机量被关闭,比如BatchNorm、Dropout等。 +* 给定相同的输入数据,为保证可复现性,如果有随机数生成,固定相关的随机种子。 +* 输出diff可以使用`np.mean(np.abs(o1 - o2))`进行计算,一般小于1e-6的话,可以认为前向没有问题。如果最终输出结果diff较大,可以使用二分的方法进行排查,比如说BERT,包含1个embdding层、12个transformer-block以及最后的MLM head层,那么完成模型组网和权重转换之后,如果模型输出没有对齐,可以尝试输出中间某一个transformer-block的tensor进行对比,如果相同,则向后进行排查;如果不同,则继续向前进行排查,以此类推,直到找到导致没有对齐的操作。 + +**【实战】** + +BERT模型组网正确性验证可以参考如下示例代码: +[https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/Step1](https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/Step1 + +**【验收】** + +对于待复现的项目,前向对齐验收流程如下。 + +1. 准备输入:fake data + * 使用参考代码的dataloader,生成一个batch的数据,保存下来,在前向对齐时,直接从文件中读入。 + * 固定随机数种子,生成numpy随机矩阵,转化tensor +2. 保存输出: + * PaddlePaddle/PyTorch:dict,key为tensor的name(自定义),value为tensor的值。最后将dict保存到文件中。建议命名为`forward_paddle.npy`和`forward_torch.npy`。 +3. 自测:使用reprod_log加载2个文件,使用report功能,记录结果到日志文件中,建议命名为`forward_diff_log.txt`,观察diff,二者diff小于特定的阈值即可。 +4. 提交内容:新建文件夹,将`forward_paddle.npy`、`forward_torch.npy`与`forward_diff_log.txt`文件放在文件夹中,后续的输出结果和自查日志也放在该文件夹中,一并打包上传即可。 +5. 注意: + * PaddlePaddle与PyTorch保存的dict的key需要保持相同,否则report过程可能会提示key无法对应,从而导致report失败,之后的`【验收】`环节也是如此。 + * 如果是固定随机数种子,建议将fake data保存到dict中,方便check参考代码和PaddlePaddle的输入是否一致。 + + +### 3.2 验证/测试集数据读取对齐 + +**【基本流程】** + +对于一个数据集,一般有以下一些信息需要重点关注 + +* 数据集名称、下载地址 +* 训练集/验证集/测试集 + +PaddlePaddle中数据集相关的API为`paddle.io.Dataset`,PyTorch中对应为`torch.utils.data.Dataset`,二者功能一致,在绝大多数情况下,可以使用该类构建数据集。它是描述Dataset方法和行为的抽象类,在具体实现的时候,需要继承这个基类,实现其中的`__getitem__`和`__len__`方法。除了参考代码中相关实现,也可以参考待复现论文中的说明。 + +复现完Dataset之后,可以构建Dataloader,对数据进行组batch、批处理,送进网络进行计算。 + +`paddle.io.DataLoader`可以进行数据加载,将数据分成批数据,并提供加载过程中的采样。PyTorch对应的实现为`torch.utils.data.DataLoader`,二者在功能上一致,只是在参数方面稍有diff:(1)PaddlePaddle缺少对`pin_memory`等参数的支持;(2)PaddlePaddle增加了`use_shared_memory`参数来选择是否使用共享内存加速数据加载过程。 + +**【注意事项】** + +论文中一般会提供数据集的名称以及基本信息。复现过程中,我们在下载完数据之后,建议先检查下是否和论文中描述一致,否则可能存在的问题有: + +* 数据集版本不同,比如论文中使用了cnn_dailymail的v3.0.0版本数据集,但是我们下载的是cnn_dailymail的v1.0.0版本数据集,如果不对其进行检查,可能会导致我们最终训练的数据量等与论文中有diff +* 数据集使用方式不同,有些论文中,可能只是抽取了该数据集的子集进行方法验证,此时需要注意抽取方法,需要保证抽取出的子集完全相同。 +* 在评估指标对齐时,我们可以固定batch size,关闭Dataloader的shuffle操作。 + +构建数据集时,可以使用paddlenlp中的数据集加载方式,具体可以参考:[如何自定义数据集](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html)。对应地,PyTorch中的数据处理api可以参考:[huggingface的datasets自定义数据集](https://huggingface.co/docs/datasets/about_dataset_load.html#building-a-dataset)。对于其中之一,可以找到另一个平台的实现。 + +此外, +* 有些自定义的数据处理方法,如果不涉及到深度学习框架的部分,可以直接复用。 +* 对于特定任务中的数据预处理方法,比如说Tokenizer,如果没有现成的API可以调用,可以参考官方模型套件中的一些实现方法,比如PaddleClas、PaddleDetection、PaddleSeg等。 + +**【实战】** + +BERT模型复现过程中,数据预处理和Dataset、Dataloader的检查可以参考该文件: +[https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/test_data.py](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/test_data.py) + + +使用方法可以参考[数据检查文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/README.md)。 + + +### 3.3 评估指标对齐 + +**【基本流程】** + +PaddlePaddle提供了一系列Metric计算类,比如说`Accuracy`, `Auc`, `Precision`, `Recall`等,而PyTorch中,目前可以通过组合的方式实现metric计算,或者调用[huggingface-datasets](https://huggingface.co/docs/datasets/about_metrics.html?highlight=metric),在论文复现的过程中,需要注意保证对于该模块,给定相同的输入,二者输出完全一致。具体流程如下。 + +1. 构建fake数据 +1. 使用PyTorch的指标获取评估结果,使用reprod_log保存结果。 +2. 使用PaddlePaddle的指标获取评估结果,使用reprod_log保存结果。 +3. 使用reprod_log排查diff,小于阈值,即可完成自测。 + +**【注意事项】** + +在评估指标对齐之前,需要注意保证对于该模块,给定相同的输入,二者输出完全一致。 + + +**【实战】** + +评估指标对齐检查方法可以参考文档:[评估指标对齐检查方法文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/README.md#%E6%95%B0%E6%8D%AE%E8%AF%84%E4%BC%B0%E5%AF%B9%E9%BD%90%E6%B5%81%E7%A8%8B) + + +**【验收】** + +对于待复现的项目,评估指标对齐验收流程如下。 + +1. 输入:dataloader, model +2. 输出: + * PaddlePaddle/PyTorch:dict,key为tensor的name(自定义),value为具体评估指标的值。最后将dict使用reprod_log保存到各自的文件中,建议命名为`metric_paddle.npy`和`metric_torch.npy`。 + * 自测:使用reprod_log加载2个文件,使用report功能,记录结果到日志文件中,建议命名为`metric_diff_log.txt`,观察diff,二者diff小于特定的阈值即可。 +3. 提交内容:将`metric_paddle.npy`、`metric_torch.npy`与`metric_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中,后续的输出结果和自查日志也放在该文件夹中,一并打包上传即可。 +4. 注意: + * 数据需要是真实数据 + * 需要检查论文是否只是抽取了验证集/测试集中的部分文件,如果是的话,则需要保证PaddlePaddle和参考代码中dataset使用的数据集一致。 + + + +### 3.4 损失函数对齐 + +**【基本流程】** + +PaddlePaddle与PyTorch均提供了很多loss function,用于模型训练,具体的API映射表可以参考:[Loss类API映射列表](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_api_mapping/pytorch_api_mapping_cn.html#lossapi)。以CrossEntropyLoss为例,主要区别为: +* PaddlePaddle提供了对软标签、指定softmax计算纬度的支持。 + +如果论文中使用的loss function没有指定的API,则可以尝试通过组合API的方式,实现自定义的loss function。 + +具体流程如下。 + +1. 定义PyTorch模型,加载权重,加载fake data 和 fake label(或者固定seed,基于numpy生成随机数),转换为PyTorch可以处理的tensor,送入网络,获取loss结果,使用reprod_log保存结果。 +2. 定义PaddlePaddle模型,加载fake data 和 fake label(或者固定seed,基于numpy生成随机数),转换为PaddlePaddle可以处理的tensor,送入网络,获取loss结果,使用reprod_log保存结果。 +3. 使用reprod_log排查diff,小于阈值,即可完成自测。 + +**【注意事项】** + +* 计算loss的时候,建议设置`model.eval()`,避免模型中随机量的问题。 + +**【实战】** + +本部分可以参考文档:[https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step3/README.md](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step3/README.md)。 + +**【验收】** + +对于待复现的项目,损失函数对齐验收流程如下。 + +1. 输入:fake data & label +2. 输出: + * PaddlePaddle/PyTorch:dict,key为tensor的name(自定义),value为具体评估指标的值。最后将dict使用reprod_log保存到各自的文件中,建议命名为`loss_paddle.npy`和`loss_torch.npy`。 +3. 自测:使用reprod_log加载2个文件,使用report功能,记录结果到日志文件中,建议命名为`loss_diff_log.txt`,观察diff,二者diff小于特定的阈值即可。 +4. 提交内容:将`loss_paddle.npy`、`loss_torch.npy`与`loss_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中,后续的输出结果和自查日志也放在该文件夹中,一并打包上传即可。 + + +### 3.5 优化器对齐 + +**【基本流程】** + +PaddlePaddle中的optimizer有`paddle.optimizer`等一系列实现,PyTorch中则有`torch.Optim`等一系列实现。 + +**【注意事项】** + +以SGD等优化器为例,PaddlePaddle与Pytorch的优化器区别主要如下。 + +* PaddlePaddle在优化器中增加了对梯度裁剪的支持,在训练GAN或者一些NLP、多模态任务中,这个用到的比较多。 +* PaddlePaddle的SGD不支持动量更新、动量衰减和Nesterov动量,这里需要使用`paddle.optimizer.Momentum` API实现这些功能。 + +**【实战】** + +本部分对齐建议对照[PaddlePaddle优化器API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html)与参考代码的优化器实现进行对齐,用之后的反向对齐统一验证该模块的正确性。 + + + +### 3.6 学习率对齐 + +**【基本流程】** + +* 学习率策略主要用于指定训练过程中的学习率变化曲线,这里可以将定义好的学习率策略,不断step,即可得到对应的学习率值,可以将学习率值保存在列表或者矩阵中,使用`reprod_log`工具判断二者是否对齐。 + +**【注意事项】** + +PaddlePaddle中,需要首先构建学习率策略,再传入优化器对象中;对于PyTorch,如果希望使用更丰富的学习率策略,需要先构建优化器,再传入学习率策略类API。 + +**【实战】** + +学习率复现对齐,可以参考代码:[学习率对齐验证文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step4/README.md#%E5%AD%A6%E4%B9%A0%E7%8E%87%E5%AF%B9%E9%BD%90%E9%AA%8C%E8%AF%81)。 + + +### 3.7 正则化策略对齐 + +**【基本流程】** + +L2正则化策略用于模型训练,可以防止模型对训练数据过拟合,L1正则化可以用于得到稀疏化的权重矩阵,PaddlePaddle中有`paddle.regularizer.L1Decay`与`paddle.regularizer.L2Decay` API。PyTorch中,torch.optim集成的优化器只有L2正则化方法,直接在构建optimizer的时候,传入`weight_decay`参数即可。 + +**【注意事项】** + +* PaddlePaddle的optimizer中支持L1Decat/L2Decay。 +* PyTorch的optimizer支持不同参数列表的学习率分别设置,params传入字典即可,而PaddlePaddle的2.1.0版本目前尚未支持这种行为,可以通过设置`ParamAttr`的`learning_rate`参数,来确定相对学习率倍数。PaddlePaddle的2.2.0版本中虽然实现该功能,但是模型收敛速度较慢,不建议使用。[优化器收敛速度慢](https://github.com/PaddlePaddle/Paddle/issues/36915) + +**【实战】** + +本部分对齐建议对照[PaddlePaddle正则化API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/regularizer/L2Decay_cn.html)与参考代码的优化器实现进行对齐,用之后的反向对齐统一验证该模块的正确性。 + + +### 3.8 反向对齐 + +**【基本流程】** + +此处可以通过numpy生成假的数据和label(推荐),也可以准备固定的真实数据。具体流程如下。 + +1. 检查两个代码的训练超参数全部一致,如优化器及其超参数、学习率、LayerNorm中的eps等。 +2. 将PaddlePaddle与PyTorch网络中涉及的所有随机操作全部关闭,如dropout、drop_path等,推荐将模型设置为eval模式(`model.eval()`) +3. 加载相同的weight dict(可以通过PyTorch来存储随机的权重),将准备好的数据分别传入网络并迭代,观察二者loss是否一致(此处batch-size要一致,如果使用多个真实数据,要保证传入网络的顺序一致) +4. 如果经过2轮以上,loss均可以对齐,则基本可以认为反向对齐。 + + +**【注意事项】** + +* 如果第一轮loss就没有对齐,则需要仔细排查一下模型前向部分。 +* 如果第二轮开始,loss开始无法对齐,则首先需要排查下超参数的差异,没问题的话,在`loss.backward()`方法之后,使用`tensor.grad`获取梯度值,二分的方法查找diff,定位出PaddlePaddle与PyTorch梯度无法对齐的API或者操作,然后进一步验证并反馈。 + +梯度的打印方法示例代码如下所示,注释掉的内容即为打印网络中所有参数的梯度shape。 + +```python + # 代码地址:https://github.com/JunnYu/BERT-SST2-Prod/blob/2c372656bb1b077b0073c50161771d9fa9d8de5a/pipeline/Step4/test_bp.py#L12 + def pd_train_some_iters(model, + criterion, + optimizer, + fake_data, + fake_label, + max_iter=2): + model = PDBertForSequenceClassification.from_pretrained("bert-base-uncased", num_classes=2) + classifier_weights = paddle.load("../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + model.eval() + criterion = paddle.nn.CrossEntropy() + decay_params = [ + p.name for n, p in model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + optimizer = paddle.optimizer.AdamW(learning_rate=3e-5, parameters=model.parameters(), + weight_decay=1e-2, + epsilon=1e-6, + apply_decay_param_fun=lambda x: x in decay_params) + loss_list = [] + for idx in range(max_iter): + input_ids = paddle.to_tensor(fake_data) + labels = paddle.to_tensor(fake_label) + + output = model(input_ids) + loss = criterion(output, labels) + loss.backward() + optimizer.step() + optimizer.clear_grad() + loss_list.append(loss) + return loss_list +``` + + + + +**【实战】** + +本部分可以参考文档:[反向对齐操作文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step4/README.md#%E5%8F%8D%E5%90%91%E5%AF%B9%E9%BD%90%E6%93%8D%E4%BD%9C%E6%96%B9%E6%B3%95)。 + +**【验收】** + +对于待复现的项目,反向对齐验收流程如下。 + +1. 输入:fake data & label +2. 输出: + * PaddlePaddle/PyTorch:dict,key为tensor的name(自定义),value为具体loss的值。最后将dict使用reprod_log保存到各自的文件中,建议命名为`bp_align_paddle.npy`和`bp_align_torch.npy`。 +3. 自测:使用reprod_log加载2个文件,使用report功能,记录结果到日志文件中,建议命名为`bp_align_diff_log.txt`,观察diff,二者diff小于特定的阈值即可。 +4. 提交内容:将`bp_align_paddle.npy`、`bp_align_torch.npy`与`bp_align_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中,后续的输出结果和自查日志也放在该文件夹中,一并打包上传即可。 +5. 注意: + * loss需要保存至少2轮以上。 + * 在迭代的过程中,需要保证模型的batch size等超参数完全相同 + * 在迭代的过程中,需要设置`model.eval()`,使用固定的假数据,同时加载相同权重的预训练模型。 + + +### 3.9 训练集数据读取对齐 + +**【基本流程】** + +该部分内容与3.2节内容基本一致,参考PyTorch的代码,实现训练集数据读取与预处理模块即可。 + +**【注意事项】** + +该部分内容,可以参考3.8节的自测方法,将输入的`fake data & label`替换为训练的dataloader,但是需要注意的是: +* 在使用train dataloader的时候,建议设置random seed,对于PyTorch来说 + +```python +#initialize random seed +torch.manual_seed(config.SEED) +torch.cuda.manual_seed_all(config.SEED) +np.random.seed(config.SEED) +random.seed(config.SEED) +``` + +对于PaddlePaddle来说 + +```python +paddle.seed(config.SEED) +np.random.seed(config.SEED) +random.seed(config.SEED) +``` + + + +### 3.10 网络初始化对齐 + +**【基本流程】** + +* 下面给出了部分初始化API的映射表。 + +|PaddlePaddle API | PyTorch API | +|---|---| +| paddle.nn.initializer.KaimingNormal | torch.nn.init.kaiming_normal_ | +| paddle.nn.initializer.KaimingUniform | torch.nn.init.kaiming_uniform_ | +| paddle.nn.initializer.XavierNormal | torch.nn.init.xavier_normal_ | +| paddle.nn.initializer.XavierUniform | torch.nn.init.xavier_uniform_ | + +**【注意事项】** + +* 更多初始化API可以参考[PyTorch初始化API文档](https://pytorch.org/docs/stable/nn.init.html)以及[PaddlePaddle初始化API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/Overview_cn.html#chushihuaxiangguan)。 + +**【实战】** + +本部分对齐建议对照[PaddlePaddle 初始化API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/Overview_cn.html#chushihuaxiangguan)与参考代码的初始化实现对齐。 + + +### 3.11 模型训练对齐 + +**【基本流程】** + +完成前面的步骤之后,就可以开始全量数据的训练对齐任务了。按照下面的步骤进行训练对齐。 + +1. 准备train/eval data, loader, model +2. 对model按照论文所述进行初始化(如果论文中提到加载了预训练模型,则按需加载pretrained model) +3. 加载配置,开始训练,迭代得到最终模型与评估指标,将评估指标使用reprod_log保存到文件中。 +4. 将PaddlePaddle提供的参考指标使用reprod_log提交到另一个文件中。 +5. 使用reprod_log排查diff,小于阈值,即可完成自测。 + +**【注意事项】** + +* 【强烈】建议先做完反向对齐之后再进行模型训练对齐,二者之间的不确定量包括:数据集、PaddlePaddle与参考代码在模型training mode下的区别,初始化参数。 +* 在训练对齐过程中,受到较多随机量的影响,精度有少量diff是正常的,以SST-2数据集的分类为例,diff在0.15%以内可以认为是正常的,这里可以根据不同的任务,适当调整对齐检查的阈值(`ReprodDiffHelper.report`函数中的`diff_threshold`参数)。 +* 训练过程中的波动是正常的,如果最终收敛结果不一致,可以 + * 仔细排查Dropout、BatchNorm以及其他组网模块及超参是否无误。 + * 基于参考代码随机生成一份预训练模型,转化为PaddlePaddle的模型,并使用PaddlePaddle加载训练,对比二者的收敛曲线与最终结果,排查初始化影响。 + * 使用参考代码的Dataloader生成的数据,进行模型训练,排查train dataloader的影响。 + +**【实战】** + +本部分可以参考文档:[训练对齐操作文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step5/README.md)。 + +**【验收】** + +对于待复现的项目,训练对齐验收流程如下。 + +1. 输入:train/eval dataloader, model +2. 输出: + * PaddlePaddle:dict,key为保存值的name(自定义),value为具体评估指标的值。最后将dict使用reprod_log保存到文件中,建议命名为`train_align_paddle.npy`。 + * benchmark:dict,key为保存值的name(自定义),value为论文复现赛的评估指标要求的值。最后将dict使用reprod_log保存到文件中,建议命名为`train_align_benchmark.npy`。 +3. 自测:使用reprod_log加载2个文件,使用report功能,记录结果到日志文件中,建议命名为`train_align_diff_log.txt`,观察diff,二者diff小于特定的阈值即可。 +4. 提交内容:将`train_align_paddle.npy`、`train_align_benchmark.npy`与`train_align_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中,最终一并打包上传即可。 + + +### 3.12 单机多卡训练 + +如果希望使用单机多卡提升训练效率,可以从以下几个过程对代码进行修改。 + +#### 3.12.1 数据读取 + +对于PaddlePaddle来说,多卡数据读取这块主要的变化在sampler + +对于单机单卡,sampler实现方式如下所示。 + +```python +train_sampler = paddle.io.RandomSampler(dataset) +train_batch_sampler = paddle.io.BatchSampler( + sampler=train_sampler, batch_size=args.batch_size) +``` + +对于单机多卡任务,sampler实现方式如下所示。 + +```python +train_batch_sampler = paddle.io.DistributedBatchSampler( + dataset=dataset, + batch_size=args.batch_size, + shuffle=True, + drop_last=False + ) +``` + +注意:在这种情况下,单机多卡的代码仍然能够以单机单卡的方式运行,因此建议以这种sampler方式进行论文复现。 + + +#### 3.12.2 多卡模型初始化 + +如果以多卡的方式运行,需要初始化并行训练环境,代码如下所示。 + +```python +if paddle.distributed.get_world_size() > 1: + paddle.distributed.init_parallel_env() +``` + +在模型组网并初始化参数之后,需要使用`paddle.DataParallel()`对模型进行封装,使得模型可以通过数据并行的模式被执行。代码如下所示。 + +```python +if paddle.distributed.get_world_size() > 1: + model = paddle.DataParallel(model) +``` + + +#### 3.12.3 模型保存、日志保存等其他模块 + +以模型保存为例,我们只需要在0号卡上保存即可,否则多个trainer同时保存的话,可能会造成写冲突,导致最终保存的模型不可用。 + + +#### 3.12.4 程序启动方式 + +对于单机单卡,启动脚本如下所示。[https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/bert](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/bert) + +```shell +unset CUDA_VISIBLE_DEVICES +python -m paddle.distributed.launch --gpus "0" run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-uncased \ + --task_name SST-2 \ + --max_seq_length 128 \ + --batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --logging_steps 1 \ + --save_steps 500 \ + --output_dir ./tmp/ \ + --device gpu \ + --use_amp False +``` + + +对于单机多卡(示例中为4卡训练),启动脚本如下所示。 + +```shell +unset CUDA_VISIBLE_DEVICES +python -m paddle.distributed.launch --gpus "0,1,2,3" run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-uncased \ + --task_name SST-2 \ + --max_seq_length 128 \ + --batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --logging_steps 1 \ + --save_steps 500 \ + --output_dir ./tmp/ \ + --device gpu \ + --use_amp False +``` + +注意:这里8卡训练时,虽然单卡的batch size没有变化(32),但是总卡的batch size相当于是单卡的8倍,因此学习率也设置为了单卡时的8倍。 + + +**【实战】** + +本部分可以参考paddlenlp库中的例子:[单机多卡训练](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/bert)。 + + +## 4. 论文复现注意事项与FAQ + +本部分主要总结大家在论文复现赛过程中遇到的问题,如果本章内容没有能够解决你的问题,欢迎给该文档提出优化建议或者给Paddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose)。 + + +### 4.0 通用注意事项 + +* 需要仔细对照PaddlePaddle与参考代码的优化器参数实现,确保优化器参数严格对齐。 +* 如果遇到一些Paddle不支持的API操作,可以尝试使用替代实现进行复现。如下面的PyTorch代码,PaddlePaddle中可以通过slice + concat API的组合形式进行功能实现。同时,对于这个问题,建议优先给Paddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose),列出Paddle不支持的实现,开发人员会根据优先级进行开发。 + +```python +torch.stack([ + per_locations[:, 0] - per_box_regression[:, 0], + per_locations[:, 1] - per_box_regression[:, 1], + per_locations[:, 0] + per_box_regression[:, 2], + per_locations[:, 1] + per_box_regression[:, 3], +], dim=1) +``` +* 如果遇到Paddle不包含的OP或者API,比如(1) 如果是某些算法实现存在调用了外部OP,而且Paddle也不包含该OP实现;(2) 其他框架存在的API或者OP,但是Paddle中没有这些OP。此时: + * 对于Paddle资深用户来说,可以尝试使用Paddle的自定义算子功能,存在一定的代码开发量。 + * 对于初学者来说,可以给Paddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose),列出Paddle不支持的实现,Paddle开发人员会根据优先级进行实现。 +* PaddlePaddle与PyTorch对于不同名称的API,实现的功能可能是相同的,复现的时候注意,比如[paddle.optimizer.lr.StepDecay](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/lr/StepDecay_cn.html#stepdecay)与[torch.optim.lr_scheduler.StepLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html#torch.optim.lr_scheduler.StepLR) 。 +* 对于PaddlePaddle来说,通过`paddle.set_device`函数(全局)来确定模型结构是运行在什么设备上,对于torch来说,是通过`model.to("device")` (局部)来确定模型结构的运行设备,这块在复现的时候需要注意。 + + + +### 4.1 模型结构对齐 + +#### 4.1.1 API +* 对于 `paddle.nn.Linear` 层的weight参数,PaddlePaddle与PyTorch的保存方式不同,在转换时需要进行转置,示例代码可以参考[BERT权重转换脚本](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py)。 +* `torch.masked_fill`函数的功能目前可以使用`paddle.where`进行实现,可以参考:[链接](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/faq/train_cn.html#paddletorch-masked-fillapi)。 +* `pack_padded_sequence`和`pad_packed_sequence`这两个API目前PaddlePaddle中没有实现,可以直接在RNN或者LSTM的输入中传入`sequence_length`来实现等价的功能。 + + +#### 4.1.2 权重转换 + +* 在权重转换的时候,不能只关注参数的名称,比如说有些`paddle.nn.Linear`层,但是定义的变量名称为`conv`,这种也是需要进行权重转置的。 +* 权重转换时,建议同时打印 Paddle 和 PyTorch 对应权重的shape,以防止名称相似但是shape不同的参数权重转换报错。 + +#### 4.1.3 模型组网正确性验证 + +* 在论文复现的过程中,可能会遇到一些经典的模型结构,比如Transformer等,Paddle官方也提供了Transformer的实现,但是这里建议自己根据PyTorch代码重新实现一遍,一方面是对整体的模型结构更加熟悉,另一方面也保证模型结构和权重完全对齐。 +* 在复杂的网络结构中,如果前向结果对不齐,可以按照模块排查问题,比如依次获取embedding、transformer-block、mlm-head输出等,看下问题具体出现在哪个子模块,再进到子模块详细排查。 +* 网络结构对齐后,尽量使用训练好的预训练模型和真实的数据进行前向diff计算,这样更准确。 + + +### 4.2 验证/测试集数据读取对齐 + +* 需要仔细排查数据预处理,不仅包含的预处理方法相同,也需要保证预处理的流程相同,比如先padding策略不同和截断策略的不同会导致得到最终的结果是不同的。 + + +### 4.3 评估指标对齐 + +* 真实数据评估时,需要注意评估时 `paddle.io.DataLoader` 的 ``drop_last`` 参数是否打开(文档[链接](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/DataLoader_cn.html#dataloader)),复现代码需要与参考代码保持一致,否则最后不够batch-size的数据的评估会有diff。 +* 在识别或者检索过程中,为了加速评估过程,往往会将评估函数由CPU实现改为GPU实现,由此会带来评估函数输出的不一致。这是由于sort函数对于相同值的排序结果不同带来的。在复现的过程中,如果可以接受轻微的指标不稳定,可以使用PaddlePaddle的sort函数,如果对于指标非常敏感,同时对速度性能要求很高,可以给PaddlePaddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose),由研发人员高优开发。 + + + +### 4.4 损失函数对齐 + +* 部分算法的损失函数中会用到 bool 索引,这时候可以使用[paddle.where](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/where_cn.html#where) 代替。 +* `paddle.nn.CrossEntropyLoss` 默认是在最后一维(axis=-1)计算损失函数,而 `torch.nn.CrossEntropyLoss` 是在axis=1的地方计算损失函数,因此如果输入的维度大于2,这里需要保证计算的维(axis)相同,否则可能会出错。 +* 在生成模型中会遇到梯度损失,需要对模型中的算子求二次梯度,目前`MaxPooling`暂时不支持二次梯度,如果复现的过程中遇到了需要对`MaxPooling`求二次梯度的情况,可以和Paddle官方开发同学反馈,进一步确认解决方案。 +* 在保存损失函数值的时候,注意要使用`paddle.no_grad`,或者仅仅保存转换成 numpy 的数组,避免损失没有析构导致内存泄漏问题。 + +```python +# 错误示范 +loss = celoss(pred, label) +avg_loss += loss +# 正确示范1 +loss = celoss(pred, label) +avg_loss += loss.numpy() +# 正确示范2 +loss = celoss(pred, label) +with paddle.no_grad() + avg_loss += loss +``` + + +### 4.5 优化器对齐 + +* Paddle目前支持在 ``optimizer`` 中通过设置 ``params_groups`` 的方式设置不同参数的更新方式,可以参考[代码示例](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/optimizer/optimizer.py#L107) 。 +* 有些模型训练时,会使用梯度累加策略,即累加到一定step数量之后才进行参数更新,这时在实现上需要注意对齐。 +* 在某些任务中,比如说深度学习可视化、可解释性等任务中,一般只要求模型前向过程,不需要训练,此时优化器、学习率等用于模型训练的模块对于该类论文复现是不需要的。 +* 在文本分类领域,大多数Transformer模型都采用了AdamW优化器,并且会设置weigh decay,同时部分参数设置为no weight decay,例如位置编码的参数通常设置为no weight decay,no weight decay参数设置不正确,最终会有明显的精度损失,需要特别注意。一般可以通过分析模型权重来发现该问题,分别计算官方模型和复现模型每层参数权重的平均值、方差,对每一层依次对比,有显著差异的层可能存在问题,因为在weight decay的作用下,参数权重数值会相对较小,而未正确设置no weight decay,则会造成该层参数权重数值异常偏小。 + + + +### 4.6 学习率对齐 + +* PaddlePaddle 中参数的学习率受到优化器学习率和`ParamAttr`中设置的学习率影响,因此跟踪学习率需要将二者结合进行跟踪。 +* 对于复现代码和参考代码,学习率在整个训练过程中在相同的轮数相同的iter下应该保持一致,可以通过`reprod_log`工具、打印学习率值或者可视化二者学习率的log来查看diff。 +* 有些网络的学习率策略比较细致,比如带warmup的学习率策略,这里需要保证起始学习率等参数都完全一致。 + + + +### 4.7 正则化策略对齐 + +* 在如Transformer或者少部分CNN模型中,存在一些参数不做正则化(正则化系数为0)的情况。这里需要找到这些参数并对齐取消实施正则化策略,可以参考[这里](https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.3/ppcls/arch/backbone/model_zoo/resnest.py#L72),对特定参数进行修改。 + + +### 4.8 反向对齐 + +* 反向对齐时,如果第二轮开始,loss开始无法对齐,则首先需要排查下超参数的差异,没问题的话,在`loss.backward()`方法之后,使用`tensor.grad`获取梯度值,二分的方法查找diff,定位出PaddlePaddle与PyTorch梯度无法对齐的API或者操作,然后进一步验证。第3章中给出了获取所有参数的梯度方法,如果只希望打印特定参数的梯度,可以用下面的方式。 + + +```python +import paddle + +def print_hook_fn(grad): + print(grad) + +x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False) +h = x.register_hook(print_hook_fn) +w = x * 4 +w.backward() +# backward之后会输出下面的内容 +# Tensor(shape=[4], dtype=float32, place=CPUPlace, stop_gradient=False, +# [4., 4., 4., 4.]) +``` + + + +### 4.9 训练集数据读取对齐 + +#### 4.9.1 API + +* 在前向过程中,如果数据预处理过程运行出错,请先将 ``paddle.io.DataLoader`` 的 ``num_workers`` 参数设为0,然后根据单个进程下的报错日志定位出具体的bug。 + +#### 4.9.2 数据预处理 + + +* 如果数据处理过程中涉及到随机数生成,建议固定seed (`np.random.seed(0)`, `random.seed(0)`),查看复现代码和参考代码处理后的数据是否有diff。 +* 对文本进行tokenizer处理时,需要确定文本的截断策略,padding策略。 + + +### 4.10 网络初始化对齐 + +* 对于不同的深度学习框架,网络初始化在大多情况下,即使值的分布完全一致,也无法保证值完全一致,这里也是论文复现中不确定性比较大的地方。如果十分怀疑初始化导致的问题,建议将参考的初始化权重转成paddle模型,加载该初始化模型训练,看下收敛精度。 +* CNN对于模型初始化相对来说没有那么敏感,在迭代轮数与数据集足够的情况下,最终精度指标基本接近;而transformer系列模型对于初始化比较敏感,在transformer系列模型训练对齐过程中,建议对这一块进行重点检查。 + + + +### 4.11 模型训练对齐 + +#### 4.11.1 训练对齐通用问题 + +* 有条件的话,复现工作之前最好先基于官方代码完成训练,保证与官方指标能够对齐,并且将训练策略和训练过程中的关键指标记录保存下来,比如每个epoch的学习率、Train Loss、Eval Loss、Eval Acc等,在复现网络的训练过程中,将关键指标保存下来,这样可以将两次训练中关键指标的变化曲线绘制出来,能够很方便的进行对比。 +* 训练过程中可以对loss或者acc进行可视化,和竞品loss或者acc进行直观的对比;如果训练较大的数据集,1次完整训练的成本比较高,此时可以隔一段时间查看一下,如果精度差异比较大,建议先停掉实验,排查原因。 +* 如果训练的过程中出nan,一般是因为除0或者log0的情况, 可以着重看下几个部分: + * 如果有预训练模型的话,可以确认下是否加载正确 + * 模型结构中计算loss的部分是否有考虑到正样本为0的情况 + * 也可能是某个API的数值越界导致的,可以测试较小的输入是否还会出现nan。 +* 如果训练过程中如果出现不收敛的情况,可以 + * 简化网络和数据,实验是否收敛; + * 如果是基于原有实现进行改动,可以尝试控制变量法,每次做一个改动,逐个排查; + * 检查学习率是否过大、优化器设置是否合理,排查下weight decay是否设置正确; + * 保存不同step之间的模型参数,观察模型参数是否更新。 diff --git a/examples/torch_migration/pipeline/Step1/README.md b/examples/torch_migration/pipeline/Step1/README.md new file mode 100644 index 000000000000..b3db11238110 --- /dev/null +++ b/examples/torch_migration/pipeline/Step1/README.md @@ -0,0 +1,86 @@ +# 使用方法 + + +本部分内容以前向对齐为例,介绍基于`repord_log`工具对齐的检查流程。其中与`reprod_log`工具有关的部分都是需要开发者需要添加的部分。 + + +```shell +# 进入文件夹并生成torch的bert模型权重 +cd pipeline/weights/ && python torch_bert_weights.py +# 进入文件夹并将torch的bert模型权重转换为paddle +cd pipeline/weights/ && python torch2paddle.py +# 进入文件夹并生成classifier权重 +cd pipeline/classifier_weights/ && python generate_classifier_weights.py +# 进入Step1文件夹 +cd pipeline/Step1/ +# 生成paddle的前向数据 +python pd_forward_bert.py +# 生成torch的前向数据 +python pt_forward_bert.py +# 对比生成log +python check_step1.py +``` + +具体地,以PaddlePaddle为例,`pd_forward_bert.py`的具体代码如下所示。 + +```python +import numpy as np +import paddle +from reprod_log import ReprodLogger +import sys +import os +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +config_path = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(config_path) +from models.pd_bert import * + +# 导入reprod_log中的ReprodLogger类 +from reprod_log import ReprodLogger + +reprod_logger = ReprodLogger() + +# 组网初始化加载BertModel权重 +paddle_dump_path = '../weights/paddle_weight.pdparams' +config = BertConfig() +model = BertForSequenceClassification(config) +checkpoint = paddle.load(paddle_dump_path) +model.bert.load_dict(checkpoint) + +# 加载分类权重 +classifier_weights = paddle.load( + "../classifier_weights/paddle_classifier_weights.bin") +model.load_dict(classifier_weights) +model.eval() +# 读入fake data并转换为tensor,这里也可以固定seed在线生成fake data +fake_data = np.load("../fake_data/fake_data.npy") +fake_data = paddle.to_tensor(fake_data) +# 模型前向 +out = model(fake_data) +# 保存前向结果,对于不同的任务,需要开发者添加。 +reprod_logger.add("logits", out.cpu().detach().numpy()) +reprod_logger.save("forward_paddle.npy") +``` + +diff检查的代码可以参考:[check_step1.py](./check_step1.py),具体代码如下所示。 + +```python +# https://github.com/littletomatodonkey/AlexNet-Prod/blob/master/pipeline/Step1/check_step1.py +# 使用reprod_log排查diff +from reprod_log import ReprodDiffHelper +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + torch_info = diff_helper.load_info("./forward_torch.npy") + paddle_info = diff_helper.load_info("./forward_paddle.npy") + diff_helper.compare_info(torch_info, paddle_info) + diff_helper.report(path="forward_diff.log") +``` + +产出日志如下,同时会将check的结果保存在`forward_diff.log`文件中。 + +``` +[2021/11/17 20:15:50] root INFO: logits: +[2021/11/17 20:15:50] root INFO: mean diff: check passed: True, value: 1.30385160446167e-07 +[2021/11/17 20:15:50] root INFO: diff check passed +``` + +平均绝对误差为1.3e-7,测试通过。 diff --git a/examples/torch_migration/pipeline/Step1/check_step1.py b/examples/torch_migration/pipeline/Step1/check_step1.py new file mode 100644 index 000000000000..6dbb247cf179 --- /dev/null +++ b/examples/torch_migration/pipeline/Step1/check_step1.py @@ -0,0 +1,23 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from reprod_log import ReprodDiffHelper + +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + torch_info = diff_helper.load_info("./forward_torch.npy") + paddle_info = diff_helper.load_info("./forward_paddle.npy") + + diff_helper.compare_info(torch_info, paddle_info) + diff_helper.report(path="forward_diff.log") diff --git a/examples/torch_migration/pipeline/Step1/pd_forward_bert.py b/examples/torch_migration/pipeline/Step1/pd_forward_bert.py new file mode 100644 index 000000000000..260386973b20 --- /dev/null +++ b/examples/torch_migration/pipeline/Step1/pd_forward_bert.py @@ -0,0 +1,50 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os + +import numpy as np +import paddle +from reprod_log import ReprodLogger + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(CONFIG_PATH) + +from models.pd_bert import BertConfig, BertForSequenceClassification + +if __name__ == "__main__": + paddle.set_device("cpu") + + # def logger + reprod_logger = ReprodLogger() + + paddle_dump_path = '../weights/paddle_weight.pdparams' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = paddle.load(paddle_dump_path) + model.bert.load_dict(checkpoint) + + classifier_weights = paddle.load( + "../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + model.eval() + # read or gen fake data + + fake_data = np.load("../fake_data/fake_data.npy") + fake_data = paddle.to_tensor(fake_data) + # forward + out = model(fake_data)[0] + reprod_logger.add("logits", out.cpu().detach().numpy()) + reprod_logger.save("forward_paddle.npy") diff --git a/examples/torch_migration/pipeline/Step1/pt_forward_bert.py b/examples/torch_migration/pipeline/Step1/pt_forward_bert.py new file mode 100644 index 000000000000..c2dd64965c99 --- /dev/null +++ b/examples/torch_migration/pipeline/Step1/pt_forward_bert.py @@ -0,0 +1,48 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os + +import numpy as np +from reprod_log import ReprodLogger +import torch + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(CONFIG_PATH) + +from models.pt_bert import BertConfig, BertForSequenceClassification + +if __name__ == "__main__": + # def logger + reprod_logger = ReprodLogger() + + pytorch_dump_path = '../weights/torch_weight.bin' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = torch.load(pytorch_dump_path) + model.bert.load_state_dict(checkpoint) + + classifier_weights = torch.load( + "../classifier_weights/torch_classifier_weights.bin") + model.load_state_dict(classifier_weights, strict=False) + model.eval() + + # read or gen fake data + fake_data = np.load("../fake_data/fake_data.npy") + fake_data = torch.from_numpy(fake_data) + # forward + out = model(fake_data)[0] + reprod_logger.add("logits", out.cpu().detach().numpy()) + reprod_logger.save("forward_torch.npy") diff --git a/examples/torch_migration/pipeline/Step1/torch2paddle.py b/examples/torch_migration/pipeline/Step1/torch2paddle.py new file mode 100644 index 000000000000..07e6edc6e4de --- /dev/null +++ b/examples/torch_migration/pipeline/Step1/torch2paddle.py @@ -0,0 +1,114 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import paddle +import torch +from paddlenlp.transformers import BertForPretraining as PDBertForMaskedLM +from transformers import BertForMaskedLM as PTBertForMaskedLM + + +def convert_pytorch_checkpoint_to_paddle( + pytorch_checkpoint_path="pytorch_model.bin", + paddle_dump_path="model_state.pdparams", + version="old", +): + hf_to_paddle = { + "embeddings.LayerNorm": "embeddings.layer_norm", + "encoder.layer": "encoder.layers", + "attention.self.query": "self_attn.q_proj", + "attention.self.key": "self_attn.k_proj", + "attention.self.value": "self_attn.v_proj", + "attention.output.dense": "self_attn.out_proj", + "intermediate.dense": "linear1", + "output.dense": "linear2", + "attention.output.LayerNorm": "norm1", + "output.LayerNorm": "norm2", + "predictions.decoder.": "predictions.decoder_", + "predictions.transform.dense": "predictions.transform", + "predictions.transform.LayerNorm": "predictions.layer_norm", + } + do_not_transpose = [] + if version == "old": + hf_to_paddle.update({ + "predictions.bias": "predictions.decoder_bias", + ".gamma": ".weight", + ".beta": ".bias", + }) + do_not_transpose = do_not_transpose + ["predictions.decoder.weight"] + + pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu") + paddle_state_dict = OrderedDict() + for k, v in pytorch_state_dict.items(): + is_transpose = False + if k[-7:] == ".weight": + # embeddings.weight and LayerNorm.weight do not transpose + if all(d not in k for d in do_not_transpose): + if ".embeddings." not in k and ".LayerNorm." not in k: + if v.ndim == 2: + if 'embeddings' not in k: + v = v.transpose(0, 1) + is_transpose = True + is_transpose = False + oldk = k + print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}") + paddle_state_dict[k] = v.data.numpy() + + paddle.save(paddle_state_dict, paddle_dump_path) + + +def compare(out_torch, out_paddle): + out_torch = out_torch.detach().numpy() + out_paddle = out_paddle.detach().numpy() + assert out_torch.shape == out_paddle.shape + abs_dif = np.abs(out_torch - out_paddle) + mean_dif = np.mean(abs_dif) + max_dif = np.max(abs_dif) + min_dif = np.min(abs_dif) + print("mean_dif:{}".format(mean_dif)) + print("max_dif:{}".format(max_dif)) + print("min_dif:{}".format(min_dif)) + + +def test_forward(): + paddle.set_device("cpu") + model_torch = PTBertForMaskedLM.from_pretrained("./bert-base-uncased") + model_paddle = PDBertForMaskedLM.from_pretrained("./bert-base-uncased") + model_torch.eval() + model_paddle.eval() + np.random.seed(42) + x = np.random.randint(1, + model_paddle.bert.config["vocab_size"], + size=(4, 64)) + input_torch = torch.tensor(x, dtype=torch.int64) + out_torch = model_torch(input_torch)[0] + + input_paddle = paddle.to_tensor(x, dtype=paddle.int64) + out_paddle = model_paddle(input_paddle)[0] + + print("torch result shape:{}".format(out_torch.shape)) + print("paddle result shape:{}".format(out_paddle.shape)) + compare(out_torch, out_paddle) + + +if __name__ == "__main__": + convert_pytorch_checkpoint_to_paddle("test.bin", "test_paddle.pdparams") +# test_forward() +# torch result shape:torch.Size([4, 64, 30522]) +# paddle result shape:[4, 64, 30522] +# mean_dif:1.666686512180604e-05 +# max_dif:0.00015211105346679688 +# min_dif:0.0 diff --git a/examples/torch_migration/pipeline/Step2/README.md b/examples/torch_migration/pipeline/Step2/README.md new file mode 100644 index 000000000000..029761c85e47 --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/README.md @@ -0,0 +1,131 @@ +# 使用方法 + +## 数据集和数据加载对齐步骤 + +* 使用下面的命令,判断数据预处理以及数据集是否构建正确。 + +```shell +python test_data.py +``` + +显示出以下内容,Dataset以及Dataloader的长度和内容diff均满足小于指定阈值,可以认为复现成功。 + +``` +[2021/11/17 20:57:06] root INFO: length: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_0_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_0_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_0_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_1_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_1_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_1_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_2_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_2_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_2_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_3_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_3_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_3_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_4_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_4_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataset_4_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_0_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_0_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_0_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_1_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_1_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_1_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_2_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_2_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_2_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_3_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_3_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_3_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_4_input_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_4_token_type_ids: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: dataloader_4_labels: +[2021/11/17 20:57:06] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 20:57:06] root INFO: diff check passed +``` + + +## 数据评估对齐流程 + +### 评估代码和修改内容说明 + +Pytorch准确率评估指标使用的是huggingface的datasets库。 + +```python +import torch +import numpy as np +from datasets import load_metric +hf_metric = load_metric("accuracy.py") +logits = np.random.normal(0, 1, size=(64, 2)).astype("float32") +labels = np.random.randint(0, 2, size=(64,)).astype("int64") +hf_metric.add_batch(predictions=torch.from_numpy(logits).argmax(dim=-1), references=torch.from_numpy(labels)) +hf_accuracy = hf_metric.compute()["accuracy"] +print(hf_accuracy) +``` + +对应地,PaddlePaddle评估指标代码如下 + +```python +import paddle +import numpy as np +from paddle.metric import Accuracy +pd_metric = Accuracy() +pd_metric.reset() +logits = np.random.normal(0, 1, size=(64, 2)).astype("float32") +labels = np.random.randint(0, 2, size=(64,)).astype("int64") +correct = pd_metric.compute(paddle.to_tensor(logits), paddle.to_tensor(labels)) +pd_metric.update(correct) +pd_accuracy = pd_metric.accumulate() +print(pd_accuracy) +``` + +### 操作步骤 + +运行下面的命令,验证数据集评估是否正常。 + +```shell +# 生成paddle和pytorch指标 +python test_metric.py +# 对比生成log +python check_step2.py +``` + +最终结果输出如下,accuracy精度diff为0,小于阈值,结果前向验证, +``` +[2021/11/17 21:15:05] root INFO: accuracy: +[2021/11/17 21:15:05] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:15:05] root INFO: diff check passed + +``` diff --git a/examples/torch_migration/pipeline/Step2/accuracy.py b/examples/torch_migration/pipeline/Step2/accuracy.py new file mode 100644 index 000000000000..ae447e4a398a --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/accuracy.py @@ -0,0 +1,96 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Accuracy metric.""" + +import datasets +from sklearn.metrics import accuracy_score + +_DESCRIPTION = """ +Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: +Accuracy = (TP + TN) / (TP + TN + FP + FN) +TP: True positive +TN: True negative +FP: False positive +FN: False negative +""" + +_KWARGS_DESCRIPTION = """ +Args: + predictions: Predicted labels, as returned by a model. + references: Ground truth labels. + normalize: If False, return the number of correctly classified samples. + Otherwise, return the fraction of correctly classified samples. + sample_weight: Sample weights. +Returns: + accuracy: Accuracy score. +Examples: + + >>> accuracy_metric = datasets.load_metric("accuracy") + >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1]) + >>> print(results) + {'accuracy': 1.0} +""" + +_CITATION = """\ +@article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} +""" + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, + _KWARGS_DESCRIPTION) +class Accuracy(datasets.Metric): + + def _info(self): + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Sequence(datasets.Value("int32")), + "references": datasets.Sequence(datasets.Value("int32")), + } if self.config_name == "multilabel" else { + "predictions": datasets.Value("int32"), + "references": datasets.Value("int32"), + }), + reference_urls=[ + "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html" + ], + ) + + def _compute(self, + predictions, + references, + normalize=True, + sample_weight=None): + return { + "accuracy": + accuracy_score( + references, + predictions, + normalize=normalize, + sample_weight=sample_weight, + ).item(), + } diff --git a/examples/torch_migration/pipeline/Step2/check_step2.py b/examples/torch_migration/pipeline/Step2/check_step2.py new file mode 100644 index 000000000000..ac74370e6a99 --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/check_step2.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from reprod_log import ReprodDiffHelper + +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + torch_info = diff_helper.load_info("./metric_torch.npy") + paddle_info = diff_helper.load_info("./metric_paddle.npy") + + diff_helper.compare_info(torch_info, paddle_info) + + diff_helper.report(path="metric_diff.log") diff --git a/examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv b/examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv new file mode 100644 index 000000000000..fdc6b82affef --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv @@ -0,0 +1,33 @@ +sentence label +it 's a charming and often affecting journey . 1 +unflinchingly bleak and desperate 0 +allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 1 +the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 1 +it 's slow -- very , very slow . 0 +although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . 1 +a sometimes tedious film . 0 +or doing last year 's taxes with your ex-wife . 0 +you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . 1 +in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . 0 +the mesmerizing performances of the leads keep the film grounded and keep the audience riveted . 1 +it takes a strange kind of laziness to waste the talents of robert forster , anne meara , eugene levy , and reginald veljohnson all in the same movie . 0 +... the film suffers from a lack of humor ( something needed to balance out the violence ) ... 0 +we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity . 1 +even horror fans will most likely not find what they 're seeking with trouble every day ; the movie lacks both thrills and humor . 0 +a gorgeous , high-spirited musical from india that exquisitely blends music , dance , song , and high drama . 1 +the emotions are raw and will strike a nerve with anyone who 's ever had family trauma . 1 +audrey tatou has a knack for picking roles that magnify her outrageous charm , and in this literate french comedy , she 's as morning-glory exuberant as she was in amélie . 1 +... the movie is just a plain old monster . 0 +in its best moments , resembles a bad high school production of grease , without benefit of song . 0 +pumpkin takes an admirable look at the hypocrisy of political correctness , but it does so with such an uneven tone that you never know when humor ends and tragedy begins . 0 +the iditarod lasts for days - this just felt like it did . 0 +holden caulfield did it better . 0 +a delectable and intriguing thriller filled with surprises , read my lips is an original . 1 +seldom has a movie so closely matched the spirit of a man and his work . 1 +nicks , seemingly uncertain what 's going to make people laugh , runs the gamut from stale parody to raunchy sex gags to formula romantic comedy . 0 +the action switches between past and present , but the material link is too tenuous to anchor the emotional connections that purport to span a 125-year divide . 0 +it 's an offbeat treat that pokes fun at the democratic exercise while also examining its significance for those who take part . 1 +it 's a cookie-cutter movie , a cut-and-paste job . 0 +i had to look away - this was god awful . 0 +thanks to scott 's charismatic roger and eisenberg 's sweet nephew , roger dodger is one of the most compelling variations on in the company of men . 1 +... designed to provide a mix of smiles and tears , `` crossroads '' instead provokes a handful of unintentional howlers and numerous yawns . 0 diff --git a/examples/torch_migration/pipeline/Step2/predict.py b/examples/torch_migration/pipeline/Step2/predict.py new file mode 100644 index 000000000000..93079f6ed8a1 --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/predict.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial +import sys +import os + +import paddle +import paddle.nn as nn +from datasets import Dataset +from paddlenlp.data import Dict, Pad, Stack +from paddlenlp.datasets import load_dataset as ppnlp_load_dataset +from paddlenlp.transformers import BertTokenizer as PPNLPBertTokenizer +from reprod_log import ReprodDiffHelper, ReprodLogger +from transformers import BertTokenizer as HFBertTokenizer +import functools +import pandas as pd + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(CONFIG_PATH) +from models.pd_bert import BertConfig, BertForSequenceClassification + + +def get_data(): + + def read(data_path): + df = pd.read_csv(data_path, sep="\t") + for _, row in df.iterrows(): + yield {"sentence": row["sentence"], "labels": row["label"]} + + def convert_example(example, tokenizer, max_length=128): + labels = [example["labels"]] + #labels = np.array([example["labels"]], dtype="int64") + example = tokenizer(example["sentence"], max_seq_len=max_length) + return example + + tokenizer = PPNLPBertTokenizer.from_pretrained("bert-base-uncased") + dataset_test = ppnlp_load_dataset(read, + data_path='demo_sst2_sentence/demo.tsv', + lazy=False) + trans_func = partial(convert_example, tokenizer=tokenizer, max_length=128) + + dataset_test = dataset_test.map(trans_func, lazy=False) + one_sentence = dataset_test.new_data[0] + + for k in ["input_ids", "token_type_ids"]: + one_sentence[k] = paddle.to_tensor(one_sentence[k], dtype='int64') + one_sentence[k] = paddle.unsqueeze(one_sentence[k], axis=0) + + return one_sentence + + +@paddle.no_grad() +def main(): + # 模型定义 + paddle_dump_path = '../weights/paddle_weight.pdparams' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = paddle.load(paddle_dump_path) + model.bert.load_dict(checkpoint) + + classifier_weights = paddle.load( + "../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + + model.eval() + tokenizer = PPNLPBertTokenizer.from_pretrained("bert-base-uncased") + # 要预测的句子 + data = get_data() + softmax = nn.Softmax() + # 预测的各类别的概率值 + output = softmax(model(**data)[0]).numpy() + + # 概率值最大的类别 + class_id = output.argmax() + # 对应的概率值 + prob = output[0][class_id] + print(f"class_id: {class_id}, prob: {prob}") + return output + + +if __name__ == "__main__": + main() diff --git a/examples/torch_migration/pipeline/Step2/test_data.py b/examples/torch_migration/pipeline/Step2/test_data.py new file mode 100644 index 000000000000..37ed96699e3b --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/test_data.py @@ -0,0 +1,145 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +from functools import partial + +import numpy as np +import paddle +import pandas as pd +import torch +from datasets import Dataset +from paddlenlp.data import Dict, Pad, Stack +from paddlenlp.datasets import load_dataset as ppnlp_load_dataset +from paddlenlp.transformers import BertTokenizer as PPNLPBertTokenizer +from reprod_log import ReprodDiffHelper, ReprodLogger +from transformers import BertTokenizer as HFBertTokenizer + + +def build_paddle_data_pipeline(): + from paddlenlp.data import DataCollatorWithPadding + + def read(data_path): + df = pd.read_csv(data_path, sep="\t") + for _, row in df.iterrows(): + yield {"sentence": row["sentence"], "labels": row["label"]} + + def convert_example(example, tokenizer, max_length=128): + labels = [example["labels"]] + example = tokenizer(example["sentence"], max_seq_len=max_length) + + example["labels"] = labels + return example + + # load tokenizer + tokenizer = PPNLPBertTokenizer.from_pretrained("bert-base-uncased") + # load data + dataset_test = ppnlp_load_dataset(read, + data_path='demo_sst2_sentence/demo.tsv', + lazy=False) + trans_func = partial(convert_example, tokenizer=tokenizer, max_length=128) + + # tokenize data + dataset_test = dataset_test.map(trans_func, lazy=False) + + test_sampler = paddle.io.SequenceSampler(dataset_test) + test_batch_sampler = paddle.io.BatchSampler(sampler=test_sampler, + batch_size=4) + data_collator = DataCollatorWithPadding(tokenizer) + data_loader_test = paddle.io.DataLoader( + dataset_test, + batch_sampler=test_batch_sampler, + num_workers=0, + collate_fn=data_collator, + ) + + return dataset_test, data_loader_test + + +def build_torch_data_pipeline(): + from transformers import DataCollatorWithPadding + tokenizer = HFBertTokenizer.from_pretrained("bert-base-uncased") + + def preprocess_function(examples): + result = tokenizer( + examples["sentence"], + padding=False, + max_length=128, + truncation=True, + return_token_type_ids=True, + ) + if "label" in examples: + result["labels"] = [examples["label"]] + return result + + # load data + dataset_test = Dataset.from_csv("demo_sst2_sentence/demo.tsv", sep="\t") + dataset_test = dataset_test.map( + preprocess_function, + batched=False, + remove_columns=dataset_test.column_names, + desc="Running tokenizer on dataset", + ) + dataset_test.set_format("np", + columns=["input_ids", "token_type_ids", "labels"]) + test_sampler = torch.utils.data.SequentialSampler(dataset_test) + collate_fn = DataCollatorWithPadding(tokenizer) + data_loader_test = torch.utils.data.DataLoader( + dataset_test, + batch_size=4, + sampler=test_sampler, + num_workers=0, + collate_fn=collate_fn, + ) + return dataset_test, data_loader_test + + +def test_data_pipeline(): + diff_helper = ReprodDiffHelper() + paddle_dataset, paddle_dataloader = build_paddle_data_pipeline() + torch_dataset, torch_dataloader = build_torch_data_pipeline() + + logger_paddle_data = ReprodLogger() + logger_torch_data = ReprodLogger() + + logger_paddle_data.add("length", np.array(len(paddle_dataset))) + logger_torch_data.add("length", np.array(len(torch_dataset))) + + # random choose 5 images and check + for idx in range(5): + rnd_idx = np.random.randint(0, len(paddle_dataset)) + for k in ["input_ids", "token_type_ids", "labels"]: + + logger_paddle_data.add(f"dataset_{idx}_{k}", + np.array(paddle_dataset[rnd_idx][k])) + + logger_torch_data.add(f"dataset_{idx}_{k}", + np.array(torch_dataset[rnd_idx][k])) + + for idx, (paddle_batch, + torch_batch) in enumerate(zip(paddle_dataloader, + torch_dataloader)): + if idx >= 5: + break + for i, k in enumerate(["input_ids", "token_type_ids", "labels"]): + logger_paddle_data.add(f"dataloader_{idx}_{k}", + paddle_batch[k].numpy()) + logger_torch_data.add(f"dataloader_{idx}_{k}", + torch_batch[k].cpu().numpy()) + + diff_helper.compare_info(logger_paddle_data.data, logger_torch_data.data) + diff_helper.report() + + +if __name__ == "__main__": + test_data_pipeline() diff --git a/examples/torch_migration/pipeline/Step2/test_metric.py b/examples/torch_migration/pipeline/Step2/test_metric.py new file mode 100644 index 000000000000..408ffb0e840b --- /dev/null +++ b/examples/torch_migration/pipeline/Step2/test_metric.py @@ -0,0 +1,50 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import torch +from datasets import load_metric +from paddle.metric import Accuracy +from reprod_log import ReprodLogger + + +def generate(): + pd_metric = Accuracy() + pd_metric.reset() + hf_metric = load_metric("accuracy.py") + for i in range(4): + logits = np.random.normal(0, 1, size=(64, 2)).astype("float32") + labels = np.random.randint(0, 2, size=(64, )).astype("int64") + # paddle metric + correct = pd_metric.compute(paddle.to_tensor(logits), + paddle.to_tensor(labels)) + pd_metric.update(correct) + # hf metric + hf_metric.add_batch( + predictions=torch.from_numpy(logits).argmax(dim=-1), + references=torch.from_numpy(labels), + ) + pd_accuracy = pd_metric.accumulate() + hf_accuracy = hf_metric.compute()["accuracy"] + reprod_logger = ReprodLogger() + reprod_logger.add("accuracy", np.array([pd_accuracy])) + reprod_logger.save("metric_paddle.npy") + reprod_logger = ReprodLogger() + reprod_logger.add("accuracy", np.array([hf_accuracy])) + reprod_logger.save("metric_torch.npy") + + +if __name__ == "__main__": + generate() diff --git a/examples/torch_migration/pipeline/Step3/README.md b/examples/torch_migration/pipeline/Step3/README.md new file mode 100644 index 000000000000..4e6e79ae1bf1 --- /dev/null +++ b/examples/torch_migration/pipeline/Step3/README.md @@ -0,0 +1,67 @@ +# 使用方法 + +## 代码解析 + +以PaddlePaddle为例,下面为定义模型、计算loss并保存的代码。 + +```python +# paddle_loss.py +if __name__ == "__main__": + paddle.set_device("cpu") + + # def logger + reprod_logger = ReprodLogger() + + model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_classes=2) + classifier_weights = paddle.load("../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + model.eval() + + criterion = nn.CrossEntropyLoss() + + # read or gen fake data + fake_data = np.load("../fake_data/fake_data.npy") + fake_data = paddle.to_tensor(fake_data) + + fake_label = np.load("../fake_data/fake_label.npy") + fake_label = paddle.to_tensor(fake_label) + + # forward + out = model(fake_data) + + loss = criterion(out, fake_label) + # + reprod_logger.add("loss", loss.cpu().detach().numpy()) + reprod_logger.save("loss_paddle.npy") + +``` + +记录loss并保存在`loss_paddle.npy`文件中。 + + +## 操作步骤 + +* 具体操作步骤如下所示。 + + +```shell +# 生成paddle的前向loss结果 +python paddle_loss.py + +# 生成torch的前向loss结果 +python torch_loss.py + +# 对比生成log +python check_step3.py +``` + +`check_step3.py`的输出结果如下所示,同时也会保存在`loss_diff.log`文件中。 + +``` +[2021/11/17 21:27:35] root INFO: loss: +[2021/11/17 21:27:35] root INFO: mean diff: check passed: True, value: 5.960464477539063e-08 +[2021/11/17 21:27:35] root INFO: diff check passed + +``` + +diff为5.96e-8,check通过。 diff --git a/examples/torch_migration/pipeline/Step3/check_step3.py b/examples/torch_migration/pipeline/Step3/check_step3.py new file mode 100644 index 000000000000..546233dade0e --- /dev/null +++ b/examples/torch_migration/pipeline/Step3/check_step3.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from reprod_log import ReprodDiffHelper + +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + torch_info = diff_helper.load_info("./loss_torch.npy") + paddle_info = diff_helper.load_info("./loss_paddle.npy") + + diff_helper.compare_info(torch_info, paddle_info) + + diff_helper.report(path="loss_diff.log") diff --git a/examples/torch_migration/pipeline/Step3/paddle_loss.py b/examples/torch_migration/pipeline/Step3/paddle_loss.py new file mode 100644 index 000000000000..fd10a4fc32d2 --- /dev/null +++ b/examples/torch_migration/pipeline/Step3/paddle_loss.py @@ -0,0 +1,59 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os + +import numpy as np +import paddle +import paddle.nn as nn +from reprod_log import ReprodLogger + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(CONFIG_PATH) + +from models.pd_bert import BertConfig, BertForSequenceClassification + +if __name__ == "__main__": + paddle.set_device("cpu") + + # def logger + reprod_logger = ReprodLogger() + + paddle_dump_path = '../weights/paddle_weight.pdparams' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = paddle.load(paddle_dump_path) + model.bert.load_dict(checkpoint) + + classifier_weights = paddle.load( + "../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + model.eval() + + criterion = nn.CrossEntropyLoss() + + # read or gen fake data + fake_data = np.load("../fake_data/fake_data.npy") + fake_data = paddle.to_tensor(fake_data) + + fake_label = np.load("../fake_data/fake_label.npy") + fake_label = paddle.to_tensor(fake_label) + + # forward + out = model(fake_data)[0] + + loss = criterion(out, fake_label) + reprod_logger.add("loss", loss.cpu().detach().numpy()) + reprod_logger.save("loss_paddle.npy") diff --git a/examples/torch_migration/pipeline/Step3/torch_loss.py b/examples/torch_migration/pipeline/Step3/torch_loss.py new file mode 100644 index 000000000000..302520581023 --- /dev/null +++ b/examples/torch_migration/pipeline/Step3/torch_loss.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os + +import numpy as np +import paddle +import torch +import torch.nn as nn +from reprod_log import ReprodLogger + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(CONFIG_PATH) + +from models.pt_bert import BertConfig, BertForSequenceClassification + +if __name__ == "__main__": + + # def logger + reprod_logger = ReprodLogger() + + criterion = nn.CrossEntropyLoss() + + pytorch_dump_path = '../weights/torch_weight.bin' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = torch.load(pytorch_dump_path) + model.bert.load_state_dict(checkpoint) + + classifier_weights = torch.load( + "../classifier_weights/torch_classifier_weights.bin") + model.load_state_dict(classifier_weights, strict=False) + model.eval() + # read or gen fake data + fake_data = np.load("../fake_data/fake_data.npy") + fake_data = torch.from_numpy(fake_data) + + fake_label = np.load("../fake_data/fake_label.npy") + fake_label = torch.from_numpy(fake_label) + + # forward + out = model(fake_data)[0] + + loss = criterion(out, fake_label) + reprod_logger.add("loss", loss.cpu().detach().numpy()) + reprod_logger.save("loss_torch.npy") diff --git a/examples/torch_migration/pipeline/Step4/README.md b/examples/torch_migration/pipeline/Step4/README.md new file mode 100644 index 000000000000..695b0728a773 --- /dev/null +++ b/examples/torch_migration/pipeline/Step4/README.md @@ -0,0 +1,136 @@ +# 使用方法 + +### 学习率对齐验证 + +运行下面的命令,检查学习率模块设置是否正确。 + +```shell +python test_lr_scheduler.py +``` + +最终输出内容如下。 + +``` +[2021/11/17 21:44:19] root INFO: step_100_linear_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_300_linear_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_500_linear_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_700_linear_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_900_linear_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_100_cosine_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_300_cosine_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_500_cosine_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: False, value: 9.35605818719964e-06 +[2021/11/17 21:44:19] root INFO: step_700_cosine_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: False, value: 1.3681476625617212e-05 +[2021/11/17 21:44:19] root INFO: step_900_cosine_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: False, value: 1.8924391285779562e-05 +[2021/11/17 21:44:19] root INFO: step_100_polynomial_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_300_polynomial_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_500_polynomial_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_700_polynomial_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: step_900_polynomial_lr: +[2021/11/17 21:44:19] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 21:44:19] root INFO: diff check failed + +``` + +linear和polynomial方式衰减的学习率diff为0,check通过,cosine方式衰减学习率可能由于计算误差未通过。 + + +### 反向对齐操作方法 + +#### 代码讲解 + +以PaddlePaddle为例,训练流程核心代码如下所示。每个iter中输入相同的fake data与fake label,计算loss,进行梯度反传与参数更新,将loss批量返回,用于后续的验证。 + +```python +def pd_train_some_iters(model, + criterion, + optimizer, + fake_data, + fake_label, + max_iter=2): + paddle_dump_path = '../weights/paddle_weight.pdparams' + config = PDBertConfig() + model = PDBertForSequenceClassification(config) + checkpoint = paddle.load(paddle_dump_path) + model.bert.load_dict(checkpoint) + classifier_weights = paddle.load( + "../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + model.eval() + criterion = paddle.nn.CrossEntropy() + decay_params = [ + p.name for n, p in model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + optimizer = paddle.optimizer.AdamW(learning_rate=3e-5, parameters=model.parameters(), + weight_decay=1e-2, + epsilon=1e-6, + apply_decay_param_fun=lambda x: x in decay_params) + loss_list = [] + for idx in range(max_iter): + input_ids = paddle.to_tensor(fake_data) + labels = paddle.to_tensor(fake_label) + + output = model(input_ids) + loss = criterion(output, labels) + loss.backward() + optimizer.step() + optimizer.clear_grad() + loss_list.append(loss) + return loss_list +``` + + +#### 操作方法 + +运行下面的命令,基于fake data与fake label,依次生成若干轮loss数据并保存,使用`reprod_log`工具进行diff排查。 + +```shell +# 生成paddle和torch的前向数据 +python test_bp.py + +# 对比生成log +python check_step4.py +``` + +最终输出结果如下,同时会保存在文件`bp_align_diff.log`中。 + +``` +[2021/11/17 22:08:30] root INFO: loss_0: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_1: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_2: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_3: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_4: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_5: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_6: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_7: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_8: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: loss_9: +[2021/11/17 22:08:30] root INFO: mean diff: check passed: True, value: 0.0 +[2021/11/17 22:08:30] root INFO: diff check passed + +``` + +前面10轮的loss diff均等于0,check通过。 diff --git a/examples/torch_migration/pipeline/Step4/check_step4.py b/examples/torch_migration/pipeline/Step4/check_step4.py new file mode 100644 index 000000000000..751be400682b --- /dev/null +++ b/examples/torch_migration/pipeline/Step4/check_step4.py @@ -0,0 +1,23 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from reprod_log import ReprodDiffHelper + +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + torch_info = diff_helper.load_info("./bp_align_torch.npy") + paddle_info = diff_helper.load_info("./bp_align_paddle.npy") + diff_helper.compare_info(torch_info, paddle_info) + + diff_helper.report(path="bp_align_diff.log") diff --git a/examples/torch_migration/pipeline/Step4/test_bp.py b/examples/torch_migration/pipeline/Step4/test_bp.py new file mode 100644 index 000000000000..c584ef60ee50 --- /dev/null +++ b/examples/torch_migration/pipeline/Step4/test_bp.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os + +import numpy as np +import paddle +import torch +from reprod_log import ReprodLogger +from transformers import AdamW + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0] +sys.path.append(CONFIG_PATH) + +from models.pd_bert import ( + BertForSequenceClassification as PDBertForSequenceClassification, ) +from models.pd_bert import ( + BertConfig as PDBertConfig, ) +from models.pt_bert import ( + BertForSequenceClassification as HFBertForSequenceClassification, ) +from models.pt_bert import ( + BertConfig as HFBertConfig, ) + + +def pd_train_some_iters(model, + criterion, + optimizer, + fake_data, + fake_label, + max_iter=2): + paddle_dump_path = '../weights/paddle_weight.pdparams' + config = PDBertConfig() + model = PDBertForSequenceClassification(config) + checkpoint = paddle.load(paddle_dump_path) + model.bert.load_dict(checkpoint) + + classifier_weights = paddle.load( + "../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + model.eval() + criterion = paddle.nn.CrossEntropy() + decay_params = [ + p.name for n, p in model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + optimizer = paddle.optimizer.AdamW( + learning_rate=3e-5, + parameters=model.parameters(), + weight_decay=1e-2, + epsilon=1e-6, + apply_decay_param_fun=lambda x: x in decay_params, + ) + loss_list = [] + for idx in range(max_iter): + input_ids = paddle.to_tensor(fake_data) + labels = paddle.to_tensor(fake_label) + + output = model(input_ids) + loss = criterion(output, labels) + loss.backward() + optimizer.step() + optimizer.clear_grad() + loss_list.append(loss) + return loss_list + + +def hf_train_some_iters(fake_data, fake_label, max_iter=2): + + pytorch_dump_path = '../weights/torch_weight.bin' + config = HFBertConfig() + model = HFBertForSequenceClassification(config) + checkpoint = torch.load(pytorch_dump_path) + model.bert.load_state_dict(checkpoint) + classifier_weights = torch.load( + "../classifier_weights/torch_classifier_weights.bin") + model.load_state_dict(classifier_weights, strict=False) + model.eval() + criterion = torch.nn.CrossEntropyLoss() + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": + 1e-2, + }, + { + "params": [ + p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": + 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5) + + loss_list = [] + for idx in range(max_iter): + input_ids = torch.from_numpy(fake_data) + labels = torch.from_numpy(fake_label) + + output = model(input_ids)[0] + loss = criterion(output, labels) + loss.backward() + optimizer.step() + optimizer.zero_grad() + loss_list.append(loss) + return loss_list + + +if __name__ == "__main__": + print("Start training") + paddle.set_device("cpu") + fake_data = np.load("../fake_data/fake_data.npy") + fake_label = np.load("../fake_data/fake_label.npy") + hf_reprod_logger = ReprodLogger() + hf_loss_list = hf_train_some_iters(fake_data, fake_label, 10) + for idx, loss in enumerate(hf_loss_list): + hf_reprod_logger.add(f"loss_{idx}", loss.detach().cpu().numpy()) + hf_reprod_logger.save("bp_align_torch.npy") + + pd_reprod_logger = ReprodLogger() + pd_loss_list = hf_train_some_iters(fake_data, fake_label, 10) + for idx, loss in enumerate(pd_loss_list): + pd_reprod_logger.add(f"loss_{idx}", loss.detach().cpu().numpy()) + pd_reprod_logger.save("bp_align_paddle.npy") diff --git a/examples/torch_migration/pipeline/Step4/test_lr_scheduler.py b/examples/torch_migration/pipeline/Step4/test_lr_scheduler.py new file mode 100644 index 000000000000..9e5f86520658 --- /dev/null +++ b/examples/torch_migration/pipeline/Step4/test_lr_scheduler.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import torch + +# define paddle scheduler +from paddlenlp.transformers import ( + CosineDecayWithWarmup, + LinearDecayWithWarmup, + PolyDecayWithWarmup, +) +from reprod_log import ReprodDiffHelper, ReprodLogger +from torch.optim import AdamW +from transformers.optimization import get_scheduler as get_hf_scheduler + +scheduler_type2cls = { + "linear": LinearDecayWithWarmup, + "cosine": CosineDecayWithWarmup, + "polynomial": PolyDecayWithWarmup, +} + + +def get_paddle_scheduler( + learning_rate, + scheduler_type, + num_warmup_steps=None, + num_training_steps=None, + **scheduler_kwargs, +): + if scheduler_type not in scheduler_type2cls.keys(): + data = " ".join(scheduler_type2cls.keys()) + raise ValueError(f"scheduler_type must be choson from {data}") + + if num_warmup_steps is None: + raise ValueError( + f"requires `num_warmup_steps`, please provide that argument.") + + if num_training_steps is None: + raise ValueError( + f"requires `num_training_steps`, please provide that argument.") + + return scheduler_type2cls[scheduler_type]( + learning_rate=learning_rate, + total_steps=num_training_steps, + warmup=num_warmup_steps, + **scheduler_kwargs, + ) + + +def test_lr(): + diff_helper = ReprodDiffHelper() + pd_reprod_logger = ReprodLogger() + hf_reprod_logger = ReprodLogger() + lr = 3e-5 + num_warmup_steps = 345 + num_training_steps = 1024 + milestone = [100, 300, 500, 700, 900] + for scheduler_type in ["linear", "cosine", "polynomial"]: + torch_optimizer = AdamW(torch.nn.Linear(1, 1).parameters(), lr=lr) + hf_scheduler = get_hf_scheduler( + name=scheduler_type, + optimizer=torch_optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + ) + pd_scheduler = get_paddle_scheduler( + learning_rate=lr, + scheduler_type=scheduler_type, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + ) + + for i in range(num_training_steps): + hf_scheduler.step() + pd_scheduler.step() + if i in milestone: + hf_reprod_logger.add( + f"step_{i}_{scheduler_type}_lr", + np.array([hf_scheduler.get_last_lr()[-1]]), + ) + pd_reprod_logger.add(f"step_{i}_{scheduler_type}_lr", + np.array([pd_scheduler.get_lr()])) + + diff_helper.compare_info(hf_reprod_logger.data, pd_reprod_logger.data) + diff_helper.report() + + +if __name__ == "__main__": + test_lr() diff --git a/examples/torch_migration/pipeline/Step5/README.md b/examples/torch_migration/pipeline/Step5/README.md new file mode 100644 index 000000000000..bab96301cac7 --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/README.md @@ -0,0 +1,29 @@ +# 使用方法 + +首先运行下面的python代码,生成`train_align_torch.npy`和`train_align_paddle.npy`文件。 + +```python +# 运行生成paddle结果 +cd bert_paddle/ +sh train.sh +# 运行生成torch结果 +cd bert_torch/ +sh train.sh +``` + +然后运行下面的代码,运行训练脚本;之后使用`check_step5.py`进行精度diff验证。 + +```shell +# 对比生成log +python check_step5.py +``` + +这里需要注意的是,由于是精度对齐,SST-2数据集的精度diff在0.15%以内时,可以认为对齐,因此将`diff_threshold`参数修改为了`0.0015`。 + +``` +[2021/11/17 22:41:12] root INFO: acc: +[2021/11/17 22:41:12] root INFO: mean diff: check passed: True, value: 0.0011467889908256534 +[2021/11/17 22:41:12] root INFO: diff check passed +``` + +最终diff为`0.00114`,小于阈值标准,检查通过。 diff --git a/examples/torch_migration/pipeline/Step5/bert_paddle/train.py b/examples/torch_migration/pipeline/Step5/bert_paddle/train.py new file mode 100644 index 000000000000..ea0a2ec40302 --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_paddle/train.py @@ -0,0 +1,342 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import datetime +import random +import time +from functools import partial + +import numpy as np +import paddle +import paddle.nn as nn +import utils +from paddle.metric import Accuracy +from paddle.optimizer import AdamW +from paddlenlp.data import Dict, Pad, Stack +from paddlenlp.datasets import load_dataset +from paddlenlp.transformers import BertTokenizer +from reprod_log import ReprodLogger + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 2)[0] +sys.path.append(CONFIG_PATH) + +from models.pd_bert import BertConfig, BertForSequenceClassification + + +def train_one_epoch( + model, + criterion, + optimizer, + lr_scheduler, + data_loader, + epoch, + print_freq, + scaler=None, +): + model.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter("lr", + utils.SmoothedValue(window_size=1, fmt="{value}")) + metric_logger.add_meter("sentence/s", + utils.SmoothedValue(window_size=10, fmt="{value}")) + + header = "Epoch: [{}]".format(epoch) + for batch in metric_logger.log_every(data_loader, print_freq, header): + inputs = {"input_ids": batch[0], "token_type_ids": batch[1]} + labels = batch[2] + start_time = time.time() + with paddle.amp.auto_cast( + enable=scaler is not None, + custom_white_list=["layer_norm", "softmax", "gelu"], + ): + logits = model(**inputs)[0] + loss = criterion( + logits.reshape([-1, 2]), + labels.reshape([ + -1, + ]), + ) + + optimizer.clear_grad() + if scaler is not None: + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + optimizer.step() + lr_scheduler.step() + batch_size = inputs["input_ids"].shape[0] + metric_logger.update(loss=loss.item(), lr=lr_scheduler.get_lr()) + metric_logger.meters["sentence/s"].update(batch_size / + (time.time() - start_time)) + + +def evaluate(model, criterion, data_loader, metric, print_freq=100): + model.eval() + metric.reset() + metric_logger = utils.MetricLogger(delimiter=" ") + header = "Test:" + with paddle.no_grad(): + for batch in metric_logger.log_every(data_loader, print_freq, header): + inputs = {"input_ids": batch[0], "token_type_ids": batch[1]} + labels = batch[2] + logits = model(**inputs)[0] + loss = criterion( + logits.reshape([-1, 2]), + labels.reshape([ + -1, + ]), + ) + metric_logger.update(loss=loss.item()) + correct = metric.compute(logits, labels) + metric.update(correct) + acc_global_avg = metric.accumulate() + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print(" * Accuracy {acc_global_avg:.6f}".format( + acc_global_avg=acc_global_avg)) + return acc_global_avg + + +def set_seed(seed=42): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + +def convert_example(example, tokenizer, max_length=128): + labels = np.array([example["labels"]], dtype="int64") + example = tokenizer(example["sentence"], max_seq_len=max_length) + return { + "input_ids": example["input_ids"], + "token_type_ids": example["token_type_ids"], + "labels": labels, + } + + +def load_data(args, tokenizer): + print("Loading data") + train_ds = load_dataset("glue", args.task_name, splits="train") + validation_ds = load_dataset("glue", args.task_name, splits="dev") + + trans_func = partial(convert_example, + tokenizer=tokenizer, + max_length=args.max_length) + train_ds = train_ds.map(trans_func, lazy=False) + validation_ds = validation_ds.map(trans_func, lazy=False) + + train_sampler = paddle.io.BatchSampler(train_ds, + batch_size=args.batch_size, + shuffle=False) + validation_sampler = paddle.io.BatchSampler(validation_ds, + batch_size=args.batch_size, + shuffle=False) + + return train_ds, validation_ds, train_sampler, validation_sampler + + +def main(args): + if args.output_dir: + pass + # utils.mkdir(args.output_dir) + print(args) + scaler = None + # if args.fp16: + # scaler = paddle.amp.GradScaler() + paddle.set_device(args.device) + + if args.seed is not None: + set_seed(args.seed) + + tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) + batchify_fn = lambda samples, fn=Dict( + { + "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), + "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), + "labels": Stack(dtype="int64"), + }): fn(samples) + train_dataset, validation_dataset, train_sampler, validation_sampler = load_data( + args, tokenizer) + + train_data_loader = paddle.io.DataLoader( + train_dataset, + batch_sampler=train_sampler, + num_workers=args.workers, + collate_fn=batchify_fn, + ) + validation_data_loader = paddle.io.DataLoader( + validation_dataset, + batch_sampler=validation_sampler, + num_workers=args.workers, + collate_fn=batchify_fn, + ) + + print("Creating model") + paddle_dump_path = '../../weights/paddle_weight.pdparams' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = paddle.load(paddle_dump_path) + model.bert.load_dict(checkpoint) + + classifier_weights = paddle.load( + "../../classifier_weights/paddle_classifier_weights.bin") + model.load_dict(classifier_weights) + + print("Creating criterion") + criterion = nn.CrossEntropyLoss() + + print("Creating lr_scheduler") + lr_scheduler = utils.get_scheduler( + learning_rate=args.lr, + scheduler_type=args.lr_scheduler_type, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.num_train_epochs * len(train_data_loader), + ) + + print("Creating optimizer") + # Split weights in two groups, one with weight decay and the other not. + decay_params = [ + p.name for n, p in model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + optimizer = AdamW( + learning_rate=lr_scheduler, + parameters=model.parameters(), + weight_decay=args.weight_decay, + epsilon=1e-6, + apply_decay_param_fun=lambda x: x in decay_params, + ) + metric = Accuracy() + + if args.test_only: + evaluate(model, criterion, validation_data_loader, metric) + return + + print("Start training") + start_time = time.time() + best_accuracy = 0.0 + for epoch in range(args.num_train_epochs): + + train_one_epoch( + model, + criterion, + optimizer, + lr_scheduler, + train_data_loader, + epoch, + args.print_freq, + scaler, + ) + acc = evaluate(model, criterion, validation_data_loader, metric) + best_accuracy = max(best_accuracy, acc) + if args.output_dir: + pass + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print("Training time {}".format(total_time_str)) + return best_accuracy + + +def get_args_parser(add_help=True): + import argparse + + parser = argparse.ArgumentParser( + description="Paddle SST-2 Classification Training", add_help=add_help) + parser.add_argument("--task_name", + default="sst-2", + help="the name of the glue task to train on.") + parser.add_argument( + "--model_name_or_path", + default="bert-base-uncased", + help= + "path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument("--device", default="gpu", help="device") + parser.add_argument("--batch_size", default=32, type=int) + parser.add_argument( + "--max_length", + type=int, + default=128, + help= + ("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," + ), + ) + parser.add_argument("--num_train_epochs", + default=3, + type=int, + help="number of total epochs to run") + parser.add_argument( + "--workers", + default=0, + type=int, + help="number of data loading workers (default: 16)", + ) + parser.add_argument("--lr", + default=3e-5, + type=float, + help="initial learning rate") + parser.add_argument( + "--weight_decay", + default=1e-2, + type=float, + help="weight decay (default: 1e-2)", + dest="weight_decay", + ) + parser.add_argument( + "--lr_scheduler_type", + default="linear", + help="the scheduler type to use.", + choices=["linear", "cosine", "polynomial"], + ) + parser.add_argument( + "--num_warmup_steps", + default=0, + type=int, + help="number of steps for the warmup in the lr scheduler.", + ) + parser.add_argument("--print_freq", + default=10, + type=int, + help="print frequency") + parser.add_argument("--output_dir", + default="outputs", + help="path where to save") + parser.add_argument( + "--test_only", + help="only test the model", + action="store_true", + ) + parser.add_argument("--seed", + default=42, + type=int, + help="a seed for reproducible training.") + # Mixed precision training parameters + parser.add_argument("--fp16", + action="store_true", + help="whether or not mixed precision training") + + return parser + + +if __name__ == "__main__": + args = get_args_parser().parse_args() + acc = main(args) + reprod_logger = ReprodLogger() + reprod_logger.add("acc", np.array([acc])) + reprod_logger.save("train_align_paddle.npy") diff --git a/examples/torch_migration/pipeline/Step5/bert_paddle/train.sh b/examples/torch_migration/pipeline/Step5/bert_paddle/train.sh new file mode 100644 index 000000000000..5c5e367f6404 --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_paddle/train.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m paddle.distributed.launch --gpus "1" train.py \ + --model_name_or_path bert-base-uncased \ + --batch_size 128 \ + --num_warmup_steps 158 \ + --output_dir paddle_outputs + diff --git a/examples/torch_migration/pipeline/Step5/bert_paddle/utils.py b/examples/torch_migration/pipeline/Step5/bert_paddle/utils.py new file mode 100644 index 000000000000..faf5fbe0e374 --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_paddle/utils.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import errno +import os +import time +from collections import OrderedDict, defaultdict, deque + +import paddle +from paddlenlp.transformers import ( + CosineDecayWithWarmup, + LinearDecayWithWarmup, + PolyDecayWithWarmup, +) + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + t = paddle.to_tensor([self.count, self.total], dtype="float64") + t = t.numpy().tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = paddle.to_tensor(list(self.deque)) + return d.median().numpy().item() + + @property + def avg(self): + d = paddle.to_tensor(list(self.deque), dtype="float32") + return d.mean().numpy().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value, + ) + + +class MetricLogger(object): + + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, paddle.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append("{}: {}".format(name, str(meter))) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = "" + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt="{avg:.4f}") + data_time = SmoothedValue(fmt="{avg:.4f}") + space_fmt = ":" + str(len(str(len(iterable)))) + "d" + if paddle.device.is_compiled_with_cuda(): + log_msg = self.delimiter.join([ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + ]) + else: + log_msg = self.delimiter.join([ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + ]) + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + )) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print("{} Total time: {}".format(header, total_time_str)) + + +scheduler_type2cls = { + "linear": LinearDecayWithWarmup, + "cosine": CosineDecayWithWarmup, + "polynomial": PolyDecayWithWarmup, +} + + +def get_scheduler( + learning_rate, + scheduler_type, + num_warmup_steps=None, + num_training_steps=None, + **scheduler_kwargs, +): + if scheduler_type not in scheduler_type2cls.keys(): + data = " ".join(scheduler_type2cls.keys()) + raise ValueError(f"scheduler_type must be choson from {data}") + + if num_warmup_steps is None: + raise ValueError( + f"requires `num_warmup_steps`, please provide that argument.") + + if num_training_steps is None: + raise ValueError( + f"requires `num_training_steps`, please provide that argument.") + + return scheduler_type2cls[scheduler_type]( + learning_rate=learning_rate, + total_steps=num_training_steps, + warmup=num_warmup_steps, + **scheduler_kwargs, + ) + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py b/examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py new file mode 100644 index 000000000000..ae447e4a398a --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py @@ -0,0 +1,96 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Accuracy metric.""" + +import datasets +from sklearn.metrics import accuracy_score + +_DESCRIPTION = """ +Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: +Accuracy = (TP + TN) / (TP + TN + FP + FN) +TP: True positive +TN: True negative +FP: False positive +FN: False negative +""" + +_KWARGS_DESCRIPTION = """ +Args: + predictions: Predicted labels, as returned by a model. + references: Ground truth labels. + normalize: If False, return the number of correctly classified samples. + Otherwise, return the fraction of correctly classified samples. + sample_weight: Sample weights. +Returns: + accuracy: Accuracy score. +Examples: + + >>> accuracy_metric = datasets.load_metric("accuracy") + >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1]) + >>> print(results) + {'accuracy': 1.0} +""" + +_CITATION = """\ +@article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} +""" + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, + _KWARGS_DESCRIPTION) +class Accuracy(datasets.Metric): + + def _info(self): + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Sequence(datasets.Value("int32")), + "references": datasets.Sequence(datasets.Value("int32")), + } if self.config_name == "multilabel" else { + "predictions": datasets.Value("int32"), + "references": datasets.Value("int32"), + }), + reference_urls=[ + "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html" + ], + ) + + def _compute(self, + predictions, + references, + normalize=True, + sample_weight=None): + return { + "accuracy": + accuracy_score( + references, + predictions, + normalize=normalize, + sample_weight=sample_weight, + ).item(), + } diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/glue.py b/examples/torch_migration/pipeline/Step5/bert_torch/glue.py new file mode 100644 index 000000000000..028c09918f67 --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_torch/glue.py @@ -0,0 +1,633 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""The General Language Understanding Evaluation (GLUE) benchmark.""" + +import csv +import os +import textwrap + +import datasets +import numpy as np + +_GLUE_CITATION = """\ +@inproceedings{wang2019glue, + title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, + author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.}, + note={In the Proceedings of ICLR.}, + year={2019} +} +""" + +_GLUE_DESCRIPTION = """\ +GLUE, the General Language Understanding Evaluation benchmark +(https://gluebenchmark.com/) is a collection of resources for training, +evaluating, and analyzing natural language understanding systems. + +""" + +_MRPC_DEV_IDS = "https://dl.fbaipublicfiles.com/glue/data/mrpc_dev_ids.tsv" +_MRPC_TRAIN = ( + "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt" +) +_MRPC_TEST = ( + "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt" +) + +_MNLI_BASE_KWARGS = dict( + text_features={ + "premise": "sentence1", + "hypothesis": "sentence2", + }, + label_classes=["entailment", "neutral", "contradiction"], + label_column="gold_label", + data_url="https://dl.fbaipublicfiles.com/glue/data/MNLI.zip", + data_dir="MNLI", + citation=textwrap.dedent("""\ + @InProceedings{N18-1101, + author = "Williams, Adina + and Nangia, Nikita + and Bowman, Samuel", + title = "A Broad-Coverage Challenge Corpus for + Sentence Understanding through Inference", + booktitle = "Proceedings of the 2018 Conference of + the North American Chapter of the + Association for Computational Linguistics: + Human Language Technologies, Volume 1 (Long + Papers)", + year = "2018", + publisher = "Association for Computational Linguistics", + pages = "1112--1122", + location = "New Orleans, Louisiana", + url = "http://aclweb.org/anthology/N18-1101" + } + @article{bowman2015large, + title={A large annotated corpus for learning natural language inference}, + author={Bowman, Samuel R and Angeli, Gabor and Potts, Christopher and Manning, Christopher D}, + journal={arXiv preprint arXiv:1508.05326}, + year={2015} + }"""), + url="http://www.nyu.edu/projects/bowman/multinli/", +) + + +class GlueConfig(datasets.BuilderConfig): + """BuilderConfig for GLUE.""" + + def __init__( + self, + text_features, + label_column, + data_url, + data_dir, + citation, + url, + label_classes=None, + process_label=lambda x: x, + **kwargs, + ): + """BuilderConfig for GLUE. + + Args: + text_features: `dict[string, string]`, map from the name of the feature + dict for each text field to the name of the column in the tsv file + label_column: `string`, name of the column in the tsv file corresponding + to the label + data_url: `string`, url to download the zip file from + data_dir: `string`, the path to the folder containing the tsv files in the + downloaded zip + citation: `string`, citation for the data set + url: `string`, url for information about the data set + label_classes: `list[string]`, the list of classes if the label is + categorical. If not provided, then the label will be of type + `datasets.Value('float32')`. + process_label: `Function[string, any]`, function taking in the raw value + of the label and processing it to the form required by the label feature + **kwargs: keyword arguments forwarded to super. + """ + super(GlueConfig, self).__init__(version=datasets.Version("1.0.0", ""), + **kwargs) + self.text_features = text_features + self.label_column = label_column + self.label_classes = label_classes + self.data_url = data_url + self.data_dir = data_dir + self.citation = citation + self.url = url + self.process_label = process_label + + +class Glue(datasets.GeneratorBasedBuilder): + """The General Language Understanding Evaluation (GLUE) benchmark.""" + + BUILDER_CONFIGS = [ + GlueConfig( + name="cola", + description=textwrap.dedent("""\ + The Corpus of Linguistic Acceptability consists of English + acceptability judgments drawn from books and journal articles on + linguistic theory. Each example is a sequence of words annotated + with whether it is a grammatical English sentence."""), + text_features={"sentence": "sentence"}, + label_classes=["unacceptable", "acceptable"], + label_column="is_acceptable", + data_url="https://dl.fbaipublicfiles.com/glue/data/CoLA.zip", + data_dir="CoLA", + citation=textwrap.dedent("""\ + @article{warstadt2018neural, + title={Neural Network Acceptability Judgments}, + author={Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R}, + journal={arXiv preprint arXiv:1805.12471}, + year={2018} + }"""), + url="https://nyu-mll.github.io/CoLA/", + ), + GlueConfig( + name="sst2", + description=textwrap.dedent("""\ + The Stanford Sentiment Treebank consists of sentences from movie reviews and + human annotations of their sentiment. The task is to predict the sentiment of a + given sentence. We use the two-way (positive/negative) class split, and use only + sentence-level labels."""), + text_features={"sentence": "sentence"}, + label_classes=["negative", "positive"], + label_column="label", + data_url="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip", + data_dir="SST-2", + citation=textwrap.dedent("""\ + @inproceedings{socher2013recursive, + title={Recursive deep models for semantic compositionality over a sentiment treebank}, + author={Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D and Ng, Andrew and Potts, Christopher}, + booktitle={Proceedings of the 2013 conference on empirical methods in natural language processing}, + pages={1631--1642}, + year={2013} + }"""), + url="https://datasets.stanford.edu/sentiment/index.html", + ), + GlueConfig( + name="mrpc", + description=textwrap.dedent("""\ + The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of + sentence pairs automatically extracted from online news sources, with human annotations + for whether the sentences in the pair are semantically equivalent.""" + ), # pylint: disable=line-too-long + text_features={ + "sentence1": "", + "sentence2": "" + }, + label_classes=["not_equivalent", "equivalent"], + label_column="Quality", + data_url="", # MRPC isn't hosted by GLUE. + data_dir="MRPC", + citation=textwrap.dedent("""\ + @inproceedings{dolan2005automatically, + title={Automatically constructing a corpus of sentential paraphrases}, + author={Dolan, William B and Brockett, Chris}, + booktitle={Proceedings of the Third International Workshop on Paraphrasing (IWP2005)}, + year={2005} + }"""), + url="https://www.microsoft.com/en-us/download/details.aspx?id=52398", + ), + GlueConfig( + name="qqp", + description=textwrap.dedent("""\ + The Quora Question Pairs2 dataset is a collection of question pairs from the + community question-answering website Quora. The task is to determine whether a + pair of questions are semantically equivalent."""), + text_features={ + "question1": "question1", + "question2": "question2", + }, + label_classes=["not_duplicate", "duplicate"], + label_column="is_duplicate", + data_url="https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip", + data_dir="QQP", + citation=textwrap.dedent("""\ + @online{WinNT, + author = {Iyer, Shankar and Dandekar, Nikhil and Csernai, Kornel}, + title = {First Quora Dataset Release: Question Pairs}, + year = {2017}, + url = {https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs}, + urldate = {2019-04-03} + }"""), + url= + "https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs", + ), + GlueConfig( + name="stsb", + description=textwrap.dedent("""\ + The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of + sentence pairs drawn from news headlines, video and image captions, and natural + language inference data. Each pair is human-annotated with a similarity score + from 1 to 5."""), + text_features={ + "sentence1": "sentence1", + "sentence2": "sentence2", + }, + label_column="score", + data_url="https://dl.fbaipublicfiles.com/glue/data/STS-B.zip", + data_dir="STS-B", + citation=textwrap.dedent("""\ + @article{cer2017semeval, + title={Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation}, + author={Cer, Daniel and Diab, Mona and Agirre, Eneko and Lopez-Gazpio, Inigo and Specia, Lucia}, + journal={arXiv preprint arXiv:1708.00055}, + year={2017} + }"""), + url="http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark", + process_label=np.float32, + ), + GlueConfig( + name="mnli", + description=textwrap.dedent("""\ + The Multi-Genre Natural Language Inference Corpus is a crowdsourced + collection of sentence pairs with textual entailment annotations. Given a premise sentence + and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis + (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are + gathered from ten different sources, including transcribed speech, fiction, and government reports. + We use the standard test set, for which we obtained private labels from the authors, and evaluate + on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend + the SNLI corpus as 550k examples of auxiliary training data."""), + **_MNLI_BASE_KWARGS, + ), + GlueConfig( + name="mnli_mismatched", + description=textwrap.dedent("""\ + The mismatched validation and test splits from MNLI. + See the "mnli" BuilderConfig for additional information."""), + **_MNLI_BASE_KWARGS, + ), + GlueConfig( + name="mnli_matched", + description=textwrap.dedent("""\ + The matched validation and test splits from MNLI. + See the "mnli" BuilderConfig for additional information."""), + **_MNLI_BASE_KWARGS, + ), + GlueConfig( + name="qnli", + description=textwrap.dedent("""\ + The Stanford Question Answering Dataset is a question-answering + dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn + from Wikipedia) contains the answer to the corresponding question (written by an annotator). We + convert the task into sentence pair classification by forming a pair between each question and each + sentence in the corresponding context, and filtering out pairs with low lexical overlap between the + question and the context sentence. The task is to determine whether the context sentence contains + the answer to the question. This modified version of the original task removes the requirement that + the model select the exact answer, but also removes the simplifying assumptions that the answer + is always present in the input and that lexical overlap is a reliable cue.""" + ), # pylint: disable=line-too-long + text_features={ + "question": "question", + "sentence": "sentence", + }, + label_classes=["entailment", "not_entailment"], + label_column="label", + data_url="https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip", + data_dir="QNLI", + citation=textwrap.dedent("""\ + @article{rajpurkar2016squad, + title={Squad: 100,000+ questions for machine comprehension of text}, + author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy}, + journal={arXiv preprint arXiv:1606.05250}, + year={2016} + }"""), + url="https://rajpurkar.github.io/SQuAD-explorer/", + ), + GlueConfig( + name="rte", + description=textwrap.dedent("""\ + The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual + entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim + et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are + constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where + for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.""" + ), # pylint: disable=line-too-long + text_features={ + "sentence1": "sentence1", + "sentence2": "sentence2", + }, + label_classes=["entailment", "not_entailment"], + label_column="label", + data_url="https://dl.fbaipublicfiles.com/glue/data/RTE.zip", + data_dir="RTE", + citation=textwrap.dedent("""\ + @inproceedings{dagan2005pascal, + title={The PASCAL recognising textual entailment challenge}, + author={Dagan, Ido and Glickman, Oren and Magnini, Bernardo}, + booktitle={Machine Learning Challenges Workshop}, + pages={177--190}, + year={2005}, + organization={Springer} + } + @inproceedings{bar2006second, + title={The second pascal recognising textual entailment challenge}, + author={Bar-Haim, Roy and Dagan, Ido and Dolan, Bill and Ferro, Lisa and Giampiccolo, Danilo and Magnini, Bernardo and Szpektor, Idan}, + booktitle={Proceedings of the second PASCAL challenges workshop on recognising textual entailment}, + volume={6}, + number={1}, + pages={6--4}, + year={2006}, + organization={Venice} + } + @inproceedings{giampiccolo2007third, + title={The third pascal recognizing textual entailment challenge}, + author={Giampiccolo, Danilo and Magnini, Bernardo and Dagan, Ido and Dolan, Bill}, + booktitle={Proceedings of the ACL-PASCAL workshop on textual entailment and paraphrasing}, + pages={1--9}, + year={2007}, + organization={Association for Computational Linguistics} + } + @inproceedings{bentivogli2009fifth, + title={The Fifth PASCAL Recognizing Textual Entailment Challenge.}, + author={Bentivogli, Luisa and Clark, Peter and Dagan, Ido and Giampiccolo, Danilo}, + booktitle={TAC}, + year={2009} + }"""), + url="https://aclweb.org/aclwiki/Recognizing_Textual_Entailment", + ), + GlueConfig( + name="wnli", + description=textwrap.dedent("""\ + The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task + in which a system must read a sentence with a pronoun and select the referent of that pronoun from + a list of choices. The examples are manually constructed to foil simple statistical methods: Each + one is contingent on contextual information provided by a single word or phrase in the sentence. + To convert the problem into sentence pair classification, we construct sentence pairs by replacing + the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the + pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of + new examples derived from fiction books that was shared privately by the authors of the original + corpus. While the included training set is balanced between two classes, the test set is imbalanced + between them (65% not entailment). Also, due to a data quirk, the development set is adversarial: + hypotheses are sometimes shared between training and development examples, so if a model memorizes the + training examples, they will predict the wrong label on corresponding development set + example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence + between a model's score on this task and its score on the unconverted original task. We + call converted dataset WNLI (Winograd NLI)."""), + text_features={ + "sentence1": "sentence1", + "sentence2": "sentence2", + }, + label_classes=["not_entailment", "entailment"], + label_column="label", + data_url="https://dl.fbaipublicfiles.com/glue/data/WNLI.zip", + data_dir="WNLI", + citation=textwrap.dedent("""\ + @inproceedings{levesque2012winograd, + title={The winograd schema challenge}, + author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora}, + booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning}, + year={2012} + }"""), + url= + "https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html", + ), + GlueConfig( + name="ax", + description=textwrap.dedent("""\ + A manually-curated evaluation dataset for fine-grained analysis of + system performance on a broad range of linguistic phenomena. This + dataset evaluates sentence understanding through Natural Language + Inference (NLI) problems. Use a model trained on MulitNLI to produce + predictions for this dataset."""), + text_features={ + "premise": "sentence1", + "hypothesis": "sentence2", + }, + label_classes=["entailment", "neutral", "contradiction"], + label_column="", # No label since we only have test set. + # We must use a URL shortener since the URL from GLUE is very long and + # causes issues in TFDS. + data_url="https://dl.fbaipublicfiles.com/glue/data/AX.tsv", + data_dir="", # We are downloading a tsv. + citation="", # The GLUE citation is sufficient. + url="https://gluebenchmark.com/diagnostics", + ), + ] + + def _info(self): + features = { + text_feature: datasets.Value("string") + for text_feature in self.config.text_features.keys() + } + if self.config.label_classes: + features["label"] = datasets.features.ClassLabel( + names=self.config.label_classes) + else: + features["label"] = datasets.Value("float32") + features["idx"] = datasets.Value("int32") + return datasets.DatasetInfo( + description=_GLUE_DESCRIPTION, + features=datasets.Features(features), + homepage=self.config.url, + citation=self.config.citation + "\n" + _GLUE_CITATION, + ) + + def _split_generators(self, dl_manager): + if self.config.name == "ax": + data_file = dl_manager.download(self.config.data_url) + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "data_file": data_file, + "split": "test", + }, + ) + ] + + if self.config.name == "mrpc": + data_dir = None + mrpc_files = dl_manager.download({ + "dev_ids": _MRPC_DEV_IDS, + "train": _MRPC_TRAIN, + "test": _MRPC_TEST, + }) + else: + dl_dir = dl_manager.download_and_extract(self.config.data_url) + data_dir = os.path.join(dl_dir, self.config.data_dir) + mrpc_files = None + train_split = datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_file": os.path.join(data_dir or "", "train.tsv"), + "split": "train", + "mrpc_files": mrpc_files, + }, + ) + if self.config.name == "mnli": + return [ + train_split, + _mnli_split_generator("validation_matched", + data_dir, + "dev", + matched=True), + _mnli_split_generator("validation_mismatched", + data_dir, + "dev", + matched=False), + _mnli_split_generator("test_matched", + data_dir, + "test", + matched=True), + _mnli_split_generator("test_mismatched", + data_dir, + "test", + matched=False), + ] + elif self.config.name == "mnli_matched": + return [ + _mnli_split_generator("validation", + data_dir, + "dev", + matched=True), + _mnli_split_generator("test", data_dir, "test", matched=True), + ] + elif self.config.name == "mnli_mismatched": + return [ + _mnli_split_generator("validation", + data_dir, + "dev", + matched=False), + _mnli_split_generator("test", data_dir, "test", matched=False), + ] + else: + return [ + train_split, + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "data_file": os.path.join(data_dir or "", "dev.tsv"), + "split": "dev", + "mrpc_files": mrpc_files, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "data_file": os.path.join(data_dir or "", "test.tsv"), + "split": "test", + "mrpc_files": mrpc_files, + }, + ), + ] + + def _generate_examples(self, data_file, split, mrpc_files=None): + if self.config.name == "mrpc": + # We have to prepare the MRPC dataset from the original sources ourselves. + examples = self._generate_example_mrpc_files(mrpc_files=mrpc_files, + split=split) + for example in examples: + yield example["idx"], example + else: + process_label = self.config.process_label + label_classes = self.config.label_classes + + # The train and dev files for CoLA are the only tsv files without a + # header. + is_cola_non_test = self.config.name == "cola" and split != "test" + + with open(data_file, encoding="utf8") as f: + reader = csv.DictReader(f, + delimiter="\t", + quoting=csv.QUOTE_NONE) + if is_cola_non_test: + reader = csv.reader(f, + delimiter="\t", + quoting=csv.QUOTE_NONE) + + for n, row in enumerate(reader): + if is_cola_non_test: + row = { + "sentence": row[3], + "is_acceptable": row[1], + } + + example = { + feat: row[col] + for feat, col in self.config.text_features.items() + } + example["idx"] = n + + if self.config.label_column in row: + label = row[self.config.label_column] + # For some tasks, the label is represented as 0 and 1 in the tsv + # files and needs to be cast to integer to work with the feature. + if label_classes and label not in label_classes: + label = int(label) if label else None + example["label"] = process_label(label) + else: + example["label"] = process_label(-1) + + # Filter out corrupted rows. + for value in example.values(): + if value is None: + break + else: + yield example["idx"], example + + def _generate_example_mrpc_files(self, mrpc_files, split): + if split == "test": + with open(mrpc_files["test"], encoding="utf8") as f: + # The first 3 bytes are the utf-8 BOM \xef\xbb\xbf, which messes with + # the Quality key. + f.seek(3) + reader = csv.DictReader(f, + delimiter="\t", + quoting=csv.QUOTE_NONE) + for n, row in enumerate(reader): + yield { + "sentence1": row["#1 String"], + "sentence2": row["#2 String"], + "label": int(row["Quality"]), + "idx": n, + } + else: + with open(mrpc_files["dev_ids"], encoding="utf8") as f: + reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) + dev_ids = [[row[0], row[1]] for row in reader] + with open(mrpc_files["train"], encoding="utf8") as f: + # The first 3 bytes are the utf-8 BOM \xef\xbb\xbf, which messes with + # the Quality key. + f.seek(3) + reader = csv.DictReader(f, + delimiter="\t", + quoting=csv.QUOTE_NONE) + for n, row in enumerate(reader): + is_row_in_dev = [row["#1 ID"], row["#2 ID"]] in dev_ids + if is_row_in_dev == (split == "dev"): + yield { + "sentence1": row["#1 String"], + "sentence2": row["#2 String"], + "label": int(row["Quality"]), + "idx": n, + } + + +def _mnli_split_generator(name, data_dir, split, matched): + return datasets.SplitGenerator( + name=name, + gen_kwargs={ + "data_file": + os.path.join( + data_dir, + "%s_%s.tsv" % (split, "matched" if matched else "mismatched")), + "split": + split, + "mrpc_files": + None, + }, + ) diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/train.py b/examples/torch_migration/pipeline/Step5/bert_torch/train.py new file mode 100644 index 000000000000..b42d360241ff --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_torch/train.py @@ -0,0 +1,373 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import datetime +import random +import time + +import paddle +import numpy as np +import torch +import torch.utils.data +import utils +from datasets import load_dataset, load_metric +from reprod_log import ReprodLogger +from torch import nn +from transformers import AdamW, BertTokenizer, DataCollatorWithPadding, get_scheduler + +CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0] # 当前目录 +CONFIG_PATH = CURRENT_DIR.rsplit('/', 2)[0] +sys.path.append(CONFIG_PATH) + +from models.pt_bert import BertConfig, BertForSequenceClassification + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + + +def train_one_epoch( + model, + criterion, + optimizer, + lr_scheduler, + data_loader, + device, + epoch, + print_freq, + scaler=None, +): + model.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter("lr", + utils.SmoothedValue(window_size=1, fmt="{value}")) + metric_logger.add_meter("sentence/s", + utils.SmoothedValue(window_size=10, fmt="{value}")) + + header = "Epoch: [{}]".format(epoch) + for batch in metric_logger.log_every(data_loader, print_freq, header): + start_time = time.time() + batch.to(device) + labels = batch.pop("labels") + with torch.cuda.amp.autocast(enabled=scaler is not None): + logits = model(**batch)[0] + loss = criterion(logits.reshape(-1, 2), labels.reshape(-1)) + + optimizer.zero_grad() + if scaler is not None: + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + optimizer.step() + lr_scheduler.step() + batch_size = batch["input_ids"].shape[0] + metric_logger.update(loss=loss.item(), + lr=lr_scheduler.get_last_lr()[-1]) + metric_logger.meters["sentence/s"].update(batch_size / + (time.time() - start_time)) + + +def evaluate(model, criterion, data_loader, device, metric, print_freq=100): + model.eval() + metric_logger = utils.MetricLogger(delimiter=" ") + header = "Test:" + with torch.no_grad(): + for batch in metric_logger.log_every(data_loader, print_freq, header): + batch.to(device) + labels = batch.pop("labels") + logits = model(**batch)[0] + loss = criterion(logits.reshape(-1, 2), labels.reshape(-1)) + metric_logger.update(loss=loss.item()) + metric.add_batch( + predictions=logits.argmax(dim=-1), + references=labels, + ) + acc_global_avg = metric.compute()["accuracy"] + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print(" * Accuracy {acc_global_avg:.6f}".format( + acc_global_avg=acc_global_avg)) + return acc_global_avg + + +def set_seed(seed=42): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def load_data(args, tokenizer): + print("Loading data") + raw_datasets = load_dataset("glue.py", + args.task_name, + cache_dir=args.data_cache_dir) + sentence1_key, sentence2_key = task_to_keys[args.task_name] + + def preprocess_function(examples): + texts = ((examples[sentence1_key], ) if sentence2_key is None else + (examples[sentence1_key], examples[sentence2_key])) + result = tokenizer(*texts, + padding=False, + max_length=args.max_length, + truncation=True) + + if "label" in examples: + result["labels"] = examples["label"] + return result + + train_ds = raw_datasets["train"].map( + preprocess_function, + batched=True, + remove_columns=raw_datasets["train"].column_names, + desc="Running tokenizer on train dataset", + new_fingerprint=f"train_tokenized_dataset_{args.task_name}", + ) + validation_ds = raw_datasets["validation"].map( + preprocess_function, + batched=True, + remove_columns=raw_datasets["validation"].column_names, + desc="Running tokenizer on validation dataset", + new_fingerprint=f"validation_tokenized_dataset_{args.task_name}", + ) + train_sampler = torch.utils.data.SequentialSampler(train_ds) + validation_sampler = torch.utils.data.SequentialSampler(validation_ds) + + return train_ds, validation_ds, train_sampler, validation_sampler + + +def main(args): + if args.output_dir: + utils.mkdir(args.output_dir) + print(args) + scaler = None + if args.fp16: + scaler = torch.cuda.amp.GradScaler() + device = torch.device(args.device) + torch.backends.cudnn.benchmark = True + + if args.seed is not None: + set_seed(args.seed) + + tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) + data_collator = DataCollatorWithPadding( + tokenizer, pad_to_multiple_of=(8 if args.fp16 else None)) + train_dataset, validation_dataset, train_sampler, validation_sampler = load_data( + args, tokenizer) + train_data_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + sampler=train_sampler, + num_workers=args.workers, + collate_fn=data_collator, + ) + + validation_data_loader = torch.utils.data.DataLoader( + validation_dataset, + batch_size=args.batch_size, + sampler=validation_sampler, + num_workers=args.workers, + collate_fn=data_collator, + ) + + print("Creating model") + pytorch_dump_path = '../../weights/torch_weight.bin' + config = BertConfig() + model = BertForSequenceClassification(config) + checkpoint = torch.load(pytorch_dump_path) + model.bert.load_state_dict(checkpoint) + + classifier_weights = torch.load( + "../../classifier_weights/torch_classifier_weights.bin") + model.load_state_dict(classifier_weights, strict=False) + model.to(device) + + print("Creating criterion") + criterion = nn.CrossEntropyLoss() + + print("Creating optimizer") + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": + args.weight_decay, + }, + { + "params": [ + p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": + 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) + + print("Creating lr_scheduler") + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.num_train_epochs * len(train_data_loader), + ) + + metric = load_metric("accuracy.py") + if args.test_only: + evaluate(model, criterion, validation_data_loader, device=device) + return + + print("Start training") + start_time = time.time() + best_accuracy = 0.0 + for epoch in range(args.num_train_epochs): + train_one_epoch( + model, + criterion, + optimizer, + lr_scheduler, + train_data_loader, + device, + epoch, + args.print_freq, + scaler, + ) + acc = evaluate(model, + criterion, + validation_data_loader, + device=device, + metric=metric) + best_accuracy = max(best_accuracy, acc) + if args.output_dir: + pass + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print("Training time {}".format(total_time_str)) + return best_accuracy + + +def get_args_parser(add_help=True): + import argparse + + parser = argparse.ArgumentParser( + description="PyTorch SST-2 Classification Training", add_help=add_help) + parser.add_argument("--data_cache_dir", + default="data_caches", + help="data cache dir.") + parser.add_argument("--task_name", + default="sst2", + help="the name of the glue task to train on.") + parser.add_argument( + "--model_name_or_path", + default="bert-base-uncased", + help= + "path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument("--device", default="cuda:2", help="device") + parser.add_argument("--batch_size", default=32, type=int) + parser.add_argument( + "--max_length", + type=int, + default=128, + help= + ("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," + ), + ) + parser.add_argument("--num_train_epochs", + default=3, + type=int, + help="number of total epochs to run") + parser.add_argument( + "--workers", + default=0, + type=int, + help="number of data loading workers (default: 16)", + ) + parser.add_argument("--lr", + default=3e-5, + type=float, + help="initial learning rate") + parser.add_argument( + "--weight_decay", + default=1e-2, + type=float, + help="weight decay (default: 1e-2)", + dest="weight_decay", + ) + parser.add_argument( + "--lr_scheduler_type", + default="linear", + help="the scheduler type to use.", + choices=[ + "linear", + "cosine", + "cosine_with_restarts", + "polynomial", + "constant", + "constant_with_warmup", + ], + ) + parser.add_argument( + "--num_warmup_steps", + default=0, + type=int, + help="number of steps for the warmup in the lr scheduler.", + ) + parser.add_argument("--print_freq", + default=10, + type=int, + help="print frequency") + parser.add_argument("--output_dir", + default="outputs", + help="path where to save") + parser.add_argument( + "--test_only", + help="only test the model", + action="store_true", + ) + parser.add_argument("--seed", + default=42, + type=int, + help="a seed for reproducible training.") + # Mixed precision training parameters + parser.add_argument("--fp16", + action="store_true", + help="whether or not mixed precision training") + + return parser + + +if __name__ == "__main__": + args = get_args_parser().parse_args() + acc = main(args) + reprod_logger = ReprodLogger() + reprod_logger.add("acc", np.array([acc])) + reprod_logger.save("train_align_benchmark.npy") diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/train.sh b/examples/torch_migration/pipeline/Step5/bert_torch/train.sh new file mode 100644 index 000000000000..1d26be50340b --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_torch/train.sh @@ -0,0 +1,19 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python train.py \ + --model_name_or_path bert-base-uncased \ + --batch_size 128 \ + --num_warmup_steps 158 \ + --output_dir bert_outputs \ \ No newline at end of file diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/utils.py b/examples/torch_migration/pipeline/Step5/bert_torch/utils.py new file mode 100644 index 000000000000..834b061d7c1a --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/bert_torch/utils.py @@ -0,0 +1,200 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import errno +import os +import time +from collections import defaultdict, deque + +import torch + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + return + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value, + ) + + +class MetricLogger(object): + + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append("{}: {}".format(name, str(meter))) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = "" + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt="{avg:.4f}") + data_time = SmoothedValue(fmt="{avg:.4f}") + space_fmt = ":" + str(len(str(len(iterable)))) + "d" + if torch.cuda.is_available(): + log_msg = self.delimiter.join([ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + "max mem: {memory:.0f}", + ]) + else: + log_msg = self.delimiter.join([ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB, + )) + else: + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + )) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print("{} Total time: {}".format(header, total_time_str)) + + +def accuracy(output, target, topk=(1, )): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target[None]) + + res = [] + for k in topk: + correct_k = correct[:k].flatten().sum(dtype=torch.float32) + res.append(correct_k * (100.0 / batch_size)) + return res + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise diff --git a/examples/torch_migration/pipeline/Step5/check_step5.py b/examples/torch_migration/pipeline/Step5/check_step5.py new file mode 100644 index 000000000000..79d3556a8ae0 --- /dev/null +++ b/examples/torch_migration/pipeline/Step5/check_step5.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from reprod_log import ReprodDiffHelper + +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + torch_info = diff_helper.load_info("bert_torch/train_align_benchmark.npy") + paddle_info = diff_helper.load_info("bert_paddle/train_align_paddle.npy") + + diff_helper.compare_info(torch_info, paddle_info) + + diff_helper.report(path="train_align_diff.log", diff_threshold=0.0025) diff --git a/examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py b/examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py new file mode 100644 index 000000000000..8d9a4f6de25b --- /dev/null +++ b/examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py @@ -0,0 +1,37 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import torch + + +def generate(seed): + np.random.seed(seed) + weight = np.random.normal(0, 0.02, (768, 2)).astype("float32") + bias = np.zeros((2, )).astype("float32") + paddle_weights = { + "classifier.weight": weight, + "classifier.bias": bias, + } + torch_weights = { + "classifier.weight": torch.from_numpy(weight).t(), + "classifier.bias": torch.from_numpy(bias), + } + torch.save(torch_weights, "torch_classifier_weights.bin") + paddle.save(paddle_weights, "paddle_classifier_weights.bin") + + +if __name__ == "__main__": + generate(seed=42) diff --git a/examples/torch_migration/pipeline/fake_data/gen_fake_data.py b/examples/torch_migration/pipeline/fake_data/gen_fake_data.py new file mode 100644 index 000000000000..e083799c0484 --- /dev/null +++ b/examples/torch_migration/pipeline/fake_data/gen_fake_data.py @@ -0,0 +1,26 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def gen_fake_data(): + fake_data = np.random.randint(1, 30522, size=(4, 64)).astype(np.int64) + fake_label = np.array([0, 1, 1, 0]).astype(np.int64) + np.save("fake_data.npy", fake_data) + np.save("fake_label.npy", fake_label) + + +if __name__ == "__main__": + gen_fake_data() diff --git a/examples/torch_migration/pipeline/models/pd_bert.py b/examples/torch_migration/pipeline/models/pd_bert.py new file mode 100644 index 000000000000..ca1118e933e3 --- /dev/null +++ b/examples/torch_migration/pipeline/models/pd_bert.py @@ -0,0 +1,454 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Paddle BERT model.""" + +import math +from typing import Optional, Tuple + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +ACT2FN = { + "relu": F.relu, + "gelu": F.gelu, + "tanh": F.tanh, + "sigmoid": F.sigmoid, +} +NEG_INF = -1e4 + + +class BertConfig: + + def __init__(self, + vocab_size: int = 30522, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + attention_probs_dropout_prob: float = 0.1, + max_position_embeddings: int = 512, + type_vocab_size: int = 2, + initializer_range: float = 0.02, + pad_token_id: int = 0, + pool_act: str = "tanh", + layer_norm_eps: float = 1e-12, + output_attentions: bool = False, + output_hidden_states: bool = False, + num_labels=2, + **kwargs): + self.pad_token_id = pad_token_id + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.pool_act = pool_act + self.layer_norm_eps = layer_norm_eps + self.output_attentions = output_attentions + self.output_hidden_states = output_hidden_states + self.num_labels = num_labels + + +class BertEmbeddings(nn.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, + epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.register_buffer( + "position_ids", + paddle.arange(config.max_position_embeddings).reshape((1, -1))) + + def forward( + self, + input_ids: Optional[paddle.Tensor] = None, + token_type_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + ) -> paddle.Tensor: + input_shape = input_ids.shape + seq_length = input_ids.shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) + + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + position_embeddings = self.position_embeddings(position_ids) + embeddings = inputs_embeds + token_type_embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Layer): + + def __init__(self, config): + super().__init__() + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: paddle.Tensor) -> paddle.Tensor: + new_x_shape = x.shape[:-1] + [ + self.num_attention_heads, self.attention_head_size + ] + x = x.reshape(new_x_shape) + return x.transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + + # compute q,k,v + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_layer, + key_layer, + transpose_y=True) + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = F.softmax(attention_scores, axis=-1) + attention_probs = self.dropout(attention_probs) + + context_layer = paddle.matmul(attention_probs, value_layer) + + context_layer = context_layer.transpose([0, 2, 1, 3]) + new_context_layer_shape = context_layer.shape[:-2] + [ + self.all_head_size, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + return outputs + + +class BertSelfOutput(nn.Layer): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: paddle.Tensor, + input_tensor: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Layer): + + def __init__(self, config): + super().__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Layer): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Layer): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: paddle.Tensor, + input_tensor: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Layer): + + def __init__(self, config): + super().__init__() + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + # self attn + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + # ffn + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + + outputs = (layer_output, ) + outputs + + return outputs + + +class BertEncoder(nn.Layer): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.LayerList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for layer_module in self.layer: + # add hidden_states + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_outputs = layer_module( + hidden_states, + attention_mask, + output_attentions, + ) + hidden_states = layer_outputs[0] + + # add self attn + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + return tuple(v for v in [ + hidden_states, + all_hidden_states, + all_self_attentions, + ] if v is not None) + + +class BertPooler(nn.Layer): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = ACT2FN[config.pool_act] + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPreTrainedModel(nn.Layer): + + def _init_weights(self, module): + """Initialize the weights""" + normal_init = nn.initializer.Normal(mean=0.0, + std=self.config.initializer_range) + zero_init = nn.initializer.Constant(0.) + one_init = nn.initializer.Constant(1.) + if isinstance(module, nn.Linear): + normal_init(module.weight) + if module.bias is not None: + zero_init(module.bias) + elif isinstance(module, nn.Embedding): + normal_init(module.weight) + if module._padding_idx is not None: + with paddle.no_grad(): + module.weight[module._padding_idx] = 0 + elif isinstance(module, nn.LayerNorm): + zero_init(module.bias) + one_init(module.weight) + + +class BertModel(BertPreTrainedModel): + + def __init__(self, config, add_pooling_layer=True): + super().__init__() + self.config = config + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.apply(self._init_weights) + + def forward( + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + token_type_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[paddle.Tensor]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + + if token_type_ids is None: + token_type_ids = paddle.zeros(input_ids.shape, dtype=paddle.int64) + + if attention_mask is not None: + attention_mask = (1.0 - attention_mask[:, :, None, None]) * NEG_INF + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + ) + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + return (sequence_output, pooled_output) + encoder_outputs[1:] + + +class BertForSequenceClassification(BertPreTrainedModel): + + def __init__(self, config): + super().__init__() + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.apply(self._init_weights) + + def forward( + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + token_type_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[paddle.Tensor]: + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + output = (logits, ) + outputs[2:] + return output diff --git a/examples/torch_migration/pipeline/models/pt_bert.py b/examples/torch_migration/pipeline/models/pt_bert.py new file mode 100644 index 000000000000..c7eee9829cd6 --- /dev/null +++ b/examples/torch_migration/pipeline/models/pt_bert.py @@ -0,0 +1,456 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +import math +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +ACT2FN = { + "relu": F.relu, + "gelu": F.gelu, + "tanh": F.tanh, + "sigmoid": F.sigmoid, +} +NEG_INF = -1e4 + + +class BertConfig: + + def __init__(self, + vocab_size: int = 30522, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + attention_probs_dropout_prob: float = 0.1, + max_position_embeddings: int = 512, + type_vocab_size: int = 2, + initializer_range: float = 0.02, + pad_token_id: int = 0, + pool_act: str = "tanh", + layer_norm_eps: float = 1e-12, + output_attentions: bool = False, + output_hidden_states: bool = False, + num_labels=2, + **kwargs): + self.pad_token_id = pad_token_id + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.pool_act = pool_act + self.layer_norm_eps = layer_norm_eps + self.output_attentions = output_attentions + self.output_hidden_states = output_hidden_states + self.num_labels = num_labels + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.register_buffer( + "position_ids", + torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + ) -> torch.Tensor: + input_shape = input_ids.size() + seq_length = input_ids.shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, + dtype=torch.long, + device=self.position_ids.device) + + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + position_embeddings = self.position_embeddings(position_ids) + embeddings = inputs_embeds + token_type_embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + + # compute q,k,v + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, + input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config): + super().__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, + input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # self attn + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + # ffn + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + + outputs = (layer_output, ) + outputs + + return outputs + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for layer_module in self.layer: + # add hidden_states + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_outputs = layer_module( + hidden_states, + attention_mask, + output_attentions, + ) + hidden_states = layer_outputs[0] + + # add self attn + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + return tuple(v for v in [ + hidden_states, + all_hidden_states, + all_self_attentions, + ] if v is not None) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = ACT2FN[config.pool_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPreTrainedModel(nn.Module): + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, + std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, + std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class BertModel(BertPreTrainedModel): + + def __init__(self, config, add_pooling_layer=True): + super().__init__() + self.config = config + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.apply(self._init_weights) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[torch.Tensor]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + + device = input_ids.device + + if token_type_ids is None: + token_type_ids = torch.zeros(input_ids.shape, + dtype=torch.long, + device=device) + + if attention_mask is not None: + attention_mask = (1.0 - attention_mask[:, :, None, None]) * NEG_INF + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + ) + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + return (sequence_output, pooled_output) + encoder_outputs[1:] + + +class BertForSequenceClassification(BertPreTrainedModel): + + def __init__(self, config): + super().__init__() + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.apply(self._init_weights) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[torch.Tensor]: + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + output = (logits, ) + outputs[2:] + return output diff --git a/examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py b/examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py new file mode 100644 index 000000000000..de23d245c6f6 --- /dev/null +++ b/examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py @@ -0,0 +1,28 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from reprod_log import ReprodDiffHelper + +if __name__ == "__main__": + diff_helper = ReprodDiffHelper() + + info1 = diff_helper.load_info("./result_1.npy") + info2 = diff_helper.load_info("./result_2.npy") + + diff_helper.compare_info(info1, info2) + + diff_helper.report(diff_method="mean", + diff_threshold=1e-6, + path="./diff.txt") diff --git a/examples/torch_migration/pipeline/reprod_log_demo/write_log.py b/examples/torch_migration/pipeline/reprod_log_demo/write_log.py new file mode 100644 index 000000000000..b2985e3db724 --- /dev/null +++ b/examples/torch_migration/pipeline/reprod_log_demo/write_log.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from reprod_log import ReprodLogger + +if __name__ == "__main__": + reprod_log_1 = ReprodLogger() + reprod_log_2 = ReprodLogger() + + data_1 = np.random.rand(4, 64, 768).astype(np.float32) + data_2 = np.random.rand(4, 64, 768).astype(np.float32) + + reprod_log_1.add("demo_test_1", data_1) + reprod_log_1.add("demo_test_2", data_1) + reprod_log_1.save("result_1.npy") + + reprod_log_2.add("demo_test_1", data_1) + reprod_log_2.add("demo_test_2", data_2) + reprod_log_2.save("result_2.npy") diff --git a/examples/torch_migration/pipeline/weights/torch2paddle.py b/examples/torch_migration/pipeline/weights/torch2paddle.py new file mode 100644 index 000000000000..1af08c937769 --- /dev/null +++ b/examples/torch_migration/pipeline/weights/torch2paddle.py @@ -0,0 +1,116 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import paddle +import torch +from paddlenlp.transformers import BertForPretraining as PDBertForMaskedLM +from transformers import BertForMaskedLM as PTBertForMaskedLM + + +def convert_pytorch_checkpoint_to_paddle( + pytorch_checkpoint_path="pytorch_model.bin", + paddle_dump_path="model_state.pdparams", + version="old", +): + hf_to_paddle = { + "embeddings.LayerNorm": "embeddings.layer_norm", + "encoder.layer": "encoder.layers", + "attention.self.query": "self_attn.q_proj", + "attention.self.key": "self_attn.k_proj", + "attention.self.value": "self_attn.v_proj", + "attention.output.dense": "self_attn.out_proj", + "intermediate.dense": "linear1", + "output.dense": "linear2", + "attention.output.LayerNorm": "norm1", + "output.LayerNorm": "norm2", + "predictions.decoder.": "predictions.decoder_", + "predictions.transform.dense": "predictions.transform", + "predictions.transform.LayerNorm": "predictions.layer_norm", + } + do_not_transpose = [] + if version == "old": + hf_to_paddle.update({ + "predictions.bias": "predictions.decoder_bias", + ".gamma": ".weight", + ".beta": ".bias", + }) + do_not_transpose = do_not_transpose + ["predictions.decoder.weight"] + + pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu") + paddle_state_dict = OrderedDict() + for k, v in pytorch_state_dict.items(): + is_transpose = False + if k[-7:] == ".weight": + # embeddings.weight and LayerNorm.weight do not transpose + if all(d not in k for d in do_not_transpose): + if ".embeddings." not in k and ".LayerNorm." not in k: + if v.ndim == 2: + if 'embeddings' not in k: + v = v.transpose(0, 1) + is_transpose = True + is_transpose = False + oldk = k + # for hf_name, pd_name in hf_to_paddle.items(): + # k = k.replace(hf_name, pd_name) + + # add prefix `bert.` + if "bert." not in k and "cls." not in k and "classifier" not in k: + k = k + + print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}") + paddle_state_dict[k] = v.data.numpy() + + paddle.save(paddle_state_dict, paddle_dump_path) + + +def compare(out_torch, out_paddle): + out_torch = out_torch.detach().numpy() + out_paddle = out_paddle.detach().numpy() + assert out_torch.shape == out_paddle.shape + abs_dif = np.abs(out_torch - out_paddle) + mean_dif = np.mean(abs_dif) + max_dif = np.max(abs_dif) + min_dif = np.min(abs_dif) + print("mean_dif:{}".format(mean_dif)) + print("max_dif:{}".format(max_dif)) + print("min_dif:{}".format(min_dif)) + + +def test_forward(): + paddle.set_device("cpu") + model_torch = PTBertForMaskedLM.from_pretrained("./bert-base-uncased") + model_paddle = PDBertForMaskedLM.from_pretrained("./bert-base-uncased") + model_torch.eval() + model_paddle.eval() + np.random.seed(42) + x = np.random.randint(1, + model_paddle.bert.config["vocab_size"], + size=(4, 64)) + input_torch = torch.tensor(x, dtype=torch.int64) + out_torch = model_torch(input_torch)[0] + + input_paddle = paddle.to_tensor(x, dtype=paddle.int64) + out_paddle = model_paddle(input_paddle)[0] + + print("torch result shape:{}".format(out_torch.shape)) + print("paddle result shape:{}".format(out_paddle.shape)) + compare(out_torch, out_paddle) + + +if __name__ == "__main__": + convert_pytorch_checkpoint_to_paddle("./torch_weight.bin", + "./paddle_weight.pdparams") diff --git a/examples/torch_migration/pipeline/weights/torch_bert_weight.py b/examples/torch_migration/pipeline/weights/torch_bert_weight.py new file mode 100644 index 000000000000..52d778b25ff7 --- /dev/null +++ b/examples/torch_migration/pipeline/weights/torch_bert_weight.py @@ -0,0 +1,21 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers import BertModel +import torch + +hf_model = BertModel.from_pretrained("bert-base-uncased") +hf_model.eval() +PATH = './torch_weight.bin' +torch.save(hf_model.state_dict(), PATH) diff --git a/examples/torch_migration/requirements.txt b/examples/torch_migration/requirements.txt new file mode 100644 index 000000000000..4d3875d03156 --- /dev/null +++ b/examples/torch_migration/requirements.txt @@ -0,0 +1,5 @@ +paddlepaddle-gpu==2.2.0 +torch>=1.7 +transformers +paddlenlp +git+https://github.com/WenmuZhou/reprod_log.git