From 1fc23a88887aab85b563bb3a9a8e844bc5dca497 Mon Sep 17 00:00:00 2001
From: ymyjl <113601649+ymyjl@users.noreply.github.com>
Date: Thu, 3 Nov 2022 10:42:47 +0800
Subject: [PATCH] [Tutorial] Add torch migration tutorial (#3641)

Co-authored-by: Zhong Hui <zhonghui.net@gmail.com>
---
 examples/torch_migration/README.md            |  62 ++
 .../docs/ThesisReproduction_NLP.md            | 928 ++++++++++++++++++
 .../torch_migration/pipeline/Step1/README.md  |  86 ++
 .../pipeline/Step1/check_step1.py             |  23 +
 .../pipeline/Step1/pd_forward_bert.py         |  50 +
 .../pipeline/Step1/pt_forward_bert.py         |  48 +
 .../pipeline/Step1/torch2paddle.py            | 114 +++
 .../torch_migration/pipeline/Step2/README.md  | 131 +++
 .../pipeline/Step2/accuracy.py                |  96 ++
 .../pipeline/Step2/check_step2.py             |  24 +
 .../Step2/demo_sst2_sentence/demo.tsv         |  33 +
 .../torch_migration/pipeline/Step2/predict.py |  94 ++
 .../pipeline/Step2/test_data.py               | 145 +++
 .../pipeline/Step2/test_metric.py             |  50 +
 .../torch_migration/pipeline/Step3/README.md  |  67 ++
 .../pipeline/Step3/check_step3.py             |  24 +
 .../pipeline/Step3/paddle_loss.py             |  59 ++
 .../pipeline/Step3/torch_loss.py              |  58 ++
 .../torch_migration/pipeline/Step4/README.md  | 136 +++
 .../pipeline/Step4/check_step4.py             |  23 +
 .../torch_migration/pipeline/Step4/test_bp.py | 141 +++
 .../pipeline/Step4/test_lr_scheduler.py       | 102 ++
 .../torch_migration/pipeline/Step5/README.md  |  29 +
 .../pipeline/Step5/bert_paddle/train.py       | 342 +++++++
 .../pipeline/Step5/bert_paddle/train.sh       |  20 +
 .../pipeline/Step5/bert_paddle/utils.py       | 211 ++++
 .../pipeline/Step5/bert_torch/accuracy.py     |  96 ++
 .../pipeline/Step5/bert_torch/glue.py         | 633 ++++++++++++
 .../pipeline/Step5/bert_torch/train.py        | 373 +++++++
 .../pipeline/Step5/bert_torch/train.sh        |  19 +
 .../pipeline/Step5/bert_torch/utils.py        | 200 ++++
 .../pipeline/Step5/check_step5.py             |  24 +
 .../generate_classifier_weights.py            |  37 +
 .../pipeline/fake_data/gen_fake_data.py       |  26 +
 .../pipeline/models/pd_bert.py                | 454 +++++++++
 .../pipeline/models/pt_bert.py                | 456 +++++++++
 .../reprod_log_demo/check_log_diff.py         |  28 +
 .../pipeline/reprod_log_demo/write_log.py     |  31 +
 .../pipeline/weights/torch2paddle.py          | 116 +++
 .../pipeline/weights/torch_bert_weight.py     |  21 +
 examples/torch_migration/requirements.txt     |   5 +
 41 files changed, 5615 insertions(+)
 create mode 100644 examples/torch_migration/README.md
 create mode 100644 examples/torch_migration/docs/ThesisReproduction_NLP.md
 create mode 100644 examples/torch_migration/pipeline/Step1/README.md
 create mode 100644 examples/torch_migration/pipeline/Step1/check_step1.py
 create mode 100644 examples/torch_migration/pipeline/Step1/pd_forward_bert.py
 create mode 100644 examples/torch_migration/pipeline/Step1/pt_forward_bert.py
 create mode 100644 examples/torch_migration/pipeline/Step1/torch2paddle.py
 create mode 100644 examples/torch_migration/pipeline/Step2/README.md
 create mode 100644 examples/torch_migration/pipeline/Step2/accuracy.py
 create mode 100644 examples/torch_migration/pipeline/Step2/check_step2.py
 create mode 100644 examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv
 create mode 100644 examples/torch_migration/pipeline/Step2/predict.py
 create mode 100644 examples/torch_migration/pipeline/Step2/test_data.py
 create mode 100644 examples/torch_migration/pipeline/Step2/test_metric.py
 create mode 100644 examples/torch_migration/pipeline/Step3/README.md
 create mode 100644 examples/torch_migration/pipeline/Step3/check_step3.py
 create mode 100644 examples/torch_migration/pipeline/Step3/paddle_loss.py
 create mode 100644 examples/torch_migration/pipeline/Step3/torch_loss.py
 create mode 100644 examples/torch_migration/pipeline/Step4/README.md
 create mode 100644 examples/torch_migration/pipeline/Step4/check_step4.py
 create mode 100644 examples/torch_migration/pipeline/Step4/test_bp.py
 create mode 100644 examples/torch_migration/pipeline/Step4/test_lr_scheduler.py
 create mode 100644 examples/torch_migration/pipeline/Step5/README.md
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_paddle/train.py
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_paddle/train.sh
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_paddle/utils.py
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/glue.py
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/train.py
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/train.sh
 create mode 100644 examples/torch_migration/pipeline/Step5/bert_torch/utils.py
 create mode 100644 examples/torch_migration/pipeline/Step5/check_step5.py
 create mode 100644 examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py
 create mode 100644 examples/torch_migration/pipeline/fake_data/gen_fake_data.py
 create mode 100644 examples/torch_migration/pipeline/models/pd_bert.py
 create mode 100644 examples/torch_migration/pipeline/models/pt_bert.py
 create mode 100644 examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py
 create mode 100644 examples/torch_migration/pipeline/reprod_log_demo/write_log.py
 create mode 100644 examples/torch_migration/pipeline/weights/torch2paddle.py
 create mode 100644 examples/torch_migration/pipeline/weights/torch_bert_weight.py
 create mode 100644 examples/torch_migration/requirements.txt

diff --git a/examples/torch_migration/README.md b/examples/torch_migration/README.md
new file mode 100644
index 000000000000..603f040ed114
--- /dev/null
+++ b/examples/torch_migration/README.md
@@ -0,0 +1,62 @@
+# BERT-SST2-Prod
+Reproduction process of BERT on SST2 dataset
+
+# 安装说明
+
+* 下载代码库
+
+```shell
+git clone https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/torch_migration
+```
+
+* 进入文件夹，安装requirements
+
+```shell
+pip install -r requirements.txt
+```
+
+* 安装PaddlePaddle与PyTorch
+
+```shell
+# CPU版本的PaddlePaddle
+pip install paddlepaddle==2.2.0 -i https://mirror.baidu.com/pypi/simple
+# 如果希望安装GPU版本的PaddlePaddle，可以使用下面的命令
+# pip install paddlepaddle-gpu==2.2.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+# 安装PyTorch
+pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+```
+
+**注意**: 本项目依赖于paddlepaddle-2.2.0版本，安装时需要注意。
+
+* 验证PaddlePaddle是否安装成功
+
+运行python，输入下面的命令。
+
+```shell
+import paddle
+paddle.utils.run_check()
+print(paddle.__version__)
+```
+
+如果输出下面的内容，则说明PaddlePaddle安装成功。
+
+```
+PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
+2.2.0
+```
+
+
+* 验证PyTorch是否安装成功
+
+运行python，输入下面的命令，如果可以正常输出，则说明torch安装成功。
+
+```shell
+import torch
+print(torch.__version__)
+# 如果安装的是cpu版本，可以按照下面的命令确认torch是否安装成功
+# 期望输出为 tensor([1.])
+print(torch.Tensor([1.0]))
+# 如果安装的是gpu版本，可以按照下面的命令确认torch是否安装成功
+# 期望输出为 tensor([1.], device='cuda:0')
+print(torch.Tensor([1.0]).cuda())
+```
diff --git a/examples/torch_migration/docs/ThesisReproduction_NLP.md b/examples/torch_migration/docs/ThesisReproduction_NLP.md
new file mode 100644
index 000000000000..eee175d34a28
--- /dev/null
+++ b/examples/torch_migration/docs/ThesisReproduction_NLP.md
@@ -0,0 +1,928 @@
+# 论文复现指南
+
+## 目录
+
+- [1. 总览](#1)
+    - [1.1 背景](#1.1)
+    - [1.2 前序工作](#1.2)
+- [2. 整体框图](#2)
+    - [2.1 流程概览](#2.1)
+    - [2.2 reprod_log whl包](#2.2)
+- [3. 论文复现理论知识及实战](#3)
+    - [3.1 模型结构对齐](#3.1)
+    - [3.2 验证/测试集数据读取对齐](#3.2)
+    - [3.3 评估指标对齐](#3.3)
+    - [3.4 损失函数对齐](#3.4)
+    - [3.5 优化器对齐](#3.5)
+    - [3.6 学习率对齐](#3.6)
+    - [3.7 正则化策略对齐](#3.7)
+    - [3.8 反向对齐](#3.8)
+    - [3.9 训练集数据读取对齐](#3.9)
+    - [3.10 网络初始化对齐](#3.10)
+    - [3.11 模型训练对齐](#3.11)
+    - [3.12 单机多卡训练](#3.12)
+- [4. 论文复现注意事项与FAQ](#4)
+    - [4.0 通用注意事项](#4.0)
+    - [4.1 模型结构对齐](#4.1)
+    - [4.2 验证/测试集数据读取对齐](#4.2)
+    - [4.3 评估指标对齐](#4.3)
+    - [4.4 损失函数对齐](#4.4)
+    - [4.5 优化器对齐](#4.5)
+    - [4.6 学习率对齐](#4.6)
+    - [4.7 正则化策略对齐](#4.7)
+    - [4.8 反向对齐](#4.8)
+    - [4.9 训练集数据读取对齐](#4.9)
+    - [4.10 网络初始化对齐](#4.10)
+    - [4.11 模型训练对齐](#4.11)
+
+<a name="1"></a>
+## 1. 总览
+
+<a name="1.1"></a>
+### 1.1 背景
+
+* 以深度学习为核心的人工智能技术仍在高速发展，通过论文复现，开发者可以获得
+    * 学习成长：自我能力提升
+    * 技术积累：对科研或工作有所帮助和启发
+    * 社区荣誉：成果被开发者广泛使用
+
+<a name="1.2"></a>
+### 1.2 前序工作
+
+基于本指南复现论文过程中，建议开发者准备以下内容。
+
+* 了解该模型输入输出格式。以BERT的情感分类任务为例，通过阅读论文与参考代码，了解到模型输入为`[batch_size, sequence_length]`的tensor，类型为`int64`，label为`[batch, ]`的label，类型为`int64`。
+* 准备好训练/验证数据集，用于模型训练与评估
+* 准备好fake input data以及label，与模型输入shape、type等保持一致，用于后续模型前向对齐。
+    * 在对齐模型前向过程中，我们不需要考虑数据集模块等其他模块，此时使用fake data是将模型结构和数据部分解耦非常合适的一种方式。
+    * 将fake data以文件的形式存储下来，也可以保证PaddlePaddle与参考代码的模型结构输入是完全一致的，更便于排查问题。
+    * 在该步骤中，以BERT为例，生成fake data的脚本可以参考：[gen_fake_data.py](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/fake_data/gen_fake_data.py)。
+* 在特定设备(CPU/GPU)上，跑通参考代码的预测过程(前向)以及至少2轮(iteration)迭代过程，保证后续基于PaddlePaddle复现论文过程中可对比。
+* 本文档基于 `BERT-SST2-Prod` 代码以及`reprod_log` whl包进行说明与测试。如果希望体验，建议参考[BERT-SST2-Prod文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/README.md)进行安装与测试。
+* 在复现的过程中，只需要将PaddlePaddle的复现代码以及打卡日志上传至github，不能在其中添加参考代码的实现，在验收通过之后，需要删除打卡日志。建议在初期复现的时候，就将复现代码与参考代码分成2个文件夹进行管理。
+
+<a name="2"></a>
+## 2. 整体框图
+
+<a name="2.1"></a>
+### 2.1 流程概览
+
+面对一篇自然语言处理的论文，复现该论文的整体流程如下图所示。
+
+![图片](https://user-images.githubusercontent.com/16911935/199389647-b000a7b1-28d1-485e-8ec0-3e7e2c05884a.png)
+
+总共包含11个步骤。为了高效复现论文，设置了5个验收节点。如上图中黄色框所示。后续章节会详细介绍上述步骤和验收节点，具体内容安排如下：
+
+* 第3章：介绍11个复现步骤的理论知识、实战以及验收流程。
+* 第4章：针对复现流程过程中每个步骤可能出现的问题，本章会进行详细介绍。如果还是不能解决问题，可以提ISSUE进行讨论，提ISSUE地址：[https://github.com/PaddlePaddle/Paddle/issues/new/choose](https://github.com/PaddlePaddle/Paddle/issues/new/choose)
+
+<a name="2.2"></a>
+### 2.2 reprod_log whl包
+
+#### 2.2.1 reprod_log工具简介
+`reprod_log`是用于论文复现赛中辅助自查和验收工具。该工具源代码地址在：[https://github.com/WenmuZhou/reprod_log](https://github.com/WenmuZhou/reprod_log)。主要功能如下：
+
+* 存取指定节点的输入输出tensor
+* 基于文件的tensor读写
+* 2个字典的对比验证
+* 对比结果的输出与记录
+
+更多API与使用方法可以参考：[reprod_log API使用说明](https://github.com/WenmuZhou/reprod_log/blob/master/README.md)。
+
+#### 2.2.2 reprod_log使用demo
+
+下面基于代码：[https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/reprod_log_demo](https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/reprod_log_demo)，给出如何使用该工具。
+
+文件夹中包含`write_log.py`和`check_log_diff.py`文件，其中`write_log.py`中给出了`ReprodLogger`类的使用方法，`check_log_diff.py`给出了`ReprodDiffHelper`类的使用方法，依次运行两个python文件，使用下面的方式运行代码。
+
+```shell
+# 进入文件夹
+cd pipeline/reprod_log_demo
+# 随机生成矩阵，写入文件中
+python write_log.py
+# 进行文件对比，输出日志
+python check_log_diff.py
+```
+
+最终会输出以下内容
+
+```
+[2021/11/18 09:29:31] root INFO: demo_test_1:
+[2021/11/18 09:29:31] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/18 09:29:31] root INFO: demo_test_2:
+[2021/11/18 09:29:31] root INFO:     mean diff: check passed: False, value: 0.33387675881385803
+[2021/11/18 09:29:31] root INFO: diff check failed
+```
+
+可以看出：对于key为`demo_test_1`的矩阵，由于diff为0，小于设置的阈值`1e-6`，核验成功；对于key为`demo_test_2`的矩阵，由于diff为0.33，大于设置的阈值`1e-6`，核验失败。
+
+#### 2.2.3 reprod_log在论文复现中应用
+
+在论文复现中，基于reprod_log的结果记录模块，产出下面若干文件
+```
+log_reprod
+├── forward_paddle.npy
+├── forward_torch.npy    # 与forward_paddle.npy作为一并核查的文件对
+├── metric_paddle.npy
+├── metric_torch.npy     # 与metric_paddle.npy作为一并核查的文件对
+├── loss_paddle.npy
+├── loss_torch.npy       # 与loss_paddle.npy作为一并核查的文件对
+├── bp_align_paddle.npy
+├── bp_align_torch.npy   # 与bp_align_paddle.npy作为一并核查的文件对
+├── train_align_paddle.npy
+├── train_align_torch.npy # pytorch运行得到的参考评估指标
+```
+
+基于reprod_log的`ReprodDiffHelper`模块，产出下面5个日志文件。
+
+```
+├── forward_diff.log     # forward_paddle.npy与forward_torch.npy生成的diff结果文件
+├── metric_diff.log      # metric_paddle.npy与metric_torch.npy生成的diff结果文件
+├── loss_diff.log          # loss_paddle.npy与loss_torch.npy生成的diff结果文件
+├── bp_align_diff.log    # bp_align_paddle.npy与bp_align_torch.npy生成的diff结果文件
+├── train_align_diff.log # train_align_paddle.train_align_torch.npy生成的diff结果文件
+```
+
+上述文件的生成代码都需要开发者进行开发，验收时需要提供上面罗列的所有文件（不需要提供产生这些文件的可运行程序）以及完整的模型训练评估程序和日志。
+BERT-SST2-Prod项目提供了基于reprod_log的5个验收点对齐验收示例，具体代码地址为：[https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline](https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline)，
+每个文件夹中的README.md文档提供了使用说明。
+
+<a name="3"></a>
+## 3. 论文复现理论知识及实战
+
+<a name="3.1"></a>
+### 3.1 模型结构对齐
+
+对齐模型结构时，一般有3个主要步骤：
+
+* 网络结构代码转换
+* 权重转换
+* 模型组网正确性验证
+
+下面详细介绍这3个部分。
+
+#### 3.1.1 网络结构代码转换
+
+**【基本流程】**
+
+由于PyTorch的API和PaddlePaddle的API非常相似，可以参考[PyTorch-PaddlePaddle API映射表](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_api_mapping/pytorch_api_mapping_cn.html)
+，组网部分代码直接进行手动转换即可。
+
+**【注意事项】**
+
+如果遇到PaddlePaddle没有的API，可以尝试用多种API来组合，也可以给PaddlePaddle团队提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues)，获得支持。
+
+**【实战】**
+
+BERT网络结构的PyTorch实现: [transformers-bert](https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/modeling_bert.py)
+
+对应转换后的PaddlePaddle实现: [paddlenlp-bert](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/bert/modeling.py)
+
+
+#### 3.1.2 权重转换
+
+**【基本流程】**
+
+组网代码转换完成之后，需要对模型权重进行转换，如果PyTorch repo中已经提供权重，那么可以直接下载并进行后续的转换；如果没有提供，则可以基于PyTorch代码，随机生成一个初始化权重(定义完model以后，使用`torch.save()` API保存模型权重)，然后进行权重转换。
+
+**【注意事项】**
+
+在权重转换的时候，需要注意`paddle.nn.Linear`等API的权重保存格式和名称等与PyTorch稍有diff，具体内容可以参考`4.1章节`。
+
+**【实战】**
+
+BERT的代码转换脚本可以在这里查看：[https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py)，
+
+注意：运行该代码需要首先下载Huggingface的BERT预训练模型到该目录下，下载地址为：[https://huggingface.co/bert-base-uncased/blob/main/pytorch_model.bin](https://huggingface.co/bert-base-uncased/blob/main/pytorch_model.bin)
+
+```python
+# https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py
+
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import torch
+from paddlenlp.transformers import BertForPretraining as PDBertForMaskedLM
+from transformers import BertForMaskedLM as PTBertForMaskedLM
+
+
+def convert_pytorch_checkpoint_to_paddle(
+        pytorch_checkpoint_path="pytorch_model.bin",
+        paddle_dump_path="model_state.pdparams",
+        version="old", ):
+    hf_to_paddle = {
+        "embeddings.LayerNorm": "embeddings.layer_norm",
+        "encoder.layer": "encoder.layers",
+        "attention.self.query": "self_attn.q_proj",
+        "attention.self.key": "self_attn.k_proj",
+        "attention.self.value": "self_attn.v_proj",
+        "attention.output.dense": "self_attn.out_proj",
+        "intermediate.dense": "linear1",
+        "output.dense": "linear2",
+        "attention.output.LayerNorm": "norm1",
+        "output.LayerNorm": "norm2",
+        "predictions.decoder.": "predictions.decoder_",
+        "predictions.transform.dense": "predictions.transform",
+        "predictions.transform.LayerNorm": "predictions.layer_norm",
+    }
+    do_not_transpose = []
+    if version == "old":
+        hf_to_paddle.update({
+            "predictions.bias": "predictions.decoder_bias",
+            ".gamma": ".weight",
+            ".beta": ".bias",
+        })
+        do_not_transpose = do_not_transpose + ["predictions.decoder.weight"]
+
+    pytorch_state_dict = torch.load(
+        pytorch_checkpoint_path, map_location="cpu")
+    paddle_state_dict = OrderedDict()
+    for k, v in pytorch_state_dict.items():
+        is_transpose = False
+        if k[-7:] == ".weight":
+            # embeddings.weight and LayerNorm.weight do not transpose
+            if all(d not in k for d in do_not_transpose):
+                if ".embeddings." not in k and ".LayerNorm." not in k:
+                    if v.ndim == 2:
+                        v = v.transpose(0, 1)
+                        is_transpose = True
+        oldk = k
+        for hf_name, pd_name in hf_to_paddle.items():
+            k = k.replace(hf_name, pd_name)
+
+        # add prefix `bert.`
+        if "bert." not in k and "cls." not in k and "classifier" not in k:
+            k = "bert." + k
+
+        print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}")
+        paddle_state_dict[k] = v.data.numpy()
+
+    paddle.save(paddle_state_dict, paddle_dump_path)
+
+
+def compare(out_torch, out_paddle):
+    out_torch = out_torch.detach().numpy()
+    out_paddle = out_paddle.detach().numpy()
+    assert out_torch.shape == out_paddle.shape
+    abs_dif = np.abs(out_torch - out_paddle)
+    mean_dif = np.mean(abs_dif)
+    max_dif = np.max(abs_dif)
+    min_dif = np.min(abs_dif)
+    print("mean_dif:{}".format(mean_dif))
+    print("max_dif:{}".format(max_dif))
+    print("min_dif:{}".format(min_dif))
+
+
+def test_forward():
+    paddle.set_device("cpu")
+    model_torch = PTBertForMaskedLM.from_pretrained("./bert-base-uncased")
+    model_paddle = PDBertForMaskedLM.from_pretrained("./bert-base-uncased")
+    model_torch.eval()
+    model_paddle.eval()
+    np.random.seed(42)
+    x = np.random.randint(
+        1, model_paddle.bert.config["vocab_size"], size=(4, 64))
+    input_torch = torch.tensor(x, dtype=torch.int64)
+    out_torch = model_torch(input_torch)[0]
+
+    input_paddle = paddle.to_tensor(x, dtype=paddle.int64)
+    out_paddle = model_paddle(input_paddle)[0]
+
+    print("torch result shape:{}".format(out_torch.shape))
+    print("paddle result shape:{}".format(out_paddle.shape))
+    compare(out_torch, out_paddle)
+
+
+if __name__ == "__main__":
+    convert_pytorch_checkpoint_to_paddle(
+        "./bert-base-uncased/pytorch_model.bin",
+        "./bert-base-uncased/model_state.pdparams")
+    test_forward()
+    # torch result shape:torch.Size([4, 64, 30522])
+    # paddle result shape:[4, 64, 30522]
+    # mean_dif:1.666686512180604e-05
+    # max_dif:0.00015211105346679688
+    # min_dif:0.0
+```
+
+运行完成之后，会在当前目录生成`model_state.pdparams`文件，即为转换后的PaddlePaddle预训练模型。
+**Tips**: 由于paddlenlp中已有转换后的bert-base-uncased模型，因此可以一键加载，程序会自动下载对应权重！
+
+
+#### 3.1.3 模型组网正确性验证
+
+**【基本流程】**
+
+1. 定义PyTorch模型，加载权重，固定seed，基于numpy生成随机数，转换为PyTorch可以处理的tensor，送入网络，获取输出，使用reprod_log保存结果。
+2. 定义PaddlePaddle模型，加载权重，固定seed，基于numpy生成随机数，转换为PaddlePaddle可以处理的tensor，送入网络，获取输出，使用reprod_log保存结果。
+3.  使用reprod_log排查diff，小于阈值，即可完成自测。
+
+**【注意事项】**
+
+* 模型在前向对齐验证时，需要调用`model.eval()`方法，保证组网中的随机量被关闭，比如BatchNorm、Dropout等。
+* 给定相同的输入数据，为保证可复现性，如果有随机数生成，固定相关的随机种子。
+* 输出diff可以使用`np.mean(np.abs(o1 - o2))`进行计算，一般小于1e-6的话，可以认为前向没有问题。如果最终输出结果diff较大，可以使用二分的方法进行排查，比如说BERT，包含1个embdding层、12个transformer-block以及最后的MLM head层，那么完成模型组网和权重转换之后，如果模型输出没有对齐，可以尝试输出中间某一个transformer-block的tensor进行对比，如果相同，则向后进行排查；如果不同，则继续向前进行排查，以此类推，直到找到导致没有对齐的操作。
+
+**【实战】**
+
+BERT模型组网正确性验证可以参考如下示例代码：
+[https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/Step1](https://github.com/JunnYu/BERT-SST2-Prod/tree/main/pipeline/Step1
+
+**【验收】**
+
+对于待复现的项目，前向对齐验收流程如下。
+
+1. 准备输入：fake data
+    * 使用参考代码的dataloader，生成一个batch的数据，保存下来，在前向对齐时，直接从文件中读入。
+    * 固定随机数种子，生成numpy随机矩阵，转化tensor
+2. 保存输出：
+    * PaddlePaddle/PyTorch：dict，key为tensor的name（自定义），value为tensor的值。最后将dict保存到文件中。建议命名为`forward_paddle.npy`和`forward_torch.npy`。
+3. 自测：使用reprod_log加载2个文件，使用report功能，记录结果到日志文件中，建议命名为`forward_diff_log.txt`，观察diff，二者diff小于特定的阈值即可。
+4. 提交内容：新建文件夹，将`forward_paddle.npy`、`forward_torch.npy`与`forward_diff_log.txt`文件放在文件夹中，后续的输出结果和自查日志也放在该文件夹中，一并打包上传即可。
+5. 注意：
+    * PaddlePaddle与PyTorch保存的dict的key需要保持相同，否则report过程可能会提示key无法对应，从而导致report失败，之后的`【验收】`环节也是如此。
+    * 如果是固定随机数种子，建议将fake data保存到dict中，方便check参考代码和PaddlePaddle的输入是否一致。
+
+<a name="3.2"></a>
+### 3.2 验证/测试集数据读取对齐
+
+**【基本流程】**
+
+对于一个数据集，一般有以下一些信息需要重点关注
+
+* 数据集名称、下载地址
+* 训练集/验证集/测试集
+
+PaddlePaddle中数据集相关的API为`paddle.io.Dataset`，PyTorch中对应为`torch.utils.data.Dataset`，二者功能一致，在绝大多数情况下，可以使用该类构建数据集。它是描述Dataset方法和行为的抽象类，在具体实现的时候，需要继承这个基类，实现其中的`__getitem__`和`__len__`方法。除了参考代码中相关实现，也可以参考待复现论文中的说明。
+
+复现完Dataset之后，可以构建Dataloader，对数据进行组batch、批处理，送进网络进行计算。
+
+`paddle.io.DataLoader`可以进行数据加载，将数据分成批数据，并提供加载过程中的采样。PyTorch对应的实现为`torch.utils.data.DataLoader`，二者在功能上一致，只是在参数方面稍有diff：（1）PaddlePaddle缺少对`pin_memory`等参数的支持；（2）PaddlePaddle增加了`use_shared_memory`参数来选择是否使用共享内存加速数据加载过程。
+
+**【注意事项】**
+
+论文中一般会提供数据集的名称以及基本信息。复现过程中，我们在下载完数据之后，建议先检查下是否和论文中描述一致，否则可能存在的问题有：
+
+* 数据集版本不同，比如论文中使用了cnn_dailymail的v3.0.0版本数据集，但是我们下载的是cnn_dailymail的v1.0.0版本数据集，如果不对其进行检查，可能会导致我们最终训练的数据量等与论文中有diff
+* 数据集使用方式不同，有些论文中，可能只是抽取了该数据集的子集进行方法验证，此时需要注意抽取方法，需要保证抽取出的子集完全相同。
+* 在评估指标对齐时，我们可以固定batch size，关闭Dataloader的shuffle操作。
+
+构建数据集时，可以使用paddlenlp中的数据集加载方式，具体可以参考：[如何自定义数据集](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html)。对应地，PyTorch中的数据处理api可以参考：[huggingface的datasets自定义数据集](https://huggingface.co/docs/datasets/about_dataset_load.html#building-a-dataset)。对于其中之一，可以找到另一个平台的实现。
+
+此外，
+* 有些自定义的数据处理方法，如果不涉及到深度学习框架的部分，可以直接复用。
+* 对于特定任务中的数据预处理方法，比如说Tokenizer，如果没有现成的API可以调用，可以参考官方模型套件中的一些实现方法，比如PaddleClas、PaddleDetection、PaddleSeg等。
+
+**【实战】**
+
+BERT模型复现过程中，数据预处理和Dataset、Dataloader的检查可以参考该文件：
+[https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/test_data.py](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/test_data.py)
+
+
+使用方法可以参考[数据检查文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/README.md)。
+
+<a name="3.3"></a>
+### 3.3 评估指标对齐
+
+**【基本流程】**
+
+PaddlePaddle提供了一系列Metric计算类，比如说`Accuracy`, `Auc`, `Precision`, `Recall`等，而PyTorch中，目前可以通过组合的方式实现metric计算，或者调用[huggingface-datasets](https://huggingface.co/docs/datasets/about_metrics.html?highlight=metric)，在论文复现的过程中，需要注意保证对于该模块，给定相同的输入，二者输出完全一致。具体流程如下。
+
+1. 构建fake数据
+1. 使用PyTorch的指标获取评估结果，使用reprod_log保存结果。
+2. 使用PaddlePaddle的指标获取评估结果，使用reprod_log保存结果。
+3. 使用reprod_log排查diff，小于阈值，即可完成自测。
+
+**【注意事项】**
+
+在评估指标对齐之前，需要注意保证对于该模块，给定相同的输入，二者输出完全一致。
+
+
+**【实战】**
+
+评估指标对齐检查方法可以参考文档：[评估指标对齐检查方法文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step2/README.md#%E6%95%B0%E6%8D%AE%E8%AF%84%E4%BC%B0%E5%AF%B9%E9%BD%90%E6%B5%81%E7%A8%8B)
+
+
+**【验收】**
+
+对于待复现的项目，评估指标对齐验收流程如下。
+
+1. 输入：dataloader, model
+2. 输出：
+    * PaddlePaddle/PyTorch：dict，key为tensor的name（自定义），value为具体评估指标的值。最后将dict使用reprod_log保存到各自的文件中，建议命名为`metric_paddle.npy`和`metric_torch.npy`。
+    * 自测：使用reprod_log加载2个文件，使用report功能，记录结果到日志文件中，建议命名为`metric_diff_log.txt`，观察diff，二者diff小于特定的阈值即可。
+3. 提交内容：将`metric_paddle.npy`、`metric_torch.npy`与`metric_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中，后续的输出结果和自查日志也放在该文件夹中，一并打包上传即可。
+4. 注意：
+    * 数据需要是真实数据
+    * 需要检查论文是否只是抽取了验证集/测试集中的部分文件，如果是的话，则需要保证PaddlePaddle和参考代码中dataset使用的数据集一致。
+
+
+<a name="3.4"></a>
+### 3.4 损失函数对齐
+
+**【基本流程】**
+
+PaddlePaddle与PyTorch均提供了很多loss function，用于模型训练，具体的API映射表可以参考：[Loss类API映射列表](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_api_mapping/pytorch_api_mapping_cn.html#lossapi)。以CrossEntropyLoss为例，主要区别为：
+* PaddlePaddle提供了对软标签、指定softmax计算纬度的支持。
+
+如果论文中使用的loss function没有指定的API，则可以尝试通过组合API的方式，实现自定义的loss function。
+
+具体流程如下。
+
+1. 定义PyTorch模型，加载权重，加载fake data 和 fake label（或者固定seed，基于numpy生成随机数），转换为PyTorch可以处理的tensor，送入网络，获取loss结果，使用reprod_log保存结果。
+2. 定义PaddlePaddle模型，加载fake data 和 fake label（或者固定seed，基于numpy生成随机数），转换为PaddlePaddle可以处理的tensor，送入网络，获取loss结果，使用reprod_log保存结果。
+3. 使用reprod_log排查diff，小于阈值，即可完成自测。
+
+**【注意事项】**
+
+* 计算loss的时候，建议设置`model.eval()`，避免模型中随机量的问题。
+
+**【实战】**
+
+本部分可以参考文档：[https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step3/README.md](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step3/README.md)。
+
+**【验收】**
+
+对于待复现的项目，损失函数对齐验收流程如下。
+
+1. 输入：fake data & label
+2. 输出：
+    * PaddlePaddle/PyTorch：dict，key为tensor的name（自定义），value为具体评估指标的值。最后将dict使用reprod_log保存到各自的文件中，建议命名为`loss_paddle.npy`和`loss_torch.npy`。
+3. 自测：使用reprod_log加载2个文件，使用report功能，记录结果到日志文件中，建议命名为`loss_diff_log.txt`，观察diff，二者diff小于特定的阈值即可。
+4. 提交内容：将`loss_paddle.npy`、`loss_torch.npy`与`loss_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中，后续的输出结果和自查日志也放在该文件夹中，一并打包上传即可。
+
+<a name="3.5"></a>
+### 3.5 优化器对齐
+
+**【基本流程】**
+
+PaddlePaddle中的optimizer有`paddle.optimizer`等一系列实现，PyTorch中则有`torch.Optim`等一系列实现。
+
+**【注意事项】**
+
+以SGD等优化器为例，PaddlePaddle与Pytorch的优化器区别主要如下。
+
+* PaddlePaddle在优化器中增加了对梯度裁剪的支持，在训练GAN或者一些NLP、多模态任务中，这个用到的比较多。
+* PaddlePaddle的SGD不支持动量更新、动量衰减和Nesterov动量，这里需要使用`paddle.optimizer.Momentum` API实现这些功能。
+
+**【实战】**
+
+本部分对齐建议对照[PaddlePaddle优化器API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html)与参考代码的优化器实现进行对齐，用之后的反向对齐统一验证该模块的正确性。
+
+
+<a name="3.6"></a>
+### 3.6 学习率对齐
+
+**【基本流程】**
+
+* 学习率策略主要用于指定训练过程中的学习率变化曲线，这里可以将定义好的学习率策略，不断step，即可得到对应的学习率值，可以将学习率值保存在列表或者矩阵中，使用`reprod_log`工具判断二者是否对齐。
+
+**【注意事项】**
+
+PaddlePaddle中，需要首先构建学习率策略，再传入优化器对象中；对于PyTorch，如果希望使用更丰富的学习率策略，需要先构建优化器，再传入学习率策略类API。
+
+**【实战】**
+
+学习率复现对齐，可以参考代码：[学习率对齐验证文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step4/README.md#%E5%AD%A6%E4%B9%A0%E7%8E%87%E5%AF%B9%E9%BD%90%E9%AA%8C%E8%AF%81)。
+
+<a name="3.7"></a>
+### 3.7 正则化策略对齐
+
+**【基本流程】**
+
+L2正则化策略用于模型训练，可以防止模型对训练数据过拟合，L1正则化可以用于得到稀疏化的权重矩阵，PaddlePaddle中有`paddle.regularizer.L1Decay`与`paddle.regularizer.L2Decay` API。PyTorch中，torch.optim集成的优化器只有L2正则化方法，直接在构建optimizer的时候，传入`weight_decay`参数即可。
+
+**【注意事项】**
+
+* PaddlePaddle的optimizer中支持L1Decat/L2Decay。
+* PyTorch的optimizer支持不同参数列表的学习率分别设置，params传入字典即可，而PaddlePaddle的2.1.0版本目前尚未支持这种行为，可以通过设置`ParamAttr`的`learning_rate`参数，来确定相对学习率倍数。PaddlePaddle的2.2.0版本中虽然实现该功能，但是模型收敛速度较慢，不建议使用。[优化器收敛速度慢](https://github.com/PaddlePaddle/Paddle/issues/36915)
+
+**【实战】**
+
+本部分对齐建议对照[PaddlePaddle正则化API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/regularizer/L2Decay_cn.html)与参考代码的优化器实现进行对齐，用之后的反向对齐统一验证该模块的正确性。
+
+<a name="3.8"></a>
+### 3.8 反向对齐
+
+**【基本流程】**
+
+此处可以通过numpy生成假的数据和label（推荐），也可以准备固定的真实数据。具体流程如下。
+
+1. 检查两个代码的训练超参数全部一致，如优化器及其超参数、学习率、LayerNorm中的eps等。
+2. 将PaddlePaddle与PyTorch网络中涉及的所有随机操作全部关闭，如dropout、drop_path等，推荐将模型设置为eval模式（`model.eval()`）
+3. 加载相同的weight dict（可以通过PyTorch来存储随机的权重），将准备好的数据分别传入网络并迭代，观察二者loss是否一致（此处batch-size要一致，如果使用多个真实数据，要保证传入网络的顺序一致）
+4. 如果经过2轮以上，loss均可以对齐，则基本可以认为反向对齐。
+
+
+**【注意事项】**
+
+* 如果第一轮loss就没有对齐，则需要仔细排查一下模型前向部分。
+* 如果第二轮开始，loss开始无法对齐，则首先需要排查下超参数的差异，没问题的话，在`loss.backward()`方法之后，使用`tensor.grad`获取梯度值，二分的方法查找diff，定位出PaddlePaddle与PyTorch梯度无法对齐的API或者操作，然后进一步验证并反馈。
+
+梯度的打印方法示例代码如下所示，注释掉的内容即为打印网络中所有参数的梯度shape。
+
+```python
+    # 代码地址：https://github.com/JunnYu/BERT-SST2-Prod/blob/2c372656bb1b077b0073c50161771d9fa9d8de5a/pipeline/Step4/test_bp.py#L12
+    def pd_train_some_iters(model,
+                        criterion,
+                        optimizer,
+                        fake_data,
+                        fake_label,
+                        max_iter=2):
+        model = PDBertForSequenceClassification.from_pretrained("bert-base-uncased", num_classes=2)
+        classifier_weights = paddle.load("../classifier_weights/paddle_classifier_weights.bin")
+        model.load_dict(classifier_weights)
+        model.eval()
+        criterion = paddle.nn.CrossEntropy()
+        decay_params = [
+            p.name for n, p in model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+        optimizer = paddle.optimizer.AdamW(learning_rate=3e-5, parameters=model.parameters(),
+            weight_decay=1e-2,
+            epsilon=1e-6,
+            apply_decay_param_fun=lambda x: x in decay_params)
+        loss_list = []
+        for idx in range(max_iter):
+            input_ids = paddle.to_tensor(fake_data)
+            labels = paddle.to_tensor(fake_label)
+
+            output = model(input_ids)
+            loss = criterion(output, labels)
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+            loss_list.append(loss)
+        return loss_list
+```
+
+
+
+
+**【实战】**
+
+本部分可以参考文档：[反向对齐操作文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step4/README.md#%E5%8F%8D%E5%90%91%E5%AF%B9%E9%BD%90%E6%93%8D%E4%BD%9C%E6%96%B9%E6%B3%95)。
+
+**【验收】**
+
+对于待复现的项目，反向对齐验收流程如下。
+
+1. 输入：fake data & label
+2. 输出：
+    * PaddlePaddle/PyTorch：dict，key为tensor的name（自定义），value为具体loss的值。最后将dict使用reprod_log保存到各自的文件中，建议命名为`bp_align_paddle.npy`和`bp_align_torch.npy`。
+3. 自测：使用reprod_log加载2个文件，使用report功能，记录结果到日志文件中，建议命名为`bp_align_diff_log.txt`，观察diff，二者diff小于特定的阈值即可。
+4. 提交内容：将`bp_align_paddle.npy`、`bp_align_torch.npy`与`bp_align_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中，后续的输出结果和自查日志也放在该文件夹中，一并打包上传即可。
+5. 注意：
+    * loss需要保存至少2轮以上。
+    * 在迭代的过程中，需要保证模型的batch size等超参数完全相同
+    * 在迭代的过程中，需要设置`model.eval()`，使用固定的假数据，同时加载相同权重的预训练模型。
+
+<a name="3.9"></a>
+### 3.9 训练集数据读取对齐
+
+**【基本流程】**
+
+该部分内容与3.2节内容基本一致，参考PyTorch的代码，实现训练集数据读取与预处理模块即可。
+
+**【注意事项】**
+
+该部分内容，可以参考3.8节的自测方法，将输入的`fake data & label`替换为训练的dataloader，但是需要注意的是：
+* 在使用train dataloader的时候，建议设置random seed，对于PyTorch来说
+
+```python
+#initialize random seed
+torch.manual_seed(config.SEED)
+torch.cuda.manual_seed_all(config.SEED)
+np.random.seed(config.SEED)
+random.seed(config.SEED)
+```
+
+对于PaddlePaddle来说
+
+```python
+paddle.seed(config.SEED)
+np.random.seed(config.SEED)
+random.seed(config.SEED)
+```
+
+
+<a name="3.10"></a>
+### 3.10 网络初始化对齐
+
+**【基本流程】**
+
+* 下面给出了部分初始化API的映射表。
+
+|PaddlePaddle API | PyTorch API |
+|---|---|
+| paddle.nn.initializer.KaimingNormal | torch.nn.init.kaiming_normal_ |
+| paddle.nn.initializer.KaimingUniform | torch.nn.init.kaiming_uniform_ |
+| paddle.nn.initializer.XavierNormal | torch.nn.init.xavier_normal_ |
+| paddle.nn.initializer.XavierUniform | torch.nn.init.xavier_uniform_ |
+
+**【注意事项】**
+
+* 更多初始化API可以参考[PyTorch初始化API文档](https://pytorch.org/docs/stable/nn.init.html)以及[PaddlePaddle初始化API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/Overview_cn.html#chushihuaxiangguan)。
+
+**【实战】**
+
+本部分对齐建议对照[PaddlePaddle 初始化API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/Overview_cn.html#chushihuaxiangguan)与参考代码的初始化实现对齐。
+
+<a name="3.11"></a>
+### 3.11 模型训练对齐
+
+**【基本流程】**
+
+完成前面的步骤之后，就可以开始全量数据的训练对齐任务了。按照下面的步骤进行训练对齐。
+
+1. 准备train/eval data, loader, model
+2. 对model按照论文所述进行初始化(如果论文中提到加载了预训练模型，则按需加载pretrained model)
+3. 加载配置，开始训练，迭代得到最终模型与评估指标，将评估指标使用reprod_log保存到文件中。
+4. 将PaddlePaddle提供的参考指标使用reprod_log提交到另一个文件中。
+5. 使用reprod_log排查diff，小于阈值，即可完成自测。
+
+**【注意事项】**
+
+* 【强烈】建议先做完反向对齐之后再进行模型训练对齐，二者之间的不确定量包括：数据集、PaddlePaddle与参考代码在模型training mode下的区别，初始化参数。
+* 在训练对齐过程中，受到较多随机量的影响，精度有少量diff是正常的，以SST-2数据集的分类为例，diff在0.15%以内可以认为是正常的，这里可以根据不同的任务，适当调整对齐检查的阈值(`ReprodDiffHelper.report`函数中的`diff_threshold`参数)。
+* 训练过程中的波动是正常的，如果最终收敛结果不一致，可以
+    * 仔细排查Dropout、BatchNorm以及其他组网模块及超参是否无误。
+    * 基于参考代码随机生成一份预训练模型，转化为PaddlePaddle的模型，并使用PaddlePaddle加载训练，对比二者的收敛曲线与最终结果，排查初始化影响。
+    * 使用参考代码的Dataloader生成的数据，进行模型训练，排查train dataloader的影响。
+
+**【实战】**
+
+本部分可以参考文档：[训练对齐操作文档](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/Step5/README.md)。
+
+**【验收】**
+
+对于待复现的项目，训练对齐验收流程如下。
+
+1. 输入：train/eval dataloader, model
+2. 输出：
+    * PaddlePaddle：dict，key为保存值的name（自定义），value为具体评估指标的值。最后将dict使用reprod_log保存到文件中，建议命名为`train_align_paddle.npy`。
+    * benchmark：dict，key为保存值的name（自定义），value为论文复现赛的评估指标要求的值。最后将dict使用reprod_log保存到文件中，建议命名为`train_align_benchmark.npy`。
+3. 自测：使用reprod_log加载2个文件，使用report功能，记录结果到日志文件中，建议命名为`train_align_diff_log.txt`，观察diff，二者diff小于特定的阈值即可。
+4. 提交内容：将`train_align_paddle.npy`、`train_align_benchmark.npy`与`train_align_diff_log.txt`文件备份到`3.1节验收环节`新建的文件夹中，最终一并打包上传即可。
+
+<a name="3.12"></a>
+### 3.12 单机多卡训练
+
+如果希望使用单机多卡提升训练效率，可以从以下几个过程对代码进行修改。
+
+#### 3.12.1 数据读取
+
+对于PaddlePaddle来说，多卡数据读取这块主要的变化在sampler
+
+对于单机单卡，sampler实现方式如下所示。
+
+```python
+train_sampler = paddle.io.RandomSampler(dataset)
+train_batch_sampler = paddle.io.BatchSampler(
+    sampler=train_sampler, batch_size=args.batch_size)
+```
+
+对于单机多卡任务，sampler实现方式如下所示。
+
+```python
+train_batch_sampler = paddle.io.DistributedBatchSampler(
+        dataset=dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=False
+    )
+```
+
+注意：在这种情况下，单机多卡的代码仍然能够以单机单卡的方式运行，因此建议以这种sampler方式进行论文复现。
+
+
+#### 3.12.2 多卡模型初始化
+
+如果以多卡的方式运行，需要初始化并行训练环境，代码如下所示。
+
+```python
+if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+```
+
+在模型组网并初始化参数之后，需要使用`paddle.DataParallel()`对模型进行封装，使得模型可以通过数据并行的模式被执行。代码如下所示。
+
+```python
+if paddle.distributed.get_world_size() > 1:
+    model = paddle.DataParallel(model)
+```
+
+
+#### 3.12.3 模型保存、日志保存等其他模块
+
+以模型保存为例，我们只需要在0号卡上保存即可，否则多个trainer同时保存的话，可能会造成写冲突，导致最终保存的模型不可用。
+
+
+#### 3.12.4 程序启动方式
+
+对于单机单卡，启动脚本如下所示。[https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/bert](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/bert)
+
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0" run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name SST-2 \
+    --max_seq_length 128 \
+    --batch_size 32   \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3 \
+    --logging_steps 1 \
+    --save_steps 500 \
+    --output_dir ./tmp/ \
+    --device gpu \
+    --use_amp False
+```
+
+
+对于单机多卡（示例中为4卡训练），启动脚本如下所示。
+
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0,1,2,3" run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name SST-2 \
+    --max_seq_length 128 \
+    --batch_size 32   \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3 \
+    --logging_steps 1 \
+    --save_steps 500 \
+    --output_dir ./tmp/ \
+    --device gpu \
+    --use_amp False
+```
+
+注意：这里8卡训练时，虽然单卡的batch size没有变化(32)，但是总卡的batch size相当于是单卡的8倍，因此学习率也设置为了单卡时的8倍。
+
+
+**【实战】**
+
+本部分可以参考paddlenlp库中的例子：[单机多卡训练](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/bert)。
+
+<a name="4"></a>
+## 4. 论文复现注意事项与FAQ
+
+本部分主要总结大家在论文复现赛过程中遇到的问题，如果本章内容没有能够解决你的问题，欢迎给该文档提出优化建议或者给Paddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose)。
+
+<a name="4.0"></a>
+### 4.0 通用注意事项
+
+* 需要仔细对照PaddlePaddle与参考代码的优化器参数实现，确保优化器参数严格对齐。
+* 如果遇到一些Paddle不支持的API操作，可以尝试使用替代实现进行复现。如下面的PyTorch代码，PaddlePaddle中可以通过slice + concat API的组合形式进行功能实现。同时，对于这个问题，建议优先给Paddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose)，列出Paddle不支持的实现，开发人员会根据优先级进行开发。
+
+```python
+torch.stack([
+    per_locations[:, 0] - per_box_regression[:, 0],
+    per_locations[:, 1] - per_box_regression[:, 1],
+    per_locations[:, 0] + per_box_regression[:, 2],
+    per_locations[:, 1] + per_box_regression[:, 3],
+], dim=1)
+```
+* 如果遇到Paddle不包含的OP或者API，比如(1) 如果是某些算法实现存在调用了外部OP，而且Paddle也不包含该OP实现；(2) 其他框架存在的API或者OP，但是Paddle中没有这些OP。此时：
+    * 对于Paddle资深用户来说，可以尝试使用Paddle的自定义算子功能，存在一定的代码开发量。
+    * 对于初学者来说，可以给Paddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose)，列出Paddle不支持的实现，Paddle开发人员会根据优先级进行实现。
+* PaddlePaddle与PyTorch对于不同名称的API，实现的功能可能是相同的，复现的时候注意，比如[paddle.optimizer.lr.StepDecay](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/lr/StepDecay_cn.html#stepdecay)与[torch.optim.lr_scheduler.StepLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html#torch.optim.lr_scheduler.StepLR) 。
+* 对于PaddlePaddle来说，通过`paddle.set_device`函数（全局）来确定模型结构是运行在什么设备上，对于torch来说，是通过`model.to("device")` （局部）来确定模型结构的运行设备，这块在复现的时候需要注意。
+
+
+<a name="4.1"></a>
+### 4.1 模型结构对齐
+
+#### 4.1.1 API
+* 对于 `paddle.nn.Linear` 层的weight参数，PaddlePaddle与PyTorch的保存方式不同，在转换时需要进行转置，示例代码可以参考[BERT权重转换脚本](https://github.com/JunnYu/BERT-SST2-Prod/blob/main/pipeline/weights/torch2paddle.py)。
+* `torch.masked_fill`函数的功能目前可以使用`paddle.where`进行实现，可以参考：[链接](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/faq/train_cn.html#paddletorch-masked-fillapi)。
+* `pack_padded_sequence`和`pad_packed_sequence`这两个API目前PaddlePaddle中没有实现，可以直接在RNN或者LSTM的输入中传入`sequence_length`来实现等价的功能。
+
+
+#### 4.1.2 权重转换
+
+* 在权重转换的时候，不能只关注参数的名称，比如说有些`paddle.nn.Linear`层，但是定义的变量名称为`conv`，这种也是需要进行权重转置的。
+* 权重转换时，建议同时打印 Paddle 和 PyTorch 对应权重的shape，以防止名称相似但是shape不同的参数权重转换报错。
+
+#### 4.1.3 模型组网正确性验证
+
+* 在论文复现的过程中，可能会遇到一些经典的模型结构，比如Transformer等，Paddle官方也提供了Transformer的实现，但是这里建议自己根据PyTorch代码重新实现一遍，一方面是对整体的模型结构更加熟悉，另一方面也保证模型结构和权重完全对齐。
+* 在复杂的网络结构中，如果前向结果对不齐，可以按照模块排查问题，比如依次获取embedding、transformer-block、mlm-head输出等，看下问题具体出现在哪个子模块，再进到子模块详细排查。
+* 网络结构对齐后，尽量使用训练好的预训练模型和真实的数据进行前向diff计算，这样更准确。
+
+<a name="4.2"></a>
+### 4.2 验证/测试集数据读取对齐
+
+* 需要仔细排查数据预处理，不仅包含的预处理方法相同，也需要保证预处理的流程相同，比如先padding策略不同和截断策略的不同会导致得到最终的结果是不同的。
+
+<a name="4.3"></a>
+### 4.3 评估指标对齐
+
+* 真实数据评估时，需要注意评估时 `paddle.io.DataLoader` 的 ``drop_last`` 参数是否打开(文档[链接](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/DataLoader_cn.html#dataloader))，复现代码需要与参考代码保持一致，否则最后不够batch-size的数据的评估会有diff。
+* 在识别或者检索过程中，为了加速评估过程，往往会将评估函数由CPU实现改为GPU实现，由此会带来评估函数输出的不一致。这是由于sort函数对于相同值的排序结果不同带来的。在复现的过程中，如果可以接受轻微的指标不稳定，可以使用PaddlePaddle的sort函数，如果对于指标非常敏感，同时对速度性能要求很高，可以给PaddlePaddle提[ISSUE](https://github.com/PaddlePaddle/Paddle/issues/new/choose)，由研发人员高优开发。
+
+
+<a name="4.4"></a>
+### 4.4 损失函数对齐
+
+* 部分算法的损失函数中会用到 bool 索引，这时候可以使用[paddle.where](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/where_cn.html#where) 代替。
+* `paddle.nn.CrossEntropyLoss` 默认是在最后一维(axis=-1)计算损失函数，而 `torch.nn.CrossEntropyLoss` 是在axis=1的地方计算损失函数，因此如果输入的维度大于2，这里需要保证计算的维(axis)相同，否则可能会出错。
+* 在生成模型中会遇到梯度损失，需要对模型中的算子求二次梯度，目前`MaxPooling`暂时不支持二次梯度，如果复现的过程中遇到了需要对`MaxPooling`求二次梯度的情况，可以和Paddle官方开发同学反馈，进一步确认解决方案。
+* 在保存损失函数值的时候，注意要使用`paddle.no_grad`，或者仅仅保存转换成 numpy 的数组，避免损失没有析构导致内存泄漏问题。
+
+```python
+# 错误示范
+loss = celoss(pred, label)
+avg_loss += loss
+# 正确示范1
+loss = celoss(pred, label)
+avg_loss += loss.numpy()
+# 正确示范2
+loss = celoss(pred, label)
+with paddle.no_grad()
+    avg_loss += loss
+```
+
+<a name="4.5"></a>
+### 4.5 优化器对齐
+
+* Paddle目前支持在 ``optimizer`` 中通过设置 ``params_groups`` 的方式设置不同参数的更新方式，可以参考[代码示例](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/optimizer/optimizer.py#L107) 。
+* 有些模型训练时，会使用梯度累加策略，即累加到一定step数量之后才进行参数更新，这时在实现上需要注意对齐。
+* 在某些任务中，比如说深度学习可视化、可解释性等任务中，一般只要求模型前向过程，不需要训练，此时优化器、学习率等用于模型训练的模块对于该类论文复现是不需要的。
+* 在文本分类领域，大多数Transformer模型都采用了AdamW优化器，并且会设置weigh decay，同时部分参数设置为no weight decay，例如位置编码的参数通常设置为no weight decay，no weight decay参数设置不正确，最终会有明显的精度损失，需要特别注意。一般可以通过分析模型权重来发现该问题，分别计算官方模型和复现模型每层参数权重的平均值、方差，对每一层依次对比，有显著差异的层可能存在问题，因为在weight decay的作用下，参数权重数值会相对较小，而未正确设置no weight decay，则会造成该层参数权重数值异常偏小。
+
+
+<a name="4.6"></a>
+### 4.6 学习率对齐
+
+* PaddlePaddle 中参数的学习率受到优化器学习率和`ParamAttr`中设置的学习率影响，因此跟踪学习率需要将二者结合进行跟踪。
+* 对于复现代码和参考代码，学习率在整个训练过程中在相同的轮数相同的iter下应该保持一致，可以通过`reprod_log`工具、打印学习率值或者可视化二者学习率的log来查看diff。
+* 有些网络的学习率策略比较细致，比如带warmup的学习率策略，这里需要保证起始学习率等参数都完全一致。
+
+
+<a name="4.7"></a>
+### 4.7 正则化策略对齐
+
+* 在如Transformer或者少部分CNN模型中，存在一些参数不做正则化(正则化系数为0)的情况。这里需要找到这些参数并对齐取消实施正则化策略，可以参考[这里](https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.3/ppcls/arch/backbone/model_zoo/resnest.py#L72)，对特定参数进行修改。
+
+<a name="4.8"></a>
+### 4.8 反向对齐
+
+* 反向对齐时，如果第二轮开始，loss开始无法对齐，则首先需要排查下超参数的差异，没问题的话，在`loss.backward()`方法之后，使用`tensor.grad`获取梯度值，二分的方法查找diff，定位出PaddlePaddle与PyTorch梯度无法对齐的API或者操作，然后进一步验证。第3章中给出了获取所有参数的梯度方法，如果只希望打印特定参数的梯度，可以用下面的方式。
+
+
+```python
+import paddle
+
+def print_hook_fn(grad):
+    print(grad)
+
+x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
+h = x.register_hook(print_hook_fn)
+w = x * 4
+w.backward()
+# backward之后会输出下面的内容
+#     Tensor(shape=[4], dtype=float32, place=CPUPlace, stop_gradient=False,
+#            [4., 4., 4., 4.])
+```
+
+
+<a name="4.9"></a>
+### 4.9 训练集数据读取对齐
+
+#### 4.9.1 API
+
+* 在前向过程中，如果数据预处理过程运行出错，请先将 ``paddle.io.DataLoader`` 的 ``num_workers`` 参数设为0，然后根据单个进程下的报错日志定位出具体的bug。
+
+#### 4.9.2 数据预处理
+
+
+* 如果数据处理过程中涉及到随机数生成，建议固定seed (`np.random.seed(0)`, `random.seed(0)`)，查看复现代码和参考代码处理后的数据是否有diff。
+* 对文本进行tokenizer处理时，需要确定文本的截断策略，padding策略。
+
+<a name="4.10"></a>
+### 4.10 网络初始化对齐
+
+* 对于不同的深度学习框架，网络初始化在大多情况下，即使值的分布完全一致，也无法保证值完全一致，这里也是论文复现中不确定性比较大的地方。如果十分怀疑初始化导致的问题，建议将参考的初始化权重转成paddle模型，加载该初始化模型训练，看下收敛精度。
+* CNN对于模型初始化相对来说没有那么敏感，在迭代轮数与数据集足够的情况下，最终精度指标基本接近；而transformer系列模型对于初始化比较敏感，在transformer系列模型训练对齐过程中，建议对这一块进行重点检查。
+
+
+<a name="4.11"></a>
+### 4.11 模型训练对齐
+
+#### 4.11.1 训练对齐通用问题
+
+* 有条件的话，复现工作之前最好先基于官方代码完成训练，保证与官方指标能够对齐，并且将训练策略和训练过程中的关键指标记录保存下来，比如每个epoch的学习率、Train Loss、Eval Loss、Eval Acc等，在复现网络的训练过程中，将关键指标保存下来，这样可以将两次训练中关键指标的变化曲线绘制出来，能够很方便的进行对比。
+* 训练过程中可以对loss或者acc进行可视化，和竞品loss或者acc进行直观的对比；如果训练较大的数据集，1次完整训练的成本比较高，此时可以隔一段时间查看一下，如果精度差异比较大，建议先停掉实验，排查原因。
+* 如果训练的过程中出nan，一般是因为除0或者log0的情况， 可以着重看下几个部分：
+    * 如果有预训练模型的话，可以确认下是否加载正确
+    * 模型结构中计算loss的部分是否有考虑到正样本为0的情况
+    * 也可能是某个API的数值越界导致的，可以测试较小的输入是否还会出现nan。
+* 如果训练过程中如果出现不收敛的情况，可以
+    * 简化网络和数据，实验是否收敛；
+    * 如果是基于原有实现进行改动，可以尝试控制变量法，每次做一个改动，逐个排查；
+    * 检查学习率是否过大、优化器设置是否合理，排查下weight decay是否设置正确；
+    * 保存不同step之间的模型参数，观察模型参数是否更新。
diff --git a/examples/torch_migration/pipeline/Step1/README.md b/examples/torch_migration/pipeline/Step1/README.md
new file mode 100644
index 000000000000..b3db11238110
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step1/README.md
@@ -0,0 +1,86 @@
+# 使用方法
+
+
+本部分内容以前向对齐为例，介绍基于`repord_log`工具对齐的检查流程。其中与`reprod_log`工具有关的部分都是需要开发者需要添加的部分。
+
+
+```shell
+# 进入文件夹并生成torch的bert模型权重
+cd pipeline/weights/ && python torch_bert_weights.py
+# 进入文件夹并将torch的bert模型权重转换为paddle
+cd pipeline/weights/ && python torch2paddle.py
+# 进入文件夹并生成classifier权重
+cd pipeline/classifier_weights/ && python generate_classifier_weights.py
+# 进入Step1文件夹
+cd pipeline/Step1/
+# 生成paddle的前向数据
+python pd_forward_bert.py
+# 生成torch的前向数据
+python pt_forward_bert.py
+# 对比生成log
+python check_step1.py
+```
+
+具体地，以PaddlePaddle为例，`pd_forward_bert.py`的具体代码如下所示。
+
+```python
+import numpy as np
+import paddle
+from reprod_log import ReprodLogger
+import sys
+import os
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+config_path = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(config_path)
+from models.pd_bert import *
+
+# 导入reprod_log中的ReprodLogger类
+from reprod_log import ReprodLogger
+
+reprod_logger = ReprodLogger()
+
+# 组网初始化加载BertModel权重
+paddle_dump_path = '../weights/paddle_weight.pdparams'
+config = BertConfig()
+model = BertForSequenceClassification(config)
+checkpoint = paddle.load(paddle_dump_path)
+model.bert.load_dict(checkpoint)
+
+# 加载分类权重
+classifier_weights = paddle.load(
+        "../classifier_weights/paddle_classifier_weights.bin")
+model.load_dict(classifier_weights)
+model.eval()
+# 读入fake data并转换为tensor，这里也可以固定seed在线生成fake data
+fake_data = np.load("../fake_data/fake_data.npy")
+fake_data = paddle.to_tensor(fake_data)
+# 模型前向
+out = model(fake_data)
+# 保存前向结果，对于不同的任务，需要开发者添加。
+reprod_logger.add("logits", out.cpu().detach().numpy())
+reprod_logger.save("forward_paddle.npy")
+```
+
+diff检查的代码可以参考：[check_step1.py](./check_step1.py)，具体代码如下所示。
+
+```python
+# https://github.com/littletomatodonkey/AlexNet-Prod/blob/master/pipeline/Step1/check_step1.py
+# 使用reprod_log排查diff
+from reprod_log import ReprodDiffHelper
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+    torch_info = diff_helper.load_info("./forward_torch.npy")
+    paddle_info = diff_helper.load_info("./forward_paddle.npy")
+    diff_helper.compare_info(torch_info, paddle_info)
+    diff_helper.report(path="forward_diff.log")
+```
+
+产出日志如下，同时会将check的结果保存在`forward_diff.log`文件中。
+
+```
+[2021/11/17 20:15:50] root INFO: logits:
+[2021/11/17 20:15:50] root INFO:     mean diff: check passed: True, value: 1.30385160446167e-07
+[2021/11/17 20:15:50] root INFO: diff check passed
+```
+
+平均绝对误差为1.3e-7，测试通过。
diff --git a/examples/torch_migration/pipeline/Step1/check_step1.py b/examples/torch_migration/pipeline/Step1/check_step1.py
new file mode 100644
index 000000000000..6dbb247cf179
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step1/check_step1.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from reprod_log import ReprodDiffHelper
+
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+    torch_info = diff_helper.load_info("./forward_torch.npy")
+    paddle_info = diff_helper.load_info("./forward_paddle.npy")
+
+    diff_helper.compare_info(torch_info, paddle_info)
+    diff_helper.report(path="forward_diff.log")
diff --git a/examples/torch_migration/pipeline/Step1/pd_forward_bert.py b/examples/torch_migration/pipeline/Step1/pd_forward_bert.py
new file mode 100644
index 000000000000..260386973b20
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step1/pd_forward_bert.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+
+import numpy as np
+import paddle
+from reprod_log import ReprodLogger
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pd_bert import BertConfig, BertForSequenceClassification
+
+if __name__ == "__main__":
+    paddle.set_device("cpu")
+
+    # def logger
+    reprod_logger = ReprodLogger()
+
+    paddle_dump_path = '../weights/paddle_weight.pdparams'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = paddle.load(paddle_dump_path)
+    model.bert.load_dict(checkpoint)
+
+    classifier_weights = paddle.load(
+        "../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+    model.eval()
+    # read or gen fake data
+
+    fake_data = np.load("../fake_data/fake_data.npy")
+    fake_data = paddle.to_tensor(fake_data)
+    # forward
+    out = model(fake_data)[0]
+    reprod_logger.add("logits", out.cpu().detach().numpy())
+    reprod_logger.save("forward_paddle.npy")
diff --git a/examples/torch_migration/pipeline/Step1/pt_forward_bert.py b/examples/torch_migration/pipeline/Step1/pt_forward_bert.py
new file mode 100644
index 000000000000..c2dd64965c99
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step1/pt_forward_bert.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+
+import numpy as np
+from reprod_log import ReprodLogger
+import torch
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pt_bert import BertConfig, BertForSequenceClassification
+
+if __name__ == "__main__":
+    # def logger
+    reprod_logger = ReprodLogger()
+
+    pytorch_dump_path = '../weights/torch_weight.bin'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = torch.load(pytorch_dump_path)
+    model.bert.load_state_dict(checkpoint)
+
+    classifier_weights = torch.load(
+        "../classifier_weights/torch_classifier_weights.bin")
+    model.load_state_dict(classifier_weights, strict=False)
+    model.eval()
+
+    # read or gen fake data
+    fake_data = np.load("../fake_data/fake_data.npy")
+    fake_data = torch.from_numpy(fake_data)
+    # forward
+    out = model(fake_data)[0]
+    reprod_logger.add("logits", out.cpu().detach().numpy())
+    reprod_logger.save("forward_torch.npy")
diff --git a/examples/torch_migration/pipeline/Step1/torch2paddle.py b/examples/torch_migration/pipeline/Step1/torch2paddle.py
new file mode 100644
index 000000000000..07e6edc6e4de
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step1/torch2paddle.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import torch
+from paddlenlp.transformers import BertForPretraining as PDBertForMaskedLM
+from transformers import BertForMaskedLM as PTBertForMaskedLM
+
+
+def convert_pytorch_checkpoint_to_paddle(
+    pytorch_checkpoint_path="pytorch_model.bin",
+    paddle_dump_path="model_state.pdparams",
+    version="old",
+):
+    hf_to_paddle = {
+        "embeddings.LayerNorm": "embeddings.layer_norm",
+        "encoder.layer": "encoder.layers",
+        "attention.self.query": "self_attn.q_proj",
+        "attention.self.key": "self_attn.k_proj",
+        "attention.self.value": "self_attn.v_proj",
+        "attention.output.dense": "self_attn.out_proj",
+        "intermediate.dense": "linear1",
+        "output.dense": "linear2",
+        "attention.output.LayerNorm": "norm1",
+        "output.LayerNorm": "norm2",
+        "predictions.decoder.": "predictions.decoder_",
+        "predictions.transform.dense": "predictions.transform",
+        "predictions.transform.LayerNorm": "predictions.layer_norm",
+    }
+    do_not_transpose = []
+    if version == "old":
+        hf_to_paddle.update({
+            "predictions.bias": "predictions.decoder_bias",
+            ".gamma": ".weight",
+            ".beta": ".bias",
+        })
+        do_not_transpose = do_not_transpose + ["predictions.decoder.weight"]
+
+    pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+    paddle_state_dict = OrderedDict()
+    for k, v in pytorch_state_dict.items():
+        is_transpose = False
+        if k[-7:] == ".weight":
+            # embeddings.weight and LayerNorm.weight do not transpose
+            if all(d not in k for d in do_not_transpose):
+                if ".embeddings." not in k and ".LayerNorm." not in k:
+                    if v.ndim == 2:
+                        if 'embeddings' not in k:
+                            v = v.transpose(0, 1)
+                            is_transpose = True
+                        is_transpose = False
+        oldk = k
+        print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}")
+        paddle_state_dict[k] = v.data.numpy()
+
+    paddle.save(paddle_state_dict, paddle_dump_path)
+
+
+def compare(out_torch, out_paddle):
+    out_torch = out_torch.detach().numpy()
+    out_paddle = out_paddle.detach().numpy()
+    assert out_torch.shape == out_paddle.shape
+    abs_dif = np.abs(out_torch - out_paddle)
+    mean_dif = np.mean(abs_dif)
+    max_dif = np.max(abs_dif)
+    min_dif = np.min(abs_dif)
+    print("mean_dif:{}".format(mean_dif))
+    print("max_dif:{}".format(max_dif))
+    print("min_dif:{}".format(min_dif))
+
+
+def test_forward():
+    paddle.set_device("cpu")
+    model_torch = PTBertForMaskedLM.from_pretrained("./bert-base-uncased")
+    model_paddle = PDBertForMaskedLM.from_pretrained("./bert-base-uncased")
+    model_torch.eval()
+    model_paddle.eval()
+    np.random.seed(42)
+    x = np.random.randint(1,
+                          model_paddle.bert.config["vocab_size"],
+                          size=(4, 64))
+    input_torch = torch.tensor(x, dtype=torch.int64)
+    out_torch = model_torch(input_torch)[0]
+
+    input_paddle = paddle.to_tensor(x, dtype=paddle.int64)
+    out_paddle = model_paddle(input_paddle)[0]
+
+    print("torch result shape:{}".format(out_torch.shape))
+    print("paddle result shape:{}".format(out_paddle.shape))
+    compare(out_torch, out_paddle)
+
+
+if __name__ == "__main__":
+    convert_pytorch_checkpoint_to_paddle("test.bin", "test_paddle.pdparams")
+# test_forward()
+# torch result shape:torch.Size([4, 64, 30522])
+# paddle result shape:[4, 64, 30522]
+# mean_dif:1.666686512180604e-05
+# max_dif:0.00015211105346679688
+# min_dif:0.0
diff --git a/examples/torch_migration/pipeline/Step2/README.md b/examples/torch_migration/pipeline/Step2/README.md
new file mode 100644
index 000000000000..029761c85e47
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/README.md
@@ -0,0 +1,131 @@
+# 使用方法
+
+## 数据集和数据加载对齐步骤
+
+* 使用下面的命令，判断数据预处理以及数据集是否构建正确。
+
+```shell
+python test_data.py
+```
+
+显示出以下内容，Dataset以及Dataloader的长度和内容diff均满足小于指定阈值，可以认为复现成功。
+
+```
+[2021/11/17 20:57:06] root INFO: length:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_0_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_0_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_0_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_1_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_1_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_1_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_2_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_2_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_2_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_3_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_3_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_3_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_4_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_4_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataset_4_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_0_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_0_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_0_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_1_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_1_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_1_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_2_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_2_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_2_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_3_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_3_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_3_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_4_input_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_4_token_type_ids:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: dataloader_4_labels:
+[2021/11/17 20:57:06] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 20:57:06] root INFO: diff check passed
+```
+
+
+## 数据评估对齐流程
+
+### 评估代码和修改内容说明
+
+Pytorch准确率评估指标使用的是huggingface的datasets库。
+
+```python
+import torch
+import numpy as np
+from datasets import load_metric
+hf_metric = load_metric("accuracy.py")
+logits = np.random.normal(0, 1, size=(64, 2)).astype("float32")
+labels = np.random.randint(0, 2, size=(64,)).astype("int64")
+hf_metric.add_batch(predictions=torch.from_numpy(logits).argmax(dim=-1), references=torch.from_numpy(labels))
+hf_accuracy = hf_metric.compute()["accuracy"]
+print(hf_accuracy)
+```
+
+对应地，PaddlePaddle评估指标代码如下
+
+```python
+import paddle
+import numpy as np
+from paddle.metric import Accuracy
+pd_metric = Accuracy()
+pd_metric.reset()
+logits = np.random.normal(0, 1, size=(64, 2)).astype("float32")
+labels = np.random.randint(0, 2, size=(64,)).astype("int64")
+correct = pd_metric.compute(paddle.to_tensor(logits), paddle.to_tensor(labels))
+pd_metric.update(correct)
+pd_accuracy = pd_metric.accumulate()
+print(pd_accuracy)
+```
+
+### 操作步骤
+
+运行下面的命令，验证数据集评估是否正常。
+
+```shell
+# 生成paddle和pytorch指标
+python test_metric.py
+# 对比生成log
+python check_step2.py
+```
+
+最终结果输出如下，accuracy精度diff为0，小于阈值，结果前向验证，
+```
+[2021/11/17 21:15:05] root INFO: accuracy:
+[2021/11/17 21:15:05] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:15:05] root INFO: diff check passed
+
+```
diff --git a/examples/torch_migration/pipeline/Step2/accuracy.py b/examples/torch_migration/pipeline/Step2/accuracy.py
new file mode 100644
index 000000000000..ae447e4a398a
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/accuracy.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Accuracy metric."""
+
+import datasets
+from sklearn.metrics import accuracy_score
+
+_DESCRIPTION = """
+Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
+Accuracy = (TP + TN) / (TP + TN + FP + FN)
+TP: True positive
+TN: True negative
+FP: False positive
+FN: False negative
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: Predicted labels, as returned by a model.
+    references: Ground truth labels.
+    normalize: If False, return the number of correctly classified samples.
+        Otherwise, return the fraction of correctly classified samples.
+    sample_weight: Sample weights.
+Returns:
+    accuracy: Accuracy score.
+Examples:
+
+    >>> accuracy_metric = datasets.load_metric("accuracy")
+    >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])
+    >>> print(results)
+    {'accuracy': 1.0}
+"""
+
+_CITATION = """\
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION,
+                                                _KWARGS_DESCRIPTION)
+class Accuracy(datasets.Metric):
+
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(datasets.Value("int32")),
+                    "references": datasets.Sequence(datasets.Value("int32")),
+                } if self.config_name == "multilabel" else {
+                    "predictions": datasets.Value("int32"),
+                    "references": datasets.Value("int32"),
+                }),
+            reference_urls=[
+                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
+            ],
+        )
+
+    def _compute(self,
+                 predictions,
+                 references,
+                 normalize=True,
+                 sample_weight=None):
+        return {
+            "accuracy":
+            accuracy_score(
+                references,
+                predictions,
+                normalize=normalize,
+                sample_weight=sample_weight,
+            ).item(),
+        }
diff --git a/examples/torch_migration/pipeline/Step2/check_step2.py b/examples/torch_migration/pipeline/Step2/check_step2.py
new file mode 100644
index 000000000000..ac74370e6a99
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/check_step2.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from reprod_log import ReprodDiffHelper
+
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+    torch_info = diff_helper.load_info("./metric_torch.npy")
+    paddle_info = diff_helper.load_info("./metric_paddle.npy")
+
+    diff_helper.compare_info(torch_info, paddle_info)
+
+    diff_helper.report(path="metric_diff.log")
diff --git a/examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv b/examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv
new file mode 100644
index 000000000000..fdc6b82affef
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/demo_sst2_sentence/demo.tsv
@@ -0,0 +1,33 @@
+sentence	label
+it 's a charming and often affecting journey . 	1
+unflinchingly bleak and desperate 	0
+allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 	1
+the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 	1
+it 's slow -- very , very slow . 	0
+although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . 	1
+a sometimes tedious film . 	0
+or doing last year 's taxes with your ex-wife . 	0
+you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . 	1
+in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . 	0
+the mesmerizing performances of the leads keep the film grounded and keep the audience riveted . 	1
+it takes a strange kind of laziness to waste the talents of robert forster , anne meara , eugene levy , and reginald veljohnson all in the same movie . 	0
+... the film suffers from a lack of humor ( something needed to balance out the violence ) ... 	0
+we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity . 	1
+even horror fans will most likely not find what they 're seeking with trouble every day ; the movie lacks both thrills and humor . 	0
+a gorgeous , high-spirited musical from india that exquisitely blends music , dance , song , and high drama . 	1
+the emotions are raw and will strike a nerve with anyone who 's ever had family trauma . 	1
+audrey tatou has a knack for picking roles that magnify her outrageous charm , and in this literate french comedy , she 's as morning-glory exuberant as she was in amélie . 	1
+... the movie is just a plain old monster . 	0
+in its best moments , resembles a bad high school production of grease , without benefit of song . 	0
+pumpkin takes an admirable look at the hypocrisy of political correctness , but it does so with such an uneven tone that you never know when humor ends and tragedy begins . 	0
+the iditarod lasts for days - this just felt like it did . 	0
+holden caulfield did it better . 	0
+a delectable and intriguing thriller filled with surprises , read my lips is an original . 	1
+seldom has a movie so closely matched the spirit of a man and his work . 	1
+nicks , seemingly uncertain what 's going to make people laugh , runs the gamut from stale parody to raunchy sex gags to formula romantic comedy . 	0
+the action switches between past and present , but the material link is too tenuous to anchor the emotional connections that purport to span a 125-year divide . 	0
+it 's an offbeat treat that pokes fun at the democratic exercise while also examining its significance for those who take part . 	1
+it 's a cookie-cutter movie , a cut-and-paste job . 	0
+i had to look away - this was god awful . 	0
+thanks to scott 's charismatic roger and eisenberg 's sweet nephew , roger dodger is one of the most compelling variations on in the company of men . 	1
+... designed to provide a mix of smiles and tears , `` crossroads '' instead provokes a handful of unintentional howlers and numerous yawns . 	0
diff --git a/examples/torch_migration/pipeline/Step2/predict.py b/examples/torch_migration/pipeline/Step2/predict.py
new file mode 100644
index 000000000000..93079f6ed8a1
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/predict.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import sys
+import os
+
+import paddle
+import paddle.nn as nn
+from datasets import Dataset
+from paddlenlp.data import Dict, Pad, Stack
+from paddlenlp.datasets import load_dataset as ppnlp_load_dataset
+from paddlenlp.transformers import BertTokenizer as PPNLPBertTokenizer
+from reprod_log import ReprodDiffHelper, ReprodLogger
+from transformers import BertTokenizer as HFBertTokenizer
+import functools
+import pandas as pd
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(CONFIG_PATH)
+from models.pd_bert import BertConfig, BertForSequenceClassification
+
+
+def get_data():
+
+    def read(data_path):
+        df = pd.read_csv(data_path, sep="\t")
+        for _, row in df.iterrows():
+            yield {"sentence": row["sentence"], "labels": row["label"]}
+
+    def convert_example(example, tokenizer, max_length=128):
+        labels = [example["labels"]]
+        #labels = np.array([example["labels"]], dtype="int64")
+        example = tokenizer(example["sentence"], max_seq_len=max_length)
+        return example
+
+    tokenizer = PPNLPBertTokenizer.from_pretrained("bert-base-uncased")
+    dataset_test = ppnlp_load_dataset(read,
+                                      data_path='demo_sst2_sentence/demo.tsv',
+                                      lazy=False)
+    trans_func = partial(convert_example, tokenizer=tokenizer, max_length=128)
+
+    dataset_test = dataset_test.map(trans_func, lazy=False)
+    one_sentence = dataset_test.new_data[0]
+
+    for k in ["input_ids", "token_type_ids"]:
+        one_sentence[k] = paddle.to_tensor(one_sentence[k], dtype='int64')
+        one_sentence[k] = paddle.unsqueeze(one_sentence[k], axis=0)
+
+    return one_sentence
+
+
+@paddle.no_grad()
+def main():
+    # 模型定义
+    paddle_dump_path = '../weights/paddle_weight.pdparams'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = paddle.load(paddle_dump_path)
+    model.bert.load_dict(checkpoint)
+
+    classifier_weights = paddle.load(
+        "../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+
+    model.eval()
+    tokenizer = PPNLPBertTokenizer.from_pretrained("bert-base-uncased")
+    # 要预测的句子
+    data = get_data()
+    softmax = nn.Softmax()
+    # 预测的各类别的概率值
+    output = softmax(model(**data)[0]).numpy()
+
+    # 概率值最大的类别
+    class_id = output.argmax()
+    # 对应的概率值
+    prob = output[0][class_id]
+    print(f"class_id: {class_id}, prob: {prob}")
+    return output
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/torch_migration/pipeline/Step2/test_data.py b/examples/torch_migration/pipeline/Step2/test_data.py
new file mode 100644
index 000000000000..37ed96699e3b
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/test_data.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+from functools import partial
+
+import numpy as np
+import paddle
+import pandas as pd
+import torch
+from datasets import Dataset
+from paddlenlp.data import Dict, Pad, Stack
+from paddlenlp.datasets import load_dataset as ppnlp_load_dataset
+from paddlenlp.transformers import BertTokenizer as PPNLPBertTokenizer
+from reprod_log import ReprodDiffHelper, ReprodLogger
+from transformers import BertTokenizer as HFBertTokenizer
+
+
+def build_paddle_data_pipeline():
+    from paddlenlp.data import DataCollatorWithPadding
+
+    def read(data_path):
+        df = pd.read_csv(data_path, sep="\t")
+        for _, row in df.iterrows():
+            yield {"sentence": row["sentence"], "labels": row["label"]}
+
+    def convert_example(example, tokenizer, max_length=128):
+        labels = [example["labels"]]
+        example = tokenizer(example["sentence"], max_seq_len=max_length)
+
+        example["labels"] = labels
+        return example
+
+    # load tokenizer
+    tokenizer = PPNLPBertTokenizer.from_pretrained("bert-base-uncased")
+    # load data
+    dataset_test = ppnlp_load_dataset(read,
+                                      data_path='demo_sst2_sentence/demo.tsv',
+                                      lazy=False)
+    trans_func = partial(convert_example, tokenizer=tokenizer, max_length=128)
+
+    # tokenize data
+    dataset_test = dataset_test.map(trans_func, lazy=False)
+
+    test_sampler = paddle.io.SequenceSampler(dataset_test)
+    test_batch_sampler = paddle.io.BatchSampler(sampler=test_sampler,
+                                                batch_size=4)
+    data_collator = DataCollatorWithPadding(tokenizer)
+    data_loader_test = paddle.io.DataLoader(
+        dataset_test,
+        batch_sampler=test_batch_sampler,
+        num_workers=0,
+        collate_fn=data_collator,
+    )
+
+    return dataset_test, data_loader_test
+
+
+def build_torch_data_pipeline():
+    from transformers import DataCollatorWithPadding
+    tokenizer = HFBertTokenizer.from_pretrained("bert-base-uncased")
+
+    def preprocess_function(examples):
+        result = tokenizer(
+            examples["sentence"],
+            padding=False,
+            max_length=128,
+            truncation=True,
+            return_token_type_ids=True,
+        )
+        if "label" in examples:
+            result["labels"] = [examples["label"]]
+        return result
+
+    # load data
+    dataset_test = Dataset.from_csv("demo_sst2_sentence/demo.tsv", sep="\t")
+    dataset_test = dataset_test.map(
+        preprocess_function,
+        batched=False,
+        remove_columns=dataset_test.column_names,
+        desc="Running tokenizer on dataset",
+    )
+    dataset_test.set_format("np",
+                            columns=["input_ids", "token_type_ids", "labels"])
+    test_sampler = torch.utils.data.SequentialSampler(dataset_test)
+    collate_fn = DataCollatorWithPadding(tokenizer)
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test,
+        batch_size=4,
+        sampler=test_sampler,
+        num_workers=0,
+        collate_fn=collate_fn,
+    )
+    return dataset_test, data_loader_test
+
+
+def test_data_pipeline():
+    diff_helper = ReprodDiffHelper()
+    paddle_dataset, paddle_dataloader = build_paddle_data_pipeline()
+    torch_dataset, torch_dataloader = build_torch_data_pipeline()
+
+    logger_paddle_data = ReprodLogger()
+    logger_torch_data = ReprodLogger()
+
+    logger_paddle_data.add("length", np.array(len(paddle_dataset)))
+    logger_torch_data.add("length", np.array(len(torch_dataset)))
+
+    # random choose 5 images and check
+    for idx in range(5):
+        rnd_idx = np.random.randint(0, len(paddle_dataset))
+        for k in ["input_ids", "token_type_ids", "labels"]:
+
+            logger_paddle_data.add(f"dataset_{idx}_{k}",
+                                   np.array(paddle_dataset[rnd_idx][k]))
+
+            logger_torch_data.add(f"dataset_{idx}_{k}",
+                                  np.array(torch_dataset[rnd_idx][k]))
+
+    for idx, (paddle_batch,
+              torch_batch) in enumerate(zip(paddle_dataloader,
+                                            torch_dataloader)):
+        if idx >= 5:
+            break
+        for i, k in enumerate(["input_ids", "token_type_ids", "labels"]):
+            logger_paddle_data.add(f"dataloader_{idx}_{k}",
+                                   paddle_batch[k].numpy())
+            logger_torch_data.add(f"dataloader_{idx}_{k}",
+                                  torch_batch[k].cpu().numpy())
+
+    diff_helper.compare_info(logger_paddle_data.data, logger_torch_data.data)
+    diff_helper.report()
+
+
+if __name__ == "__main__":
+    test_data_pipeline()
diff --git a/examples/torch_migration/pipeline/Step2/test_metric.py b/examples/torch_migration/pipeline/Step2/test_metric.py
new file mode 100644
index 000000000000..408ffb0e840b
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step2/test_metric.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import torch
+from datasets import load_metric
+from paddle.metric import Accuracy
+from reprod_log import ReprodLogger
+
+
+def generate():
+    pd_metric = Accuracy()
+    pd_metric.reset()
+    hf_metric = load_metric("accuracy.py")
+    for i in range(4):
+        logits = np.random.normal(0, 1, size=(64, 2)).astype("float32")
+        labels = np.random.randint(0, 2, size=(64, )).astype("int64")
+        # paddle metric
+        correct = pd_metric.compute(paddle.to_tensor(logits),
+                                    paddle.to_tensor(labels))
+        pd_metric.update(correct)
+        # hf metric
+        hf_metric.add_batch(
+            predictions=torch.from_numpy(logits).argmax(dim=-1),
+            references=torch.from_numpy(labels),
+        )
+    pd_accuracy = pd_metric.accumulate()
+    hf_accuracy = hf_metric.compute()["accuracy"]
+    reprod_logger = ReprodLogger()
+    reprod_logger.add("accuracy", np.array([pd_accuracy]))
+    reprod_logger.save("metric_paddle.npy")
+    reprod_logger = ReprodLogger()
+    reprod_logger.add("accuracy", np.array([hf_accuracy]))
+    reprod_logger.save("metric_torch.npy")
+
+
+if __name__ == "__main__":
+    generate()
diff --git a/examples/torch_migration/pipeline/Step3/README.md b/examples/torch_migration/pipeline/Step3/README.md
new file mode 100644
index 000000000000..4e6e79ae1bf1
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step3/README.md
@@ -0,0 +1,67 @@
+# 使用方法
+
+## 代码解析
+
+以PaddlePaddle为例，下面为定义模型、计算loss并保存的代码。
+
+```python
+# paddle_loss.py
+if __name__ == "__main__":
+    paddle.set_device("cpu")
+
+    # def logger
+    reprod_logger = ReprodLogger()
+
+    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_classes=2)
+    classifier_weights = paddle.load("../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+    model.eval()
+
+    criterion = nn.CrossEntropyLoss()
+
+    # read or gen fake data
+    fake_data = np.load("../fake_data/fake_data.npy")
+    fake_data = paddle.to_tensor(fake_data)
+
+    fake_label = np.load("../fake_data/fake_label.npy")
+    fake_label = paddle.to_tensor(fake_label)
+
+    # forward
+    out = model(fake_data)
+
+    loss = criterion(out, fake_label)
+    #
+    reprod_logger.add("loss", loss.cpu().detach().numpy())
+    reprod_logger.save("loss_paddle.npy")
+
+```
+
+记录loss并保存在`loss_paddle.npy`文件中。
+
+
+## 操作步骤
+
+* 具体操作步骤如下所示。
+
+
+```shell
+# 生成paddle的前向loss结果
+python paddle_loss.py
+
+# 生成torch的前向loss结果
+python torch_loss.py
+
+# 对比生成log
+python check_step3.py
+```
+
+`check_step3.py`的输出结果如下所示，同时也会保存在`loss_diff.log`文件中。
+
+```
+[2021/11/17 21:27:35] root INFO: loss:
+[2021/11/17 21:27:35] root INFO:     mean diff: check passed: True, value: 5.960464477539063e-08
+[2021/11/17 21:27:35] root INFO: diff check passed
+
+```
+
+diff为5.96e-8，check通过。
diff --git a/examples/torch_migration/pipeline/Step3/check_step3.py b/examples/torch_migration/pipeline/Step3/check_step3.py
new file mode 100644
index 000000000000..546233dade0e
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step3/check_step3.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from reprod_log import ReprodDiffHelper
+
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+    torch_info = diff_helper.load_info("./loss_torch.npy")
+    paddle_info = diff_helper.load_info("./loss_paddle.npy")
+
+    diff_helper.compare_info(torch_info, paddle_info)
+
+    diff_helper.report(path="loss_diff.log")
diff --git a/examples/torch_migration/pipeline/Step3/paddle_loss.py b/examples/torch_migration/pipeline/Step3/paddle_loss.py
new file mode 100644
index 000000000000..fd10a4fc32d2
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step3/paddle_loss.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from reprod_log import ReprodLogger
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pd_bert import BertConfig, BertForSequenceClassification
+
+if __name__ == "__main__":
+    paddle.set_device("cpu")
+
+    # def logger
+    reprod_logger = ReprodLogger()
+
+    paddle_dump_path = '../weights/paddle_weight.pdparams'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = paddle.load(paddle_dump_path)
+    model.bert.load_dict(checkpoint)
+
+    classifier_weights = paddle.load(
+        "../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+    model.eval()
+
+    criterion = nn.CrossEntropyLoss()
+
+    # read or gen fake data
+    fake_data = np.load("../fake_data/fake_data.npy")
+    fake_data = paddle.to_tensor(fake_data)
+
+    fake_label = np.load("../fake_data/fake_label.npy")
+    fake_label = paddle.to_tensor(fake_label)
+
+    # forward
+    out = model(fake_data)[0]
+
+    loss = criterion(out, fake_label)
+    reprod_logger.add("loss", loss.cpu().detach().numpy())
+    reprod_logger.save("loss_paddle.npy")
diff --git a/examples/torch_migration/pipeline/Step3/torch_loss.py b/examples/torch_migration/pipeline/Step3/torch_loss.py
new file mode 100644
index 000000000000..302520581023
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step3/torch_loss.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+
+import numpy as np
+import paddle
+import torch
+import torch.nn as nn
+from reprod_log import ReprodLogger
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pt_bert import BertConfig, BertForSequenceClassification
+
+if __name__ == "__main__":
+
+    # def logger
+    reprod_logger = ReprodLogger()
+
+    criterion = nn.CrossEntropyLoss()
+
+    pytorch_dump_path = '../weights/torch_weight.bin'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = torch.load(pytorch_dump_path)
+    model.bert.load_state_dict(checkpoint)
+
+    classifier_weights = torch.load(
+        "../classifier_weights/torch_classifier_weights.bin")
+    model.load_state_dict(classifier_weights, strict=False)
+    model.eval()
+    # read or gen fake data
+    fake_data = np.load("../fake_data/fake_data.npy")
+    fake_data = torch.from_numpy(fake_data)
+
+    fake_label = np.load("../fake_data/fake_label.npy")
+    fake_label = torch.from_numpy(fake_label)
+
+    # forward
+    out = model(fake_data)[0]
+
+    loss = criterion(out, fake_label)
+    reprod_logger.add("loss", loss.cpu().detach().numpy())
+    reprod_logger.save("loss_torch.npy")
diff --git a/examples/torch_migration/pipeline/Step4/README.md b/examples/torch_migration/pipeline/Step4/README.md
new file mode 100644
index 000000000000..695b0728a773
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step4/README.md
@@ -0,0 +1,136 @@
+# 使用方法
+
+### 学习率对齐验证
+
+运行下面的命令，检查学习率模块设置是否正确。
+
+```shell
+python test_lr_scheduler.py
+```
+
+最终输出内容如下。
+
+```
+[2021/11/17 21:44:19] root INFO: step_100_linear_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_300_linear_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_500_linear_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_700_linear_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_900_linear_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_100_cosine_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_300_cosine_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_500_cosine_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: False, value: 9.35605818719964e-06
+[2021/11/17 21:44:19] root INFO: step_700_cosine_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: False, value: 1.3681476625617212e-05
+[2021/11/17 21:44:19] root INFO: step_900_cosine_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: False, value: 1.8924391285779562e-05
+[2021/11/17 21:44:19] root INFO: step_100_polynomial_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_300_polynomial_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_500_polynomial_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_700_polynomial_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: step_900_polynomial_lr:
+[2021/11/17 21:44:19] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 21:44:19] root INFO: diff check failed
+
+```
+
+linear和polynomial方式衰减的学习率diff为0，check通过，cosine方式衰减学习率可能由于计算误差未通过。
+
+
+### 反向对齐操作方法
+
+#### 代码讲解
+
+以PaddlePaddle为例，训练流程核心代码如下所示。每个iter中输入相同的fake data与fake label，计算loss，进行梯度反传与参数更新，将loss批量返回，用于后续的验证。
+
+```python
+def pd_train_some_iters(model,
+                     criterion,
+                     optimizer,
+                     fake_data,
+                     fake_label,
+                     max_iter=2):
+    paddle_dump_path = '../weights/paddle_weight.pdparams'
+    config = PDBertConfig()
+    model = PDBertForSequenceClassification(config)
+    checkpoint = paddle.load(paddle_dump_path)
+    model.bert.load_dict(checkpoint)
+    classifier_weights = paddle.load(
+        "../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+    model.eval()
+    criterion = paddle.nn.CrossEntropy()
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = paddle.optimizer.AdamW(learning_rate=3e-5, parameters=model.parameters(),
+        weight_decay=1e-2,
+        epsilon=1e-6,
+        apply_decay_param_fun=lambda x: x in decay_params)
+    loss_list = []
+    for idx in range(max_iter):
+        input_ids = paddle.to_tensor(fake_data)
+        labels = paddle.to_tensor(fake_label)
+
+        output = model(input_ids)
+        loss = criterion(output, labels)
+        loss.backward()
+        optimizer.step()
+        optimizer.clear_grad()
+        loss_list.append(loss)
+    return loss_list
+```
+
+
+#### 操作方法
+
+运行下面的命令，基于fake data与fake label，依次生成若干轮loss数据并保存，使用`reprod_log`工具进行diff排查。
+
+```shell
+# 生成paddle和torch的前向数据
+python test_bp.py
+
+# 对比生成log
+python check_step4.py
+```
+
+最终输出结果如下，同时会保存在文件`bp_align_diff.log`中。
+
+```
+[2021/11/17 22:08:30] root INFO: loss_0:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_1:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_2:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_3:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_4:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_5:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_6:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_7:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_8:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: loss_9:
+[2021/11/17 22:08:30] root INFO:     mean diff: check passed: True, value: 0.0
+[2021/11/17 22:08:30] root INFO: diff check passed
+
+```
+
+前面10轮的loss diff均等于0，check通过。
diff --git a/examples/torch_migration/pipeline/Step4/check_step4.py b/examples/torch_migration/pipeline/Step4/check_step4.py
new file mode 100644
index 000000000000..751be400682b
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step4/check_step4.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from reprod_log import ReprodDiffHelper
+
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+    torch_info = diff_helper.load_info("./bp_align_torch.npy")
+    paddle_info = diff_helper.load_info("./bp_align_paddle.npy")
+    diff_helper.compare_info(torch_info, paddle_info)
+
+    diff_helper.report(path="bp_align_diff.log")
diff --git a/examples/torch_migration/pipeline/Step4/test_bp.py b/examples/torch_migration/pipeline/Step4/test_bp.py
new file mode 100644
index 000000000000..c584ef60ee50
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step4/test_bp.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+
+import numpy as np
+import paddle
+import torch
+from reprod_log import ReprodLogger
+from transformers import AdamW
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 1)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pd_bert import (
+    BertForSequenceClassification as PDBertForSequenceClassification, )
+from models.pd_bert import (
+    BertConfig as PDBertConfig, )
+from models.pt_bert import (
+    BertForSequenceClassification as HFBertForSequenceClassification, )
+from models.pt_bert import (
+    BertConfig as HFBertConfig, )
+
+
+def pd_train_some_iters(model,
+                        criterion,
+                        optimizer,
+                        fake_data,
+                        fake_label,
+                        max_iter=2):
+    paddle_dump_path = '../weights/paddle_weight.pdparams'
+    config = PDBertConfig()
+    model = PDBertForSequenceClassification(config)
+    checkpoint = paddle.load(paddle_dump_path)
+    model.bert.load_dict(checkpoint)
+
+    classifier_weights = paddle.load(
+        "../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+    model.eval()
+    criterion = paddle.nn.CrossEntropy()
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=3e-5,
+        parameters=model.parameters(),
+        weight_decay=1e-2,
+        epsilon=1e-6,
+        apply_decay_param_fun=lambda x: x in decay_params,
+    )
+    loss_list = []
+    for idx in range(max_iter):
+        input_ids = paddle.to_tensor(fake_data)
+        labels = paddle.to_tensor(fake_label)
+
+        output = model(input_ids)
+        loss = criterion(output, labels)
+        loss.backward()
+        optimizer.step()
+        optimizer.clear_grad()
+        loss_list.append(loss)
+    return loss_list
+
+
+def hf_train_some_iters(fake_data, fake_label, max_iter=2):
+
+    pytorch_dump_path = '../weights/torch_weight.bin'
+    config = HFBertConfig()
+    model = HFBertForSequenceClassification(config)
+    checkpoint = torch.load(pytorch_dump_path)
+    model.bert.load_state_dict(checkpoint)
+    classifier_weights = torch.load(
+        "../classifier_weights/torch_classifier_weights.bin")
+    model.load_state_dict(classifier_weights, strict=False)
+    model.eval()
+    criterion = torch.nn.CrossEntropyLoss()
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            1e-2,
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
+
+    loss_list = []
+    for idx in range(max_iter):
+        input_ids = torch.from_numpy(fake_data)
+        labels = torch.from_numpy(fake_label)
+
+        output = model(input_ids)[0]
+        loss = criterion(output, labels)
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        loss_list.append(loss)
+    return loss_list
+
+
+if __name__ == "__main__":
+    print("Start training")
+    paddle.set_device("cpu")
+    fake_data = np.load("../fake_data/fake_data.npy")
+    fake_label = np.load("../fake_data/fake_label.npy")
+    hf_reprod_logger = ReprodLogger()
+    hf_loss_list = hf_train_some_iters(fake_data, fake_label, 10)
+    for idx, loss in enumerate(hf_loss_list):
+        hf_reprod_logger.add(f"loss_{idx}", loss.detach().cpu().numpy())
+    hf_reprod_logger.save("bp_align_torch.npy")
+
+    pd_reprod_logger = ReprodLogger()
+    pd_loss_list = hf_train_some_iters(fake_data, fake_label, 10)
+    for idx, loss in enumerate(pd_loss_list):
+        pd_reprod_logger.add(f"loss_{idx}", loss.detach().cpu().numpy())
+    pd_reprod_logger.save("bp_align_paddle.npy")
diff --git a/examples/torch_migration/pipeline/Step4/test_lr_scheduler.py b/examples/torch_migration/pipeline/Step4/test_lr_scheduler.py
new file mode 100644
index 000000000000..9e5f86520658
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step4/test_lr_scheduler.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import torch
+
+# define paddle scheduler
+from paddlenlp.transformers import (
+    CosineDecayWithWarmup,
+    LinearDecayWithWarmup,
+    PolyDecayWithWarmup,
+)
+from reprod_log import ReprodDiffHelper, ReprodLogger
+from torch.optim import AdamW
+from transformers.optimization import get_scheduler as get_hf_scheduler
+
+scheduler_type2cls = {
+    "linear": LinearDecayWithWarmup,
+    "cosine": CosineDecayWithWarmup,
+    "polynomial": PolyDecayWithWarmup,
+}
+
+
+def get_paddle_scheduler(
+    learning_rate,
+    scheduler_type,
+    num_warmup_steps=None,
+    num_training_steps=None,
+    **scheduler_kwargs,
+):
+    if scheduler_type not in scheduler_type2cls.keys():
+        data = " ".join(scheduler_type2cls.keys())
+        raise ValueError(f"scheduler_type must be choson from {data}")
+
+    if num_warmup_steps is None:
+        raise ValueError(
+            f"requires `num_warmup_steps`, please provide that argument.")
+
+    if num_training_steps is None:
+        raise ValueError(
+            f"requires `num_training_steps`, please provide that argument.")
+
+    return scheduler_type2cls[scheduler_type](
+        learning_rate=learning_rate,
+        total_steps=num_training_steps,
+        warmup=num_warmup_steps,
+        **scheduler_kwargs,
+    )
+
+
+def test_lr():
+    diff_helper = ReprodDiffHelper()
+    pd_reprod_logger = ReprodLogger()
+    hf_reprod_logger = ReprodLogger()
+    lr = 3e-5
+    num_warmup_steps = 345
+    num_training_steps = 1024
+    milestone = [100, 300, 500, 700, 900]
+    for scheduler_type in ["linear", "cosine", "polynomial"]:
+        torch_optimizer = AdamW(torch.nn.Linear(1, 1).parameters(), lr=lr)
+        hf_scheduler = get_hf_scheduler(
+            name=scheduler_type,
+            optimizer=torch_optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+        )
+        pd_scheduler = get_paddle_scheduler(
+            learning_rate=lr,
+            scheduler_type=scheduler_type,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+        )
+
+        for i in range(num_training_steps):
+            hf_scheduler.step()
+            pd_scheduler.step()
+            if i in milestone:
+                hf_reprod_logger.add(
+                    f"step_{i}_{scheduler_type}_lr",
+                    np.array([hf_scheduler.get_last_lr()[-1]]),
+                )
+                pd_reprod_logger.add(f"step_{i}_{scheduler_type}_lr",
+                                     np.array([pd_scheduler.get_lr()]))
+
+    diff_helper.compare_info(hf_reprod_logger.data, pd_reprod_logger.data)
+    diff_helper.report()
+
+
+if __name__ == "__main__":
+    test_lr()
diff --git a/examples/torch_migration/pipeline/Step5/README.md b/examples/torch_migration/pipeline/Step5/README.md
new file mode 100644
index 000000000000..bab96301cac7
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/README.md
@@ -0,0 +1,29 @@
+# 使用方法
+
+首先运行下面的python代码，生成`train_align_torch.npy`和`train_align_paddle.npy`文件。
+
+```python
+# 运行生成paddle结果
+cd bert_paddle/
+sh train.sh
+# 运行生成torch结果
+cd bert_torch/
+sh train.sh
+```
+
+然后运行下面的代码，运行训练脚本；之后使用`check_step5.py`进行精度diff验证。
+
+```shell
+# 对比生成log
+python check_step5.py
+```
+
+这里需要注意的是，由于是精度对齐，SST-2数据集的精度diff在0.15%以内时，可以认为对齐，因此将`diff_threshold`参数修改为了`0.0015`。
+
+```
+[2021/11/17 22:41:12] root INFO: acc:
+[2021/11/17 22:41:12] root INFO:     mean diff: check passed: True, value: 0.0011467889908256534
+[2021/11/17 22:41:12] root INFO: diff check passed
+```
+
+最终diff为`0.00114`，小于阈值标准，检查通过。
diff --git a/examples/torch_migration/pipeline/Step5/bert_paddle/train.py b/examples/torch_migration/pipeline/Step5/bert_paddle/train.py
new file mode 100644
index 000000000000..ea0a2ec40302
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_paddle/train.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import datetime
+import random
+import time
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import utils
+from paddle.metric import Accuracy
+from paddle.optimizer import AdamW
+from paddlenlp.data import Dict, Pad, Stack
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import BertTokenizer
+from reprod_log import ReprodLogger
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 2)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pd_bert import BertConfig, BertForSequenceClassification
+
+
+def train_one_epoch(
+    model,
+    criterion,
+    optimizer,
+    lr_scheduler,
+    data_loader,
+    epoch,
+    print_freq,
+    scaler=None,
+):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr",
+                            utils.SmoothedValue(window_size=1, fmt="{value}"))
+    metric_logger.add_meter("sentence/s",
+                            utils.SmoothedValue(window_size=10, fmt="{value}"))
+
+    header = "Epoch: [{}]".format(epoch)
+    for batch in metric_logger.log_every(data_loader, print_freq, header):
+        inputs = {"input_ids": batch[0], "token_type_ids": batch[1]}
+        labels = batch[2]
+        start_time = time.time()
+        with paddle.amp.auto_cast(
+                enable=scaler is not None,
+                custom_white_list=["layer_norm", "softmax", "gelu"],
+        ):
+            logits = model(**inputs)[0]
+            loss = criterion(
+                logits.reshape([-1, 2]),
+                labels.reshape([
+                    -1,
+                ]),
+            )
+
+        optimizer.clear_grad()
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            optimizer.step()
+        lr_scheduler.step()
+        batch_size = inputs["input_ids"].shape[0]
+        metric_logger.update(loss=loss.item(), lr=lr_scheduler.get_lr())
+        metric_logger.meters["sentence/s"].update(batch_size /
+                                                  (time.time() - start_time))
+
+
+def evaluate(model, criterion, data_loader, metric, print_freq=100):
+    model.eval()
+    metric.reset()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = "Test:"
+    with paddle.no_grad():
+        for batch in metric_logger.log_every(data_loader, print_freq, header):
+            inputs = {"input_ids": batch[0], "token_type_ids": batch[1]}
+            labels = batch[2]
+            logits = model(**inputs)[0]
+            loss = criterion(
+                logits.reshape([-1, 2]),
+                labels.reshape([
+                    -1,
+                ]),
+            )
+            metric_logger.update(loss=loss.item())
+            correct = metric.compute(logits, labels)
+            metric.update(correct)
+        acc_global_avg = metric.accumulate()
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print(" * Accuracy {acc_global_avg:.6f}".format(
+        acc_global_avg=acc_global_avg))
+    return acc_global_avg
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+def convert_example(example, tokenizer, max_length=128):
+    labels = np.array([example["labels"]], dtype="int64")
+    example = tokenizer(example["sentence"], max_seq_len=max_length)
+    return {
+        "input_ids": example["input_ids"],
+        "token_type_ids": example["token_type_ids"],
+        "labels": labels,
+    }
+
+
+def load_data(args, tokenizer):
+    print("Loading data")
+    train_ds = load_dataset("glue", args.task_name, splits="train")
+    validation_ds = load_dataset("glue", args.task_name, splits="dev")
+
+    trans_func = partial(convert_example,
+                         tokenizer=tokenizer,
+                         max_length=args.max_length)
+    train_ds = train_ds.map(trans_func, lazy=False)
+    validation_ds = validation_ds.map(trans_func, lazy=False)
+
+    train_sampler = paddle.io.BatchSampler(train_ds,
+                                           batch_size=args.batch_size,
+                                           shuffle=False)
+    validation_sampler = paddle.io.BatchSampler(validation_ds,
+                                                batch_size=args.batch_size,
+                                                shuffle=False)
+
+    return train_ds, validation_ds, train_sampler, validation_sampler
+
+
+def main(args):
+    if args.output_dir:
+        pass
+    # utils.mkdir(args.output_dir)
+    print(args)
+    scaler = None
+    # if args.fp16:
+    #     scaler = paddle.amp.GradScaler()
+    paddle.set_device(args.device)
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
+    batchify_fn = lambda samples, fn=Dict(
+        {
+            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
+            "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
+            "labels": Stack(dtype="int64"),
+        }): fn(samples)
+    train_dataset, validation_dataset, train_sampler, validation_sampler = load_data(
+        args, tokenizer)
+
+    train_data_loader = paddle.io.DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        num_workers=args.workers,
+        collate_fn=batchify_fn,
+    )
+    validation_data_loader = paddle.io.DataLoader(
+        validation_dataset,
+        batch_sampler=validation_sampler,
+        num_workers=args.workers,
+        collate_fn=batchify_fn,
+    )
+
+    print("Creating model")
+    paddle_dump_path = '../../weights/paddle_weight.pdparams'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = paddle.load(paddle_dump_path)
+    model.bert.load_dict(checkpoint)
+
+    classifier_weights = paddle.load(
+        "../../classifier_weights/paddle_classifier_weights.bin")
+    model.load_dict(classifier_weights)
+
+    print("Creating criterion")
+    criterion = nn.CrossEntropyLoss()
+
+    print("Creating lr_scheduler")
+    lr_scheduler = utils.get_scheduler(
+        learning_rate=args.lr,
+        scheduler_type=args.lr_scheduler_type,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.num_train_epochs * len(train_data_loader),
+    )
+
+    print("Creating optimizer")
+    # Split weights in two groups, one with weight decay and the other not.
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = AdamW(
+        learning_rate=lr_scheduler,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        epsilon=1e-6,
+        apply_decay_param_fun=lambda x: x in decay_params,
+    )
+    metric = Accuracy()
+
+    if args.test_only:
+        evaluate(model, criterion, validation_data_loader, metric)
+        return
+
+    print("Start training")
+    start_time = time.time()
+    best_accuracy = 0.0
+    for epoch in range(args.num_train_epochs):
+
+        train_one_epoch(
+            model,
+            criterion,
+            optimizer,
+            lr_scheduler,
+            train_data_loader,
+            epoch,
+            args.print_freq,
+            scaler,
+        )
+        acc = evaluate(model, criterion, validation_data_loader, metric)
+        best_accuracy = max(best_accuracy, acc)
+        if args.output_dir:
+            pass
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("Training time {}".format(total_time_str))
+    return best_accuracy
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Paddle SST-2 Classification Training", add_help=add_help)
+    parser.add_argument("--task_name",
+                        default="sst-2",
+                        help="the name of the glue task to train on.")
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-uncased",
+        help=
+        "path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument("--device", default="gpu", help="device")
+    parser.add_argument("--batch_size", default=32, type=int)
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=128,
+        help=
+        ("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+         ),
+    )
+    parser.add_argument("--num_train_epochs",
+                        default=3,
+                        type=int,
+                        help="number of total epochs to run")
+    parser.add_argument(
+        "--workers",
+        default=0,
+        type=int,
+        help="number of data loading workers (default: 16)",
+    )
+    parser.add_argument("--lr",
+                        default=3e-5,
+                        type=float,
+                        help="initial learning rate")
+    parser.add_argument(
+        "--weight_decay",
+        default=1e-2,
+        type=float,
+        help="weight decay (default: 1e-2)",
+        dest="weight_decay",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        default="linear",
+        help="the scheduler type to use.",
+        choices=["linear", "cosine", "polynomial"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        default=0,
+        type=int,
+        help="number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument("--print_freq",
+                        default=10,
+                        type=int,
+                        help="print frequency")
+    parser.add_argument("--output_dir",
+                        default="outputs",
+                        help="path where to save")
+    parser.add_argument(
+        "--test_only",
+        help="only test the model",
+        action="store_true",
+    )
+    parser.add_argument("--seed",
+                        default=42,
+                        type=int,
+                        help="a seed for reproducible training.")
+    # Mixed precision training parameters
+    parser.add_argument("--fp16",
+                        action="store_true",
+                        help="whether or not mixed precision training")
+
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    acc = main(args)
+    reprod_logger = ReprodLogger()
+    reprod_logger.add("acc", np.array([acc]))
+    reprod_logger.save("train_align_paddle.npy")
diff --git a/examples/torch_migration/pipeline/Step5/bert_paddle/train.sh b/examples/torch_migration/pipeline/Step5/bert_paddle/train.sh
new file mode 100644
index 000000000000..5c5e367f6404
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_paddle/train.sh
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m paddle.distributed.launch --gpus "1" train.py \
+    --model_name_or_path bert-base-uncased \
+    --batch_size 128 \
+    --num_warmup_steps 158 \
+    --output_dir paddle_outputs
+
diff --git a/examples/torch_migration/pipeline/Step5/bert_paddle/utils.py b/examples/torch_migration/pipeline/Step5/bert_paddle/utils.py
new file mode 100644
index 000000000000..faf5fbe0e374
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_paddle/utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import errno
+import os
+import time
+from collections import OrderedDict, defaultdict, deque
+
+import paddle
+from paddlenlp.transformers import (
+    CosineDecayWithWarmup,
+    LinearDecayWithWarmup,
+    PolyDecayWithWarmup,
+)
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        t = paddle.to_tensor([self.count, self.total], dtype="float64")
+        t = t.numpy().tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = paddle.to_tensor(list(self.deque))
+        return d.median().numpy().item()
+
+    @property
+    def avg(self):
+        d = paddle.to_tensor(list(self.deque), dtype="float32")
+        return d.mean().numpy().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+
+
+class MetricLogger(object):
+
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, paddle.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if paddle.device.is_compiled_with_cuda():
+            log_msg = self.delimiter.join([
+                header,
+                "[{0" + space_fmt + "}/{1}]",
+                "eta: {eta}",
+                "{meters}",
+                "time: {time}",
+                "data: {data}",
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                "[{0" + space_fmt + "}/{1}]",
+                "eta: {eta}",
+                "{meters}",
+                "time: {time}",
+                "data: {data}",
+            ])
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                print(
+                    log_msg.format(
+                        i,
+                        len(iterable),
+                        eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time),
+                        data=str(data_time),
+                    ))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print("{} Total time: {}".format(header, total_time_str))
+
+
+scheduler_type2cls = {
+    "linear": LinearDecayWithWarmup,
+    "cosine": CosineDecayWithWarmup,
+    "polynomial": PolyDecayWithWarmup,
+}
+
+
+def get_scheduler(
+    learning_rate,
+    scheduler_type,
+    num_warmup_steps=None,
+    num_training_steps=None,
+    **scheduler_kwargs,
+):
+    if scheduler_type not in scheduler_type2cls.keys():
+        data = " ".join(scheduler_type2cls.keys())
+        raise ValueError(f"scheduler_type must be choson from {data}")
+
+    if num_warmup_steps is None:
+        raise ValueError(
+            f"requires `num_warmup_steps`, please provide that argument.")
+
+    if num_training_steps is None:
+        raise ValueError(
+            f"requires `num_training_steps`, please provide that argument.")
+
+    return scheduler_type2cls[scheduler_type](
+        learning_rate=learning_rate,
+        total_steps=num_training_steps,
+        warmup=num_warmup_steps,
+        **scheduler_kwargs,
+    )
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py b/examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py
new file mode 100644
index 000000000000..ae447e4a398a
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_torch/accuracy.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Accuracy metric."""
+
+import datasets
+from sklearn.metrics import accuracy_score
+
+_DESCRIPTION = """
+Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
+Accuracy = (TP + TN) / (TP + TN + FP + FN)
+TP: True positive
+TN: True negative
+FP: False positive
+FN: False negative
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: Predicted labels, as returned by a model.
+    references: Ground truth labels.
+    normalize: If False, return the number of correctly classified samples.
+        Otherwise, return the fraction of correctly classified samples.
+    sample_weight: Sample weights.
+Returns:
+    accuracy: Accuracy score.
+Examples:
+
+    >>> accuracy_metric = datasets.load_metric("accuracy")
+    >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])
+    >>> print(results)
+    {'accuracy': 1.0}
+"""
+
+_CITATION = """\
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION,
+                                                _KWARGS_DESCRIPTION)
+class Accuracy(datasets.Metric):
+
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(datasets.Value("int32")),
+                    "references": datasets.Sequence(datasets.Value("int32")),
+                } if self.config_name == "multilabel" else {
+                    "predictions": datasets.Value("int32"),
+                    "references": datasets.Value("int32"),
+                }),
+            reference_urls=[
+                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
+            ],
+        )
+
+    def _compute(self,
+                 predictions,
+                 references,
+                 normalize=True,
+                 sample_weight=None):
+        return {
+            "accuracy":
+            accuracy_score(
+                references,
+                predictions,
+                normalize=normalize,
+                sample_weight=sample_weight,
+            ).item(),
+        }
diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/glue.py b/examples/torch_migration/pipeline/Step5/bert_torch/glue.py
new file mode 100644
index 000000000000..028c09918f67
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_torch/glue.py
@@ -0,0 +1,633 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""The General Language Understanding Evaluation (GLUE) benchmark."""
+
+import csv
+import os
+import textwrap
+
+import datasets
+import numpy as np
+
+_GLUE_CITATION = """\
+@inproceedings{wang2019glue,
+  title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
+  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
+  note={In the Proceedings of ICLR.},
+  year={2019}
+}
+"""
+
+_GLUE_DESCRIPTION = """\
+GLUE, the General Language Understanding Evaluation benchmark
+(https://gluebenchmark.com/) is a collection of resources for training,
+evaluating, and analyzing natural language understanding systems.
+
+"""
+
+_MRPC_DEV_IDS = "https://dl.fbaipublicfiles.com/glue/data/mrpc_dev_ids.tsv"
+_MRPC_TRAIN = (
+    "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt"
+)
+_MRPC_TEST = (
+    "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt"
+)
+
+_MNLI_BASE_KWARGS = dict(
+    text_features={
+        "premise": "sentence1",
+        "hypothesis": "sentence2",
+    },
+    label_classes=["entailment", "neutral", "contradiction"],
+    label_column="gold_label",
+    data_url="https://dl.fbaipublicfiles.com/glue/data/MNLI.zip",
+    data_dir="MNLI",
+    citation=textwrap.dedent("""\
+      @InProceedings{N18-1101,
+        author = "Williams, Adina
+                  and Nangia, Nikita
+                  and Bowman, Samuel",
+        title = "A Broad-Coverage Challenge Corpus for
+                 Sentence Understanding through Inference",
+        booktitle = "Proceedings of the 2018 Conference of
+                     the North American Chapter of the
+                     Association for Computational Linguistics:
+                     Human Language Technologies, Volume 1 (Long
+                     Papers)",
+        year = "2018",
+        publisher = "Association for Computational Linguistics",
+        pages = "1112--1122",
+        location = "New Orleans, Louisiana",
+        url = "http://aclweb.org/anthology/N18-1101"
+      }
+      @article{bowman2015large,
+        title={A large annotated corpus for learning natural language inference},
+        author={Bowman, Samuel R and Angeli, Gabor and Potts, Christopher and Manning, Christopher D},
+        journal={arXiv preprint arXiv:1508.05326},
+        year={2015}
+      }"""),
+    url="http://www.nyu.edu/projects/bowman/multinli/",
+)
+
+
+class GlueConfig(datasets.BuilderConfig):
+    """BuilderConfig for GLUE."""
+
+    def __init__(
+        self,
+        text_features,
+        label_column,
+        data_url,
+        data_dir,
+        citation,
+        url,
+        label_classes=None,
+        process_label=lambda x: x,
+        **kwargs,
+    ):
+        """BuilderConfig for GLUE.
+
+        Args:
+          text_features: `dict[string, string]`, map from the name of the feature
+            dict for each text field to the name of the column in the tsv file
+          label_column: `string`, name of the column in the tsv file corresponding
+            to the label
+          data_url: `string`, url to download the zip file from
+          data_dir: `string`, the path to the folder containing the tsv files in the
+            downloaded zip
+          citation: `string`, citation for the data set
+          url: `string`, url for information about the data set
+          label_classes: `list[string]`, the list of classes if the label is
+            categorical. If not provided, then the label will be of type
+            `datasets.Value('float32')`.
+          process_label: `Function[string, any]`, function  taking in the raw value
+            of the label and processing it to the form required by the label feature
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(GlueConfig, self).__init__(version=datasets.Version("1.0.0", ""),
+                                         **kwargs)
+        self.text_features = text_features
+        self.label_column = label_column
+        self.label_classes = label_classes
+        self.data_url = data_url
+        self.data_dir = data_dir
+        self.citation = citation
+        self.url = url
+        self.process_label = process_label
+
+
+class Glue(datasets.GeneratorBasedBuilder):
+    """The General Language Understanding Evaluation (GLUE) benchmark."""
+
+    BUILDER_CONFIGS = [
+        GlueConfig(
+            name="cola",
+            description=textwrap.dedent("""\
+            The Corpus of Linguistic Acceptability consists of English
+            acceptability judgments drawn from books and journal articles on
+            linguistic theory. Each example is a sequence of words annotated
+            with whether it is a grammatical English sentence."""),
+            text_features={"sentence": "sentence"},
+            label_classes=["unacceptable", "acceptable"],
+            label_column="is_acceptable",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/CoLA.zip",
+            data_dir="CoLA",
+            citation=textwrap.dedent("""\
+            @article{warstadt2018neural,
+              title={Neural Network Acceptability Judgments},
+              author={Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R},
+              journal={arXiv preprint arXiv:1805.12471},
+              year={2018}
+            }"""),
+            url="https://nyu-mll.github.io/CoLA/",
+        ),
+        GlueConfig(
+            name="sst2",
+            description=textwrap.dedent("""\
+            The Stanford Sentiment Treebank consists of sentences from movie reviews and
+            human annotations of their sentiment. The task is to predict the sentiment of a
+            given sentence. We use the two-way (positive/negative) class split, and use only
+            sentence-level labels."""),
+            text_features={"sentence": "sentence"},
+            label_classes=["negative", "positive"],
+            label_column="label",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
+            data_dir="SST-2",
+            citation=textwrap.dedent("""\
+            @inproceedings{socher2013recursive,
+              title={Recursive deep models for semantic compositionality over a sentiment treebank},
+              author={Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D and Ng, Andrew and Potts, Christopher},
+              booktitle={Proceedings of the 2013 conference on empirical methods in natural language processing},
+              pages={1631--1642},
+              year={2013}
+            }"""),
+            url="https://datasets.stanford.edu/sentiment/index.html",
+        ),
+        GlueConfig(
+            name="mrpc",
+            description=textwrap.dedent("""\
+            The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of
+            sentence pairs automatically extracted from online news sources, with human annotations
+            for whether the sentences in the pair are semantically equivalent."""
+                                        ),  # pylint: disable=line-too-long
+            text_features={
+                "sentence1": "",
+                "sentence2": ""
+            },
+            label_classes=["not_equivalent", "equivalent"],
+            label_column="Quality",
+            data_url="",  # MRPC isn't hosted by GLUE.
+            data_dir="MRPC",
+            citation=textwrap.dedent("""\
+            @inproceedings{dolan2005automatically,
+              title={Automatically constructing a corpus of sentential paraphrases},
+              author={Dolan, William B and Brockett, Chris},
+              booktitle={Proceedings of the Third International Workshop on Paraphrasing (IWP2005)},
+              year={2005}
+            }"""),
+            url="https://www.microsoft.com/en-us/download/details.aspx?id=52398",
+        ),
+        GlueConfig(
+            name="qqp",
+            description=textwrap.dedent("""\
+            The Quora Question Pairs2 dataset is a collection of question pairs from the
+            community question-answering website Quora. The task is to determine whether a
+            pair of questions are semantically equivalent."""),
+            text_features={
+                "question1": "question1",
+                "question2": "question2",
+            },
+            label_classes=["not_duplicate", "duplicate"],
+            label_column="is_duplicate",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip",
+            data_dir="QQP",
+            citation=textwrap.dedent("""\
+          @online{WinNT,
+            author = {Iyer, Shankar and Dandekar, Nikhil and Csernai, Kornel},
+            title = {First Quora Dataset Release: Question Pairs},
+            year = {2017},
+            url = {https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs},
+            urldate = {2019-04-03}
+          }"""),
+            url=
+            "https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs",
+        ),
+        GlueConfig(
+            name="stsb",
+            description=textwrap.dedent("""\
+            The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
+            sentence pairs drawn from news headlines, video and image captions, and natural
+            language inference data. Each pair is human-annotated with a similarity score
+            from 1 to 5."""),
+            text_features={
+                "sentence1": "sentence1",
+                "sentence2": "sentence2",
+            },
+            label_column="score",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/STS-B.zip",
+            data_dir="STS-B",
+            citation=textwrap.dedent("""\
+            @article{cer2017semeval,
+              title={Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation},
+              author={Cer, Daniel and Diab, Mona and Agirre, Eneko and Lopez-Gazpio, Inigo and Specia, Lucia},
+              journal={arXiv preprint arXiv:1708.00055},
+              year={2017}
+            }"""),
+            url="http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark",
+            process_label=np.float32,
+        ),
+        GlueConfig(
+            name="mnli",
+            description=textwrap.dedent("""\
+            The Multi-Genre Natural Language Inference Corpus is a crowdsourced
+            collection of sentence pairs with textual entailment annotations. Given a premise sentence
+            and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis
+            (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are
+            gathered from ten different sources, including transcribed speech, fiction, and government reports.
+            We use the standard test set, for which we obtained private labels from the authors, and evaluate
+            on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend
+            the SNLI corpus as 550k examples of auxiliary training data."""),
+            **_MNLI_BASE_KWARGS,
+        ),
+        GlueConfig(
+            name="mnli_mismatched",
+            description=textwrap.dedent("""\
+          The mismatched validation and test splits from MNLI.
+          See the "mnli" BuilderConfig for additional information."""),
+            **_MNLI_BASE_KWARGS,
+        ),
+        GlueConfig(
+            name="mnli_matched",
+            description=textwrap.dedent("""\
+          The matched validation and test splits from MNLI.
+          See the "mnli" BuilderConfig for additional information."""),
+            **_MNLI_BASE_KWARGS,
+        ),
+        GlueConfig(
+            name="qnli",
+            description=textwrap.dedent("""\
+            The Stanford Question Answering Dataset is a question-answering
+            dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn
+            from Wikipedia) contains the answer to the corresponding question (written by an annotator). We
+            convert the task into sentence pair classification by forming a pair between each question and each
+            sentence in the corresponding context, and filtering out pairs with low lexical overlap between the
+            question and the context sentence. The task is to determine whether the context sentence contains
+            the answer to the question. This modified version of the original task removes the requirement that
+            the model select the exact answer, but also removes the simplifying assumptions that the answer
+            is always present in the input and that lexical overlap is a reliable cue."""
+                                        ),  # pylint: disable=line-too-long
+            text_features={
+                "question": "question",
+                "sentence": "sentence",
+            },
+            label_classes=["entailment", "not_entailment"],
+            label_column="label",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip",
+            data_dir="QNLI",
+            citation=textwrap.dedent("""\
+            @article{rajpurkar2016squad,
+              title={Squad: 100,000+ questions for machine comprehension of text},
+              author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
+              journal={arXiv preprint arXiv:1606.05250},
+              year={2016}
+            }"""),
+            url="https://rajpurkar.github.io/SQuAD-explorer/",
+        ),
+        GlueConfig(
+            name="rte",
+            description=textwrap.dedent("""\
+            The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
+            entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim
+            et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are
+            constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where
+            for three-class datasets we collapse neutral and contradiction into not entailment, for consistency."""
+                                        ),  # pylint: disable=line-too-long
+            text_features={
+                "sentence1": "sentence1",
+                "sentence2": "sentence2",
+            },
+            label_classes=["entailment", "not_entailment"],
+            label_column="label",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/RTE.zip",
+            data_dir="RTE",
+            citation=textwrap.dedent("""\
+            @inproceedings{dagan2005pascal,
+              title={The PASCAL recognising textual entailment challenge},
+              author={Dagan, Ido and Glickman, Oren and Magnini, Bernardo},
+              booktitle={Machine Learning Challenges Workshop},
+              pages={177--190},
+              year={2005},
+              organization={Springer}
+            }
+            @inproceedings{bar2006second,
+              title={The second pascal recognising textual entailment challenge},
+              author={Bar-Haim, Roy and Dagan, Ido and Dolan, Bill and Ferro, Lisa and Giampiccolo, Danilo and Magnini, Bernardo and Szpektor, Idan},
+              booktitle={Proceedings of the second PASCAL challenges workshop on recognising textual entailment},
+              volume={6},
+              number={1},
+              pages={6--4},
+              year={2006},
+              organization={Venice}
+            }
+            @inproceedings{giampiccolo2007third,
+              title={The third pascal recognizing textual entailment challenge},
+              author={Giampiccolo, Danilo and Magnini, Bernardo and Dagan, Ido and Dolan, Bill},
+              booktitle={Proceedings of the ACL-PASCAL workshop on textual entailment and paraphrasing},
+              pages={1--9},
+              year={2007},
+              organization={Association for Computational Linguistics}
+            }
+            @inproceedings{bentivogli2009fifth,
+              title={The Fifth PASCAL Recognizing Textual Entailment Challenge.},
+              author={Bentivogli, Luisa and Clark, Peter and Dagan, Ido and Giampiccolo, Danilo},
+              booktitle={TAC},
+              year={2009}
+            }"""),
+            url="https://aclweb.org/aclwiki/Recognizing_Textual_Entailment",
+        ),
+        GlueConfig(
+            name="wnli",
+            description=textwrap.dedent("""\
+            The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task
+            in which a system must read a sentence with a pronoun and select the referent of that pronoun from
+            a list of choices. The examples are manually constructed to foil simple statistical methods: Each
+            one is contingent on contextual information provided by a single word or phrase in the sentence.
+            To convert the problem into sentence pair classification, we construct sentence pairs by replacing
+            the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the
+            pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of
+            new examples derived from fiction books that was shared privately by the authors of the original
+            corpus. While the included training set is balanced between two classes, the test set is imbalanced
+            between them (65% not entailment). Also, due to a data quirk, the development set is adversarial:
+            hypotheses are sometimes shared between training and development examples, so if a model memorizes the
+            training examples, they will predict the wrong label on corresponding development set
+            example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence
+            between a model's score on this task and its score on the unconverted original task. We
+            call converted dataset WNLI (Winograd NLI)."""),
+            text_features={
+                "sentence1": "sentence1",
+                "sentence2": "sentence2",
+            },
+            label_classes=["not_entailment", "entailment"],
+            label_column="label",
+            data_url="https://dl.fbaipublicfiles.com/glue/data/WNLI.zip",
+            data_dir="WNLI",
+            citation=textwrap.dedent("""\
+            @inproceedings{levesque2012winograd,
+              title={The winograd schema challenge},
+              author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora},
+              booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning},
+              year={2012}
+            }"""),
+            url=
+            "https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html",
+        ),
+        GlueConfig(
+            name="ax",
+            description=textwrap.dedent("""\
+            A manually-curated evaluation dataset for fine-grained analysis of
+            system performance on a broad range of linguistic phenomena. This
+            dataset evaluates sentence understanding through Natural Language
+            Inference (NLI) problems. Use a model trained on MulitNLI to produce
+            predictions for this dataset."""),
+            text_features={
+                "premise": "sentence1",
+                "hypothesis": "sentence2",
+            },
+            label_classes=["entailment", "neutral", "contradiction"],
+            label_column="",  # No label since we only have test set.
+            # We must use a URL shortener since the URL from GLUE is very long and
+            # causes issues in TFDS.
+            data_url="https://dl.fbaipublicfiles.com/glue/data/AX.tsv",
+            data_dir="",  # We are downloading a tsv.
+            citation="",  # The GLUE citation is sufficient.
+            url="https://gluebenchmark.com/diagnostics",
+        ),
+    ]
+
+    def _info(self):
+        features = {
+            text_feature: datasets.Value("string")
+            for text_feature in self.config.text_features.keys()
+        }
+        if self.config.label_classes:
+            features["label"] = datasets.features.ClassLabel(
+                names=self.config.label_classes)
+        else:
+            features["label"] = datasets.Value("float32")
+        features["idx"] = datasets.Value("int32")
+        return datasets.DatasetInfo(
+            description=_GLUE_DESCRIPTION,
+            features=datasets.Features(features),
+            homepage=self.config.url,
+            citation=self.config.citation + "\n" + _GLUE_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        if self.config.name == "ax":
+            data_file = dl_manager.download(self.config.data_url)
+            return [
+                datasets.SplitGenerator(
+                    name=datasets.Split.TEST,
+                    gen_kwargs={
+                        "data_file": data_file,
+                        "split": "test",
+                    },
+                )
+            ]
+
+        if self.config.name == "mrpc":
+            data_dir = None
+            mrpc_files = dl_manager.download({
+                "dev_ids": _MRPC_DEV_IDS,
+                "train": _MRPC_TRAIN,
+                "test": _MRPC_TEST,
+            })
+        else:
+            dl_dir = dl_manager.download_and_extract(self.config.data_url)
+            data_dir = os.path.join(dl_dir, self.config.data_dir)
+            mrpc_files = None
+        train_split = datasets.SplitGenerator(
+            name=datasets.Split.TRAIN,
+            gen_kwargs={
+                "data_file": os.path.join(data_dir or "", "train.tsv"),
+                "split": "train",
+                "mrpc_files": mrpc_files,
+            },
+        )
+        if self.config.name == "mnli":
+            return [
+                train_split,
+                _mnli_split_generator("validation_matched",
+                                      data_dir,
+                                      "dev",
+                                      matched=True),
+                _mnli_split_generator("validation_mismatched",
+                                      data_dir,
+                                      "dev",
+                                      matched=False),
+                _mnli_split_generator("test_matched",
+                                      data_dir,
+                                      "test",
+                                      matched=True),
+                _mnli_split_generator("test_mismatched",
+                                      data_dir,
+                                      "test",
+                                      matched=False),
+            ]
+        elif self.config.name == "mnli_matched":
+            return [
+                _mnli_split_generator("validation",
+                                      data_dir,
+                                      "dev",
+                                      matched=True),
+                _mnli_split_generator("test", data_dir, "test", matched=True),
+            ]
+        elif self.config.name == "mnli_mismatched":
+            return [
+                _mnli_split_generator("validation",
+                                      data_dir,
+                                      "dev",
+                                      matched=False),
+                _mnli_split_generator("test", data_dir, "test", matched=False),
+            ]
+        else:
+            return [
+                train_split,
+                datasets.SplitGenerator(
+                    name=datasets.Split.VALIDATION,
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir or "", "dev.tsv"),
+                        "split": "dev",
+                        "mrpc_files": mrpc_files,
+                    },
+                ),
+                datasets.SplitGenerator(
+                    name=datasets.Split.TEST,
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir or "", "test.tsv"),
+                        "split": "test",
+                        "mrpc_files": mrpc_files,
+                    },
+                ),
+            ]
+
+    def _generate_examples(self, data_file, split, mrpc_files=None):
+        if self.config.name == "mrpc":
+            # We have to prepare the MRPC dataset from the original sources ourselves.
+            examples = self._generate_example_mrpc_files(mrpc_files=mrpc_files,
+                                                         split=split)
+            for example in examples:
+                yield example["idx"], example
+        else:
+            process_label = self.config.process_label
+            label_classes = self.config.label_classes
+
+            # The train and dev files for CoLA are the only tsv files without a
+            # header.
+            is_cola_non_test = self.config.name == "cola" and split != "test"
+
+            with open(data_file, encoding="utf8") as f:
+                reader = csv.DictReader(f,
+                                        delimiter="\t",
+                                        quoting=csv.QUOTE_NONE)
+                if is_cola_non_test:
+                    reader = csv.reader(f,
+                                        delimiter="\t",
+                                        quoting=csv.QUOTE_NONE)
+
+                for n, row in enumerate(reader):
+                    if is_cola_non_test:
+                        row = {
+                            "sentence": row[3],
+                            "is_acceptable": row[1],
+                        }
+
+                    example = {
+                        feat: row[col]
+                        for feat, col in self.config.text_features.items()
+                    }
+                    example["idx"] = n
+
+                    if self.config.label_column in row:
+                        label = row[self.config.label_column]
+                        # For some tasks, the label is represented as 0 and 1 in the tsv
+                        # files and needs to be cast to integer to work with the feature.
+                        if label_classes and label not in label_classes:
+                            label = int(label) if label else None
+                        example["label"] = process_label(label)
+                    else:
+                        example["label"] = process_label(-1)
+
+                    # Filter out corrupted rows.
+                    for value in example.values():
+                        if value is None:
+                            break
+                    else:
+                        yield example["idx"], example
+
+    def _generate_example_mrpc_files(self, mrpc_files, split):
+        if split == "test":
+            with open(mrpc_files["test"], encoding="utf8") as f:
+                # The first 3 bytes are the utf-8 BOM \xef\xbb\xbf, which messes with
+                # the Quality key.
+                f.seek(3)
+                reader = csv.DictReader(f,
+                                        delimiter="\t",
+                                        quoting=csv.QUOTE_NONE)
+                for n, row in enumerate(reader):
+                    yield {
+                        "sentence1": row["#1 String"],
+                        "sentence2": row["#2 String"],
+                        "label": int(row["Quality"]),
+                        "idx": n,
+                    }
+        else:
+            with open(mrpc_files["dev_ids"], encoding="utf8") as f:
+                reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                dev_ids = [[row[0], row[1]] for row in reader]
+            with open(mrpc_files["train"], encoding="utf8") as f:
+                # The first 3 bytes are the utf-8 BOM \xef\xbb\xbf, which messes with
+                # the Quality key.
+                f.seek(3)
+                reader = csv.DictReader(f,
+                                        delimiter="\t",
+                                        quoting=csv.QUOTE_NONE)
+                for n, row in enumerate(reader):
+                    is_row_in_dev = [row["#1 ID"], row["#2 ID"]] in dev_ids
+                    if is_row_in_dev == (split == "dev"):
+                        yield {
+                            "sentence1": row["#1 String"],
+                            "sentence2": row["#2 String"],
+                            "label": int(row["Quality"]),
+                            "idx": n,
+                        }
+
+
+def _mnli_split_generator(name, data_dir, split, matched):
+    return datasets.SplitGenerator(
+        name=name,
+        gen_kwargs={
+            "data_file":
+            os.path.join(
+                data_dir,
+                "%s_%s.tsv" % (split, "matched" if matched else "mismatched")),
+            "split":
+            split,
+            "mrpc_files":
+            None,
+        },
+    )
diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/train.py b/examples/torch_migration/pipeline/Step5/bert_torch/train.py
new file mode 100644
index 000000000000..b42d360241ff
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_torch/train.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import datetime
+import random
+import time
+
+import paddle
+import numpy as np
+import torch
+import torch.utils.data
+import utils
+from datasets import load_dataset, load_metric
+from reprod_log import ReprodLogger
+from torch import nn
+from transformers import AdamW, BertTokenizer, DataCollatorWithPadding, get_scheduler
+
+CURRENT_DIR = os.path.split(os.path.abspath(__file__))[0]  # 当前目录
+CONFIG_PATH = CURRENT_DIR.rsplit('/', 2)[0]
+sys.path.append(CONFIG_PATH)
+
+from models.pt_bert import BertConfig, BertForSequenceClassification
+
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+
+
+def train_one_epoch(
+    model,
+    criterion,
+    optimizer,
+    lr_scheduler,
+    data_loader,
+    device,
+    epoch,
+    print_freq,
+    scaler=None,
+):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr",
+                            utils.SmoothedValue(window_size=1, fmt="{value}"))
+    metric_logger.add_meter("sentence/s",
+                            utils.SmoothedValue(window_size=10, fmt="{value}"))
+
+    header = "Epoch: [{}]".format(epoch)
+    for batch in metric_logger.log_every(data_loader, print_freq, header):
+        start_time = time.time()
+        batch.to(device)
+        labels = batch.pop("labels")
+        with torch.cuda.amp.autocast(enabled=scaler is not None):
+            logits = model(**batch)[0]
+            loss = criterion(logits.reshape(-1, 2), labels.reshape(-1))
+
+        optimizer.zero_grad()
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            optimizer.step()
+        lr_scheduler.step()
+        batch_size = batch["input_ids"].shape[0]
+        metric_logger.update(loss=loss.item(),
+                             lr=lr_scheduler.get_last_lr()[-1])
+        metric_logger.meters["sentence/s"].update(batch_size /
+                                                  (time.time() - start_time))
+
+
+def evaluate(model, criterion, data_loader, device, metric, print_freq=100):
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = "Test:"
+    with torch.no_grad():
+        for batch in metric_logger.log_every(data_loader, print_freq, header):
+            batch.to(device)
+            labels = batch.pop("labels")
+            logits = model(**batch)[0]
+            loss = criterion(logits.reshape(-1, 2), labels.reshape(-1))
+            metric_logger.update(loss=loss.item())
+            metric.add_batch(
+                predictions=logits.argmax(dim=-1),
+                references=labels,
+            )
+    acc_global_avg = metric.compute()["accuracy"]
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print(" * Accuracy {acc_global_avg:.6f}".format(
+        acc_global_avg=acc_global_avg))
+    return acc_global_avg
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def load_data(args, tokenizer):
+    print("Loading data")
+    raw_datasets = load_dataset("glue.py",
+                                args.task_name,
+                                cache_dir=args.data_cache_dir)
+    sentence1_key, sentence2_key = task_to_keys[args.task_name]
+
+    def preprocess_function(examples):
+        texts = ((examples[sentence1_key], ) if sentence2_key is None else
+                 (examples[sentence1_key], examples[sentence2_key]))
+        result = tokenizer(*texts,
+                           padding=False,
+                           max_length=args.max_length,
+                           truncation=True)
+
+        if "label" in examples:
+            result["labels"] = examples["label"]
+        return result
+
+    train_ds = raw_datasets["train"].map(
+        preprocess_function,
+        batched=True,
+        remove_columns=raw_datasets["train"].column_names,
+        desc="Running tokenizer on train dataset",
+        new_fingerprint=f"train_tokenized_dataset_{args.task_name}",
+    )
+    validation_ds = raw_datasets["validation"].map(
+        preprocess_function,
+        batched=True,
+        remove_columns=raw_datasets["validation"].column_names,
+        desc="Running tokenizer on validation dataset",
+        new_fingerprint=f"validation_tokenized_dataset_{args.task_name}",
+    )
+    train_sampler = torch.utils.data.SequentialSampler(train_ds)
+    validation_sampler = torch.utils.data.SequentialSampler(validation_ds)
+
+    return train_ds, validation_ds, train_sampler, validation_sampler
+
+
+def main(args):
+    if args.output_dir:
+        utils.mkdir(args.output_dir)
+    print(args)
+    scaler = None
+    if args.fp16:
+        scaler = torch.cuda.amp.GradScaler()
+    device = torch.device(args.device)
+    torch.backends.cudnn.benchmark = True
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
+    data_collator = DataCollatorWithPadding(
+        tokenizer, pad_to_multiple_of=(8 if args.fp16 else None))
+    train_dataset, validation_dataset, train_sampler, validation_sampler = load_data(
+        args, tokenizer)
+    train_data_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.batch_size,
+        sampler=train_sampler,
+        num_workers=args.workers,
+        collate_fn=data_collator,
+    )
+
+    validation_data_loader = torch.utils.data.DataLoader(
+        validation_dataset,
+        batch_size=args.batch_size,
+        sampler=validation_sampler,
+        num_workers=args.workers,
+        collate_fn=data_collator,
+    )
+
+    print("Creating model")
+    pytorch_dump_path = '../../weights/torch_weight.bin'
+    config = BertConfig()
+    model = BertForSequenceClassification(config)
+    checkpoint = torch.load(pytorch_dump_path)
+    model.bert.load_state_dict(checkpoint)
+
+    classifier_weights = torch.load(
+        "../../classifier_weights/torch_classifier_weights.bin")
+    model.load_state_dict(classifier_weights, strict=False)
+    model.to(device)
+
+    print("Creating criterion")
+    criterion = nn.CrossEntropyLoss()
+
+    print("Creating optimizer")
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            args.weight_decay,
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)
+
+    print("Creating lr_scheduler")
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.num_train_epochs * len(train_data_loader),
+    )
+
+    metric = load_metric("accuracy.py")
+    if args.test_only:
+        evaluate(model, criterion, validation_data_loader, device=device)
+        return
+
+    print("Start training")
+    start_time = time.time()
+    best_accuracy = 0.0
+    for epoch in range(args.num_train_epochs):
+        train_one_epoch(
+            model,
+            criterion,
+            optimizer,
+            lr_scheduler,
+            train_data_loader,
+            device,
+            epoch,
+            args.print_freq,
+            scaler,
+        )
+        acc = evaluate(model,
+                       criterion,
+                       validation_data_loader,
+                       device=device,
+                       metric=metric)
+        best_accuracy = max(best_accuracy, acc)
+        if args.output_dir:
+            pass
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("Training time {}".format(total_time_str))
+    return best_accuracy
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="PyTorch SST-2 Classification Training", add_help=add_help)
+    parser.add_argument("--data_cache_dir",
+                        default="data_caches",
+                        help="data cache dir.")
+    parser.add_argument("--task_name",
+                        default="sst2",
+                        help="the name of the glue task to train on.")
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-uncased",
+        help=
+        "path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument("--device", default="cuda:2", help="device")
+    parser.add_argument("--batch_size", default=32, type=int)
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=128,
+        help=
+        ("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+         ),
+    )
+    parser.add_argument("--num_train_epochs",
+                        default=3,
+                        type=int,
+                        help="number of total epochs to run")
+    parser.add_argument(
+        "--workers",
+        default=0,
+        type=int,
+        help="number of data loading workers (default: 16)",
+    )
+    parser.add_argument("--lr",
+                        default=3e-5,
+                        type=float,
+                        help="initial learning rate")
+    parser.add_argument(
+        "--weight_decay",
+        default=1e-2,
+        type=float,
+        help="weight decay (default: 1e-2)",
+        dest="weight_decay",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        default="linear",
+        help="the scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        default=0,
+        type=int,
+        help="number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument("--print_freq",
+                        default=10,
+                        type=int,
+                        help="print frequency")
+    parser.add_argument("--output_dir",
+                        default="outputs",
+                        help="path where to save")
+    parser.add_argument(
+        "--test_only",
+        help="only test the model",
+        action="store_true",
+    )
+    parser.add_argument("--seed",
+                        default=42,
+                        type=int,
+                        help="a seed for reproducible training.")
+    # Mixed precision training parameters
+    parser.add_argument("--fp16",
+                        action="store_true",
+                        help="whether or not mixed precision training")
+
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    acc = main(args)
+    reprod_logger = ReprodLogger()
+    reprod_logger.add("acc", np.array([acc]))
+    reprod_logger.save("train_align_benchmark.npy")
diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/train.sh b/examples/torch_migration/pipeline/Step5/bert_torch/train.sh
new file mode 100644
index 000000000000..1d26be50340b
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_torch/train.sh
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python train.py \
+    --model_name_or_path bert-base-uncased \
+    --batch_size 128 \
+    --num_warmup_steps 158 \
+    --output_dir bert_outputs \
\ No newline at end of file
diff --git a/examples/torch_migration/pipeline/Step5/bert_torch/utils.py b/examples/torch_migration/pipeline/Step5/bert_torch/utils.py
new file mode 100644
index 000000000000..834b061d7c1a
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/bert_torch/utils.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import errno
+import os
+import time
+from collections import defaultdict, deque
+
+import torch
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        return
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+
+
+class MetricLogger(object):
+
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                "[{0" + space_fmt + "}/{1}]",
+                "eta: {eta}",
+                "{meters}",
+                "time: {time}",
+                "data: {data}",
+                "max mem: {memory:.0f}",
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                "[{0" + space_fmt + "}/{1}]",
+                "eta: {eta}",
+                "{meters}",
+                "time: {time}",
+                "data: {data}",
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        ))
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        ))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print("{} Total time: {}".format(header, total_time_str))
+
+
+def accuracy(output, target, topk=(1, )):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target[None])
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].flatten().sum(dtype=torch.float32)
+            res.append(correct_k * (100.0 / batch_size))
+        return res
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
diff --git a/examples/torch_migration/pipeline/Step5/check_step5.py b/examples/torch_migration/pipeline/Step5/check_step5.py
new file mode 100644
index 000000000000..79d3556a8ae0
--- /dev/null
+++ b/examples/torch_migration/pipeline/Step5/check_step5.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from reprod_log import ReprodDiffHelper
+
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+    torch_info = diff_helper.load_info("bert_torch/train_align_benchmark.npy")
+    paddle_info = diff_helper.load_info("bert_paddle/train_align_paddle.npy")
+
+    diff_helper.compare_info(torch_info, paddle_info)
+
+    diff_helper.report(path="train_align_diff.log", diff_threshold=0.0025)
diff --git a/examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py b/examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py
new file mode 100644
index 000000000000..8d9a4f6de25b
--- /dev/null
+++ b/examples/torch_migration/pipeline/classifier_weights/generate_classifier_weights.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import torch
+
+
+def generate(seed):
+    np.random.seed(seed)
+    weight = np.random.normal(0, 0.02, (768, 2)).astype("float32")
+    bias = np.zeros((2, )).astype("float32")
+    paddle_weights = {
+        "classifier.weight": weight,
+        "classifier.bias": bias,
+    }
+    torch_weights = {
+        "classifier.weight": torch.from_numpy(weight).t(),
+        "classifier.bias": torch.from_numpy(bias),
+    }
+    torch.save(torch_weights, "torch_classifier_weights.bin")
+    paddle.save(paddle_weights, "paddle_classifier_weights.bin")
+
+
+if __name__ == "__main__":
+    generate(seed=42)
diff --git a/examples/torch_migration/pipeline/fake_data/gen_fake_data.py b/examples/torch_migration/pipeline/fake_data/gen_fake_data.py
new file mode 100644
index 000000000000..e083799c0484
--- /dev/null
+++ b/examples/torch_migration/pipeline/fake_data/gen_fake_data.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def gen_fake_data():
+    fake_data = np.random.randint(1, 30522, size=(4, 64)).astype(np.int64)
+    fake_label = np.array([0, 1, 1, 0]).astype(np.int64)
+    np.save("fake_data.npy", fake_data)
+    np.save("fake_label.npy", fake_label)
+
+
+if __name__ == "__main__":
+    gen_fake_data()
diff --git a/examples/torch_migration/pipeline/models/pd_bert.py b/examples/torch_migration/pipeline/models/pd_bert.py
new file mode 100644
index 000000000000..ca1118e933e3
--- /dev/null
+++ b/examples/torch_migration/pipeline/models/pd_bert.py
@@ -0,0 +1,454 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle BERT model."""
+
+import math
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+ACT2FN = {
+    "relu": F.relu,
+    "gelu": F.gelu,
+    "tanh": F.tanh,
+    "sigmoid": F.sigmoid,
+}
+NEG_INF = -1e4
+
+
+class BertConfig:
+
+    def __init__(self,
+                 vocab_size: int = 30522,
+                 hidden_size: int = 768,
+                 num_hidden_layers: int = 12,
+                 num_attention_heads: int = 12,
+                 intermediate_size: int = 3072,
+                 hidden_act: str = "gelu",
+                 hidden_dropout_prob: float = 0.1,
+                 attention_probs_dropout_prob: float = 0.1,
+                 max_position_embeddings: int = 512,
+                 type_vocab_size: int = 2,
+                 initializer_range: float = 0.02,
+                 pad_token_id: int = 0,
+                 pool_act: str = "tanh",
+                 layer_norm_eps: float = 1e-12,
+                 output_attentions: bool = False,
+                 output_hidden_states: bool = False,
+                 num_labels=2,
+                 **kwargs):
+        self.pad_token_id = pad_token_id
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.layer_norm_eps = layer_norm_eps
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.num_labels = num_labels
+
+
+class BertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size,
+                                            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(config.max_position_embeddings).reshape((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        input_shape = input_ids.shape
+        seq_length = input_ids.shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: paddle.Tensor) -> paddle.Tensor:
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads, self.attention_head_size
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+
+        # compute q,k,v
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer,
+                                         key_layer,
+                                         transpose_y=True)
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        return outputs
+
+
+class BertSelfOutput(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor,
+                input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor,
+                input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        # self attn
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[
+            1:]  # add self attentions if we output attention weights
+
+        # ffn
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+
+        outputs = (layer_output, ) + outputs
+
+        return outputs
+
+
+class BertEncoder(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for layer_module in self.layer:
+            # add hidden_states
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            # add self attn
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        return tuple(v for v in [
+            hidden_states,
+            all_hidden_states,
+            all_self_attentions,
+        ] if v is not None)
+
+
+class BertPooler(nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = ACT2FN[config.pool_act]
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPreTrainedModel(nn.Layer):
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        normal_init = nn.initializer.Normal(mean=0.0,
+                                            std=self.config.initializer_range)
+        zero_init = nn.initializer.Constant(0.)
+        one_init = nn.initializer.Constant(1.)
+        if isinstance(module, nn.Linear):
+            normal_init(module.weight)
+            if module.bias is not None:
+                zero_init(module.bias)
+        elif isinstance(module, nn.Embedding):
+            normal_init(module.weight)
+            if module._padding_idx is not None:
+                with paddle.no_grad():
+                    module.weight[module._padding_idx] = 0
+        elif isinstance(module, nn.LayerNorm):
+            zero_init(module.bias)
+            one_init(module.weight)
+
+
+class BertModel(BertPreTrainedModel):
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__()
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.apply(self._init_weights)
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[paddle.Tensor]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_ids.shape, dtype=paddle.int64)
+
+        if attention_mask is not None:
+            attention_mask = (1.0 - attention_mask[:, :, None, None]) * NEG_INF
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+
+class BertForSequenceClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.apply(self._init_weights)
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[paddle.Tensor]:
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        output = (logits, ) + outputs[2:]
+        return output
diff --git a/examples/torch_migration/pipeline/models/pt_bert.py b/examples/torch_migration/pipeline/models/pt_bert.py
new file mode 100644
index 000000000000..c7eee9829cd6
--- /dev/null
+++ b/examples/torch_migration/pipeline/models/pt_bert.py
@@ -0,0 +1,456 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+ACT2FN = {
+    "relu": F.relu,
+    "gelu": F.gelu,
+    "tanh": F.tanh,
+    "sigmoid": F.sigmoid,
+}
+NEG_INF = -1e4
+
+
+class BertConfig:
+
+    def __init__(self,
+                 vocab_size: int = 30522,
+                 hidden_size: int = 768,
+                 num_hidden_layers: int = 12,
+                 num_attention_heads: int = 12,
+                 intermediate_size: int = 3072,
+                 hidden_act: str = "gelu",
+                 hidden_dropout_prob: float = 0.1,
+                 attention_probs_dropout_prob: float = 0.1,
+                 max_position_embeddings: int = 512,
+                 type_vocab_size: int = 2,
+                 initializer_range: float = 0.02,
+                 pad_token_id: int = 0,
+                 pool_act: str = "tanh",
+                 layer_norm_eps: float = 1e-12,
+                 output_attentions: bool = False,
+                 output_hidden_states: bool = False,
+                 num_labels=2,
+                 **kwargs):
+        self.pad_token_id = pad_token_id
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.layer_norm_eps = layer_norm_eps
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.num_labels = num_labels
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size,
+                                            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        seq_length = input_ids.shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape,
+                                         dtype=torch.long,
+                                         device=self.position_ids.device)
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+
+        # compute q,k,v
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # self attn
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[
+            1:]  # add self attentions if we output attention weights
+
+        # ffn
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+
+        outputs = (layer_output, ) + outputs
+
+        return outputs
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for layer_module in self.layer:
+            # add hidden_states
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            # add self attn
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        return tuple(v for v in [
+            hidden_states,
+            all_hidden_states,
+            all_self_attentions,
+        ] if v is not None)
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = ACT2FN[config.pool_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPreTrainedModel(nn.Module):
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0,
+                                       std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0,
+                                       std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class BertModel(BertPreTrainedModel):
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__()
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.apply(self._init_weights)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+
+        device = input_ids.device
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_ids.shape,
+                                         dtype=torch.long,
+                                         device=device)
+
+        if attention_mask is not None:
+            attention_mask = (1.0 - attention_mask[:, :, None, None]) * NEG_INF
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+
+class BertForSequenceClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.apply(self._init_weights)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor]:
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        output = (logits, ) + outputs[2:]
+        return output
diff --git a/examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py b/examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py
new file mode 100644
index 000000000000..de23d245c6f6
--- /dev/null
+++ b/examples/torch_migration/pipeline/reprod_log_demo/check_log_diff.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from reprod_log import ReprodDiffHelper
+
+if __name__ == "__main__":
+    diff_helper = ReprodDiffHelper()
+
+    info1 = diff_helper.load_info("./result_1.npy")
+    info2 = diff_helper.load_info("./result_2.npy")
+
+    diff_helper.compare_info(info1, info2)
+
+    diff_helper.report(diff_method="mean",
+                       diff_threshold=1e-6,
+                       path="./diff.txt")
diff --git a/examples/torch_migration/pipeline/reprod_log_demo/write_log.py b/examples/torch_migration/pipeline/reprod_log_demo/write_log.py
new file mode 100644
index 000000000000..b2985e3db724
--- /dev/null
+++ b/examples/torch_migration/pipeline/reprod_log_demo/write_log.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from reprod_log import ReprodLogger
+
+if __name__ == "__main__":
+    reprod_log_1 = ReprodLogger()
+    reprod_log_2 = ReprodLogger()
+
+    data_1 = np.random.rand(4, 64, 768).astype(np.float32)
+    data_2 = np.random.rand(4, 64, 768).astype(np.float32)
+
+    reprod_log_1.add("demo_test_1", data_1)
+    reprod_log_1.add("demo_test_2", data_1)
+    reprod_log_1.save("result_1.npy")
+
+    reprod_log_2.add("demo_test_1", data_1)
+    reprod_log_2.add("demo_test_2", data_2)
+    reprod_log_2.save("result_2.npy")
diff --git a/examples/torch_migration/pipeline/weights/torch2paddle.py b/examples/torch_migration/pipeline/weights/torch2paddle.py
new file mode 100644
index 000000000000..1af08c937769
--- /dev/null
+++ b/examples/torch_migration/pipeline/weights/torch2paddle.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import torch
+from paddlenlp.transformers import BertForPretraining as PDBertForMaskedLM
+from transformers import BertForMaskedLM as PTBertForMaskedLM
+
+
+def convert_pytorch_checkpoint_to_paddle(
+    pytorch_checkpoint_path="pytorch_model.bin",
+    paddle_dump_path="model_state.pdparams",
+    version="old",
+):
+    hf_to_paddle = {
+        "embeddings.LayerNorm": "embeddings.layer_norm",
+        "encoder.layer": "encoder.layers",
+        "attention.self.query": "self_attn.q_proj",
+        "attention.self.key": "self_attn.k_proj",
+        "attention.self.value": "self_attn.v_proj",
+        "attention.output.dense": "self_attn.out_proj",
+        "intermediate.dense": "linear1",
+        "output.dense": "linear2",
+        "attention.output.LayerNorm": "norm1",
+        "output.LayerNorm": "norm2",
+        "predictions.decoder.": "predictions.decoder_",
+        "predictions.transform.dense": "predictions.transform",
+        "predictions.transform.LayerNorm": "predictions.layer_norm",
+    }
+    do_not_transpose = []
+    if version == "old":
+        hf_to_paddle.update({
+            "predictions.bias": "predictions.decoder_bias",
+            ".gamma": ".weight",
+            ".beta": ".bias",
+        })
+        do_not_transpose = do_not_transpose + ["predictions.decoder.weight"]
+
+    pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+    paddle_state_dict = OrderedDict()
+    for k, v in pytorch_state_dict.items():
+        is_transpose = False
+        if k[-7:] == ".weight":
+            # embeddings.weight and LayerNorm.weight do not transpose
+            if all(d not in k for d in do_not_transpose):
+                if ".embeddings." not in k and ".LayerNorm." not in k:
+                    if v.ndim == 2:
+                        if 'embeddings' not in k:
+                            v = v.transpose(0, 1)
+                            is_transpose = True
+                        is_transpose = False
+        oldk = k
+        # for hf_name, pd_name in hf_to_paddle.items():
+        #     k = k.replace(hf_name, pd_name)
+
+        # add prefix `bert.`
+        if "bert." not in k and "cls." not in k and "classifier" not in k:
+            k = k
+
+        print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}")
+        paddle_state_dict[k] = v.data.numpy()
+
+    paddle.save(paddle_state_dict, paddle_dump_path)
+
+
+def compare(out_torch, out_paddle):
+    out_torch = out_torch.detach().numpy()
+    out_paddle = out_paddle.detach().numpy()
+    assert out_torch.shape == out_paddle.shape
+    abs_dif = np.abs(out_torch - out_paddle)
+    mean_dif = np.mean(abs_dif)
+    max_dif = np.max(abs_dif)
+    min_dif = np.min(abs_dif)
+    print("mean_dif:{}".format(mean_dif))
+    print("max_dif:{}".format(max_dif))
+    print("min_dif:{}".format(min_dif))
+
+
+def test_forward():
+    paddle.set_device("cpu")
+    model_torch = PTBertForMaskedLM.from_pretrained("./bert-base-uncased")
+    model_paddle = PDBertForMaskedLM.from_pretrained("./bert-base-uncased")
+    model_torch.eval()
+    model_paddle.eval()
+    np.random.seed(42)
+    x = np.random.randint(1,
+                          model_paddle.bert.config["vocab_size"],
+                          size=(4, 64))
+    input_torch = torch.tensor(x, dtype=torch.int64)
+    out_torch = model_torch(input_torch)[0]
+
+    input_paddle = paddle.to_tensor(x, dtype=paddle.int64)
+    out_paddle = model_paddle(input_paddle)[0]
+
+    print("torch result shape:{}".format(out_torch.shape))
+    print("paddle result shape:{}".format(out_paddle.shape))
+    compare(out_torch, out_paddle)
+
+
+if __name__ == "__main__":
+    convert_pytorch_checkpoint_to_paddle("./torch_weight.bin",
+                                         "./paddle_weight.pdparams")
diff --git a/examples/torch_migration/pipeline/weights/torch_bert_weight.py b/examples/torch_migration/pipeline/weights/torch_bert_weight.py
new file mode 100644
index 000000000000..52d778b25ff7
--- /dev/null
+++ b/examples/torch_migration/pipeline/weights/torch_bert_weight.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import BertModel
+import torch
+
+hf_model = BertModel.from_pretrained("bert-base-uncased")
+hf_model.eval()
+PATH = './torch_weight.bin'
+torch.save(hf_model.state_dict(), PATH)
diff --git a/examples/torch_migration/requirements.txt b/examples/torch_migration/requirements.txt
new file mode 100644
index 000000000000..4d3875d03156
--- /dev/null
+++ b/examples/torch_migration/requirements.txt
@@ -0,0 +1,5 @@
+paddlepaddle-gpu==2.2.0
+torch>=1.7
+transformers
+paddlenlp
+git+https://github.com/WenmuZhou/reprod_log.git