From 39fe51715c4d38ac8c7af5dd35831edd2ab9eff3 Mon Sep 17 00:00:00 2001
From: Jiawei Wang <wangjiawei04@baidu.com>
Date: Mon, 19 Apr 2021 23:24:34 +0800
Subject: [PATCH 01/20] Merge pull request #1150 from bjjwwang/k8s-2

K8s 2
---
 doc/PADDLE_SERVING_ON_KUBERNETES.md | 74 ++++++++++++++---------------
 tools/k8s_serving.yaml_template     |  2 +-
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/doc/PADDLE_SERVING_ON_KUBERNETES.md b/doc/PADDLE_SERVING_ON_KUBERNETES.md
index e643b9c6d..21f4854c0 100644
--- a/doc/PADDLE_SERVING_ON_KUBERNETES.md
+++ b/doc/PADDLE_SERVING_ON_KUBERNETES.md
@@ -1,6 +1,6 @@
 ## 在Kubenetes集群上部署Paddle Serving
 
-Paddle Serving在0.6.0版本开始支持在Kubenetes集群上部署，并提供反向代理和安全网关支持。与Paddle Serving在Docker镜像中开发类似，Paddle Serving模型在Kubenetes集群部署需要制作轻量化的运行镜像，并使用kubectl工具在集群上部署。
+Paddle Serving在0.6.0版本开始支持在Kubenetes集群上部署，并提供反向代理和安全网关支持。与Paddle Serving在Docker镜像中开发类似，Paddle Serving 模型在Kubenetes集群部署需要制作轻量化的运行镜像，并使用kubectl工具在集群上部署。
 
 ### 集群准备
 
@@ -84,7 +84,7 @@ web service模式本质上和pipeline模式类似，因此我们以`Serving/pyth
 
 ```bash
 #假设您已经拥有Serving运行镜像，假设镜像名为registry.baidubce.com/paddlepaddle/serving:0.6.0-cuda10.2-py37
-docker run --rm -dit --name webservice_serving_demo registry.baidubce.com/paddlepaddle/serving:0.6.0-cpu-py37 bash
+docker run --rm -dit --name webservice_serving_demo registry.baidubce.com/paddlepaddle/serving:0.6.0-cpu-py27 bash
 cd Serving/python/examples/bert
 ### download model 
 wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
@@ -109,33 +109,7 @@ python3.7 bert_web_service.py 9292
 
 **为了方便您对照，我们也提供了示例镜像registry.baidubce.com/paddlepaddle/serving:k8s-web-demo**
 
-#### RPC模式：
 
-相比之下，RPC模式只需要Serving模型的服务端配置即可，我们也以`Serving/python/examples/detection/faster_rcnn`为例
-
-```bash
-#假设您已经拥有Serving运行镜像，假设镜像名为paddle_serving:cuda10.2-py37
-docker run --rm -dit --name rpc_serving_demo paddle_serving:cuda10.2-py37 bash
-cd Serving/python/example/detections/faster_rcnn_r50_fpn_1x_coco]
-## get model
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/faster_rcnn_r50_fpn_1x_coco.tar
-tar xf faster_rcnn_r50_fpn_1x_coco.tar
-cd ..
-docker cp faster_rcnn_r50_fpn_1x_coco rpc_serving_demo:/home/faster_rcnn
-docker commit rpc_serving_demo detection_serving:latest
-```
-
-**提示：如果您对runtime镜像是否可运行需要验证，可以执行**
-
-```
-docker exec -it rpc_serving_demo bash
-cd /home/faster_rcnn
-python3.7 -m paddle_serving_server.serve --model serving_server --port 9292
-```
-
-进入容器到工程目录之后，剩下的操作和调试代码的工作是类似的。
-
-**为了方便您对照，我们也提供了示例镜像registry.baidubce.com/paddlepaddle/serving:k8s-rpc-demo**
 
 ### 在Kubenetes集群上部署 
 
@@ -144,19 +118,13 @@ kubenetes集群操作需要`kubectl`去操纵yaml文件。我们这里给出了
 - pipeline ocr示例 
 
 ```bash
-sh tools/generate_k8s_yamls.sh  --app_name ocr --image_name registry.baidubce.com/paddlepaddle/serving:k8s-pipeline-demo --workdir /home/ocr --command "python3.7 web_service.py" --port 18080
+sh tools/generate_k8s_yamls.sh  --app_name ocr --image_name registry.baidubce.com/paddlepaddle/serving:k8s-pipeline-demo --workdir /home/ocr --command "python2.7 web_service.py" --port 9999
 ```
 
 - web service bert示例
 
 ```bash
-sh tools/generate_k8s_yamls.sh  --app_name bert --image_name registry.baidubce.com/paddlepaddle/serving:k8s-web-demo --workdir /home/bert --command "python3.7 bert_web_service.py 9292" --port 9292
-```
-
-- rpc faster rcnn示例
-
-```bash
-sh tools/generate_k8s_yamls.sh  --app_name faster_rcnn --image_name registry.baidubce.com/paddlepaddle/serving:k8s-r pc-demo --workdir /home/faster_rcnn --command "python3.7 -m paddle_serving_server.serve --model serving_server --port 9292" --port 9292
+sh tools/generate_k8s_yamls.sh  --app_name bert --image_name registry.baidubce.com/paddlepaddle/serving:k8s-web-demo --workdir /home/bert --command "python2.7 bert_web_service.py 9292" --port 9292
 ```
 
 接下来我们会看到有两个yaml文件，分别是`k8s_serving.yaml`和 k8s_ingress.yaml`.
@@ -174,7 +142,7 @@ metadata:
 spec:
   ports:
   - port: 18080
-    name: rpc
+    name: http
     protocol: TCP
     targetPort: 18080
   selector:
@@ -264,4 +232,36 @@ NAME   READY   UP-TO-DATE   AVAILABLE   AGE
 ocr    1/1     1            1           2d20h
 ```
 
+我们使用
+
+```
+kubectl get service --all-namespaces
+```
+
+可以看到
+
+```
+NAMESPACE     NAME                      TYPE           CLUSTER-IP       EXTERNAL-IP   PORT(S)                    AGE
+default       bert                      ClusterIP      172.16.86.12     <none>        9292/TCP                   20m
+default       kubernetes                ClusterIP      172.16.0.1       <none>        443/TCP                    28d
+default       ocr                       ClusterIP      172.16.152.43    <none>        9999/TCP                   50m
+kong          kong-proxy                LoadBalancer   172.16.88.132    <pending>     80:8893/TCP,443:8805/TCP   25d
+kong          kong-validation-webhook   ClusterIP      172.16.38.100    <none>        443/TCP                    25d
+kube-system   heapster                  ClusterIP      172.16.240.64    <none>        80/TCP                     28d
+kube-system   kube-dns                  ClusterIP      172.16.0.10      <none>        53/UDP,53/TCP,9153/TCP     28d
+kube-system   metrics-server            ClusterIP      172.16.34.157    <none>        443/TCP                    28d
+```
+
+访问的方式就在
+
+```:
+http://${KONG_IP}:80/${APP_NAME}/prediction
+```
+
+例如Bert
+
+```
+curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://172.16.88.132:80/bert/prediction
+```
 
+就会从KONG的网关转发给bert服务。同理，OCR服务也可以把对应的IP地址换成`http://172.16.88.132:80/ocr/prediction`
diff --git a/tools/k8s_serving.yaml_template b/tools/k8s_serving.yaml_template
index b1e9bdf74..17d103c87 100644
--- a/tools/k8s_serving.yaml_template
+++ b/tools/k8s_serving.yaml_template
@@ -7,7 +7,7 @@ metadata:
 spec:
   ports:
   - port: << PORT >> 
-    name: rpc
+    name: http
     protocol: TCP
     targetPort: << PORT >>
   selector:

From a6b9c216c1c6350520f474a43f21afec0c164b21 Mon Sep 17 00:00:00 2001
From: Jiawei Wang <wangjiawei04@baidu.com>
Date: Tue, 20 Apr 2021 22:13:35 +0800
Subject: [PATCH 02/20] Merge pull request #1156 from bjjwwang/v0.6_doc

fix doc and add gcc54
---
 doc/COMPILE.md                               |  40 ++----
 doc/COMPILE_CN.md                            |  20 +--
 doc/DOCKER_IMAGES.md                         |  41 ++----
 doc/DOCKER_IMAGES_CN.md                      |  49 ++-----
 tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel | 136 +++++++++++++++++++
 5 files changed, 175 insertions(+), 111 deletions(-)
 create mode 100644 tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel

diff --git a/doc/COMPILE.md b/doc/COMPILE.md
index 861032aa3..ef161d141 100755
--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -7,10 +7,10 @@
 |            module            |              version              |
 | :--------------------------: | :-------------------------------: |
 |              OS              |     Ubuntu16 and 18/CentOS 7      |
-|             gcc              | 4.8.5(Cuda 9.0 and 10.0) and 8.2(Others) |
-|           gcc-c++            | 4.8.5(Cuda 9.0 and 10.0) and 8.2(Others) |
+|             gcc              |          5.4.0(Cuda 10.1) and 8.2.0         |
+|           gcc-c++            |          5.4.0(Cuda 10.1) and 8.2.0         |
 |            cmake             |          3.2.0 and later          |
-|            Python            |  2.7.2 and later / 3.5.1 and later |
+|            Python            |          3.6.0 and later          |
 |              Go              |          1.9.2 and later          |
 |             git              |         2.17.1 and later          |
 |         glibc-static         |               2.17                |
@@ -42,18 +42,6 @@ export PYTHONROOT=/usr
 If you are using a Docker development image, please follow the following to determine the Python version to be compiled, and set the corresponding environment variables
 
 ```
-#Python 2.7
-export PYTHONROOT=/usr/local/python2.7.15/
-export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/
-export PYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so
-export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python2.7
-
-#Python 3.5
-export PYTHONROOT=/usr/local/python3.5.1
-export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.5m
-export PYTHON_LIBRARIES=$PYTHONROOT/lib/libpython3.5m.so
-export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.5
-
 #Python3.6
 export PYTHONROOT=/usr/local/
 export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.6m
@@ -108,9 +96,9 @@ go get -u google.golang.org/grpc@v1.33.0
 
 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
+    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
     -DSERVER=ON ..
 make -j10
 ```
@@ -176,10 +164,10 @@ make -j10
 
 ``` shell
 mkdir client-build && cd client-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-      -DCLIENT=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
+    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+    -DCLIENT=ON ..
 make -j10
 ```
 
@@ -191,9 +179,9 @@ execute `make install` to put targets under directory `./output`
 
 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
+    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
     -DAPP=ON ..
 make
 ```
@@ -257,8 +245,6 @@ The following is the base library version matching relationship used by the Padd
 
 |          |  CUDA   |   CuDNN      | TensorRT |
 | :----:   | :-----: | :----------: | :----:   |
-| post9    |  9.0    | CuDNN 7.6.4  |          |
-| post10   |  10.0   | CuDNN 7.6.5  |          |
 | post101  |  10.1   | CuDNN 7.6.5  | 6.0.1    |
 | post102  |  10.2   | CuDNN 8.0.5  | 7.1.3    |
 | post11   |  11.0   | CuDNN 8.0.4  | 7.1.3    |
diff --git a/doc/COMPILE_CN.md b/doc/COMPILE_CN.md
index c5d8e424b..53c3548a4 100755
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -7,10 +7,10 @@
 |             组件             |             版本要求              |
 | :--------------------------: | :-------------------------------: |
 |              OS              |     Ubuntu16 and 18/CentOS 7      |
-|             gcc              | 4.8.5(Cuda 9.0 and 10.0) and 8.2(Others) |
-|           gcc-c++            | 4.8.5(Cuda 9.0 and 10.0) and 8.2(Others) |
+|             gcc              |          5.4.0(Cuda 10.1) and 8.2.0         |
+|           gcc-c++            |          5.4.0(Cuda 10.1) and 8.2.0         |
 |            cmake             |          3.2.0 and later          |
-|            Python            |  2.7.2 and later / 3.5.1 and later |
+|            Python            |          3.6.0 and later          |
 |              Go              |          1.9.2 and later          |
 |             git              |         2.17.1 and later          |
 |         glibc-static         |               2.17                |
@@ -41,18 +41,6 @@ export PYTHONROOT=/usr
 
 如果您使用的是Docker开发镜像，请按照如下，确定好需要编译的Python版本，设置对应的环境变量
 ```
-#Python 2.7
-export PYTHONROOT=/usr/local/python2.7.15/
-export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/
-export PYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so
-export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python2.7
-
-#Python 3.5
-export PYTHONROOT=/usr/local/python3.5.1
-export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.5m
-export PYTHON_LIBRARIES=$PYTHONROOT/lib/libpython3.5m.so
-export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.5
-
 #Python3.6
 export PYTHONROOT=/usr/local/
 export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.6m
@@ -259,8 +247,6 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选
 
 |          |  CUDA   |   CuDNN      | TensorRT |
 | :----:   | :-----: | :----------: | :----:   |
-| post9    |  9.0    | CuDNN 7.6.4  |          |
-| post10   |  10.0   | CuDNN 7.6.5  |          |
 | post101  |  10.1   | CuDNN 7.6.5  | 6.0.1    |
 | post102  |  10.2   | CuDNN 8.0.5  | 7.1.3    |
 | post11   |  11.0   | CuDNN 8.0.4  | 7.1.3    |
diff --git a/doc/DOCKER_IMAGES.md b/doc/DOCKER_IMAGES.md
index aa60c2cac..469a11dfb 100644
--- a/doc/DOCKER_IMAGES.md
+++ b/doc/DOCKER_IMAGES.md
@@ -31,17 +31,10 @@ If you want to customize your Serving based on source code, use the version with
 
 |                         Description                          |   OS    |             TAG              |                          Dockerfile                          |
 | :----------------------------------------------------------: | :-----: | :--------------------------: | :----------------------------------------------------------: |
-|                         CPU runtime                          | CentOS7 |            latest            |              [Dockerfile](../tools/Dockerfile)               |
-|                       CPU development                        | CentOS7 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
-|                 GPU (cuda9.0-cudnn7) runtime                 | CentOS7 |    latest-cuda9.0-cudnn7     | [Dockerfile.cuda9.0-cudnn7](../tools/Dockerfile.cuda9.0-cudnn7) |
-|               GPU (cuda9.0-cudnn7) development               | CentOS7 | latest-cuda9.0-cudnn7-devel  | [Dockerfile.cuda9.0-cudnn7.devel](../tools/Dockerfile.cuda9.0-cudnn7.devel) |
-|                GPU (cuda10.0-cudnn7) runtime                 | CentOS7 |    latest-cuda10.0-cudnn7    | [Dockerfile.cuda10.0-cudnn7](../tools/Dockerfile.cuda10.0-cudnn7) |
-|              GPU (cuda10.0-cudnn7) development               | CentOS7 | latest-cuda10.0-cudnn7-devel | [Dockerfile.cuda10.0-cudnn7.devel](../tools/Dockerfile.cuda10.0-cudnn7.devel) |
-|                GPU (cuda10.1-cudnn7-tensorRT6) runtime                 | Ubuntu16 |    latest-cuda10.1-cudnn7    | [Dockerfile.cuda10.1-cudnn7](../tools/Dockerfile.cuda10.1-cudnn7) |
+|                       CPU development                        | Ubuntu16 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
+|              GPU (cuda10.1-cudnn7-tensorRT6-gcc54) development               | Ubuntu16 | latest-cuda10.1-cudnn7-gcc54-devel | [Dockerfile.cuda10.1-cudnn7-gcc54.devel](../tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel) |
 |              GPU (cuda10.1-cudnn7-tensorRT6) development               | Ubuntu16 | latest-cuda10.1-cudnn7-devel | [Dockerfile.cuda10.1-cudnn7.devel](../tools/Dockerfile.cuda10.1-cudnn7.devel) |
-|                GPU (cuda10.2-cudnn8-tensorRT7) runtime                 | Ubuntu16|    latest-cuda10.2-cudnn8   | [Dockerfile.cuda10.2-cudnn8](../tools/Dockerfile.cuda10.2-cudnn8) |
 |              GPU (cuda10.2-cudnn8-tensorRT7) development               | Ubuntu16 | latest-cuda10.2-cudnn8-devel | [Dockerfile.cuda10.2-cudnn8.devel](../tools/Dockerfile.cuda10.2-cudnn8.devel) |
-|                GPU (cuda11-cudnn8-tensorRT7) runtime                 | Ubuntu18|    latest-cuda11-cudnn8   | [Dockerfile.cuda11-cudnn8](../tools/Dockerfile.cuda11-cudnn8) |
 |              GPU (cuda11-cudnn8-tensorRT7) development               | Ubuntu18 | latest-cuda11-cudnn8-devel | [Dockerfile.cuda11-cudnn8.devel](../tools/Dockerfile.cuda11-cudnn8.devel) |
 
 **Java Client:**
@@ -68,34 +61,18 @@ Develop Images:
 
 | Env      | Version | Docker images tag            | OS        | Gcc Version |
 |----------|---------|------------------------------|-----------|-------------|
-|    CPU   | 0.5.0   | 0.5.0-devel                 | Ubuntu 16 |  8.2.0       |
+|    CPU   | >=0.5.0 | 0.6.0-devel                 | Ubuntu 16 |  8.2.0       |
 |          | <=0.4.0 | 0.4.0-devel                  | CentOS 7  | 4.8.5       |
-|  Cuda9.0 | 0.5.0 | 0.5.0-cuda9.0-cudnn7-devel    | Ubuntu 16 |  4.8.5       |
-|          | <=0.4.0 | 0.4.0-cuda9.0-cudnn7-devel   | CentOS 7  | 4.8.5       |
-| Cuda10.0 | 0.5.0 | 0.5.0-cuda10.0-cudnn7-devel | Ubuntu 16 |    4.8.5       |
-|          | <=0.4.0 | 0.4.0-cuda10.0-cudnn7-devel  | CentOS 7  | 4.8.5       |
-| Cuda10.1 | 0.5.0 | 0.5.0-cuda10.1-cudnn7-devel  | Ubuntu 16 |   8.2.0       |
-|          | <=0.4.0 | 0.4.0-cuda10.1-cudnn7-devel    | CentOS 7  | 4.8.5     |
-| Cuda10.2 | 0.5.0 | 0.5.0-cuda10.2-cudnn8-devel  | Ubuntu 16 |   8.2.0       |
+| Cuda10.1 | >=0.5.0 | 0.6.0-cuda10.1-cudnn7-devel  | Ubuntu 16 |   8.2.0       |
+|          | 0.6.0   | 0.5.0-cuda10.1-cudnn7-gcc54-devel  | Ubuntu 16 |  5.4.0 |
+|          | <=0.4.0 | 0.6.0-cuda10.1-cudnn7-devel    | CentOS 7  | 4.8.5     |
+| Cuda10.2 | >=0.5.0 | 0.5.0-cuda10.2-cudnn8-devel  | Ubuntu 16 |   8.2.0       |
 |          | <=0.4.0 | Nan                          | Nan       | Nan         |
-| Cuda11.0 | 0.5.0 | 0.5.0-cuda11.0-cudnn8-devel | Ubuntu 18 |    8.2.0       |
+| Cuda11.0 | >=0.5.0 | 0.6.0-cuda11.0-cudnn8-devel | Ubuntu 18 |    8.2.0       |
 |          | <=0.4.0 | Nan                          | Nan       | Nan         |
 
 Running Images:
 
-| Env      | Version | Docker images tag     | OS        | Gcc Version |
-|----------|---------|-----------------------|-----------|-------------|
-|    CPU   | 0.5.0   | 0.5.0                 | Ubuntu 16 | 8.2.0       |
-|          | <=0.4.0 | 0.4.0                 | CentOS 7  | 4.8.5       |
-|  Cuda9.0 | 0.5.0   | 0.5.0-cuda9.0-cudnn7   | Ubuntu 16 | 4.8.5      |
-|          | <=0.4.0 | 0.4.0-cuda9.0-cudnn7  | CentOS 7  | 4.8.5       |
-| Cuda10.0 | 0.5.0   | 0.5.0-cuda10.0-cudnn7 | Ubuntu 16 | 4.8.5       |
-|          | <=0.4.0 | 0.4.0-cuda10.0-cudnn7 | CentOS 7  | 4.8.5       |
-| Cuda10.1 | 0.5.0   | 0.5.0-cuda10.1-cudnn7 | Ubuntu 16 | 8.2.0       |
-|          | <=0.4.0 | 0.4.0-cuda10.1-cudnn7 | CentOS 7  | 4.8.5       |
-| Cuda10.2 | 0.5.0   | 0.5.0-cuda10.2-cudnn8 | Ubuntu 16 | 8.2.0       |
-|          | <=0.4.0 | Nan                   | Nan       | Nan         |
-| Cuda11.0 | 0.5.0   | 0.5.0-cuda11.0-cudnn8 | Ubuntu 18 | 8.2.0       |
-|          | <=0.4.0 | Nan                   | Nan       | Nan         |
+Running Images is lighter than Develop Images, and Running Images are too many due to multiple combinations of python, device environment. If you want to know about it, plese check the document [Paddle Serving on Kubernetes.](PADDLE_SERVING_ON_KUBERNETES.md).
 
 **Tips:**  If you want to use CPU server and GPU server (version>=0.5.0) at the same time, you should check the gcc version,  only Cuda10.1/10.2/11 can run with CPU server owing to the same gcc version(8.2).
diff --git a/doc/DOCKER_IMAGES_CN.md b/doc/DOCKER_IMAGES_CN.md
index 05f5f87c1..8b16f4dd6 100644
--- a/doc/DOCKER_IMAGES_CN.md
+++ b/doc/DOCKER_IMAGES_CN.md
@@ -34,18 +34,11 @@
 
 |                         镜像选择                         |   操作系统    |             TAG              |                          Dockerfile                          |
 | :----------------------------------------------------------: | :-----: | :--------------------------: | :----------------------------------------------------------: |
-|                         CPU 运行镜像                          | CentOS7 |            latest            |              [Dockerfile](../tools/Dockerfile)               |
-|                       CPU 开发镜像                        | CentOS7 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
-|                 GPU (cuda9.0-cudnn7) 运行镜像                 | CentOS7 |    latest-cuda9.0-cudnn7     | [Dockerfile.cuda9.0-cudnn7](../tools/Dockerfile.cuda9.0-cudnn7) |
-|               GPU (cuda9.0-cudnn7) 开发镜像          | CentOS7 | latest-cuda9.0-cudnn7-devel  | [Dockerfile.cuda9.0-cudnn7.devel](../tools/Dockerfile.cuda9.0-cudnn7.devel) |
-|                GPU (cuda10.0-cudnn7) 运行镜像                 | CentOS7 |    latest-cuda10.0-cudnn7    | [Dockerfile.cuda10.0-cudnn7](../tools/Dockerfile.cuda10.0-cudnn7) |
-|              GPU (cuda10.0-cudnn7) 开发镜像               | CentOS7 | latest-cuda10.0-cudnn7-devel | [Dockerfile.cuda10.0-cudnn7.devel](../tools/Dockerfile.cuda10.0-cudnn7.devel) |
-|                GPU (cuda10.1-cudnn7-tensorRT6) 运行镜像                 | Ubuntu16 |    latest-cuda10.1-cudnn7    | [Dockerfile.cuda10.1-cudnn7](../tools/Dockerfile.cuda10.1-cudnn7) |
-|              GPU (cuda10.1-cudnn7-tensorRT6) 开发镜像               | Ubuntu16 | latest-cuda10.1-cudnn7-devel | [Dockerfile.cuda10.1-cudnn7.devel](../tools/Dockerfile.cuda10.1-cudnn7.devel) |
-|                GPU (cuda10.2-cudnn8-tensorRT7) 运行镜像                 | Ubuntu16|    latest-cuda10.2-cudnn8   | [Dockerfile.cuda10.2-cudnn8](../tools/Dockerfile.cuda10.2-cudnn8) |
-|              GPU (cuda10.2-cudnn8-tensorRT7) 开发镜像               | Ubuntu16 | latest-cuda10.2-cudnn8-devel | [Dockerfile.cuda10.2-cudnn8.devel](../tools/Dockerfile.cuda10.2-cudnn8.devel) |
-|                GPU (cuda11-cudnn8-tensorRT7) 运行镜像                 | Ubuntu18|    latest-cuda11-cudnn8   | [Dockerfile.cuda11-cudnn8](../tools/Dockerfile.cuda11-cudnn8) |
-|              GPU (cuda11-cudnn8-tensorRT7) 开发镜像               | Ubuntu18 | latest-cuda11-cudnn8-devel | [Dockerfile.cuda11-cudnn8.devel](../tools/Dockerfile.cuda11-cudnn8.devel) |
+|                       CPU development                        | Ubuntu16 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
+|              GPU (cuda10.1-cudnn7-tensorRT6-gcc54) development               | Ubuntu16 | latest-cuda10.1-cudnn7-gcc54-devel | [Dockerfile.cuda10.1-cudnn7-gcc54.devel](../tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel) |
+|              GPU (cuda10.1-cudnn7-tensorRT6) development               | Ubuntu16 | latest-cuda10.1-cudnn7-devel | [Dockerfile.cuda10.1-cudnn7.devel](../tools/Dockerfile.cuda10.1-cudnn7.devel) |
+|              GPU (cuda10.2-cudnn8-tensorRT7) development               | Ubuntu16 | latest-cuda10.2-cudnn8-devel | [Dockerfile.cuda10.2-cudnn8.devel](../tools/Dockerfile.cuda10.2-cudnn8.devel) |
+|              GPU (cuda11-cudnn8-tensorRT7) development               | Ubuntu18 | latest-cuda11-cudnn8-devel | [Dockerfile.cuda11-cudnn8.devel](../tools/Dockerfile.cuda11-cudnn8.devel) |
 
 **Java镜像：**
 ```
@@ -70,36 +63,22 @@ registry.baidubce.com/paddlepaddle/serving:xpu-beta
 
 编译镜像：
 
+开发镜像:
+
 | Env      | Version | Docker images tag            | OS        | Gcc Version |
 |----------|---------|------------------------------|-----------|-------------|
-|    CPU   | 0.5.0   | 0.5.0-devel                 | Ubuntu 16 |  8.2.0       |
+|    CPU   | >=0.5.0 | 0.6.0-devel                 | Ubuntu 16 |  8.2.0       |
 |          | <=0.4.0 | 0.4.0-devel                  | CentOS 7  | 4.8.5       |
-|  Cuda9.0 | 0.5.0 | 0.5.0-cuda9.0-cudnn7-devel    | Ubuntu 16 |  4.8.5       |
-|          | <=0.4.0 | 0.4.0-cuda9.0-cudnn7-devel   | CentOS 7  | 4.8.5       |
-| Cuda10.0 | 0.5.0 | 0.5.0-cuda10.0-cudnn7-devel | Ubuntu 16 |    4.8.5       |
-|          | <=0.4.0 | 0.4.0-cuda10.0-cudnn7-devel  | CentOS 7  | 4.8.5       |
-| Cuda10.1 | 0.5.0 | 0.5.0-cuda10.1-cudnn7-devel  | Ubuntu 16 |   8.2.0       |
-|          | <=0.4.0 | 0.4.0-cuda10.1-cudnn7-devel    | CentOS 7  | 4.8.5     |
-| Cuda10.2 | 0.5.0 | 0.5.0-cuda10.2-cudnn8-devel  | Ubuntu 16 |   8.2.0       |
+| Cuda10.1 | >=0.5.0 | 0.6.0-cuda10.1-cudnn7-devel  | Ubuntu 16 |   8.2.0       |
+|          | 0.6.0   | 0.5.0-cuda10.1-cudnn7-gcc54-devel  | Ubuntu 16 |  5.4.0 |
+|          | <=0.4.0 | 0.6.0-cuda10.1-cudnn7-devel    | CentOS 7  | 4.8.5     |
+| Cuda10.2 | >=0.5.0 | 0.5.0-cuda10.2-cudnn8-devel  | Ubuntu 16 |   8.2.0       |
 |          | <=0.4.0 | Nan                          | Nan       | Nan         |
-| Cuda11.0 | 0.5.0 | 0.5.0-cuda11.0-cudnn8-devel | Ubuntu 18 |    8.2.0       |
+| Cuda11.0 | >=0.5.0 | 0.6.0-cuda11.0-cudnn8-devel | Ubuntu 18 |    8.2.0       |
 |          | <=0.4.0 | Nan                          | Nan       | Nan         |
 
 运行镜像:
 
-| Env      | Version | Docker images tag     | OS        | Gcc Version |
-|----------|---------|-----------------------|-----------|-------------|
-|    CPU   | 0.5.0   | 0.5.0                 | Ubuntu 16 | 8.2.0       |
-|          | <=0.4.0 | 0.4.0                 | CentOS 7  | 4.8.5       |
-|  Cuda9.0 | 0.5.0   | 0.5.0-cuda9.0-cudnn7   | Ubuntu 16 | 4.8.5      |
-|          | <=0.4.0 | 0.4.0-cuda9.0-cudnn7  | CentOS 7  | 4.8.5       |
-| Cuda10.0 | 0.5.0   | 0.5.0-cuda10.0-cudnn7 | Ubuntu 16 | 4.8.5       |
-|          | <=0.4.0 | 0.4.0-cuda10.0-cudnn7 | CentOS 7  | 4.8.5       |
-| Cuda10.1 | 0.5.0   | 0.5.0-cuda10.1-cudnn7 | Ubuntu 16 | 8.2.0       |
-|          | <=0.4.0 | 0.4.0-cuda10.1-cudnn7 | CentOS 7  | 4.8.5       |
-| Cuda10.2 | 0.5.0   | 0.5.0-cuda10.2-cudnn8 | Ubuntu 16 | 8.2.0       |
-|          | <=0.4.0 | Nan                   | Nan       | Nan         |
-| Cuda11.0 | 0.5.0   | 0.5.0-cuda11.0-cudnn8 | Ubuntu 18 | 8.2.0       |
-|          | <=0.4.0 | Nan                   | Nan       | Nan         |
+运行镜像比开发镜像更加轻量化, 且由于python，运行环境的多种组合，进而导致运行镜像种类过多。 如果您想了解有关信息，请检查文档[在Kubernetes上使用Paddle Serving](PADDLE_SERVING_ON_KUBERNETES.md)。
 
 **注意事项：** 如果您在0.5.0及以上版本需要在一个容器当中同时运行CPU server和GPU server，需要选择Cuda10.1/10.2/11的镜像，因为他们和CPU环境有着相同版本的gcc。
diff --git a/tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel b/tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel
new file mode 100644
index 000000000..ca45336ad
--- /dev/null
+++ b/tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel
@@ -0,0 +1,136 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM hub.baidubce.com/ctr/cuda:10.1-cudnn7-devel-ubuntu16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+
+ENV HOME /root
+# Add bash enhancements
+COPY tools/dockerfiles/root/ /root/
+
+# Prepare packages for Python
+RUN apt-get update && \
+    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev
+
+RUN apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+    patchelf git python-pip python-dev python-opencv openssh-server bison \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
+    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
+    python-matplotlib unzip \
+    automake locales clang-format swig  \
+    liblapack-dev liblapacke-dev libcurl4-openssl-dev \
+    net-tools libtool module-init-tools vim && \
+    apt-get clean -y
+
+RUN ln -s /usr/lib/x86_64-linux-gnu/libssl.so /usr/lib/libssl.so.10 && \
+    ln -s /usr/lib/x86_64-linux-gnu/libcrypto.so /usr/lib/libcrypto.so.10
+
+RUN wget https://github.com/koalaman/shellcheck/releases/download/v0.7.1/shellcheck-v0.7.1.linux.x86_64.tar.xz -O shellcheck-v0.7.1.linux.x86_64.tar.xz && \
+    tar -xf shellcheck-v0.7.1.linux.x86_64.tar.xz && cp  shellcheck-v0.7.1/shellcheck /usr/bin/shellcheck && \
+    rm -rf shellcheck-v0.7.1.linux.x86_64.tar.xz shellcheck-v0.7.1
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+# Install Python3.6
+RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz
+
+RUN wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && cd .. && rm -rf Python-3.6.0*
+
+# Install Python3.7
+RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && cd .. && rm -rf Python-3.7.0*
+
+# Install Python3.8
+RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
+    tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && cd .. && rm -rf Python-3.8.0*
+
+ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
+RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 && ln -sf /usr/local/bin/python3.6 /usr/bin/python3 && ln -sf /usr/local/bin/pip3.6 /usr/local/bin/pip3 && ln -sf /usr/local/bin/pip3.6 /usr/bin/pip3
+
+RUN rm -r /root/python_build
+
+# Install Go and glide
+RUN wget -qO- https://dl.google.com/go/go1.14.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/go && \
+    mkdir /root/go/bin && \
+    mkdir /root/go/src && \
+    echo "GOROOT=/usr/local/go" >> /root/.bashrc && \
+    echo "GOPATH=/root/go" >> /root/.bashrc && \
+    echo "PATH=/usr/local/go/bin:/root/go/bin:$PATH" >> /root/.bashrc
+ENV GOROOT=/usr/local/go GOPATH=/root/go
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=usr/local/go/bin:/root/go/bin:${PATH}
+
+# Install TensorRT
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+
+# Downgrade TensorRT 
+COPY tools/dockerfiles/build_scripts /build_scripts
+RUN bash /build_scripts/install_trt.sh 
+RUN rm -rf /build_scripts
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN apt-get install libprotobuf-dev -y
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+RUN python3.8 -m pip install --upgrade pip requests && \
+    python3.7 -m pip install --upgrade pip requests && \
+    python3.6 -m pip install --upgrade pip requests 
+
+RUN wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && \
+    tar xf centos_ssl.tar && rm -rf centos_ssl.tar && \
+    mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && \
+    ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && \
+    ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && \
+    ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && \
+    ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+
+EXPOSE 22

From 9c4ce45edae40dd94da12453cd7f1252fb445338 Mon Sep 17 00:00:00 2001
From: Jiawei Wang <wangjiawei04@baidu.com>
Date: Tue, 20 Apr 2021 17:54:22 +0800
Subject: [PATCH 03/20] Merge pull request #1154 from HexToString/fix_grpc_bug

fix reader-op log bug
---
 core/general-server/op/general_reader_op.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
index 17a1aaa60..4b4e25cb0 100644
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -157,15 +157,13 @@ int GeneralReaderOp::inference() {
     }
     // implement lod tensor here
     // only support 1-D lod
-    // TODO:support 2-D lod
+    // TODO(HexToString): support 2-D lod
     if (tensor.lod_size() > 0) {
       VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
       lod_tensor.lod.resize(1);
       for (int k = 0; k < tensor.lod_size(); ++k) {
         lod_tensor.lod[0].push_back(tensor.lod(k));
       }
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] has lod_tensor and len=" << out->at(i).lod[0].back();
     }
 
     for (int k = 0; k < tensor.shape_size(); ++k) {
@@ -180,7 +178,10 @@ int GeneralReaderOp::inference() {
             << "]: " << data_len;
     databuf_size[i] = data_len * elem_size[i];
     out->at(i).data.Resize(data_len * elem_size[i]);
-
+    if (out->at(i).lod.size() > 0) {
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
+              << "] has lod_tensor and len=" << out->at(i).lod[0].back();
+    }
     if (elem_type[i] == P_INT64) {
       int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
       VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i

From e3322fb4a52d9957206e87031b63edac013fc13d Mon Sep 17 00:00:00 2001
From: Jiawei Wang <wangjiawei04@baidu.com>
Date: Tue, 20 Apr 2021 14:46:26 +0800
Subject: [PATCH 04/20] Merge pull request #1153 from HexToString/fix_grpc_bug

Fix grpc bug
---
 python/examples/bert/bert_web_service_gpu.py | 2 +-
 python/paddle_serving_client/client.py       | 9 ++++++++-
 python/paddle_serving_server/rpc_service.py  | 9 ++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/python/examples/bert/bert_web_service_gpu.py b/python/examples/bert/bert_web_service_gpu.py
index cbdd321c0..fb332bca3 100644
--- a/python/examples/bert/bert_web_service_gpu.py
+++ b/python/examples/bert/bert_web_service_gpu.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 from paddle_serving_app.reader import ChineseBertReader
 import sys
 import os
diff --git a/python/paddle_serving_client/client.py b/python/paddle_serving_client/client.py
index 8f1218b84..48ad112ab 100755
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -555,7 +555,14 @@ def connect(self, endpoints):
         )
         resp = self.stub_.GetClientConfig(get_client_config_req)
         model_config_path_list = resp.client_config_str_list
-        self._parse_model_config(model_config_path_list)
+        file_path_list = []
+        for single_model_config in model_config_path_list:
+            if os.path.isdir(single_model_config):
+                file_path_list.append("{}/serving_server_conf.prototxt".format(
+                    single_model_config))
+            elif os.path.isfile(single_model_config):
+                file_path_list.append(single_model_config)
+        self._parse_model_config(file_path_list)
 
     def _flatten_list(self, nested_list):
         for item in nested_list:
diff --git a/python/paddle_serving_server/rpc_service.py b/python/paddle_serving_server/rpc_service.py
index f41a4e242..d9d302831 100755
--- a/python/paddle_serving_server/rpc_service.py
+++ b/python/paddle_serving_server/rpc_service.py
@@ -34,11 +34,18 @@ def __init__(self, model_config_path_list, is_multi_model, endpoints):
         self._parse_model_config(self.model_config_path_list)
 
     def _init_bclient(self, model_config_path_list, endpoints, timeout_ms=None):
+        file_path_list = []
+        for single_model_config in model_config_path_list:
+            if os.path.isdir(single_model_config):
+                file_path_list.append("{}/serving_server_conf.prototxt".format(
+                    single_model_config))
+            elif os.path.isfile(single_model_config):
+                file_path_list.append(single_model_config)
         from paddle_serving_client import Client
         self.bclient_ = Client()
         if timeout_ms is not None:
             self.bclient_.set_rpc_timeout_ms(timeout_ms)
-        self.bclient_.load_client_config(model_config_path_list)
+        self.bclient_.load_client_config(file_path_list)
         self.bclient_.connect(endpoints)
 
     def _parse_model_config(self, model_config_path_list):

From f88ab1b8f0687a123e072b245cc6af706dca91bb Mon Sep 17 00:00:00 2001
From: Jiawei Wang <wangjiawei04@baidu.com>
Date: Tue, 20 Apr 2021 22:22:06 +0800
Subject: [PATCH 05/20] Merge pull request #1155 from ZhangYulongg/patch-5

Update ipipe_py3.sh
---
 tools/scripts/ipipe_py3.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/scripts/ipipe_py3.sh b/tools/scripts/ipipe_py3.sh
index 1f34f6952..9ae4012a2 100644
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh
@@ -825,7 +825,7 @@ function main() {
     if [ -f ${log_dir}error_models.txt ]; then
         cat ${log_dir}error_models.txt
         echo "error occurred!"
-        # exit 1
+        exit 1
     fi
 }
 

From 3932b95c15d1e619bc4b6af5a478bcb7ac180988 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Wed, 21 Apr 2021 19:38:37 +0800
Subject: [PATCH 06/20] Update CREATING.md

---
 doc/deprecated/CREATING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/deprecated/CREATING.md b/doc/deprecated/CREATING.md
index d057af4c3..bc4faa55f 100644
--- a/doc/deprecated/CREATING.md
+++ b/doc/deprecated/CREATING.md
@@ -75,7 +75,7 @@ service ImageClassifyService {
 
 #### 2.2.2 示例配置
 
-关于Serving端的配置的详细信息，可以参考[Serving端配置](SERVING_CONFIGURE.md)
+关于Serving端的配置的详细信息，可以参考[Serving端配置](../SERVING_CONFIGURE.md)
 
 以下配置文件将ReaderOP, ClassifyOP和WriteJsonOP串联成一个workflow (关于OP/workflow等概念，可参考[设计文档](DESIGN.md))
 

From ae451be0684d4296eb1f2743b8c692d234c4f65a Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Wed, 21 Apr 2021 19:39:55 +0800
Subject: [PATCH 07/20] Update CREATING.md

---
 doc/deprecated/CREATING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/deprecated/CREATING.md b/doc/deprecated/CREATING.md
index bc4faa55f..2e77f3c12 100644
--- a/doc/deprecated/CREATING.md
+++ b/doc/deprecated/CREATING.md
@@ -392,4 +392,4 @@ predictors {
   }
 }
 ```
-关于客户端的详细配置选项，可参考[CLIENT CONFIGURATION](CLIENT_CONFIGURE.md)
+关于客户端的详细配置选项，可参考[CLIENT CONFIGURATION](../CLIENT_CONFIGURE.md)

From 88d664f56d1a491332c66816577f467f235ac2d7 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Wed, 21 Apr 2021 19:41:17 +0800
Subject: [PATCH 08/20] Update CLUSTERING.md

---
 doc/deprecated/CLUSTERING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/deprecated/CLUSTERING.md b/doc/deprecated/CLUSTERING.md
index 7c372ac56..b9da2e561 100644
--- a/doc/deprecated/CLUSTERING.md
+++ b/doc/deprecated/CLUSTERING.md
@@ -1,6 +1,6 @@
 # 搭建预测服务集群
 
-从[客户端配置](CLIENT_CONFIGURE.md)中我们已经知道，通过在客户端SDK的配置文件predictors.prototxt适当配置，可以搭建多副本和多Variant的预测集群。以下以图像分类任务为例，在单机上模拟搭建单Variant的多副本、和多Variant的预测集群
+从[客户端配置](../CLIENT_CONFIGURE.md)中我们已经知道，通过在客户端SDK的配置文件predictors.prototxt适当配置，可以搭建多副本和多Variant的预测集群。以下以图像分类任务为例，在单机上模拟搭建单Variant的多副本、和多Variant的预测集群
 
 ## 1. 单Variant多副本的预测集群
 

From edbeb1387e38c255857efe0e96b1c88b88fed90c Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Wed, 21 Apr 2021 19:42:23 +0800
Subject: [PATCH 09/20] Update DESIGN_CN.md

---
 doc/deprecated/DESIGN_CN.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/deprecated/DESIGN_CN.md b/doc/deprecated/DESIGN_CN.md
index 846074b91..b2574a4dc 100644
--- a/doc/deprecated/DESIGN_CN.md
+++ b/doc/deprecated/DESIGN_CN.md
@@ -126,7 +126,7 @@ Paddle Serving实例可以同时加载多个模型，每个模型用一个Servic
 ![调用层级关系](../multi-variants.png)
 
 一个Service对应一个预测模型，模型下有1个endpoint。模型的不同版本，通过endpoint下多个variant概念实现：
-同一个模型预测服务，可以配置多个variant，每个variant有自己的下游IP列表。客户端代码可以对各个variant配置相对权重，以达到调节流量比例的关系（参考[客户端配置](CLIENT_CONFIGURE.md)第3.2节中关于variant_weight_list的说明）。
+同一个模型预测服务，可以配置多个variant，每个variant有自己的下游IP列表。客户端代码可以对各个variant配置相对权重，以达到调节流量比例的关系（参考[客户端配置](../CLIENT_CONFIGURE.md)第3.2节中关于variant_weight_list的说明）。
 
 ![Client端proxy功能](../client-side-proxy.png)
 

From a9a5d00ec8a2da8b7c0ffc862801d58f85dcc192 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Wed, 21 Apr 2021 19:43:19 +0800
Subject: [PATCH 10/20] Update DESIGN_CN.md

---
 doc/deprecated/DESIGN_CN.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/deprecated/DESIGN_CN.md b/doc/deprecated/DESIGN_CN.md
index b2574a4dc..42bc2b470 100644
--- a/doc/deprecated/DESIGN_CN.md
+++ b/doc/deprecated/DESIGN_CN.md
@@ -143,7 +143,7 @@ Paddle Serving实例可以同时加载多个模型，每个模型用一个Servic
 
 ### 5.1 数据压缩方法
 
-Baidu-rpc内置了snappy, gzip, zlib等数据压缩方法，可在配置文件中配置（参考[客户端配置](CLIENT_CONFIGURE.md)第3.1节关于compress_type的介绍）
+Baidu-rpc内置了snappy, gzip, zlib等数据压缩方法，可在配置文件中配置（参考[客户端配置](../CLIENT_CONFIGURE.md)第3.1节关于compress_type的介绍）
 
 ### 5.2 C++ SDK API接口
 

From 8848e741b9950067fbba0ed37cb20ed993325041 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Wed, 21 Apr 2021 19:46:11 +0800
Subject: [PATCH 11/20] Update FAQ.md

---
 doc/FAQ.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/FAQ.md b/doc/FAQ.md
index 4a20c2558..0d9e27a63 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -186,7 +186,7 @@ wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && \
 
 （2）Cuda和Cudnn动态库：文件名通常为 `libcudart.so.$CUDA_VERSION`，和 `libcudnn.so.$CUDNN_VERSION`。例如Cuda9就是 `libcudart.so.9.0`，Cudnn7就是 `libcudnn.so.7`。Cuda和Cudnn与Serving的版本匹配参见[Serving所有镜像列表](DOCKER_IMAGES_CN.md#%E9%99%84%E5%BD%95%E6%89%80%E6%9C%89%E9%95%9C%E5%83%8F%E5%88%97%E8%A1%A8).
 
-  (3) Cuda10.1及更高版本需要TensorRT。安装TensorRT相关文件的脚本参考 [install_trt.sh](../tools/dockerfile/build_scripts/install_trt.sh).
+  (3) Cuda10.1及更高版本需要TensorRT。安装TensorRT相关文件的脚本参考 [install_trt.sh](../tools/dockerfiles/build_scripts/install_trt.sh).
 
 ## 部署问题
 

From 57619ebe760b6007ff31edbbb62b422ed0efa3c8 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Thu, 22 Apr 2021 14:03:14 +0800
Subject: [PATCH 12/20] V0.6.0 cherry-pick 1161

---
 python/setup.py.app.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py.app.in b/python/setup.py.app.in
index cffa63cf8..0d4763dfd 100644
--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
@@ -44,7 +44,7 @@ REQUIRED_PACKAGES = [
     'six >= 1.10.0',
     'pillow',
     'pyclipper', 'shapely',
-    'sentencepiece<=0.1.83; platform_machine != "aarch64"',
+    'sentencepiece<=0.1.92; platform_machine != "aarch64"',
     'sentencepiece; platform_machine == "aarch64"',
     'opencv-python<=4.2.0.32; platform_machine != "aarch64"',
     'opencv-python; platform_machine == "aarch64"',

From da12719e13508336067a2d29a230ad0fd378126c Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Thu, 22 Apr 2021 15:06:57 +0800
Subject: [PATCH 13/20] Update PIPELINE_SERVING_CN.md

---
 doc/PIPELINE_SERVING_CN.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/doc/PIPELINE_SERVING_CN.md b/doc/PIPELINE_SERVING_CN.md
index 268a962dc..f902b9327 100644
--- a/doc/PIPELINE_SERVING_CN.md
+++ b/doc/PIPELINE_SERVING_CN.md
@@ -132,20 +132,21 @@ def __init__(name=None,
 
 各参数含义如下
 
-|        参数名         |                             含义                             |
-| :-------------------: | :----------------------------------------------------------: |
-|         name          |    （str）用于标识 OP 类型的字符串，该字段必须全局唯一。     |
-|       input_ops       |            （list）当前 OP 的所有前继 OP 的列表。            |
-|   server_endpoints    | （list）远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，认为是local_precditor模式，从local_service_conf中读取配置。 |
-|      fetch_list       |     （list）远程 Paddle Serving Service 的 fetch 列表。      |
-|     client_config     | （str）Paddle Serving Service 对应的 Client 端配置文件路径。 |
-|      client_type      | (str) 可选择brpc、grpc或local_predictor。local_predictor不启动Serving服务，进程内预测。 |
-|      concurrency      |                     （int）OP 的并发数。                     |
-|        timeout        | （int）process 操作的超时时间，单位为毫秒。若该值小于零，则视作不超时。 |
-|         retry         |       （int）超时重试次数。当该值为 1 时，不进行重试。       |
-|      batch_size       | （int）进行 Auto-Batching 的期望 batch_size 大小，由于构建 batch 可能超时，实际 batch_size 可能小于设定值，默认为 1。 |
-| auto_batching_timeout | （float）进行 Auto-Batching 构建 batch 的超时时间，单位为毫秒。batch_size > 1时，要设置auto_batching_timeout，否则请求数量不足batch_size时会阻塞等待。 |
-| local_service_handler | (object) local predictor handler，Op init()入参赋值 或 在Op init()中创建|
+|        参数名         |     类型     |                  含义                             |
+| :-------------------: | :---------: |:------------------------------------------------: |
+|         name          |   （str）   | 用于标识 OP 类型的字符串，该字段必须全局唯一。     |
+|       input_ops       |   （list）  | 当前 OP 的所有前继 OP 的列表。            |
+|   server_endpoints    |   （list）  |远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，认为是local_precditor模式，从local_service_conf中读取配置。 |
+|      fetch_list       |   （list）  |远程 Paddle Serving Service 的 fetch 列表。      |
+|     client_config     |   （str）   |Paddle Serving Service 对应的 Client 端配置文件路径。 |
+|      client_type      |    (str)    |可选择brpc、grpc或local_predictor。local_predictor不启动Serving服务，进程内预测。 |
+|      concurrency      |   （int）   | OP 的并发数。                     |
+|        timeout        |   （int）   |process 操作的超时时间，单位为毫秒。若该值小于零，则视作不超时。 |
+|         retry         |   （int）   |超时重试次数。当该值为 1 时，不进行重试。       |
+|      batch_size       |   （int）   |进行 Auto-Batching 的期望 batch_size 大小，由于构建 batch 可能超时，实际 batch_size 可能小于设定值，默认为 1。 |
+| auto_batching_timeout |  （float）  |进行 Auto-Batching 构建 batch 的超时时间，单位为毫秒。batch_size > 1时，要设置auto_batching_timeout，否则请求数量不足batch_size时会阻塞等待。 |
+| local_service_handler |   (object)  |local predictor handler，Op init()入参赋值 或 在Op init()中创建|
+
 
 
 

From 79fa74b436659768681187ec034954def5a7be25 Mon Sep 17 00:00:00 2001
From: HexToString <506181616@qq.com>
Date: Thu, 22 Apr 2021 11:38:31 +0000
Subject: [PATCH 14/20] fix ocr core dump

---
 .../op/general_detection_op.cpp               | 207 +++++++++---------
 core/general-server/op/general_reader_op.cpp  |  89 ++++----
 2 files changed, 149 insertions(+), 147 deletions(-)

diff --git a/core/general-server/op/general_detection_op.cpp b/core/general-server/op/general_detection_op.cpp
index f02465e0a..7c33ec8ef 100755
--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
@@ -22,7 +22,6 @@
 #include "core/predictor/framework/resource.h"
 #include "core/util/include/timer.h"
 
-
 /*
 #include "opencv2/imgcodecs/legacy/constants_c.h"
 #include "opencv2/imgproc/types_c.h"
@@ -52,18 +51,18 @@ int GeneralDetectionOp::inference() {
   }
   const std::string pre_name = pre_node_names[0];
 
-  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  const GeneralBlob* input_blob = get_depend_argument<GeneralBlob>(pre_name);
   if (!input_blob) {
     LOG(ERROR) << "input_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   uint64_t log_id = input_blob->GetLogId();
   VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
 
-  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  GeneralBlob* output_blob = mutable_data<GeneralBlob>();
   if (!output_blob) {
     LOG(ERROR) << "output_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   output_blob->SetLogId(log_id);
 
@@ -73,7 +72,7 @@ int GeneralDetectionOp::inference() {
     return -1;
   }
 
-  const TensorVector *in = &input_blob->tensor_vector;
+  const TensorVector* in = &input_blob->tensor_vector;
   TensorVector* out = &output_blob->tensor_vector;
 
   int batch_size = input_blob->_batch_size;
@@ -81,38 +80,39 @@ int GeneralDetectionOp::inference() {
 
   output_blob->_batch_size = batch_size;
 
-  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
-
   std::vector<int> input_shape;
-  int in_num =0;
+  int in_num = 0;
   void* databuf_data = NULL;
   char* databuf_char = NULL;
   size_t databuf_size = 0;
+  // now only support single string
+  char* total_input_ptr = static_cast<char*>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
 
-  std::string* input_ptr = static_cast<std::string*>(in->at(0).data.data());
-  std::string base64str = input_ptr[0];
   float ratio_h{};
   float ratio_w{};
 
-  
   cv::Mat img = Base2Mat(base64str);
   cv::Mat srcimg;
   cv::Mat resize_img;
-  
+
   cv::Mat resize_img_rec;
   cv::Mat crop_img;
   img.copyTo(srcimg);
 
-  this->resize_op_.Run(img, resize_img, this->max_side_len_, ratio_h, ratio_w,
+  this->resize_op_.Run(img,
+                       resize_img,
+                       this->max_side_len_,
+                       ratio_h,
+                       ratio_w,
                        this->use_tensorrt_);
 
-  this->normalize_op_.Run(&resize_img, this->mean_det, this->scale_det,
-                          this->is_scale_);
+  this->normalize_op_.Run(
+      &resize_img, this->mean_det, this->scale_det, this->is_scale_);
 
   std::vector<float> input(1 * 3 * resize_img.rows * resize_img.cols, 0.0f);
   this->permute_op_.Run(&resize_img, input.data());
 
-
   TensorVector* real_in = new TensorVector();
   if (!real_in) {
     LOG(ERROR) << "real_in is nullptr,error";
@@ -121,14 +121,15 @@ int GeneralDetectionOp::inference() {
 
   for (int i = 0; i < in->size(); ++i) {
     input_shape = {1, 3, resize_img.rows, resize_img.cols};
-    in_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
-    databuf_size = in_num*sizeof(float);
+    in_num = std::accumulate(
+        input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+    databuf_size = in_num * sizeof(float);
     databuf_data = MempoolWrapper::instance().malloc(databuf_size);
     if (!databuf_data) {
-        LOG(ERROR) << "Malloc failed, size: " << databuf_size;
-        return -1;
+      LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+      return -1;
     }
-    memcpy(databuf_data,input.data(),databuf_size);
+    memcpy(databuf_data, input.data(), databuf_size);
     databuf_char = reinterpret_cast<char*>(databuf_data);
     paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
     paddle::PaddleTensor tensor_in;
@@ -143,21 +144,23 @@ int GeneralDetectionOp::inference() {
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
   timeline.Start();
-  
+
   if (InferManager::instance().infer(
           engine_name().c_str(), real_in, out, batch_size)) {
     LOG(ERROR) << "(logid=" << log_id
                << ") Failed do infer in fluid model: " << engine_name().c_str();
     return -1;
   }
+  delete real_in;
+
   std::vector<int> output_shape;
-  int out_num =0;
+  int out_num = 0;
   void* databuf_data_out = NULL;
   char* databuf_char_out = NULL;
   size_t databuf_size_out = 0;
-  //this is special add for PaddleOCR postprecess
-  int infer_outnum =  out->size();
-  for (int k = 0;k <infer_outnum; ++k) {
+  // this is special add for PaddleOCR postprecess
+  int infer_outnum = out->size();
+  for (int k = 0; k < infer_outnum; ++k) {
     int n2 = out->at(k).shape[2];
     int n3 = out->at(k).shape[3];
     int n = n2 * n3;
@@ -171,17 +174,19 @@ int GeneralDetectionOp::inference() {
       cbuf[i] = (unsigned char)((out_data[i]) * 255);
     }
 
-    cv::Mat cbuf_map(n2, n3, CV_8UC1, (unsigned char *)cbuf.data());
-    cv::Mat pred_map(n2, n3, CV_32F, (float *)pred.data());
+    cv::Mat cbuf_map(n2, n3, CV_8UC1, (unsigned char*)cbuf.data());
+    cv::Mat pred_map(n2, n3, CV_32F, (float*)pred.data());
 
     const double threshold = this->det_db_thresh_ * 255;
     const double maxvalue = 255;
     cv::Mat bit_map;
     cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);
     cv::Mat dilation_map;
-    cv::Mat dila_ele = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
+    cv::Mat dila_ele =
+        cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
     cv::dilate(bit_map, dilation_map, dila_ele);
-    boxes = post_processor_.BoxesFromBitmap(pred_map, dilation_map,
+    boxes = post_processor_.BoxesFromBitmap(pred_map,
+                                            dilation_map,
                                             this->det_db_box_thresh_,
                                             this->det_db_unclip_ratio_);
 
@@ -192,25 +197,28 @@ int GeneralDetectionOp::inference() {
 
       float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
 
-      this->resize_op_rec.Run(crop_img, resize_img_rec, wh_ratio, this->use_tensorrt_);
+      this->resize_op_rec.Run(
+          crop_img, resize_img_rec, wh_ratio, this->use_tensorrt_);
 
-      this->normalize_op_.Run(&resize_img_rec, this->mean_rec, this->scale_rec,
-                              this->is_scale_);
+      this->normalize_op_.Run(
+          &resize_img_rec, this->mean_rec, this->scale_rec, this->is_scale_);
 
-      std::vector<float> output_rec(1 * 3 * resize_img_rec.rows * resize_img_rec.cols, 0.0f);
+      std::vector<float> output_rec(
+          1 * 3 * resize_img_rec.rows * resize_img_rec.cols, 0.0f);
 
       this->permute_op_.Run(&resize_img_rec, output_rec.data());
 
       // Inference.
       output_shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
-      out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-      databuf_size_out = out_num*sizeof(float);
+      out_num = std::accumulate(
+          output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+      databuf_size_out = out_num * sizeof(float);
       databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
       if (!databuf_data_out) {
-          LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
-          return -1;
+        LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
+        return -1;
       }
-      memcpy(databuf_data_out,output_rec.data(),databuf_size_out);
+      memcpy(databuf_data_out, output_rec.data(), databuf_size_out);
       databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
       paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
       paddle::PaddleTensor tensor_out;
@@ -221,9 +229,8 @@ int GeneralDetectionOp::inference() {
       out->push_back(tensor_out);
     }
   }
-  out->erase(out->begin(),out->begin()+infer_outnum);
+  out->erase(out->begin(), out->begin() + infer_outnum);
 
-  
   int64_t end = timeline.TimeStampUS();
   CopyBlobInfo(input_blob, output_blob);
   AddBlobInfo(output_blob, start);
@@ -231,68 +238,62 @@ int GeneralDetectionOp::inference() {
   return 0;
 }
 
-cv::Mat GeneralDetectionOp::Base2Mat(std::string &base64_data)
-{
-	cv::Mat img;
-	std::string s_mat;
-	s_mat = base64Decode(base64_data.data(), base64_data.size());
-	std::vector<char> base64_img(s_mat.begin(), s_mat.end());
-	img = cv::imdecode(base64_img, cv::IMREAD_COLOR);//CV_LOAD_IMAGE_COLOR
-	return img;
+cv::Mat GeneralDetectionOp::Base2Mat(std::string& base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR);  // CV_LOAD_IMAGE_COLOR
+  return img;
 }
 
-std::string GeneralDetectionOp::base64Decode(const char* Data, int DataByte)
-{
-
-	const char DecodeTable[] =
-	{
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		62, // '+'
-		0, 0, 0,
-		63, // '/'
-		52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
-		0, 0, 0, 0, 0, 0, 0,
-		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-		13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
-		0, 0, 0, 0, 0, 0,
-		26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-		39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
-	};
-
-	std::string strDecode;
-	int nValue;
-	int i = 0;
-	while (i < DataByte)
-	{
-		if (*Data != '\r' && *Data != '\n')
-		{
-			nValue = DecodeTable[*Data++] << 18;
-			nValue += DecodeTable[*Data++] << 12;
-			strDecode += (nValue & 0x00FF0000) >> 16;
-			if (*Data != '=')
-			{
-				nValue += DecodeTable[*Data++] << 6;
-				strDecode += (nValue & 0x0000FF00) >> 8;
-				if (*Data != '=')
-				{
-					nValue += DecodeTable[*Data++];
-					strDecode += nValue & 0x000000FF;
-				}
-			}
-			i += 4;
-		}
-		else// 回车换行,跳过
-		{
-			Data++;
-			i++;
-		}
-	}
-	return strDecode;
+std::string GeneralDetectionOp::base64Decode(const char* Data, int DataByte) {
+  const char
+      DecodeTable[] =
+          {
+              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+              62,  // '+'
+              0,  0,  0,
+              63,                                      // '/'
+              52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  // '0'-'9'
+              0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,
+              8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+              23, 24, 25,  // 'A'-'Z'
+              0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34,
+              35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+              50, 51,  // 'a'-'z'
+          };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else  // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
 }
 
-cv::Mat GeneralDetectionOp::GetRotateCropImage(const cv::Mat &srcimage,
-                                           std::vector<std::vector<int>> box) {
+cv::Mat GeneralDetectionOp::GetRotateCropImage(
+    const cv::Mat& srcimage, std::vector<std::vector<int>> box) {
   cv::Mat image;
   srcimage.copyTo(image);
   std::vector<std::vector<int>> points = box;
@@ -332,7 +333,9 @@ cv::Mat GeneralDetectionOp::GetRotateCropImage(const cv::Mat &srcimage,
   cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);
 
   cv::Mat dst_img;
-  cv::warpPerspective(img_crop, dst_img, M,
+  cv::warpPerspective(img_crop,
+                      dst_img,
+                      M,
                       cv::Size(img_crop_width, img_crop_height),
                       cv::BORDER_REPLICATE);
 
@@ -350,4 +353,4 @@ DEFINE_OP(GeneralDetectionOp);
 
 }  // namespace serving
 }  // namespace paddle_serving
-}  // namespace baidu
\ No newline at end of file
+}  // namespace baidu
diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
index 4b4e25cb0..3e1091dd8 100644
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -77,9 +77,6 @@ int GeneralReaderOp::inference() {
 
   uint64_t log_id = req->log_id();
   int input_var_num = 0;
-  std::vector<int64_t> elem_type;
-  std::vector<int64_t> elem_size;
-  std::vector<int64_t> databuf_size;
 
   GeneralBlob *res = mutable_data<GeneralBlob>();
   if (!res) {
@@ -119,40 +116,44 @@ int GeneralReaderOp::inference() {
   }
   */
   // package tensor
-
-  elem_type.resize(var_num);
-  elem_size.resize(var_num);
-  databuf_size.resize(var_num);
   // prepare basic information for input
   // specify the memory needed for output tensor_vector
   // fill the data into output general_blob
   int data_len = 0;
+  int64_t elem_type = 0;
+  int64_t elem_size = 0;
+  int64_t databuf_size = 0;
   for (int i = 0; i < var_num; ++i) {
-    paddle::PaddleTensor lod_tensor;
+    paddle::PaddleTensor paddleTensor;
     const Tensor &tensor = req->insts(0).tensor_array(i);
     data_len = 0;
-    elem_type[i] = tensor.elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
-    if (elem_type[i] == P_INT64) {  // int64
-      elem_size[i] = sizeof(int64_t);
-      lod_tensor.dtype = paddle::PaddleDType::INT64;
+    elem_type = 0;
+    elem_size = 0;
+    databuf_size = 0;
+    elem_type = tensor.elem_type();
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type;
+    if (elem_type == P_INT64) {  // int64
+      elem_size = sizeof(int64_t);
+      paddleTensor.dtype = paddle::PaddleDType::INT64;
       data_len = tensor.int64_data_size();
-    } else if (elem_type[i] == P_FLOAT32) {
-      elem_size[i] = sizeof(float);
-      lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
+    } else if (elem_type == P_FLOAT32) {
+      elem_size = sizeof(float);
+      paddleTensor.dtype = paddle::PaddleDType::FLOAT32;
       data_len = tensor.float_data_size();
-    } else if (elem_type[i] == P_INT32) {
-      elem_size[i] = sizeof(int32_t);
-      lod_tensor.dtype = paddle::PaddleDType::INT32;
+    } else if (elem_type == P_INT32) {
+      elem_size = sizeof(int32_t);
+      paddleTensor.dtype = paddle::PaddleDType::INT32;
       data_len = tensor.int_data_size();
-    } else if (elem_type[i] == P_STRING) {
+    } else if (elem_type == P_STRING) {
       // use paddle::PaddleDType::UINT8 as for String.
-      elem_size[i] = sizeof(uint8_t);
-      lod_tensor.dtype = paddle::PaddleDType::UINT8;
+      elem_size = sizeof(char);
+      paddleTensor.dtype = paddle::PaddleDType::UINT8;
       // this is for vector<String>, cause the databuf_size !=
       // vector<String>.size()*sizeof(char);
+      // data_len should be +1 cause '\0'
+      // now only support single string
       for (int idx = 0; idx < tensor.data_size(); idx++) {
-        data_len += tensor.data()[idx].length();
+        data_len += tensor.data()[idx].length() + 1;
       }
     }
     // implement lod tensor here
@@ -160,29 +161,29 @@ int GeneralReaderOp::inference() {
     // TODO(HexToString): support 2-D lod
     if (tensor.lod_size() > 0) {
       VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
-      lod_tensor.lod.resize(1);
+      paddleTensor.lod.resize(1);
       for (int k = 0; k < tensor.lod_size(); ++k) {
-        lod_tensor.lod[0].push_back(tensor.lod(k));
+        paddleTensor.lod[0].push_back(tensor.lod(k));
       }
     }
 
     for (int k = 0; k < tensor.shape_size(); ++k) {
       int dim = tensor.shape(k);
       VLOG(2) << "(logid=" << log_id << ") shape for var[" << i << "]: " << dim;
-      lod_tensor.shape.push_back(dim);
+      paddleTensor.shape.push_back(dim);
     }
-    lod_tensor.name = model_config->_feed_name[i];
-    out->push_back(lod_tensor);
+    paddleTensor.name = model_config->_feed_name[i];
+    out->push_back(paddleTensor);
 
     VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
             << "]: " << data_len;
-    databuf_size[i] = data_len * elem_size[i];
-    out->at(i).data.Resize(data_len * elem_size[i]);
+    databuf_size = data_len * elem_size;
+    out->at(i).data.Resize(databuf_size);
     if (out->at(i).lod.size() > 0) {
       VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] has lod_tensor and len=" << out->at(i).lod[0].back();
     }
-    if (elem_type[i] == P_INT64) {
+    if (elem_type == P_INT64) {
       int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
       VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
               << "] is " << tensor.int64_data(0);
@@ -190,14 +191,14 @@ int GeneralReaderOp::inference() {
         LOG(ERROR) << "dst_ptr is nullptr";
         return -1;
       }
-      memcpy(dst_ptr, tensor.int64_data().data(), databuf_size[i]);
+      memcpy(dst_ptr, tensor.int64_data().data(), databuf_size);
       /*
       int elem_num = tensor.int64_data_size();
       for (int k = 0; k < elem_num; ++k) {
         dst_ptr[k] = tensor.int64_data(k);
       }
       */
-    } else if (elem_type[i] == P_FLOAT32) {
+    } else if (elem_type == P_FLOAT32) {
       float *dst_ptr = static_cast<float *>(out->at(i).data.data());
       VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
               << "] is " << tensor.float_data(0);
@@ -205,12 +206,12 @@ int GeneralReaderOp::inference() {
         LOG(ERROR) << "dst_ptr is nullptr";
         return -1;
       }
-      memcpy(dst_ptr, tensor.float_data().data(), databuf_size[i]);
+      memcpy(dst_ptr, tensor.float_data().data(), databuf_size);
       /*int elem_num = tensor.float_data_size();
       for (int k = 0; k < elem_num; ++k) {
         dst_ptr[k] = tensor.float_data(k);
       }*/
-    } else if (elem_type[i] == P_INT32) {
+    } else if (elem_type == P_INT32) {
       int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
       VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
               << "] is " << tensor.int_data(0);
@@ -218,15 +219,9 @@ int GeneralReaderOp::inference() {
         LOG(ERROR) << "dst_ptr is nullptr";
         return -1;
       }
-      memcpy(dst_ptr, tensor.int_data().data(), databuf_size[i]);
-      /*
-      int elem_num = tensor.int_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.int_data(k);
-      }
-      */
-    } else if (elem_type[i] == P_STRING) {
-      std::string *dst_ptr = static_cast<std::string *>(out->at(i).data.data());
+      memcpy(dst_ptr, tensor.int_data().data(), databuf_size);
+    } else if (elem_type == P_STRING) {
+      char *dst_ptr = static_cast<char *>(out->at(i).data.data());
       VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
               << "] is " << tensor.data(0);
       if (!dst_ptr) {
@@ -234,8 +229,12 @@ int GeneralReaderOp::inference() {
         return -1;
       }
       int elem_num = tensor.data_size();
+      int offset = 0;
       for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.data(k);
+        memcpy(dst_ptr + offset,
+               tensor.data(k).c_str(),
+               strlen(tensor.data(k).c_str()) + 1);
+        offset += strlen(tensor.data(k).c_str()) + 1;
       }
     }
   }

From c345319767fca14eafc3def9d1910c8669f0f8e2 Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Thu, 22 Apr 2021 21:08:09 +0800
Subject: [PATCH 15/20] 11

11
---
 doc/LOD_CN.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 doc/LOD_CN.md

diff --git a/doc/LOD_CN.md b/doc/LOD_CN.md
new file mode 100644
index 000000000..55c307aad
--- /dev/null
+++ b/doc/LOD_CN.md
@@ -0,0 +1,24 @@
+# Lod字段说明
+
+(简体中文|[English](LOD.md))
+
+## 概念
+
+LoD(Level-of-Detail) Tensor是Paddle的高级特性，是对Tensor的一种扩充。LoDTensor通过牺牲灵活性来提升训练的效率。
+注：对于大部分用户来说，无需关注LoDTensor的用法，目前Serving中仅支持一维Lod的用法。
+
+## 使用
+
+**前提：** 首先您的预测模型需要支持变长Tensor的输入。
+
+
+以视觉任务为例，在视觉任务中，时常需要处理视频和图像这些元素是高维的对象，假设现存的一个mini-batch包含3个视频，分别有3个，1个和2个帧。
+每个帧都具有相同大小：640x480，则这个mini-batch可以被表示为：
+```
+3     1  2
+口口口 口 口口
+```
+最底层tensor大小为（3+1+2）x640x480，每一个 口 表示一个640x480的图像。
+那么此时，Tensor的shape为[6,640,480],lod=[0,3,4,6].
+其中0为起始值，3-0=3;4-3=1;6-4=2，这三个值正好表示您的变长信息，lod中的最后一个元素6，应等于shape中第一维度的总长度。
+lod中记录的变长信息与Tensor中shape的第一维度的信息应按照上述方式对齐。

From ad134a1bb6e0b61ba76bf7fcb2f6b8b7dfea0468 Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Thu, 22 Apr 2021 21:08:47 +0800
Subject: [PATCH 16/20] Update LOD_CN.md

---
 doc/LOD_CN.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/LOD_CN.md b/doc/LOD_CN.md
index 55c307aad..061879b52 100644
--- a/doc/LOD_CN.md
+++ b/doc/LOD_CN.md
@@ -19,6 +19,9 @@ LoD(Level-of-Detail) Tensor是Paddle的高级特性，是对Tensor的一种扩
 口口口 口 口口
 ```
 最底层tensor大小为（3+1+2）x640x480，每一个 口 表示一个640x480的图像。
+
 那么此时，Tensor的shape为[6,640,480],lod=[0,3,4,6].
+
 其中0为起始值，3-0=3;4-3=1;6-4=2，这三个值正好表示您的变长信息，lod中的最后一个元素6，应等于shape中第一维度的总长度。
+
 lod中记录的变长信息与Tensor中shape的第一维度的信息应按照上述方式对齐。

From e0db4bbc3461ba82675754ea316ff16374a413e5 Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Thu, 22 Apr 2021 21:10:15 +0800
Subject: [PATCH 17/20] 11

11
---
 doc/LOD_CN.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/LOD_CN.md b/doc/LOD_CN.md
index 061879b52..8f0263370 100644
--- a/doc/LOD_CN.md
+++ b/doc/LOD_CN.md
@@ -5,7 +5,9 @@
 ## 概念
 
 LoD(Level-of-Detail) Tensor是Paddle的高级特性，是对Tensor的一种扩充。LoDTensor通过牺牲灵活性来提升训练的效率。
+```
 注：对于大部分用户来说，无需关注LoDTensor的用法，目前Serving中仅支持一维Lod的用法。
+```
 
 ## 使用
 

From 5a131d2f661abf925f6b511d1914f3528493e2f7 Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Thu, 22 Apr 2021 21:10:54 +0800
Subject: [PATCH 18/20] 1

2
---
 doc/LOD_CN.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/LOD_CN.md b/doc/LOD_CN.md
index 8f0263370..ff04bd3d7 100644
--- a/doc/LOD_CN.md
+++ b/doc/LOD_CN.md
@@ -5,9 +5,9 @@
 ## 概念
 
 LoD(Level-of-Detail) Tensor是Paddle的高级特性，是对Tensor的一种扩充。LoDTensor通过牺牲灵活性来提升训练的效率。
-```
-注：对于大部分用户来说，无需关注LoDTensor的用法，目前Serving中仅支持一维Lod的用法。
-```
+
+**注：** 对于大部分用户来说，无需关注LoDTensor的用法，目前Serving中仅支持一维Lod的用法。
+
 
 ## 使用
 

From fe90c7c0de9c8dd715c639e1cb8d6b19ebc922f8 Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Thu, 22 Apr 2021 21:23:35 +0800
Subject: [PATCH 19/20] 123

321
---
 doc/LOD.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 doc/LOD.md

diff --git a/doc/LOD.md b/doc/LOD.md
new file mode 100644
index 000000000..4e20c4953
--- /dev/null
+++ b/doc/LOD.md
@@ -0,0 +1,32 @@
+# Lod Introduction
+
+(English|[简体中文](LOD_CN.md))
+
+## Principle
+
+LoD(Level-of-Detail) Tensor is an advanced feature of paddle and an extension of tensor. LoD Tensor improves training efficiency by sacrificing flexibility.
+
+**Notice：** For most users, there is no need to pay attention to the usage of LoD Tensor. Currently, serving only supports the usage of one-dimensional LOD.
+
+
+## Use
+
+**Prerequisite：** Your prediction model needs to support variable length tensor input.
+
+
+Take the visual task as an example. In the visual task, we often need to process video and image. These elements are high-dimensional objects. 
+Suppose that an existing Mini batch contains three videos, each video contains three frames, one frame and two frames respectively.
+If each frame has the same size: 640x480, the mini batch can be expressed as:
+```
+3     1  2
+口口口 口 口口
+```
+The size of the bottom tenor is (3 + 1 + 2) x640x480, and each 口 represents a 640x480 image.
+
+Then, the shape of tensor is [6,640,480], lod=[0,3,4,6].
+
+Where 0 is the starting value and 3-0 = 3; 4-3=1; 6-4 = 2, these three values just represent your variable length information. 
+
+The last element 6 in LOD should be equal to the total length of the first dimension in shape.
+
+The variable length information recorded in LOD and the first dimension information of shape in tensor should be aligned in the above way.

From 1ee32878c308c17a4a9947430b3060918ddac02d Mon Sep 17 00:00:00 2001
From: HexToString <506181616@qq.com>
Date: Sun, 25 Apr 2021 11:38:25 +0000
Subject: [PATCH 20/20] add benchmark

---
 python/examples/bert/README.md                 |  6 ++++++
 python/examples/bert/README_CN.md              |  7 +++++++
 python/examples/bert/benchmark.py              | 11 +++++++----
 python/examples/bert/benchmark.sh              |  4 ++--
 python/examples/bert/benchmark_with_profile.sh |  4 ++--
 python/examples/fit_a_line/README.md           |  6 ++++++
 python/examples/fit_a_line/README_CN.md        |  7 +++++++
 python/examples/fit_a_line/benchmark.py        | 18 +++++++++++++-----
 python/examples/util/show_profile.py           |  4 ++++
 9 files changed, 54 insertions(+), 13 deletions(-)
 mode change 100644 => 100755 python/examples/bert/benchmark.py
 mode change 100644 => 100755 python/examples/bert/benchmark.sh
 mode change 100644 => 100755 python/examples/bert/benchmark_with_profile.sh
 mode change 100644 => 100755 python/examples/util/show_profile.py

diff --git a/python/examples/bert/README.md b/python/examples/bert/README.md
index 1fde6d466..7bada9387 100644
--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -84,3 +84,9 @@ set environmental variable to specify which gpus are used, the command above mea
 ```
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
+
+## Benchmark
+``` shell
+bash benchmark.sh bert_seq128_model bert_seq128_client
+```
+The output log file of benchmark named `profile_log_bert_seq128_model`
diff --git a/python/examples/bert/README_CN.md b/python/examples/bert/README_CN.md
index 060c5579a..ef28089b5 100644
--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -88,3 +88,10 @@ python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
 ```
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
+
+## 性能测试
+``` shell
+bash benchmark.sh bert_seq128_model bert_seq128_client
+```
+性能测试的日志文件为profile_log_bert_seq128_model
+如需修改性能测试用例的参数，请修改benchmark.sh中的配置信息。
diff --git a/python/examples/bert/benchmark.py b/python/examples/bert/benchmark.py
old mode 100644
new mode 100755
index c177d4b8c..eff82051c
--- a/python/examples/bert/benchmark.py
+++ b/python/examples/bert/benchmark.py
@@ -21,6 +21,7 @@
 import time
 import json
 import requests
+import numpy as np
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -56,7 +57,11 @@ def single_func(idx, resource):
                 feed_batch = []
                 b_start = time.time()
                 for bi in range(args.batch_size):
-                    feed_batch.append(reader.process(dataset[bi]))
+                    feed_dict = reader.process(dataset[bi])
+                    for key in feed_dict.keys():
+                        feed_dict[key] = np.array(feed_dict[key]).reshape(
+                            (1, 128, 1))
+                    feed_batch.append(feed_dict)
                 b_end = time.time()
 
                 if profile_flags:
@@ -116,9 +121,7 @@ def single_func(idx, resource):
 
 if __name__ == '__main__':
     multi_thread_runner = MultiThreadRunner()
-    endpoint_list = [
-        "127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295"
-    ]
+    endpoint_list = ["127.0.0.1:9292", "127.0.0.1:9293"]
     turns = 100
     start = time.time()
     result = multi_thread_runner.run(
diff --git a/python/examples/bert/benchmark.sh b/python/examples/bert/benchmark.sh
old mode 100644
new mode 100755
index 525e955e9..1a8263556
--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
@@ -1,5 +1,5 @@
 rm profile_log*
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0,1
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
 export FLAGS_serving_latency=1
@@ -12,7 +12,7 @@ else
     mkdir utilization
 fi
 #start server
-$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
+$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1 --mem_optim  --ir_optim >  elog  2>&1 &
 sleep 5
 
 #warm up
diff --git a/python/examples/bert/benchmark_with_profile.sh b/python/examples/bert/benchmark_with_profile.sh
old mode 100644
new mode 100755
index 074a9acd2..f36fbbce9
--- a/python/examples/bert/benchmark_with_profile.sh
+++ b/python/examples/bert/benchmark_with_profile.sh
@@ -1,5 +1,5 @@
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+export CUDA_VISIBLE_DEVICES=0,1
+python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1 2> elog > stdlog &
 export FLAGS_profile_client=1
 export FLAGS_profile_server=1
 sleep 5
diff --git a/python/examples/fit_a_line/README.md b/python/examples/fit_a_line/README.md
index af45b2a85..77583ce59 100644
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -42,3 +42,9 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 ``` shell
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
 ```
+
+## Benchmark
+``` shell
+bash benchmark.sh uci_housing_model uci_housing_client
+```
+The log file of benchmark named `profile_log_uci_housing_model`
diff --git a/python/examples/fit_a_line/README_CN.md b/python/examples/fit_a_line/README_CN.md
index 9ef55749b..e115b6deb 100644
--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -43,3 +43,10 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 ``` shell
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
 ```
+
+## 性能测试
+``` shell
+bash benchmark.sh uci_housing_model uci_housing_client
+```
+性能测试的日志文件为profile_log_uci_housing_model
+如需修改性能测试用例的参数，请修改benchmark.sh中的配置信息。
diff --git a/python/examples/fit_a_line/benchmark.py b/python/examples/fit_a_line/benchmark.py
index b1550b2ff..77f0965f7 100644
--- a/python/examples/fit_a_line/benchmark.py
+++ b/python/examples/fit_a_line/benchmark.py
@@ -15,7 +15,7 @@
 
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
+from paddle_serving_client.utils import benchmark_args, show_latency
 import time
 import paddle
 import sys
@@ -37,9 +37,6 @@ def single_func(idx, resource):
         client.connect([args.endpoint])
         start = time.time()
         for data in train_reader():
-            #new_data = np.zeros((1, 13)).astype("float32")
-            #new_data[0] = data[0][0]
-            #fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=True)
             fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
         end = time.time()
         return [[end - start], [total_number]]
@@ -57,6 +54,17 @@ def single_func(idx, resource):
         return [[end - start], [total_number]]
 
 
+start = time.time()
 multi_thread_runner = MultiThreadRunner()
 result = multi_thread_runner.run(single_func, args.thread, {})
-print(result)
+end = time.time()
+total_cost = end - start
+avg_cost = 0
+for i in range(args.thread):
+    avg_cost += result[0][i]
+avg_cost = avg_cost / args.thread
+
+print("total cost: {}s".format(total_cost))
+print("each thread cost: {}s. ".format(avg_cost))
+print("qps: {}samples/s".format(args.batch_size * args.thread / total_cost))
+show_latency(result[1])
diff --git a/python/examples/util/show_profile.py b/python/examples/util/show_profile.py
old mode 100644
new mode 100755
index 3815ad9ec..a726e765e
--- a/python/examples/util/show_profile.py
+++ b/python/examples/util/show_profile.py
@@ -5,6 +5,7 @@
 profile_file = sys.argv[1]
 thread_num = sys.argv[2]
 time_dict = collections.OrderedDict()
+query_count = 0
 
 
 def prase(line):
@@ -26,12 +27,15 @@ def prase(line):
 
 
 with open(profile_file) as f:
+    query_count = 0
     for line in f.readlines():
         line = line.strip().split("\t")
         if line[0] == "PROFILE":
             prase(line[2])
+            query_count += 1
 
 print("thread_num: {}".format(thread_num))
+print("query_count: {}".format(query_count))
 for name in time_dict:
     print("{} cost: {}s in each thread ".format(name, time_dict[name] / (
         1000000.0 * float(thread_num))))