Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python pipeline模式下提供mkldnn高性能推理 #1264

Merged
merged 1 commit into from
May 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/examples/pipeline/ocr/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export FLAGS_profile_pipeline=1
alias python3="python3.6"
alias python3="python3.7"
modelname="ocr"

# HTTP
Expand All @@ -11,11 +11,11 @@ rm -rf profile_log_$modelname

echo "Starting HTTP Clients..."
# Start a client in each thread, tesing the case of multiple threads.
for thread_num in 1 2 4 8 12 16
for thread_num in 1 2 4 6 8 12 16
do
for batch_size in 1
do
echo '----$modelname thread num: $thread_num batch size: $batch_size mode:http ----' >>profile_log_$modelname
echo "----$modelname thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
# Start one web service, If you start the service yourself, you can ignore it here.
#python3 web_service.py >web.log 2>&1 &
#sleep 3
Expand Down Expand Up @@ -51,7 +51,7 @@ sleep 3

# Create yaml,If you already have the config.yaml, ignore it.
#python3 benchmark.py yaml local_predictor 1 gpu
rm -rf profile_log_$modelname
#rm -rf profile_log_$modelname

# Start a client in each thread, tesing the case of multiple threads.
for thread_num in 1 2 4 6 8 12 16
Expand Down
28 changes: 23 additions & 5 deletions python/examples/pipeline/ocr/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ http_port: 9999

#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程,每个进程内构建grpcSever和DAG
##当build_dag_each_worker=False时,框架会设置主线程grpc线程池的max_workers=worker_num
worker_num: 5
worker_num: 20

#build_dag_each_worker, False,框架在进程内创建一条DAG;True,框架会每个进程内创建多个独立的DAG
build_dag_each_worker: false
Expand All @@ -26,7 +26,7 @@ dag:
op:
det:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 2
concurrency: 6

#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
Expand All @@ -40,10 +40,19 @@ op:
fetch_list: ["concat_1.tmp_0"]

#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
devices: ""

#use_mkldnn
#use_mkldnn: True

#thread_num
thread_num: 2

#ir_optim
ir_optim: True
rec:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 2
concurrency: 3

#超时时间, 单位ms
timeout: -1
Expand All @@ -64,4 +73,13 @@ op:
fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]

#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
devices: ""

#use_mkldnn
#use_mkldnn: True

#thread_num
thread_num: 2

#ir_optim
ir_optim: True
15 changes: 11 additions & 4 deletions python/examples/pipeline/simple_web_service/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@ http_port: 18082
dag:
#op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: False

#tracer
tracer:
interval_s: 10
op:
uci:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 2
concurrency: 1

#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
Expand All @@ -35,7 +39,10 @@ op:
#precsion, 预测精度,降低预测精度可提升预测速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "FP16"
precision: "fp32"

#ir_optim开关, 默认False
ir_optim: True

#ir_optim开关
ir_optim: False
#use_mkldnn开关, 默认False, use_mkldnn与ir_optim同时打开才有性能提升
use_mkldnn: True
51 changes: 39 additions & 12 deletions python/paddle_serving_app/local_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ def load_model_config(self,
use_xpu=False,
precision="fp32",
use_calib=False,
use_mkldnn=False,
mkldnn_cache_capacity=0,
mkldnn_op_list=None,
mkldnn_bf16_op_list=None,
use_feed_fetch_ops=False):
"""
Load model configs and create the paddle predictor by Paddle Inference API.
Expand All @@ -73,14 +77,18 @@ def load_model_config(self,
use_gpu: calculating with gpu, False default.
gpu_id: gpu id, 0 default.
use_profile: use predictor profiles, False default.
thread_num: thread nums, default 1.
thread_num: thread nums of cpu math library, default 1.
mem_optim: memory optimization, True default.
ir_optim: open calculation chart optimization, False default.
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
use_xpu: run predict on Baidu Kunlun, False default
precision: precision mode, "fp32" default
use_calib: use TensorRT calibration, False default
use_mkldnn: use MKLDNN, False default.
mkldnn_cache_capacity: cache capacity for input shapes, 0 default.
mkldnn_op_list: op list accelerated using MKLDNN, None default.
mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
use_feed_fetch_ops: use feed/fetch ops, False default.
"""
client_config = "{}/serving_server_conf.prototxt".format(model_path)
Expand All @@ -96,13 +104,15 @@ def load_model_config(self,
config = paddle_infer.Config(model_path)

logger.info(
"LocalPredictor load_model_config params: model_path:{}, use_gpu:{},\
gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
use_trt:{}, use_lite:{}, use_xpu: {}, precision: {}, use_calib: {},\
use_feed_fetch_ops:{}"
.format(model_path, use_gpu, gpu_id, use_profile, thread_num,
mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
use_calib, use_feed_fetch_ops))
"LocalPredictor load_model_config params: model_path:{}, use_gpu:{}, "
"gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, "
"use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, ".format(
model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
ir_optim, use_trt, use_lite, use_xpu, precision, use_calib,
use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list,
mkldnn_bf16_op_list, use_feed_fetch_ops))

self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
Expand All @@ -118,21 +128,35 @@ def load_model_config(self,
self.fetch_names_to_idx_[var.alias_name] = i
self.fetch_names_to_type_[var.alias_name] = var.fetch_type

# set precision of inference.
precision_type = paddle_infer.PrecisionType.Float32
if precision is not None and precision.lower() in precision_map:
precision_type = precision_map[precision.lower()]
else:
logger.warning("precision error!!! Please check precision:{}".
format(precision))
# set profile
if use_profile:
config.enable_profile()
# set memory optimization
if mem_optim:
config.enable_memory_optim()
# set ir optimization, threads of cpu math library
config.switch_ir_optim(ir_optim)
config.set_cpu_math_library_num_threads(thread_num)
# use feed & fetch ops
config.switch_use_feed_fetch_ops(use_feed_fetch_ops)
# pass optim
config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")

# set cpu & mkldnn
config.set_cpu_math_library_num_threads(thread_num)
if use_mkldnn:
config.enable_mkldnn()
if mkldnn_cache_capacity > 0:
config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
if mkldnn_op_list is not None:
config.set_mkldnn_op(mkldnn_op_list)
# set gpu
if not use_gpu:
config.disable_gpu()
else:
Expand All @@ -145,18 +169,18 @@ def load_model_config(self,
min_subgraph_size=3,
use_static=False,
use_calib_mode=False)

# set lite
if use_lite:
config.enable_lite_engine(
precision_mode=precision_type,
zero_copy=True,
passes_filter=[],
ops_filter=[])

# set xpu
if use_xpu:
# 2MB l3 cache
config.enable_xpu(8 * 1024 * 1024)

# set cpu low precision
if not use_gpu and not use_lite:
if precision_type == paddle_infer.PrecisionType.Int8:
logger.warning(
Expand All @@ -165,6 +189,9 @@ def load_model_config(self,
#config.enable_quantizer()
if precision is not None and precision.lower() == "bf16":
config.enable_mkldnn_bfloat16()
if mkldnn_bf16_op_list is not None:
config.set_bfloat16_op(mkldnn_bf16_op_list)

self.predictor = paddle_infer.create_predictor(config)

def predict(self, feed=None, fetch=None, batch=False, log_id=0):
Expand Down
38 changes: 31 additions & 7 deletions python/pipeline/local_service_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ def __init__(self,
ir_optim=False,
available_port_generator=None,
use_profile=False,
precision="fp32"):
precision="fp32",
use_mkldnn=False,
mkldnn_cache_capacity=0,
mkldnn_op_list=None,
mkldnn_bf16_op_list=None):
"""
Initialization of localservicehandler

Expand All @@ -64,6 +68,10 @@ def __init__(self,
available_port_generator: generate available ports
use_profile: use profiling, False default.
precision: inference precesion, e.g. "fp32", "fp16", "int8"
use_mkldnn: use mkldnn, default False.
mkldnn_cache_capacity: cache capacity of mkldnn, 0 means no limit.
mkldnn_op_list: OP list optimized by mkldnn, None default.
mkldnn_bf16_op_list: OP list optimized by mkldnn bf16, None default.

Returns:
None
Expand All @@ -78,6 +86,10 @@ def __init__(self,
self._use_trt = False
self._use_lite = False
self._use_xpu = False
self._use_mkldnn = False
self._mkldnn_cache_capacity = 0
self._mkldnn_op_list = None
self._mkldnn_bf16_op_list = None

if device_type == -1:
# device_type is not set, determined by `devices`,
Expand Down Expand Up @@ -140,16 +152,24 @@ def __init__(self,
self._use_profile = use_profile
self._fetch_names = fetch_names
self._precision = precision
self._use_mkldnn = use_mkldnn
self._mkldnn_cache_capacity = mkldnn_cache_capacity
self._mkldnn_op_list = mkldnn_op_list
self._mkldnn_bf16_op_list = mkldnn_bf16_op_list

_LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{} precision:{}".format(
"client_type:{}, fetch_names:{}, precision:{}, use_mkldnn:{}, "
"mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices, self.
_mem_optim, self._ir_optim, self._use_profile, self._thread_num,
self._client_type, self._fetch_names, self._precision))
self._use_lite, self._use_xpu, device_type, self._devices,
self._mem_optim, self._ir_optim, self._use_profile,
self._thread_num, self._client_type, self._fetch_names,
self._precision, self._use_mkldnn, self._mkldnn_cache_capacity,
self._mkldnn_op_list, self._mkldnn_bf16_op_list))

def get_fetch_list(self):
return self._fetch_names
Expand Down Expand Up @@ -189,7 +209,7 @@ def get_client(self, concurrency_idx):
from paddle_serving_app.local_predict import LocalPredictor
if self._local_predictor_client is None:
self._local_predictor_client = LocalPredictor()

# load model config and init predictor
self._local_predictor_client.load_model_config(
model_path=self._model_config,
use_gpu=self._use_gpu,
Expand All @@ -201,7 +221,11 @@ def get_client(self, concurrency_idx):
use_trt=self._use_trt,
use_lite=self._use_lite,
use_xpu=self._use_xpu,
precision=self._precision)
precision=self._precision,
use_mkldnn=self._use_mkldnn,
mkldnn_cache_capacity=self._mkldnn_cache_capacity,
mkldnn_op_list=self._mkldnn_op_list,
mkldnn_bf16_op_list=self._mkldnn_bf16_op_list)
return self._local_predictor_client

def get_client_config(self):
Expand Down
Loading