Skip to content

Commit

Permalink
Merge pull request #219 from JZZ-NOTE/share_external_data
Browse files Browse the repository at this point in the history
add share_external_data interface demo
  • Loading branch information
jiweibo authored Feb 23, 2022
2 parents c67d8e2 + 4759044 commit eee5f72
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 4 deletions.
6 changes: 5 additions & 1 deletion c++/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,12 @@ endif()

if(WITH_GPU)
if(NOT WIN32)
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
include_directories("/usr/local/cuda/include")
if(CUDA_LIB STREQUAL "")
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
endif()
else()
include_directories("C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include")
if(CUDA_LIB STREQUAL "")
set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
endif()
Expand Down
8 changes: 6 additions & 2 deletions c++/resnet50/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,21 @@ Paddle-Inference-Demo/c++/lib/
点击[链接](https://paddle-inference-dist.bj.bcebos.com/Paddle-Inference-Demo/resnet50.tgz)下载模型。如果你想获取更多的**模型训练信息**,请访问[这里](https://github.com/PaddlePaddle/PaddleClas)
### **样例编译**

文件`resnet50_test.cc` 为预测的样例程序(程序中的输入为固定值,如果您有opencv或其他方式进行数据读取的需求,需要对程序进行一定的修改)。
文件`resnet50_test.cc` 为预测的样例程序(程序中的输入为固定值,如果您有opencv或其他方式进行数据读取的需求,需要对程序进行一定的修改)。
文件`resnet50_share_data.cc` 为使用 `ShareExternalData` 接口创建输入/输出 Tensor 的样例程序。
脚本`compile.sh` 包含了第三方库、预编译库的信息配置。
脚本`run.sh` 一键运行脚本。

编译Resnet50样例,我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
编译Resnet50样例,我们首先需要对脚本`compile.sh` 文件中的配置进行修改。

1)**修改`compile.sh`**

打开`compile.sh`,我们对以下的几处信息进行修改:

```shell
# 编译的 demo 名称,resnet50_test 或 resnet50_share_data
DEMO_NAME=resnet50_test

# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
WITH_MKL=ON
WITH_GPU=ON
Expand Down
2 changes: 1 addition & 1 deletion c++/resnet50/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ USE_TENSORRT=OFF
LIB_DIR=${work_path}/../lib/paddle_inference
CUDNN_LIB=/usr/lib/x86_64-linux-gnu/
CUDA_LIB=/usr/local/cuda/lib64
TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
TENSORRT_ROOT=/usr/local/TensorRT-7.1.3.4

WITH_ROCM=OFF
ROCM_LIB=/opt/rocm/lib
Expand Down
110 changes: 110 additions & 0 deletions c++/resnet50/resnet50_share_data.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include <chrono>
#include <iostream>
#include <memory>
#include <numeric>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <cuda_runtime.h>

#include "paddle/include/paddle_inference_api.h"

using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::CreatePredictor;
using paddle_infer::DataLayout;
using paddle_infer::PlaceType;

DEFINE_string(model_file, "", "Directory of the inference model.");
DEFINE_string(params_file, "", "Directory of the inference model.");
DEFINE_string(model_dir, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Directory of the inference model.");
DEFINE_int32(warmup, 0, "warmup.");
DEFINE_int32(repeats, 1, "repeats.");
DEFINE_bool(use_gpu, false, "use gpu.");

using Time = decltype(std::chrono::high_resolution_clock::now());
Time time() { return std::chrono::high_resolution_clock::now(); };
double time_diff(Time t1, Time t2) {
typedef std::chrono::microseconds ms;
auto diff = t2 - t1;
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}

std::shared_ptr<Predictor> InitPredictor() {
Config config;
if (FLAGS_model_dir != "") {
config.SetModel(FLAGS_model_dir);
}
config.SetModel(FLAGS_model_file, FLAGS_params_file);
if (FLAGS_use_gpu) {
config.EnableUseGpu(100, 0);
} else {
config.EnableMKLDNN();
}

// Open the memory optim.
config.EnableMemoryOptim();
return CreatePredictor(config);
}

void run(Predictor *predictor, const float* input,
const std::vector<int> &input_shape, float *output) {
int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1,
std::multiplies<int>());

auto input_names = predictor->GetInputNames();
auto output_names = predictor->GetOutputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
auto output_t = predictor->GetOutputHandle(output_names[0]);
if (FLAGS_use_gpu){
input_t->ShareExternalData<float>(input, input_shape, PlaceType::kGPU);
output_t->ShareExternalData<float>(output, {FLAGS_batch_size, 1000}, PlaceType::kGPU);
}else{
input_t->ShareExternalData<float>(input, input_shape, PlaceType::kCPU);
output_t->ShareExternalData<float>(output, {FLAGS_batch_size, 1000}, PlaceType::kCPU);
}

for (size_t i = 0; i < FLAGS_warmup; ++i)
CHECK(predictor->Run());

auto st = time();
for (size_t i = 0; i < FLAGS_repeats; ++i) {
CHECK(predictor->Run());
}
LOG(INFO) << "run avg time is " << time_diff(st, time()) / FLAGS_repeats
<< " ms";
}

int main(int argc, char *argv[]) {
google::ParseCommandLineFlags(&argc, &argv, true);
auto predictor = InitPredictor();

std::vector<int> input_shape = {FLAGS_batch_size, 3, 224, 224};
std::vector<float> input_data(FLAGS_batch_size * 3 * 224 * 224);
for (size_t i = 0; i < input_data.size(); ++i)
input_data[i] = i % 255 * 0.1;
std::vector<float> out_data;
out_data.resize(FLAGS_batch_size * 1000);
if (FLAGS_use_gpu) {
float* input;
cudaMalloc((void **) &input, FLAGS_batch_size * 3 * 224 * 224 * sizeof(float));
cudaMemcpy(input, input_data.data(), FLAGS_batch_size * 3 * 224 * 224 * sizeof(float), cudaMemcpyHostToDevice);

float* output;
cudaMalloc((void **) &output, FLAGS_batch_size * 1000 * sizeof(float));
run(predictor.get(), input, input_shape, output);
cudaMemcpy(out_data.data(), output, FLAGS_batch_size * 1000 * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(input);
cudaFree(output);
} else {
run(predictor.get(), input_data.data(), input_shape, out_data.data());
}

for (size_t i = 0; i < out_data.size(); i += 100) {
LOG(INFO) << i << " : " << out_data[i] << std::endl;
}
return 0;
}
15 changes: 15 additions & 0 deletions docs/api_reference/cxx_api_doc/Tensor.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ void CopyFromCpu(const T* data);
template <typename T>
void CopyToCpu(T* data);

// 使用用户的数据指针创建输入/输出 Tensor
// 创建输入 Tensor 时,用户保证输入指针数据预测过程中有效
// 创建输出 Tensor 时,用户保证输出指针的数据长度大于等于模型的输出数据大小
// 参数:data - CPU/GPU 数据指针
// 参数:shape - 数据 shape
// 参数:place - 数据的存放位置
// 参数:layout - 数据格式,默认为 NCHW,当前仅支持 NCHW
// 返回:None
template <typename T>
void ShareExternalData(const T* data, const std::vector<int>& shape,
PlaceType place, DataLayout layout = DataLayout::kNCHW);

// 获取 Tensor 底层数据指针,用于设置 Tensor 输入数据
// 在调用这个 API 之前需要先对输入 Tensor 进行 Reshape
// 参数:place - 获取 Tensor 的 PlaceType
Expand Down Expand Up @@ -93,6 +105,9 @@ std::copy_n(input_data.begin(), input_data.size(),
// 方式2: 通过 CopyFromCpu 设置输入数据
input_tensor->CopyFromCpu(input_data.data());
// 方式3: 通过 ShareExternalData 设置输入数据
input_tensor->ShareExternalData<float>(input, INPUT_SHAPE, PlaceType::kCPU);
// 执行预测
predictor->Run();
Expand Down

0 comments on commit eee5f72

Please sign in to comment.