diff --git a/c++/lib/CMakeLists.txt b/c++/lib/CMakeLists.txt index 65a5752aea..25b34c8425 100644 --- a/c++/lib/CMakeLists.txt +++ b/c++/lib/CMakeLists.txt @@ -71,8 +71,12 @@ endif() if(WITH_GPU) if(NOT WIN32) - set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + include_directories("/usr/local/cuda/include") + if(CUDA_LIB STREQUAL "") + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + endif() else() + include_directories("C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include") if(CUDA_LIB STREQUAL "") set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") endif() diff --git a/c++/resnet50/README.md b/c++/resnet50/README.md index fdecf36ffb..583a52a46e 100644 --- a/c++/resnet50/README.md +++ b/c++/resnet50/README.md @@ -42,17 +42,21 @@ Paddle-Inference-Demo/c++/lib/ 点击[链接](https://paddle-inference-dist.bj.bcebos.com/Paddle-Inference-Demo/resnet50.tgz)下载模型。如果你想获取更多的**模型训练信息**,请访问[这里](https://github.com/PaddlePaddle/PaddleClas)。 ### **样例编译** -文件`resnet50_test.cc` 为预测的样例程序(程序中的输入为固定值,如果您有opencv或其他方式进行数据读取的需求,需要对程序进行一定的修改)。 +文件`resnet50_test.cc` 为预测的样例程序(程序中的输入为固定值,如果您有opencv或其他方式进行数据读取的需求,需要对程序进行一定的修改)。 +文件`resnet50_share_data.cc` 为使用 `ShareExternalData` 接口创建输入/输出 Tensor 的样例程序。 脚本`compile.sh` 包含了第三方库、预编译库的信息配置。 脚本`run.sh` 一键运行脚本。 -编译Resnet50样例,我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。 +编译Resnet50样例,我们首先需要对脚本`compile.sh` 文件中的配置进行修改。 1)**修改`compile.sh`** 打开`compile.sh`,我们对以下的几处信息进行修改: ```shell +# 编译的 demo 名称,resnet50_test 或 resnet50_share_data +DEMO_NAME=resnet50_test + # 根据预编译库中的version.txt信息判断是否将以下三个标记打开 WITH_MKL=ON WITH_GPU=ON diff --git a/c++/resnet50/compile.sh b/c++/resnet50/compile.sh index 387e088ade..6ef2f96a96 100755 --- a/c++/resnet50/compile.sh +++ b/c++/resnet50/compile.sh @@ -30,7 +30,7 @@ USE_TENSORRT=OFF LIB_DIR=${work_path}/../lib/paddle_inference CUDNN_LIB=/usr/lib/x86_64-linux-gnu/ CUDA_LIB=/usr/local/cuda/lib64 -TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5 +TENSORRT_ROOT=/usr/local/TensorRT-7.1.3.4 WITH_ROCM=OFF ROCM_LIB=/opt/rocm/lib diff --git a/c++/resnet50/resnet50_share_data.cc b/c++/resnet50/resnet50_share_data.cc new file mode 100644 index 0000000000..30a0888ba4 --- /dev/null +++ b/c++/resnet50/resnet50_share_data.cc @@ -0,0 +1,110 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include "paddle/include/paddle_inference_api.h" + +using paddle_infer::Config; +using paddle_infer::Predictor; +using paddle_infer::CreatePredictor; +using paddle_infer::DataLayout; +using paddle_infer::PlaceType; + +DEFINE_string(model_file, "", "Directory of the inference model."); +DEFINE_string(params_file, "", "Directory of the inference model."); +DEFINE_string(model_dir, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Directory of the inference model."); +DEFINE_int32(warmup, 0, "warmup."); +DEFINE_int32(repeats, 1, "repeats."); +DEFINE_bool(use_gpu, false, "use gpu."); + +using Time = decltype(std::chrono::high_resolution_clock::now()); +Time time() { return std::chrono::high_resolution_clock::now(); }; +double time_diff(Time t1, Time t2) { + typedef std::chrono::microseconds ms; + auto diff = t2 - t1; + ms counter = std::chrono::duration_cast(diff); + return counter.count() / 1000.0; +} + +std::shared_ptr InitPredictor() { + Config config; + if (FLAGS_model_dir != "") { + config.SetModel(FLAGS_model_dir); + } + config.SetModel(FLAGS_model_file, FLAGS_params_file); + if (FLAGS_use_gpu) { + config.EnableUseGpu(100, 0); + } else { + config.EnableMKLDNN(); + } + + // Open the memory optim. + config.EnableMemoryOptim(); + return CreatePredictor(config); +} + +void run(Predictor *predictor, const float* input, + const std::vector &input_shape, float *output) { + int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, + std::multiplies()); + + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto input_t = predictor->GetInputHandle(input_names[0]); + auto output_t = predictor->GetOutputHandle(output_names[0]); + if (FLAGS_use_gpu){ + input_t->ShareExternalData(input, input_shape, PlaceType::kGPU); + output_t->ShareExternalData(output, {FLAGS_batch_size, 1000}, PlaceType::kGPU); + }else{ + input_t->ShareExternalData(input, input_shape, PlaceType::kCPU); + output_t->ShareExternalData(output, {FLAGS_batch_size, 1000}, PlaceType::kCPU); + } + + for (size_t i = 0; i < FLAGS_warmup; ++i) + CHECK(predictor->Run()); + + auto st = time(); + for (size_t i = 0; i < FLAGS_repeats; ++i) { + CHECK(predictor->Run()); + } + LOG(INFO) << "run avg time is " << time_diff(st, time()) / FLAGS_repeats + << " ms"; +} + +int main(int argc, char *argv[]) { + google::ParseCommandLineFlags(&argc, &argv, true); + auto predictor = InitPredictor(); + + std::vector input_shape = {FLAGS_batch_size, 3, 224, 224}; + std::vector input_data(FLAGS_batch_size * 3 * 224 * 224); + for (size_t i = 0; i < input_data.size(); ++i) + input_data[i] = i % 255 * 0.1; + std::vector out_data; + out_data.resize(FLAGS_batch_size * 1000); + if (FLAGS_use_gpu) { + float* input; + cudaMalloc((void **) &input, FLAGS_batch_size * 3 * 224 * 224 * sizeof(float)); + cudaMemcpy(input, input_data.data(), FLAGS_batch_size * 3 * 224 * 224 * sizeof(float), cudaMemcpyHostToDevice); + + float* output; + cudaMalloc((void **) &output, FLAGS_batch_size * 1000 * sizeof(float)); + run(predictor.get(), input, input_shape, output); + cudaMemcpy(out_data.data(), output, FLAGS_batch_size * 1000 * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(input); + cudaFree(output); + } else { + run(predictor.get(), input_data.data(), input_shape, out_data.data()); + } + + for (size_t i = 0; i < out_data.size(); i += 100) { + LOG(INFO) << i << " : " << out_data[i] << std::endl; + } + return 0; +} diff --git a/docs/api_reference/cxx_api_doc/Tensor.md b/docs/api_reference/cxx_api_doc/Tensor.md index d4515c9afb..421cb40204 100644 --- a/docs/api_reference/cxx_api_doc/Tensor.md +++ b/docs/api_reference/cxx_api_doc/Tensor.md @@ -24,6 +24,18 @@ void CopyFromCpu(const T* data); template void CopyToCpu(T* data); +// 使用用户的数据指针创建输入/输出 Tensor +// 创建输入 Tensor 时,用户保证输入指针数据预测过程中有效 +// 创建输出 Tensor 时,用户保证输出指针的数据长度大于等于模型的输出数据大小 +// 参数:data - CPU/GPU 数据指针 +// 参数:shape - 数据 shape +// 参数:place - 数据的存放位置 +// 参数:layout - 数据格式,默认为 NCHW,当前仅支持 NCHW +// 返回:None +template +void ShareExternalData(const T* data, const std::vector& shape, + PlaceType place, DataLayout layout = DataLayout::kNCHW); + // 获取 Tensor 底层数据指针,用于设置 Tensor 输入数据 // 在调用这个 API 之前需要先对输入 Tensor 进行 Reshape // 参数:place - 获取 Tensor 的 PlaceType @@ -93,6 +105,9 @@ std::copy_n(input_data.begin(), input_data.size(), // 方式2: 通过 CopyFromCpu 设置输入数据 input_tensor->CopyFromCpu(input_data.data()); +// 方式3: 通过 ShareExternalData 设置输入数据 +input_tensor->ShareExternalData(input, INPUT_SHAPE, PlaceType::kCPU); + // 执行预测 predictor->Run();