Merge pull request #2339 from alibaba/feature/sync

[MNN:Sync] Sync Internal Gitlab 2.4.3
alibaba · Apr 18, 2023 · fa653d7 · fa653d7
2 parents f631e59 + 39e2e7e
commit fa653d7
Show file tree

Hide file tree

Showing 138 changed files with 2,437 additions and 1,206 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,8 +25,6 @@ set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_LIST_DIR}/cmake"
 )
-# Required for OpenCL/OpenGL/Vulkan CodeGen
-include(FindPythonInterp REQUIRED)
 # build options
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
@@ -279,6 +277,10 @@ if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
     endif()
 endif()
 
+IF(MNN_DEBUG_MEMORY)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+endif()
 
 IF(CMAKE_BUILD_TYPE MATCHES Debug)
     add_definitions(-DMNN_DEBUG -DDEBUG)

diff --git a/benchmark/android/bench_android.sh b/benchmark/android/bench_android.sh
@@ -0,0 +1,107 @@
+set -e
+ABI="arm64-v8a"
+OPENMP="OFF"
+VULKAN="ON"
+OPENCL="ON"
+OPENGL="OFF"
+RUN_LOOP=10
+FORWARD_TYPE=0
+CLEAN=""
+PUSH_MODEL=""
+
+WORK_DIR=`pwd`
+BUILD_DIR=build
+BENCHMARK_MODEL_DIR=$WORK_DIR/../models
+ANDROID_DIR=/data/local/tmp
+
+function usage() {
+    echo "-32\tBuild 32bit."
+    echo "-c\tClean up build folders."
+    echo "-p\tPush models to device"
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_android_bench() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../../ \
+          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_ABI="${ABI}" \
+          -DANDROID_STL=c++_static \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_NATIVE_API_LEVEL=android-21  \
+          -DMNN_USE_LOGCAT:BOOL=OFF \
+          -DMNN_VULKAN:BOOL=$VULKAN \
+          -DMNN_OPENCL:BOOL=$OPENCL \
+          -DMNN_OPENMP:BOOL=$OPENMP \
+          -DMNN_OPENGL:BOOL=$OPENGL \
+          -DMNN_ARM82:BOOL=ON \
+          -DMNN_BUILD_BENCHMARK:BOOL=ON \
+          -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+          -DNATIVE_LIBRARY_OUTPUT=.
+    make -j8 benchmark.out timeProfile.out
+}
+
+function bench_android() {
+    build_android_bench
+    find . -name "*.so" | while read solib; do
+        adb push $solib  $ANDROID_DIR
+    done
+    adb push benchmark.out $ANDROID_DIR
+    adb push timeProfile.out $ANDROID_DIR
+    adb shell chmod 0777 $ANDROID_DIR/benchmark.out
+
+    if [ "" != "$PUSH_MODEL" ]; then
+        adb shell "rm -rf $ANDROID_DIR/benchmark_models"
+        adb push $BENCHMARK_MODEL_DIR $ANDROID_DIR/benchmark_models
+    fi
+    adb shell "cat /proc/cpuinfo > $ANDROID_DIR/benchmark.txt"
+    adb shell "echo >> $ANDROID_DIR/benchmark.txt"
+    adb shell "echo Build Flags: ABI=$ABI  OpenMP=$OPENMP Vulkan=$VULKAN OpenCL=$OPENCL >> $ANDROID_DIR/benchmark.txt"
+    #benchmark  CPU
+    adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models $RUN_LOOP 5 $FORWARD_TYPE 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt"
+    #benchmark  Vulkan
+    adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models $RUN_LOOP 5 7 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt"
+    #benchmark OpenCL
+    adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models 100 20 3 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt"
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            ABI="armeabi-v7a"
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -p)
+            shift
+            PUSH_MODEL="-p"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_android
+adb pull $ANDROID_DIR/benchmark.txt .
+cat benchmark.txt
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -43,6 +43,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_AVX512           | 是否构建`avx512`后端，默认为`OFF` |
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
 | MNN_CUDA_PROFILE     | 是否打开CUDA profile工具，默认为`OFF` |
+| MNN_CUDA_QUANT       | 是否打开CUDA 量化文件编译，默认为`OFF` |
 | MNN_TENSORRT         | 是否构建`TensorRT`后端，默认为`OFF` |
 | MNN_COREML           | 是否构建`CoreML`后端，默认为`OFF` |
 | MNN_NNAPI            | 是否构建`NNAPI`后端，默认为`OFF`  |

diff --git a/docs/compile/engine.md b/docs/compile/engine.md
@@ -3,12 +3,12 @@
 ## Linux/MacOS
 - 环境要求
   - cmake >= 3.10
-  - protobuf >= 3.0
   - gcc >= 4.9
 - 相关编译选项
   - `MNN_ONEDNN` 是否使用oneDNN库来加速卷积运算
-  - `MNN_AVX512` 是否使用AVX512指令
-  - `MNN_OPENCL` 是否使用OpenCL后端，针对AMD GPU设备
+  - `MNN_AVX512` 是否使用AVX512指令，需要gcc9以上版本编译
+  - `MNN_OPENCL` 是否使用OpenCL后端，针对GPU设备
+  - `MNN_VULKAN` 是否使用Vulkan后端，针对GPU设备
   - `MNN_CUDA`  是否使用CUDA后端，针对Nivida GPU设备
   - `MNN_TENSORRT` 是否使用TensorRT后端，针对Nivida GPU设备
 - 具体步骤
@@ -42,27 +42,26 @@
   2. 编译
      - 64位编译：在设置中找到vcvars64.bat（适用于 VS 2017 的 x64 本机工具命令提示）并单击，打开VS编译x64架构程序的虚拟环境
      - 32位编译：在设置中找到vcvarsamd64_x86.bat（VS 2017的 x64_x86 交叉工具命令提示符）并单击，打开VS交叉编译x86架构程序的虚拟环境 
+     - 在虚拟环境中执行如下编译命令：
         ```bash
         cd /path/to/MNN
-        powershell # 运行该命令从cmd环境进入powershell环境，后者功能更强大
-        ./schema/generate.ps1
-        # CPU, 64位编译
-        .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x64
-        # CPU, 32位编译
-        .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x86
-        # CPU+OpenCL+Vulkan, 64位编译
-        .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x64 -backends "opencl,vulkan"
-        # CPU+OpenCL+Vulkan, 32位编译
-        .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x86 -backends "opencl,vulkan"
+        ./schema/generate.ps1 # 非必须
+        mkdir build && cd build
+        cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
+        ninja
         ```
+     - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
+     - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
+     - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
 ## Android
 - 环境要求
   - cmake >= 3.10
-  - protobuf >= 3.0
   - ndk
 - 相关编译选项
   - `MNN_OPENCL` 是否使用OpenCL后端，OpenCL后端可以利用GPU加速
-  - `MNN_ARM82`  是否使用Arm82后端，Arm82后端支持低精度(fp16)推理，同时uint8量化模型加速也需要ARM82
+  - `MNN_NNAPI` 是否使用NNAPI后端，NNAPI后端会尝试使用设备上的NPU进行加速
+  - `MNN_ARM82`  是否支持fp16推理，开启该编译选项后，在precision设成Precision_Low时，会在支持的设备（ARMv8.2 及以上架构）上启用低精度(fp16)推理，减少内存占用，提升性能
+  - `MNN_SUPPORT_BF16`  是否支持bf16推理，开启该编译选项后，在precision设成Precision_Low_BF16 时，会启用bf16推理，减少内存占用，提升性能
 - 具体步骤
   1. 在[NDK download](https://developer.android.com/ndk/downloads/)下载安装NDK，建议使用最新稳定版本；
   2. 在 .bashrc 或者 .bash_profile 中设置NDK环境变量，例如：export ANDROID_NDK=/Users/username/path/to/android-ndk-r14b
@@ -85,7 +84,7 @@
 - 相关编译选项
   - `MNN_METAL` 是否使用Metal后端，Metal后端可以利用GPU加速
   - `MNN_COREML`  是否使用CoreML后端，CoreML后端可以利用ANE硬件加速
-  - `MNN_ARM82`  是否使用Arm82后端，Arm82后端支持低精度(fp16)推理，同时uint8量化模型加速也需要ARM82
+  - `MNN_ARM82`  是否支持fp16推理，开启该编译选项后，在precision设成Precision_Low时，会在支持的设备（ARMv8.2 及以上架构）上启用低精度(fp16)推理，减少内存占用，提升性能
 - 具体步骤
   - 在macOS下，用Xcode打开project/ios/MNN.xcodeproj，点击编译即可
 ## 其他平台交叉编译
@@ -106,7 +105,7 @@
         cmake .. \
         -DCMAKE_SYSTEM_NAME=宿主系统，例如Linux \
         -DCMAKE_SYSTEM_VERSION=1 \
-        -DCMAKE_SYSTEM_PROCESSOR=交叉编译目标处理器的信息。例如arm或aarch64 \
+        -DCMAKE_SYSTEM_PROCESSOR=交叉编译目标处理器的信息。例如armv7或aarch64 \
         -DCMAKE_C_COMPILER=交叉编译器中C编译器的路径 \
         -DCMAKE_CXX_COMPILER=交叉编译器中C++编译器的路径
         ```
@@ -129,4 +128,4 @@
         -DCMAKE_C_COMPILER=$cross_compile_toolchain/bin/aarch64-linux-gnu-gcc \
         -DCMAKE_CXX_COMPILER=$cross_compile_toolchain/bin/aarch64-linux-gnu-g++
         make -j4
-        ```
+        ```
diff --git a/docs/compile/tools.md b/docs/compile/tools.md
@@ -10,7 +10,7 @@
     ```
 - 编译产物
   - `MNNConvert` 模型转换工具
-  - `TestConvertResult` 模型转换正确性测试工具，*Windows下没有此产物*
+  - `TestConvertResult` 模型转换正确性测试工具，*Windows下没有此产物，用`MNNConvert`对应功能替代*
   - `TestPassManager` 模型转换工具测试用例
   - `MNNDump2Json` 模型转换为Json
   - `MNNRevert2Buffer` Json转换为模型

diff --git a/docs/contribute/op.md b/docs/contribute/op.md
@@ -109,14 +109,14 @@ DECLARE_OP_COVERTER(MyCustomOpTflite);
 
 需要实现函数：
 ```cpp
-MyCustomOpTflite::opType(bool quantizedModel);
-MyCustomOpTflite::type(bool quantizedModel);
+MyCustomOpTflite::opType(int quantizedModel);
+MyCustomOpTflite::type(int quantizedModel);
 MyCustomOpTflite::run(MNN::OpT *dstOp, 
                       const std::unique_ptr<tflite::OperatorT> &tfliteOp, 
                       const std::vector<std::unique_ptr<tflite::TensorT> > &tfliteTensors,
                       const std::vector<std::unique_ptr<tflite::BufferT> > &tfliteModelBuffer,
                       const std::vector<std::unique_ptr<tflite::OperatorCodeT> > &tfliteOpSet,
-                      bool quantizedModel)
+                      int quantizedModel)
 ```
 
 其中，`run`函数相比TensorFlow的版本，多一个`quantizedModel`参数。若`qu``antizedModel`为true，则模型为量化模型，需转为相应的量化Op；若为false，转为浮点Op。在run函数中需要设置输入、输出tensor的index：

diff --git a/docs/tools/benchmark.md b/docs/tools/benchmark.md
@@ -10,8 +10,8 @@
 - warm_up_count: 预热次数
 - forwardtype: 可选，默认是0，即CPU，forwardtype有0->CPU，1->Metal，3->OpenCL，6->OpenGL，7->Vulkan
 ## Android
-在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark)下直接执行脚本`bench_android.sh`，默认编译armv7，加参数-64编译armv8，参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
-脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark)下得到测试结果`benchmark.txt`
+在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`，默认编译armv7，加参数-64编译armv8，参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
+脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`
 ## iOS
 1. 先准备模型文件，进入tools/script目录下执行脚本`get_model.sh`；
 2. 打开demo/iOS目录下的demo工程，点击benchmark；可通过底部工具栏切换模型、推理类型、线程数。
@@ -29,4 +29,4 @@
  ./benchmarkExprModels.out SqueezeNet_100 10 0 4 
  ./benchmarkExprModels.out ShuffleNet_100_4 10 0 4
 ```
-相应模型的paper链接附在头文件里，如`benchmark/exprModels/MobileNetExpr.hpp`
+相应模型的paper链接附在头文件里，如`benchmark/exprModels/MobileNetExpr.hpp`
diff --git a/docs/tools/test.md b/docs/tools/test.md
@@ -26,14 +26,14 @@ Model Version: < 2.0.0
 测试性能、输出结果，可检查与Caffe/Tensorflow的预期结果是否匹配。
 **注意：对非CPU后端来说，只有总耗时是准确的，单个op耗时和op耗时占比都是不准确的**
 ### 参数
-`./MNNV2Basic.out model [runLoops runMask forwardType numberThread inputSize precision]`
+`./MNNV2Basic.out model [runLoops runMask forwardType numberThread precision_memory inputSize]`
 - `model:str` 模型文件路径
 - `runLoops:int` 性能测试的循环次数，可选，默认为`1`
 - `runMask:int` 是否输出推理中间结果，0为不输出，1为只输出每个算子的输出结果（{op_name}.txt），2为输出每个算子的输入（Input_{op_name}.txt）和输出（{op_name}.txt）结果； 默认输出当前目录的output目录下（使用工具之前要自己建好output目录），可选，默认为`0`
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `numberThread:int` 线程数仅对CPU有效，可选，默认为`4`
+- `precision_memory:int` 测试精度与内存模式，precision_memory % 16 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2` ; precision_memory / 16 为内存设置，默认为 0 (memory_normal) 。例如测试 memory 为 low (2) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
 - `inputSize:str` 输入tensor的大小，输入格式为：`1x3x224x224`，可选，默认使用模型默认输入
-- `precision:int` 测试精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2`
 ### 默认输入与输出
 只支持单一输入、单一输出。输入为运行目录下的input_0.txt；输出为推理完成后的第一个输出tensor，转换为文本后，输出到output.txt中。
 ### 示例
@@ -64,14 +64,14 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
 ### 功能
 类似`MNNV2Basic.out`，对于带控制流模型，或者多输入多输出的模型，建议采用这个工具
 ### 参数
-`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision cacheFile]`
+`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
 - `model:str` 模型文件路径
 - `dir:str` 输入输出信息文件夹，可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成，参考模型转换的正确性校验部分。
 - `runMask:int` 是否输出推理中间结果，0为不输出，1为只输出每个算子的输出结果（{op_name}.txt），2为输出每个算子的输入（Input_{op_name}.txt）和输出（{op_name}.txt）结果； 默认输出当前目录的output目录下（使用工具之前要自己建好output目录），可选，默认为`0`
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `runLoops:int` 性能测试的循环次数，可选，默认为`0`即不做性能测试
 - `numberThread:int` GPU的线程数，可选，默认为`1`
-- `precision:int` 测试精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`0`
+- `precision_memory:int` 测试精度与内存模式，precision_memory % 16 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2` ; precision_memory / 16 为内存设置，默认为 0 (memory_normal) 。例如测试 memory 为 2(low) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
 ### 默认输出
 在当前目录 output 文件夹下，依次打印输出为 0.txt , 1.txt , 2.txt , etc
 ### 示例
@@ -486,4 +486,4 @@ G
 1.0000000	
 Generate winogradTransformSource1_2_0.5.comp
 Generate winogradTransformDest1_2_0.5.comp
-```
+```
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 4
-#define MNN_VERSION_PATCH 2
+#define MNN_VERSION_PATCH 3
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/MNNSharedContext.h b/include/MNN/MNNSharedContext.h
@@ -42,6 +42,14 @@ struct MNNMetalTensorContent {
 MNN_PUBLIC int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor);
 #endif
 
+#ifdef MNN_USER_SET_DEVICE
+
+struct MNNDeviceContext {
+    uint32_t deviceId;
+};
+
+#endif
+
 
 #ifdef __cplusplus
 }