diff --git a/.gitignore b/.gitignore
index ca1a4320f..d11586811 100644
--- a/.gitignore
+++ b/.gitignore
@@ -361,10 +361,3 @@ pymnn_build/
 
 # mnncompress generated
 MNN_compression_pb2.py
-
-# model path
-model/
-
-# datasets
-datasets/*
-!datasets/*.sh
\ No newline at end of file
diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp
index 305e88f30..b74fdbe11 100644
--- a/3rd_party/OpenCLHeaders/CL/cl2.hpp
+++ b/3rd_party/OpenCLHeaders/CL/cl2.hpp
@@ -3810,6 +3810,24 @@ class Buffer : public Memory
         }
     }
 
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        const cl_import_properties_arm *properties,
+        void *memory,
+        size_type size,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clImportMemoryARM(context(), flags, properties, memory, size, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+
     /*!
      * \brief Construct a Buffer from a host container via iterators using a specified context.
      * IteratorType must be random access.
diff --git a/3rd_party/OpenCLHeaders/CL/cl_ext.h b/3rd_party/OpenCLHeaders/CL/cl_ext.h
index 7b101d737..47afb42f2 100644
--- a/3rd_party/OpenCLHeaders/CL/cl_ext.h
+++ b/3rd_party/OpenCLHeaders/CL/cl_ext.h
@@ -430,6 +430,23 @@ typedef struct _cl_mem_android_native_buffer_host_ptr
 } cl_mem_android_native_buffer_host_ptr;
 
 
+/*********************************
+* cl_qcom_ahardwarebuffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_AHARDWAREBUFFER_HOST_PTR_QCOM                0x4119
+
+typedef struct _cl_mem_ahardwarebuffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_AHARDWAREBUFFER_HOST_PTR_QCOM for Android Hardware buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* Virtual pointer to the android hardware buffer */
+    void*                ahb_ptr;
+
+} cl_mem_ahardwarebuffer_host_ptr;
+
 /******************************************
  * cl_img_yuv_image extension *
  ******************************************/
@@ -583,6 +600,11 @@ typedef intptr_t cl_import_properties_arm;
 
 /* Protected DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
+#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM          0x41E2
+#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM    0x41E3
+#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM               SIZE_MAX
+#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM   0x41EF
+#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM   0x41F0
 
 /* This extension adds a new function that allows for direct memory import into
  * OpenCL via the clImportMemoryARM function.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7768e340..6048bf4d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,9 +20,7 @@ endif()
 project(MNN VERSION ${MNN_VERSION} LANGUAGES C CXX ASM)
 # complier options
 set(CMAKE_C_STANDARD 99)
-IF (NOT (CMAKE_CXX_STANDARD EQUAL 17))
-  set(CMAKE_CXX_STANDARD 11)
-ENDIF()
+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_LIST_DIR}/cmake"
@@ -49,7 +47,7 @@ option(MNN_BUILD_TOOLS "Build tools/cpp or not" ON)
 option(MNN_BUILD_QUANTOOLS "Build Quantized Tools or not" OFF)
 option(MNN_EVALUATION "Build Evaluation Tools or not" OFF)
 option(MNN_BUILD_CONVERTER "Build Converter" OFF)
-option(MNN_SUPPORT_DEPRECATED_OP "Enable MNN's tflite quantized op" ON)
+option(MNN_SUPPORT_DEPRECATED_OP "Enable MNN's tflite quantized op" OFF)
 option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
@@ -74,6 +72,7 @@ option(MNN_JNI "Build MNN Jni for java to use" OFF)
 option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
 option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
 option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
+option(MNN_BUILD_AUDIO "Build audio api in MNN." OFF)
 
 IF (OHOS AND MNN_INTERNAL)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)
@@ -192,6 +191,9 @@ endif()
 if(MNN_SUPPORT_TRANSFORMER_FUSE)
     add_definitions(-DMNN_SUPPORT_TRANSFORMER_FUSE)
 endif()
+if(MNN_BUILD_AUDIO)
+    add_definitions(-DMNN_BUILD_AUDIO)
+endif()
 # debug options
 if(MNN_DEBUG_MEMORY)
     add_definitions(-DMNN_DEBUG_MEMORY)
@@ -287,7 +289,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "^Android")
 endif()
 option(MNN_USE_CPP11 "Enable MNN use c++11" ON)
 if (NOT MSVC)
-    if((MNN_CUDA AND MNN_SUPPORT_TRANSFORMER_FUSE) OR (CMAKE_CXX_STANDARD EQUAL 17))
+    if(MNN_CUDA AND MNN_SUPPORT_TRANSFORMER_FUSE)
         set(CMAKE_CXX_STANDARD 17)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
@@ -463,6 +465,10 @@ IF(MNN_BUILD_OPENCV)
   list(APPEND MNN_EXTRA_HEADERS ${MNN_CV_HDRS})
   list(APPEND MNN_EXTRA_HEADERS ${MNN_CV_IMGHDRS})
 ENDIF()
+IF(MNN_BUILD_AUDIO)
+  file(GLOB MNN_AUDIO_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/tools/audio/include/audio/*.hpp PARENT_SCOPE)
+  list(APPEND MNN_EXTRA_HEADERS ${MNN_AUDIO_HDRS})
+ENDIF()
 IF(MNN_BUILD_LLM)
   file(GLOB MNN_LLM_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/llm/engine/include/llm/*)
   list(APPEND MNN_EXTRA_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/llm/engine/include/llm/llm.hpp)
@@ -775,6 +781,14 @@ IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
   ENDIF()
   target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
 ENDIF()
+add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/audio)
+IF(MNN_BUILD_AUDIO AND NOT MNN_SEP_BUILD)
+  IF(MSVC)
+    target_compile_definitions(MNNAudio PRIVATE "-DBUILDING_MNN_DLL" INTERFACE "-DUSING_MNN_DLL")
+  ENDIF()
+  message(STATUC "### build MNNAudio into MNN")
+  target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNAudio>)
+ENDIF()
 
 
 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
@@ -884,6 +898,14 @@ ELSE()
       SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/cv/imgproc )
     ENDFOREACH()
   ENDIF()
+  IF(MNN_BUILD_AUDIO)
+    if (NOT MNN_AAPL_FMWK)
+      INSTALL(FILES ${MNN_AUDIO_HDRS} DESTINATION include/MNN/audio)
+    endif()
+    FOREACH(HDR ${MNN_AUDIO_HDRS})
+      SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/audio/ )
+    ENDFOREACH()
+  ENDIF()
   IF(MNN_BUILD_LLM)
     if (NOT MNN_AAPL_FMWK)
         INSTALL(FILES ${MNN_LLM_HDRS} DESTINATION include/MNN/llm)
diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.cpp b/backupcode/cpubackend/compute/DeconvolutionWithStride.cpp
similarity index 100%
rename from source/backend/cpu/compute/DeconvolutionWithStride.cpp
rename to backupcode/cpubackend/compute/DeconvolutionWithStride.cpp
diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.hpp b/backupcode/cpubackend/compute/DeconvolutionWithStride.hpp
similarity index 100%
rename from source/backend/cpu/compute/DeconvolutionWithStride.hpp
rename to backupcode/cpubackend/compute/DeconvolutionWithStride.hpp
diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/backupcode/cpubackend/compute/GemmInt8Executor.cpp
similarity index 100%
rename from source/backend/cpu/compute/GemmInt8Executor.cpp
rename to backupcode/cpubackend/compute/GemmInt8Executor.cpp
diff --git a/source/backend/cpu/compute/GemmInt8Executor.hpp b/backupcode/cpubackend/compute/GemmInt8Executor.hpp
similarity index 100%
rename from source/backend/cpu/compute/GemmInt8Executor.hpp
rename to backupcode/cpubackend/compute/GemmInt8Executor.hpp
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
index 9307038ad..a4d45bca4 100644
--- a/docs/compile/cmake.md
+++ b/docs/compile/cmake.md
@@ -16,7 +16,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_BUILD_QUANTOOLS  | 是否构建MNN的量化工具，默认为`OFF` |
 | MNN_EVALUATION       | 是否构建MNN的评估工具，默认为`OFF` |
 | MNN_BUILD_CONVERTER  | 是否构建MNN的转换工具，默认为`OFF` |
-| MNN_SUPPORT_DEPRECATED_OP | 是否支持Tflite的量化算子，默认为`ON` |
+| MNN_SUPPORT_DEPRECATED_OP | 是否支持Tflite的量化算子等已经废弃的算子，用于兼容历史模型(1.1.0版本之前)，默认为`OFF` |
 | MNN_DEBUG_MEMORY     | 是否开启MNN内存调试，默认为`OFF` |
 | MNN_DEBUG_TENSOR_SIZE | 是否开启MNN tensor size调试，默认为`OFF` |
 | MNN_GPU_TRACE        | 是否开启MNN GPU调试，默认为`OFF` |
@@ -32,6 +32,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_ENABLE_COVERAGE  | 是否开启MNN的代码覆盖率，默认为`OFF` |
 | MNN_BUILD_PROTOBUFFER | 是否使用MNN中的`protobuffer`，默认为`ON` |
 | MNN_BUILD_OPENCV     | 是否构建MNN的OpenCV功能，默认为`OFF` |
+| MNN_BUILD_AUDIO      | 是否构建MNN的Audio功能，默认为`OFF` |
 | MNN_INTERNAL         | 是否构建MNN的一些内部功能，如：日志；默认为`OFF` |
 | MNN_JNI              | 是否构建MNN的JNI支持，默认为`OFF` |
 | MNN_METAL            | 是否构建`Metal`后端，默认为`OFF` |
@@ -79,6 +80,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_CVCORE           | 构建MNN的OpenCV功能是否开启`core`功能，默认为`ON` |
 | MNN_OPENCV_TEST      | 构建MNN的OpenCV功能是否开启单元测试，默认为`OFF` |
 | MNN_OPENCV_BENCH     | 构建MNN的OpenCV功能是否开启性能benchmark，默认为`OFF` |
+| MNN_AUDIO_TEST       | 构建MNN的Audio功能是否开启单元测试，默认为`OFF` |
 | MNN_VULKAN_IMAGE     | 构建MNN的Vulkan后端时采用Image内存模式，以便支持FP16和部分移动端上GPU的加速，默认为`ON` |
 | MNN_LOW_MEMORY       | 是否支持低内存模式，支持低内存模式使用权值量化模型并设置`low_memory`则会使用计算时反量化，默认为`OFF` |
 | MNN_CPU_WEIGHT_DEQUANT_GEMM       | 是否编译CPU权重反量化的矩阵乘Kernel， 如果打开该编译宏并且在CPU推理时设置MNN::BackendConfig::MemoryMode=Memory_Normal，就会使用权重反量化算子进行权重量化模型的推理，默认为`OFF` |
diff --git a/docs/compile/other.md b/docs/compile/other.md
index d0209f61b..f6418cc27 100644
--- a/docs/compile/other.md
+++ b/docs/compile/other.md
@@ -133,6 +133,19 @@
   - `libMNNOpenCV.so` MNN OpenCV函数库
   - `opencv_test` MNN OpenCV单元测试
   - `opencv_bench` MNN OpenCV性能测试
+## MNN Audio库
+- 相关编译选项
+  - `MNN_BUILD_AUDIO` 是否编译Audio函数接口
+  - `MNN_AUDIO_TEST` 是否编译Audio单元测试
+- 编译命令
+    ```bash
+    mkdir build && cd build
+    cmake .. -MNN_BUILD_AUDIO=ON -MNN_AUDIO_TEST=ON
+    make -j4
+    ```
+- 编译产物
+  - `libMNNAudio.so` MNN Audio函数库
+  - `audio_test` MNN Audio单元测试
 
 ## 示例工程
 - 相关编译选项
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
index b0fbd4932..bdd40739f 100644
--- a/docs/transformers/llm.md
+++ b/docs/transformers/llm.md
@@ -49,7 +49,7 @@ python llmexport.py \
 
 ### 功能
 - 直接转为mnn模型，使用`--export mnn`，注意，你需要先安装pymnn或者通过`--mnnconvert`选项指定MNNConvert工具的地址，两种条件必须满足其中一个。如果没有安装pymnn并且没有通过`--mnnconvert`指定MNNConvert工具的地址，那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具，需保证该目录下存在MNNConvert文件。此方案目前支持导出4bit和8bit模型
-- 如果直接转为mnn模型遇到问题，或者需要其他bits数的量化（如5bit/6bit），可以先将模型先转为onnx模型，使用`--export onnx`，然后使用./MNNConvert工具将onnx模型转为mnn模型: 
+- 如果直接转为mnn模型遇到问题，或者需要其他bits数的量化（如5bit/6bit），可以先将模型先转为onnx模型，使用`--export onnx`，然后使用./MNNConvert工具将onnx模型转为mnn模型:
 
 ```
 ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 --weightQuantBlock=128 -f ONNX --transformerFuse=1 --allowCustomOp --saveExternalData
@@ -98,13 +98,17 @@ options:
 [从源码编译](../compile/other.html#id4)
 在原有编译过程中增加必需编译宏即可：
 ```
--DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true 
+-DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true
 ```
 
 - 需要开启视觉功能时，增加相关编译宏
 ```
 -DLLM_SUPPORT_VISION=true -DMNN_BUILD_OPENCV=true -DMNN_IMGCODECS=true
 ```
+- 需要开启音频功能时，增加相关编译宏
+```
+-DLLM_SUPPORT_AUDIO=true
+```
 
 #### mac / linux / windows
 
@@ -137,7 +141,7 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
 ```
 
 #### Web
-环境配置参考 https://mnn-docs.readthedocs.io/en/latest/compile/engine.html#web 
+环境配置参考 https://mnn-docs.readthedocs.io/en/latest/compile/engine.html#web
 
 - 编译库，产出 `libMNN.a`，`libMNN_Express.a`，`libllm.a`
 
@@ -189,7 +193,7 @@ node llm_demo.js ~/qwen2.0_1.5b/config.json ~/qwen2.0_1.5b/prompt.txt
   - visual_model: 当使用VL模型时，visual_model的实际路径为`base_dir + visual_model`，默认为`base_dir + 'visual.mnn'`
 - 推理配置
   - max_new_tokens: 生成时最大token数，默认为`512`
-  - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`, 目前只有CPU后端支持设置为`true`.
+  - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`
   - quant_qkv: CPU attention 算子中`query, key, value`是否量化，可选为：`0, 1, 2, 3, 4`，默认为`0`，含义如下：
     - 0: key和value都不量化
     - 1: 使用非对称8bit量化存储key
@@ -205,19 +209,6 @@ node llm_demo.js ~/qwen2.0_1.5b/config.json ~/qwen2.0_1.5b/prompt.txt
   - thread_num: CPU推理使用硬件线程数，默认为：`4`; OpenCL推理时使用`68`
   - precision: 推理使用精度策略，默认为：`"low"`，尽量使用`fp16`
   - memory: 推理使用内存策略，默认为：`"low"`，开启运行时量化
-- Sampler配置
-  - sampler_type: 使用的sampler种类，目前支持`greedy`, `temperature`, `topK`, `topP`, `minP`, `tfs`, `typical`, `penalty`8种基本sampler，外加`mixed`(混合sampler)。当选择`mixed`时，依次执行mixed_samplers中的sampler。默认为`mixed`。
-  - mixed_samplers: 当`sampler_type`为`mixed`时有效，默认为`["topK", "tfs", "typical", "topP", "min_p", "temperature"]`
-  - temperature: `temperature`, `topP`, `minP`, `tfsZ`, `typical`中temerature值，默认为1.0
-  - topK: `topK`中top K 个的个数，默认为40
-  - topP: `topP`中top P的值，默认为0.9
-  - minP: `minP`中min P的值，默认为0.1
-  - tfsZ: `tfs`中Z的值，默认为1.0，即不使用tfs算法
-  - typical: `typical`中p的值，默认为1.0，即不使用typical算法
-  - penalty: `penalty`中对于logits的惩罚项，默认为0.0，即不惩罚
-  - n_gram: `penalty`中最大存储的ngram大小，默认为8
-  - ngram_factor: `penalty`中对于重复ngram的额外惩罚，默认为1.0，即没有额外惩罚
-  - penalty_sampler: `penalty`中最后一步采用的sampling策略，可选"greedy"或"temperature"，默认greedy.
 
 ##### 配置文件示例
 - `config.json`
@@ -229,15 +220,7 @@ node llm_demo.js ~/qwen2.0_1.5b/config.json ~/qwen2.0_1.5b/prompt.txt
       "backend_type": "cpu",
       "thread_num": 4,
       "precision": "low",
-      "memory": "low",
-      "sampler_type": "mixed",
-      "mixed_samplers": ["topK", "tfs", "typical", "topP", "min_p", "temperature"],
-      "temperature": 1.0,
-      "topK": 40,
-      "topP": 0.9,
-      "tfsZ": 1.0,
-      "minP": 0.1,
-      "reuse_kv": true
+      "memory": "low"
   }
   ```
 - `llm_config.json`
@@ -261,8 +244,7 @@ node llm_demo.js ~/qwen2.0_1.5b/config.json ~/qwen2.0_1.5b/prompt.txt
 
 #### 推理用法
 `llm_demo`的用法如下：
-pc端直接推理
-```bash
+```
 # 使用config.json
 ## 交互式聊天
 ./llm_demo model_dir/config.json
@@ -276,16 +258,15 @@ pc端直接推理
 ./llm_demo model_dir/llm.mnn prompt.txt
 ```
 
-android手机端adb推理用法：
-```bash
-# 利用adb push将链接库push到手机上
-adb shell mkdir /data/local/tmp/llm
-adb push llm_demo ppl_demo libllm.so libMNN_CL.so libMNN_Express.so libMNN.so tools/cv/libMNNOpenCV.so /data/local/tmp/llm
-```
-
 - 对于视觉大模型，在prompt中嵌入图片输入
 ```
 <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>介绍一下图片里的内容
+# 指定图片大小
+<img><hw>280, 420</hw>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>介绍一下图片里的内容
+```
+- 对于音频大模型，在prompt中嵌入音频输入
+```
+<audio>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav</audio>介绍一下音频里的内容
 ```
 
 #### GPTQ权重加载
diff --git a/docs/transformers/models.md b/docs/transformers/models.md
index 5587b41a5..ee83463a4 100644
--- a/docs/transformers/models.md
+++ b/docs/transformers/models.md
@@ -47,4 +47,5 @@
 | [reader-lm-0.5b](https://huggingface.co/jinaai/reader-lm-0.5b) | [Q4_1](https://modelscope.cn/models/MNN/reader-lm-0.5b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/reader-lm-0.5b-MNN) |
 | [reader-lm-1.5b](https://huggingface.co/jinaai/reader-lm-1.5b) | [Q4_1](https://modelscope.cn/models/MNN/reader-lm-1.5b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/reader-lm-1.5b-MNN) |
 | [TinyLlama-1.1B-Chat-v1.0](https://modelscope.cn/models/AI-ModelScope/TinyLlama-1.1B-Chat-v1.0/summary) | [Q4_1](https://modelscope.cn/models/MNN/TinyLlama-1.1B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/TinyLlama-1.1B-Chat-MNN) |
-| [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Yi-6B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Yi-6B-Chat-MNN) |
\ No newline at end of file
+| [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Yi-6B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Yi-6B-Chat-MNN) |
+| [QwQ-32B-Preview](https://modelscope.cn/models/Qwen/QwQ-32B-Preview/summary) | [Q4_1](https://modelscope.cn/models/MNN/QwQ-32B-Preview-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/QwQ-32B-Preview-MNN) |
\ No newline at end of file
diff --git a/express/MathOp.cpp b/express/MathOp.cpp
index c14a902f1..eaf93f9fc 100644
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@@ -1208,7 +1208,7 @@ VARP _LinSpace(VARP start, VARP stop, VARP num) {
     return (Variable::create(Expr::create(std::move(op), {start, stop, num})));
 }
 
-VARP _EltwiseProdInt8(VARP x, VARP y, 
+VARP _EltwiseProdInt8(VARP x, VARP y,
                     std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale)
@@ -1219,7 +1219,7 @@ VARP _EltwiseProdInt8(VARP x, VARP y,
                         output_weight, output_bias, output_scale, output_tensorScale);
 }
 
-VARP _EltwiseSumInt8(VARP x, VARP y, 
+VARP _EltwiseSumInt8(VARP x, VARP y,
                     std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale)
@@ -1230,7 +1230,7 @@ VARP _EltwiseSumInt8(VARP x, VARP y,
                         output_weight, output_bias, output_scale, output_tensorScale);
 }
 
-VARP _EltwiseSubInt8(VARP x, VARP y, 
+VARP _EltwiseSubInt8(VARP x, VARP y,
                     std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale)
@@ -1241,7 +1241,7 @@ VARP _EltwiseSubInt8(VARP x, VARP y,
                         output_weight, output_bias, output_scale, output_tensorScale);
 }
 
-VARP _EltwiseMaxInt8(VARP x, VARP y, 
+VARP _EltwiseMaxInt8(VARP x, VARP y,
                     std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale)
@@ -1320,5 +1320,20 @@ VARP _Histogram(VARP x, int bin, int min, int max, int channel) {
     return (Variable::create(Expr::create(std::move(op), {x})));
 }
 
+#ifdef MNN_BUILD_AUDIO
+VARP _Stft(VARP sample, VARP window, int n_fft, int hop_length, bool abs) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type      = OpType_Stft;
+    op->main.type = OpParameter_StftParam;
+    auto param = new StftParamT;
+    param->n_fft = n_fft;
+    param->hop_length = hop_length;
+    param->abs = abs;
+    op->main.value = param;
+    EXPRP expr = Expr::create(std::move(op), {sample, window});
+    return Variable::create(expr);
+}
+#endif
+
 } // namespace Express
 } // namespace MNN
diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp
index 18d58c3ec..28580d792 100644
--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@@ -70,7 +70,7 @@ VARP _Scalar(const void* ptr, halide_type_t type) {
 ptr: A pointer. Indicates the values.
 shape: A vector, the shape of the variable.
 format: A enum, NCHW/NHWC/NC4HW4 is allowed.
-type: The type of the elements of the resulting variable. 
+type: The type of the elements of the resulting variable.
 Returns:
 output: A constant variable.
 */
@@ -118,7 +118,7 @@ VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP
         ipParam->biasTerm = 1;
     }
     ipParam->weightSize = (int)weight.size();
-    
+
     ipParam->weight = std::move(weight);
     ipParam->bias = std::move(bias);
     return (Variable::create(Expr::create(ipOp.get(), {x})));
@@ -369,9 +369,9 @@ VARP _MaxPool(VARP x, INTS kernel, INTS stride, PaddingMode pad, INTS pads) {
 }
 /*Reshapes a variable.
 Args:
-x: A variable. 
+x: A variable.
 shape: A vector, the shape of the target variable.
-original_format: A enum, only NCHW/NHWC is allowed, NC4HW4 is not allowed, 
+original_format: A enum, only NCHW/NHWC is allowed, NC4HW4 is not allowed,
 as it provides additional information(x comes from NCHW or NHWC) When x is NC4HW4.
 Returns:
 output: A variable with the same type as `x`.
@@ -387,7 +387,7 @@ VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format) {
 }
 /*Reshapes a variable.
 Args:
-x: A variable. 
+x: A variable.
 shape: A variable, the shape of the target variable.
 Returns:
 output: A variable with the same type as `x`.
@@ -415,10 +415,10 @@ VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float
     scale->main.AsScale()->biasData  = std::move(bias);
     return (Variable::create(Expr::create(std::move(scale), {x})));
 }
-/*Given an input value x, it computes the output as x if x > 0 and slope * x if x <= 0. 
+/*Given an input value x, it computes the output as x if x > 0 and slope * x if x <= 0.
 Args:
-x: A variable. 
-slope: A float, a positive float value, it leakes the negative part by multiplying with `slope` rather than setting it to 0.0f. 
+x: A variable.
+slope: A float, a positive float value, it leakes the negative part by multiplying with `slope` rather than setting it to 0.0f.
 Returns:
 output: A variable with the same type as `x`.
 */
@@ -432,7 +432,7 @@ VARP _Relu(VARP x, float slope) {
 }
 /*Given an input value x, it computes Rectified Linear 6: min(max(x, 0), 6).
 Args:
-x: A variable. 
+x: A variable.
 Returns:
 output: A variable with the same type as `x`.
 */
@@ -445,9 +445,9 @@ VARP _Relu6(VARP x, float minValue, float maxValue) {
     relu->main.AsRelu6()->minValue = minValue;
     return (Variable::create(Expr::create(relu.get(), {x})));
 }
-/*Given an input value x, it computes the output as x if x > 0 and slopes * x if x <= 0. 
+/*Given an input value x, it computes the output as x if x > 0 and slopes * x if x <= 0.
 Args:
-x: A variable, must be 4-D with NC4HW4 format. 
+x: A variable, must be 4-D with NC4HW4 format.
 slopes: A vector, has save size as x.
 Returns:
 output: A variable with the same type as `x`.
@@ -497,10 +497,10 @@ VARP _Softsign(VARP features) {
 /*Concatenates variables along one dimension.
 Args:
 values: A list of variables a single variable.
-axis: A int. Dimension along which to concatenate. 
-Must be in the range [-rank(values), rank(values)). 
-As in Python, indexing for axis is 0-based. 
-Positive axis in the rage of [0, rank(values)) refers to axis-th dimension. 
+axis: A int. Dimension along which to concatenate.
+Must be in the range [-rank(values), rank(values)).
+As in Python, indexing for axis is 0-based.
+Positive axis in the rage of [0, rank(values)) refers to axis-th dimension.
 And negative axis refers to axis + rank(values)-th dimension.
 Returns:
 A variable resulting from concatenation of the input variables.
@@ -516,7 +516,7 @@ VARP _Concat(VARPS values, int axis) {
 /*Convert a variable to another format(possibily added after `input`).
 Args:
 input: A variable.
-format: The target format. 
+format: The target format.
 Returns:
 A variable. If `input` is already `format`, then return `input` directly, otherwize add a variable after `input` with `format`.
 */
@@ -537,7 +537,7 @@ VARP _Convert(VARP input, Dimensionformat format) {
 /*Splits a variable value into a list of sub variables.
 Args:
 value: The variable to split.
-size_splits: A vector, a 1-D integer containing the sizes of each output variable along axis. 
+size_splits: A vector, a 1-D integer containing the sizes of each output variable along axis.
 axis: A int, the dimension along which to split. Must be in the range [-rank(value), rank(value)). Defaults to 0
 Returns:
 A list of variables.
@@ -645,7 +645,7 @@ VARP _ReverseSequence(VARP x, VARP y, int batchDim, int seqDim) {
 /*Convert a variable to another format(possibily added before `input`).
 Args:
 input: A variable.
-format: The target format. 
+format: The target format.
 Returns:
 A variable. If `input` is already `format`, then return `input` directly, otherwize add a variable before `input` with `format`.
 */
@@ -735,15 +735,15 @@ VARP _PoolGrad(VARP originInput, VARP originOutput, VARP inputGrad, INTS kernel,
     pool->main.AsPool()->type    = (PoolType)type;
     return (Variable::create(Expr::create(std::move(pool), {originInput, originOutput, inputGrad})));
 }
-/*Crop images. 
+/*Crop images.
 Args:
-images: 4-D variable of NC4HW4 format.  
+images: 4-D variable of NC4HW4 format.
 size: A variable. It takes the shape of `size` as output cropped variable's shape  while omits the values/format of `size`.
-axis: A int indicating the dimention to crop. Must be >=2. All dimensions up to but excluding `axis` are preserved, while the dimensions including and trailing `axis` are cropped.  
+axis: A int indicating the dimention to crop. Must be >=2. All dimensions up to but excluding `axis` are preserved, while the dimensions including and trailing `axis` are cropped.
 offset: A vector of int indicating the offsets. length(`offset`) must be >=1 and <=2. If length(`offset`) is 1, then all dimensions are offset by this amount.Otherwise, the number of offsets must equal the number of cropped axes in each dimension accordingly.
 Returns:
 The cropped 4-D variable of NC4HW4 format.
-*/  
+*/
 VARP _Crop(VARP images, VARP size, int axis, INTS offset) {
     std::unique_ptr<OpT> crop(new OpT);
     crop->type                  = OpType_Crop;
@@ -753,13 +753,13 @@ VARP _Crop(VARP images, VARP size, int axis, INTS offset) {
     crop->main.AsCrop()->offset = offset;
     return (Variable::create(Expr::create(std::move(crop), {images, size})));
 }
-/*Resize images. 
+/*Resize images.
 Args:
-images: 4-D variable of NC4HW4 format.  
-xScale: A float. 
+images: 4-D variable of NC4HW4 format.
+xScale: A float.
 yScale: A float.
 Returns:
-The resized 4-D variable of NC4HW4 format.  
+The resized 4-D variable of NC4HW4 format.
 */
 VARP _Resize(VARP images, float xScale, float yScale) {
     std::unique_ptr<OpT> resize(new OpT);
@@ -773,8 +773,8 @@ VARP _Resize(VARP images, float xScale, float yScale) {
 /*Pads a variable.
 Args:
 x: A variable.
-paddings: A variable of type Halide_Type_Int. The shape is [n, 2] where  n is the rank of variable. 
-mode: A enum, One of PadValueMode_CONSTANT, PadValueMode_SYMMETRIC, or PadValueMode_REFLECT. 
+paddings: A variable of type Halide_Type_Int. The shape is [n, 2] where  n is the rank of variable.
+mode: A enum, One of PadValueMode_CONSTANT, PadValueMode_SYMMETRIC, or PadValueMode_REFLECT.
 Returns:
 A variable. Has the same type as x.
 */
@@ -802,7 +802,7 @@ VARP _Pad(VARP x, VARP paddings, PadValueMode mode) {
 /*Returns a variable with an additional dimension inserted at index axis.
 Args:
 input: A variable.
-axis: A int, specifying the dimension index at which to expand the shape of input. 
+axis: A int, specifying the dimension index at which to expand the shape of input.
 Given an input of D dimensions, axis must be in range [-(D+1), D] (inclusive).
 Returns:
 A variable with the same data as input, with an additional dimension inserted at the index specified by axis.
@@ -827,7 +827,7 @@ VARP _ExpandDims(VARP input, VARP axis) {
 input: A variable.
 Returns:
 A variable of Halide_Type_Int.
-*/ 
+*/
 VARP _Shape(VARP input, bool nchw) {
     std::unique_ptr<OpT> shape(new OpT);
     shape->type = OpType_Shape;
@@ -838,13 +838,13 @@ VARP _Shape(VARP input, bool nchw) {
 }
 /*Stacks a list of rank-R variables into one rank-(R+1) variable.
 Packs the list of variables in `values` into a ariable with rank one higher than each variable in values,
-by packing them along the axis dimension. 
+by packing them along the axis dimension.
 Given a list of length N of variables of shape (A, B, C);
-if axis == 0 then the output variable will have the shape (N, A, B, C). 
+if axis == 0 then the output variable will have the shape (N, A, B, C).
 if axis == 1 then the output variable will have the shape (A, N, B, C). Etc.
 Args:
 values: A list of variable objects with the same shape and type.
-axis: An int. The axis to stack along. Defaults to the first dimension. Negative values wrap around, 
+axis: An int. The axis to stack along. Defaults to the first dimension. Negative values wrap around,
 so the valid range is [-(R+1), R+1).
 Returns:
 output: A stacked variable with the same type as `values`.
@@ -858,13 +858,13 @@ VARP _Stack(VARPS values, int axis) {
     return (Variable::create(Expr::create(std::move(pack), values)));
 }
 /*Extracts crops from the input image variable and resizes them using bilinear sampling or nearest neighbor sampling (possibly with aspect ratio change)
-to a common output size specified by crop_size. 
-Returns a variable with crops from the input image at positions defined at the bounding box locations in boxes. 
-The cropped boxes are all resized (with bilinear or nearest neighbor interpolation) to a fixed size = [crop_height, crop_width]. 
+to a common output size specified by crop_size.
+Returns a variable with crops from the input image at positions defined at the bounding box locations in boxes.
+The cropped boxes are all resized (with bilinear or nearest neighbor interpolation) to a fixed size = [crop_height, crop_width].
 The result is a 4-D tensor [num_boxes, crop_height, crop_width, depth](supposing NHWC format).
 Arguments:
 image: A 4-D variable of shape [batch, image_height, image_width, depth](supposing NHWC format). Both image_height and image_width need to be positive.
-boxes: A 2-D variable of shape [num_boxes, 4]. The i-th row of the variable specifies the coordinates of a box in the box_ind[i] image and is specified in normalized coordinates [y1, x1, y2, x2]. 
+boxes: A 2-D variable of shape [num_boxes, 4]. The i-th row of the variable specifies the coordinates of a box in the box_ind[i] image and is specified in normalized coordinates [y1, x1, y2, x2].
 A normalized coordinate value of y is mapped to the image coordinate at y * (image_height - 1), so as the [0, 1] interval of normalized image height is mapped to [0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. Normalized coordinates outside the [0, 1] range are allowed, in which case we use extrapolation_value to extrapolate the input image values.
 box_ind: A 1-D variable of shape [num_boxes] with int values in [0, batch). The value of box_ind[i] specifies the image that the i-th box refers to.
 crop_size: A 1-D variable of 2 elements, size = [crop_height, crop_width]. All cropped image patches are resized to this size. The aspect ratio of the image content is not preserved. Both crop_height and crop_width need to be positive.
@@ -893,7 +893,7 @@ VARP _CropAndResize(VARP image, VARP boxes, VARP box_ind, VARP crop_size, Interp
 /*Creates a variable filled with a scalar value.
 Args:
 dims: A variable. Must be 1-D Halide_Type_Int. Represents the shape of the output variable.
-value: A variable. 0-D (scalar). Value to fill the returned variable. 
+value: A variable. 0-D (scalar). Value to fill the returned variable.
 Returns:
 A variable. Has the same type as value.
 */
@@ -918,7 +918,7 @@ VARP _Tile(VARP input, VARP multiples) {
 }
 /*Gather slices from params according to indices.
 Arguments:
-params: The variable from which to gather values. 
+params: The variable from which to gather values.
 indices: Index variable. Must be Halide_Type_Int in range [0, ndims(params)-1].
 Returns:
 Output: Values from params gathered from indices given by indices.
@@ -930,10 +930,10 @@ VARP _Gather(VARP params, VARP indices) {
 }
 /*Gather slices from params axis according to indices.
 Arguments:
-params: The variable from which to gather values. 
+params: The variable from which to gather values.
 indices: Index variable. Must be Halide_Type_Int in range [0, ndims(params)-1].
-axis: A int, the axis in params to gather indices from. Supports negative indexes. 
-If set to 0, it's same as _Gather. Currently only 0 is supported. 
+axis: A int, the axis in params to gather indices from. Supports negative indexes.
+If set to 0, it's same as _Gather. Currently only 0 is supported.
 Returns:
 Output: Values from params gathered from indices given by indices.
 */
@@ -951,8 +951,8 @@ VARP _GatherV2(VARP params, VARP indices, VARP axis) {
 /*Removes dimensions of size 1 from the shape of a variable.
 Args:
 input: A variable. The input to squeeze.
-axis: A vector, Defaults to {}. If specified, only squeezes the dimensions listed. The dimension index starts at 0. 
-Must be in the range [-rank(input), rank(input)). 
+axis: A vector, Defaults to {}. If specified, only squeezes the dimensions listed. The dimension index starts at 0.
+Must be in the range [-rank(input), rank(input)).
 Returns:
 A variable. Has the same type as input. Contains the same data as input, but has one or more dimensions of size 1 removed.
 */
@@ -1062,24 +1062,24 @@ VARP _GatherElements(VARP params, VARP indices, VARP axis) {
 }
 
 /*BatchToSpace for N-D variables
-This operation reshapes the "batch" dimension 0 into M + 1 dimensions of shape block_shape + [batch], 
-interleaves these blocks back into the grid defined by the spatial dimensions [1, ..., M], 
-to obtain a result with the same rank as the input. 
-The spatial dimensions of this intermediate result are then optionally cropped according to crops to 
+This operation reshapes the "batch" dimension 0 into M + 1 dimensions of shape block_shape + [batch],
+interleaves these blocks back into the grid defined by the spatial dimensions [1, ..., M],
+to obtain a result with the same rank as the input.
+The spatial dimensions of this intermediate result are then optionally cropped according to crops to
 produce the output. This is the reverse of SpaceToBatch. See below for a precise description.
 Arguments:
 input: must be 4-D with NC4HW4 format. N-D with shape input_shape = [batch] + spatial_shape + remaining_shape, where spatial_shape has M dimensions.
 block_shape: 1-D with shape [M], all values must be >= 1.
-crops: 2-D with shape [M, 2], all values must be >= 0. crops[i] = [crop_start, crop_end] specifies the amount to crop from input dimension i + 1, 
+crops: 2-D with shape [M, 2], all values must be >= 0. crops[i] = [crop_start, crop_end] specifies the amount to crop from input dimension i + 1,
 which corresponds to spatial dimension i. It is required that crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1].
 This operation is equivalent to the following steps:
-Reshape input to reshaped of shape: [block_shape[0], ..., block_shape[M-1], batch / prod(block_shape), 
+Reshape input to reshaped of shape: [block_shape[0], ..., block_shape[M-1], batch / prod(block_shape),
 input_shape[1], ..., input_shape[N-1]]
-Permute dimensions of reshaped to produce permuted of shape 
+Permute dimensions of reshaped to produce permuted of shape
 [batch / prod(block_shape),input_shape[1], block_shape[0], ..., input_shape[M], block_shape[M-1],input_shape[M+1], ..., input_shape[N-1]]
-Reshape permuted to produce reshaped_permuted of shape 
+Reshape permuted to produce reshaped_permuted of shape
 [batch / prod(block_shape),input_shape[1] * block_shape[0], ..., input_shape[M] * block_shape[M-1],input_shape[M+1], ..., input_shape[N-1]]
-Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops to produce the output of shape: 
+Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops to produce the output of shape:
 [batch / prod(block_shape),input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],input_shape[M+1], ..., input_shape[N-1]]
 Some examples:
 for the following input of shape [4, 1, 1, 3], block_shape = [2, 2], and crops = [[0, 0], [0, 0]]:
@@ -1095,14 +1095,14 @@ VARP _BatchToSpaceND(VARP input, VARP block_shape, VARP crops) {
     std::unique_ptr<OpT> op(new OpT);
     std::unique_ptr<BlobT> blob_blockShape(new BlobT);
     std::unique_ptr<BlobT> blob_paddings(new BlobT);
-    
+
     auto info_block_shape = block_shape->getInfo();
     auto info_crops = crops->getInfo();
     MNN_ASSERT(info_block_shape != nullptr);
     MNN_ASSERT(info_crops != nullptr);
     MNN_ASSERT(halide_type_int == info_block_shape->type.code);
     MNN_ASSERT(halide_type_int == info_crops->type.code);
-  
+
     blob_blockShape->dims = info_block_shape->dim;
     blob_blockShape->dataFormat = (MNN_DATA_FORMAT)Utils::convertFormat(info_block_shape->order);
     blob_blockShape->dataType = (MNN::DataType)Utils::convertDataType(info_block_shape->type);
@@ -1144,7 +1144,7 @@ VARP _MatrixBandPart(VARP input, VARP num_lower, VARP num_upper) {
 Args:
 x: A variable. must be 4-D with NC4HW4 format.
 axes: Array of ints. Axes along which to compute mean and variance. Ignored for this implementation: must be {2, 3}
-shift: Not used in the current implementation. 
+shift: Not used in the current implementation.
 keepdims: produce moments with the same dimensionality as the input.  Ignored for this implementation: must be true.
 Returns:
 Two variable objects: mean and variance.
@@ -1153,7 +1153,7 @@ std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims) {
     std::unique_ptr<OpT> op(new OpT);
     axis = {2, 3};
     keepDims = true;
-    // if axis != {2,3} or keepDims != true, print warning. 
+    // if axis != {2,3} or keepDims != true, print warning.
     // ignore shift.
     op->type       = OpType_Moments;
     auto momentsParam = new MomentsParamT;
@@ -1168,11 +1168,11 @@ std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims) {
     return res;
 }
 /*Computes the difference between two lists of numbers or strings.
-Given a list x and a list y, this operation returns a list out that represents all values that are in x but not in y. 
-The returned list out is sorted in the same order that the numbers appear in x (duplicates are preserved). 
-This operation also returns a list idx that represents the position of each out element in x. 
+Given a list x and a list y, this operation returns a list out that represents all values that are in x but not in y.
+The returned list out is sorted in the same order that the numbers appear in x (duplicates are preserved).
+This operation also returns a list idx that represents the position of each out element in x.
 Arguments:
-x: 1-D variable of type Halide_Type_Int. Values to keep. 
+x: 1-D variable of type Halide_Type_Int. Values to keep.
 y: 1-D variable of type Halide_Type_Int. Values to remove.
 Returns:
 Output out: 1-D variable of type Halide_Type_Int. Values present in x but not in y.
@@ -1184,8 +1184,8 @@ VARP _SetDiff1D(VARP x, VARP y) {
     op->main.value = nullptr;
     return Variable::create(Expr::create(std::move(op), {x, y}));
 }
-/*Rearranges blocks of spatial data, into depth. 
-More specifically, it outputs a copy of the input variable where values from the height and width dimensions are moved to the depth dimension. 
+/*Rearranges blocks of spatial data, into depth.
+More specifically, it outputs a copy of the input variable where values from the height and width dimensions are moved to the depth dimension.
 The block_size indicates the input block size.
 Non-overlapping blocks of size block_size x block_size are rearranged into depth at each location.
 The depth of the output variable is block_size * block_size * input_depth.
@@ -1207,11 +1207,11 @@ VARP _SpaceToDepth(VARP input, int block_size) {
     return Variable::create(Expr::create(std::move(op), {input}));
 }
 
-/*This operation divides "spatial" dimensions [1, ..., M] of the input into a grid of blocks of shape block_shape, 
-and interleaves these blocks with the "batch" dimension 
+/*This operation divides "spatial" dimensions [1, ..., M] of the input into a grid of blocks of shape block_shape,
+and interleaves these blocks with the "batch" dimension
 such that in the output, the spatial dimensions [1, ..., M] correspond to the position within the grid,
 and the batch dimension combines both the position within a spatial block and the original batch position.
-Prior to division into blocks, the spatial dimensions of the input are optionally zero padded according to paddings. 
+Prior to division into blocks, the spatial dimensions of the input are optionally zero padded according to paddings.
 See below for a precise description.
 Args:
 input: A variable. must be 4-D with NC4HW4 format. N-D with shape input_shape = [batch] + spatial_shape + remaining_shape, where spatial_shape has M dimensions.
@@ -1232,7 +1232,7 @@ VARP _SpaceToBatchND(VARP input, VARP block_shape, VARP paddings) {
     MNN_ASSERT(info_paddings != nullptr);
     MNN_ASSERT(halide_type_int == info_block_shape->type.code);
     MNN_ASSERT(halide_type_int == info_paddings->type.code);
-  
+
     blob_blockShape->dims = info_block_shape->dim;
     blob_blockShape->dataFormat = (MNN::MNN_DATA_FORMAT)Utils::convertFormat(info_block_shape->order);
     blob_blockShape->dataType = (MNN::DataType)Utils::convertDataType(info_block_shape->type);
@@ -1271,9 +1271,9 @@ VARP _ZerosLike(VARP input) {
 }
 /*Unpacks the given dimension of a rank-R tensor into rank-(R-1) variable.
 For example, given a variable of shape (A, B, C, D);
-If axis == 0 then the i'th variable in output is the slice value[i, :, :, :] and each variable in output will have shape (B, C, D). 
+If axis == 0 then the i'th variable in output is the slice value[i, :, :, :] and each variable in output will have shape (B, C, D).
 (Note that the dimension unpacked along is gone, unlike split).
-If axis == 1 then the i'th variable in output is the slice value[:, i, :, :] and each variable in output will have shape (A, C, D). 
+If axis == 1 then the i'th variable in output is the slice value[:, i, :, :] and each variable in output will have shape (A, C, D).
 Args:
 value: A rank R > 0 variable to be unstacked.
 num: An int. The length of the dimension axis. Automatically inferred if None (the default).
@@ -1304,13 +1304,13 @@ std::vector <VARP> _Unstack(VARP value, int axis) {
     for (int i = 0; i < size; ++i) {
         res.emplace_back(Variable::create(expr, i));
     }
-    return res;   
+    return res;
 }
 
 /*Returns the rank of a variable.
 Returns a 0-D int32 variable representing the rank of input.
-Note: The rank of a variable is not the same as the rank of a matrix. 
-It's the number of indices required to uniquely select each element of the variable. 
+Note: The rank of a variable is not the same as the rank of a matrix.
+It's the number of indices required to uniquely select each element of the variable.
 It's also known as "order", "degree", or "ndims."
 Args:
 input: A variable.
@@ -1326,9 +1326,9 @@ VARP _Rank(VARP input) {
 }
 /*Creates a sequence of numbers.
 Args:
-start: A 0-D variable (scalar). 
-limit: A 0-D variable (scalar). 
-delta: A 0-D variable (scalar). 
+start: A 0-D variable (scalar).
+limit: A 0-D variable (scalar).
+delta: A 0-D variable (scalar).
 */
 VARP _Range(VARP start, VARP limit, VARP delta) {
     std::unique_ptr<OpT> op(new OpT);
@@ -1338,9 +1338,9 @@ VARP _Range(VARP start, VARP limit, VARP delta) {
     op->main.value = rangeParam;
     return Variable::create(Expr::create(std::move(op), {start, limit, delta}));
 }
-/*Rearranges data from depth into blocks of spatial data. 
+/*Rearranges data from depth into blocks of spatial data.
 It is the reverse transformation of SpaceToDepth. More specifically,
-it outputs a copy of the input variable where values from the depth dimension are moved in spatial blocks to the height and width dimensions. 
+it outputs a copy of the input variable where values from the depth dimension are moved in spatial blocks to the height and width dimensions.
 Args:
 input: A variable.
 block_size: An int that is >= 2. The size of the spatial block, same as in Space2Depth.
@@ -1356,25 +1356,25 @@ VARP _DepthToSpace(VARP input, int block_size) {
     op->main.value = depthtospaceParam;
     return Variable::create(Expr::create(std::move(op), {input}));
 }
-/*SSD network's priorbox layer. 
+/*SSD network's priorbox layer.
 Args:
-feature: A variable. Contains the feature map. Namely bottom[0] in caffe. 
+feature: A variable. Contains the feature map. Namely bottom[0] in caffe.
 image: A variable. Contains the image. Namely bottom[1] in caffe.
-min_size: Minimum box size (in pixels). 
+min_size: Minimum box size (in pixels).
 max_size: Maximum box size (in pixels).
-aspect_ratio: Various of aspect ratios. Duplicate ratios are ignored. If none is provided, use default 1.0. 
-flip: If true, flips each aspect ratio. For example, if there is aspect ratio "r", generates aspect ratio "1.0/r" as well. Default true. 
-clip: If true, clips the prior so that it is within [0, 1]. Default false. 
-variance: Variance for adjusting the prior bboxes. 
-img_h: image height. If 0, uses information in image. 
+aspect_ratio: Various of aspect ratios. Duplicate ratios are ignored. If none is provided, use default 1.0.
+flip: If true, flips each aspect ratio. For example, if there is aspect ratio "r", generates aspect ratio "1.0/r" as well. Default true.
+clip: If true, clips the prior so that it is within [0, 1]. Default false.
+variance: Variance for adjusting the prior bboxes.
+img_h: image height. If 0, uses information in image.
 img_w: image width.  If 0, uses information in image.
-step_h: step in height. 
-step_w: step in width. 
-offset: Offset to the top left corner of each cell. 
-Returns: 
-A variable. 
+step_h: step in height.
+step_w: step in width.
+offset: Offset to the top left corner of each cell.
+Returns:
+A variable.
 */
-VARP _PriorBox(VARP feature, VARP image, std::vector<float> min_size, std::vector<float> max_size, std::vector<float>aspect_ratio, 
+VARP _PriorBox(VARP feature, VARP image, std::vector<float> min_size, std::vector<float> max_size, std::vector<float>aspect_ratio,
             bool flip, bool clip, std::vector<float>variance,
             unsigned int img_h, unsigned int img_w, float step_h, float step_w, float offset) {
     std::unique_ptr<OpT> op(new OpT);
@@ -1395,12 +1395,12 @@ VARP _PriorBox(VARP feature, VARP image, std::vector<float> min_size, std::vecto
     op->main.value = param;
     return Variable::create(Expr::create(std::move(op), {feature, image}));
 }
-/*SSD network's permute layer.  
+/*SSD network's permute layer.
 Args:
-input: A variable. Contains the feature map. Namely bottom[0] in caffe. 
+input: A variable. Contains the feature map. Namely bottom[0] in caffe.
 dims:  A vector. Contains the order.
-Returns: 
-A variable. 
+Returns:
+A variable.
 */
 VARP _Permute(VARP input, INTS dims) {
     std::unique_ptr<OpT> op(new OpT);
@@ -1411,27 +1411,27 @@ VARP _Permute(VARP input, INTS dims) {
     op->main.value = param;
     return Variable::create(Expr::create(std::move(op), {input}));
 }
-/*SSD network's detectionoutput layer.  
+/*SSD network's detectionoutput layer.
 Args:
-location: A variable. 
+location: A variable.
 confidence:  A variable.
 priorbox: A variable.
 num_classes: number of classes.
-share_location: indicates wheter share location between different classes, default true. 
-background_label_id: default = 0. 
+share_location: indicates wheter share location between different classes, default true.
+background_label_id: default = 0.
 nms_threshhold: nonmaximumsupression threshhold.
 mns_topk: nonmaximumsupression topk.
-code_type: indicates the mode to encode bbox,  default = CORNER. 
-variance_encoded_in_target: indicates whether encode variance in target, default false. 
-keep_top_k: indicates the number of boxes kept, default -1(all boxes are kept). 
-confidence_threshold: the threshhold for confidence. 
+code_type: indicates the mode to encode bbox,  default = CORNER.
+variance_encoded_in_target: indicates whether encode variance in target, default false.
+keep_top_k: indicates the number of boxes kept, default -1(all boxes are kept).
+confidence_threshold: the threshhold for confidence.
 visualize_threshold: The threshold used to visualize the detection results.
-Returns: 
-A variable. 
+Returns:
+A variable.
 */
-VARP _DetectionOutput(VARP location, VARP confidence, VARP priorbox, 
-                        unsigned int num_classes, bool share_location, int background_label_id, 
-                        float nms_threshhold, int nms_topk, int code_type, 
+VARP _DetectionOutput(VARP location, VARP confidence, VARP priorbox,
+                        unsigned int num_classes, bool share_location, int background_label_id,
+                        float nms_threshhold, int nms_topk, int code_type,
                         bool variance_encoded_in_target,
                         int keep_top_k, float confidence_threshold, float visualize_threshold){
     std::unique_ptr<OpT> op(new OpT);
@@ -1451,26 +1451,26 @@ VARP _DetectionOutput(VARP location, VARP confidence, VARP priorbox,
     op->main.value = param;
     return Variable::create(Expr::create(std::move(op), {location, confidence, priorbox}));
 }
-/*SSD network's detectionpostprocess layer.  
+/*SSD network's detectionpostprocess layer.
 Args:
-encode_boxes: A variable. 
+encode_boxes: A variable.
 class_predictions:  A variable.
 anchors: A variable.
 num_classes: number of classes.
 max_detections: A int, indicates max detections.
-max_class_per_detection: A int, indicates max class per detection. 
-detections_per_class: A int, indicates detections per class. 
+max_class_per_detection: A int, indicates max class per detection.
+detections_per_class: A int, indicates detections per class.
 nms_threshhold: A float, the threshold for nms.
-iou_threshold: A float, the threshold for iou. 
-use_regular_nms: A bool, indicates whether use regular nms method, only false is implemented currently. 
-centersize_encoding: A float vector, indicates the centersize encoding.  
-Returns: 
+iou_threshold: A float, the threshold for iou.
+use_regular_nms: A bool, indicates whether use regular nms method, only false is implemented currently.
+centersize_encoding: A float vector, indicates the centersize encoding.
+Returns:
 4 variable, detection_boxes, detection_class, detection_scores, num_detections
 */
-std::vector<VARP> _DetectionPostProcess(VARP encode_boxes, VARP class_predictions, VARP anchors, 
-                        int num_classes, int max_detections, 
-                        int max_class_per_detection, int detections_per_class, 
-                        float nms_threshold, float iou_threshold, 
+std::vector<VARP> _DetectionPostProcess(VARP encode_boxes, VARP class_predictions, VARP anchors,
+                        int num_classes, int max_detections,
+                        int max_class_per_detection, int detections_per_class,
+                        float nms_threshold, float iou_threshold,
                         bool use_regular_nms, std::vector<float> centersize_encoding){
     std::unique_ptr<OpT> op(new OpT);
     op->type       = OpType_DetectionPostProcess;
@@ -1649,7 +1649,7 @@ VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, std::vector<
     }
 
     conv2D->bias = bias;
-    
+
     conv2D->symmetricQuan->weight = std::move(weight);
     conv2D->symmetricQuan->zeroPoint = std::move(inputZeroPoint);
     conv2D->symmetricQuan->outputZeroPoint = std::move(outputZeroPoint);
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
index 695a55cad..bd0b72a30 100644
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@@ -75,6 +75,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 3
 #define MNN_VERSION_MINOR 0
-#define MNN_VERSION_PATCH 1
+#define MNN_VERSION_PATCH 2
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/MNNForwardType.h b/include/MNN/MNNForwardType.h
index c115113ea..31665c1ec 100644
--- a/include/MNN/MNNForwardType.h
+++ b/include/MNN/MNNForwardType.h
@@ -40,14 +40,16 @@ typedef enum {
     MNN_FORWARD_USER_2 = 10,
     MNN_FORWARD_USER_3 = 11,
 
-    MNN_FORWARD_ALL,
+    MNN_FORWARD_ALL = 12,
 
     /* Apply arm extension instruction set to accelerate some Ops, this forward type
        is only used in MNN internal, and will be active automatically when user set forward type
        to be MNN_FORWARD_CPU and extension instruction set is valid on hardware.
     */
-    MNN_FORWARD_CPU_EXTENSION
-
+    MNN_FORWARD_CPU_EXTENSION = 13,
+    // use for shared memory on android device
+    
+    MNN_MEMORY_AHARDWAREBUFFER = 14
 } MNNForwardType;
 
 typedef enum {
diff --git a/include/MNN/expr/MathOp.hpp b/include/MNN/expr/MathOp.hpp
index c5595fa5a..9d7e41763 100644
--- a/include/MNN/expr/MathOp.hpp
+++ b/include/MNN/expr/MathOp.hpp
@@ -13,7 +13,7 @@ namespace MNN {
 namespace Express {
 //BinaryOPs
 MNN_PUBLIC VARP _Add(VARP x, VARP y);
-MNN_PUBLIC VARP _Subtract(VARP x, VARP y);    
+MNN_PUBLIC VARP _Subtract(VARP x, VARP y);
 MNN_PUBLIC VARP _Multiply(VARP x, VARP y);
 MNN_PUBLIC VARP _Divide(VARP x, VARP y);
 MNN_PUBLIC VARP _Pow(VARP x, VARP y);
@@ -92,19 +92,19 @@ MNN_PUBLIC VARP _Prod(VARP a, VARP b, std::vector<float> coeff);
 MNN_PUBLIC VARP _Sum(VARP a, VARP b, std::vector<float> coeff);
 MNN_PUBLIC VARP _Max(VARP a, VARP b, std::vector<float> coeff);
 MNN_PUBLIC VARP _Sub(VARP a, VARP b, std::vector<float> coeff);
-MNN_PUBLIC VARP _EltwiseProdInt8(VARP x, VARP y, 
+MNN_PUBLIC VARP _EltwiseProdInt8(VARP x, VARP y,
                     std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale);
-MNN_PUBLIC VARP _EltwiseSumInt8(VARP x, VARP y, 
+MNN_PUBLIC VARP _EltwiseSumInt8(VARP x, VARP y,
                      std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale);
-MNN_PUBLIC VARP _EltwiseSubInt8(VARP x, VARP y, 
+MNN_PUBLIC VARP _EltwiseSubInt8(VARP x, VARP y,
                      std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale);
-MNN_PUBLIC VARP _EltwiseMaxInt8(VARP x, VARP y, 
+MNN_PUBLIC VARP _EltwiseMaxInt8(VARP x, VARP y,
                       std::vector<int8_t> x_weight, std::vector<int32_t> x_bias, std::vector<float> x_scale, std::vector<float> x_tensorScale,
                     std::vector<int8_t> y_weight, std::vector<int32_t> y_bias, std::vector<float> y_scale, std::vector<float> y_tensorScale,
                     std::vector<int8_t> output_weight, std::vector<int32_t> output_bias, std::vector<float> output_scale, std::vector<float> output_tensorScale);
@@ -138,6 +138,9 @@ MNN_PUBLIC VARP _CumSum(VARP x, int axis, bool exclusive = false, bool reverse =
 MNN_PUBLIC VARP _CumProd(VARP x, int axis);
 MNN_PUBLIC VARPS _Svd(VARP x);
 MNN_PUBLIC VARP _Histogram(VARP x, int bin, int min, int max, int channel = -1);
+#ifdef MNN_BUILD_AUDIO
+MNN_PUBLIC VARP _Stft(VARP sample, VARP window, int n_fft, int hop_length, bool abse = true);
+#endif
 }; // namespace Express
 }; // namespace MNN
 
diff --git a/project/android/build_32_stl_shared.sh b/project/android/build_32_stl_shared.sh
index 137dced1e..1fb38d5d8 100755
--- a/project/android/build_32_stl_shared.sh
+++ b/project/android/build_32_stl_shared.sh
@@ -5,7 +5,7 @@ cmake ../../../ \
 -DANDROID_ABI="armeabi-v7a" \
 -DANDROID_STL=c++_shared \
 -DCMAKE_BUILD_TYPE=Release \
--DANDROID_NATIVE_API_LEVEL=android-21  \
+-DANDROID_NATIVE_API_LEVEL=android-14  \
 -DANDROID_TOOLCHAIN=clang \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index 6aafd2121..36b0971e1 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -486,11 +486,9 @@
 		92FF02E223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */; };
 		92FF02E323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */; };
 		92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */; };
-		92FF02E623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */; };
 		92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */; };
 		92FF02E823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */; };
 		92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */; };
-		92FF02EC23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */; };
 		92FF02EE23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */; };
 		92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
@@ -530,11 +528,9 @@
 		92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */; };
 		92FF032323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */; };
 		92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */; };
-		92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */; };
 		92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */; };
 		92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */; };
 		92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */; };
-		92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */; };
 		92FF032E23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */; };
 		92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
@@ -592,7 +588,6 @@
 		92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022323AA0B5600AC97F6 /* Int8FunctionsOpt.cpp */; };
 		92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022523AA0B5600AC97F6 /* ConvOpt.cpp */; };
 		92FF03A423AA0B5A00AC97F6 /* OptimizedComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022623AA0B5600AC97F6 /* OptimizedComputer.cpp */; };
-		92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022723AA0B5600AC97F6 /* DeconvolutionWithStride.hpp */; };
 		92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022823AA0B5600AC97F6 /* ConvolutionTiledExecutor.hpp */; };
 		92FF03A723AA0B5A00AC97F6 /* ConvolutionIntFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022923AA0B5600AC97F6 /* ConvolutionIntFactory.cpp */; };
 		92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */; };
@@ -609,7 +604,6 @@
 		92FF03B923AA0B5A00AC97F6 /* ConvOpt.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023B23AA0B5600AC97F6 /* ConvOpt.h */; };
 		92FF03BC23AA0B5A00AC97F6 /* OptimizedComputer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023E23AA0B5600AC97F6 /* OptimizedComputer.hpp */; };
 		92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */; };
-		92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */; };
 		92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */; };
 		92FF03C323AA0B5A00AC97F6 /* CPUEltwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024523AA0B5700AC97F6 /* CPUEltwise.cpp */; };
 		92FF03C423AA0B5A00AC97F6 /* CPUInterp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024623AA0B5700AC97F6 /* CPUInterp.cpp */; };
@@ -740,8 +734,6 @@
 		95772DCF2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S in Sources */ = {isa = PBXBuildFile; fileRef = 95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */; };
 		95772DD02C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S in Sources */ = {isa = PBXBuildFile; fileRef = 95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */; };
 		958375352A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S in Sources */ = {isa = PBXBuildFile; fileRef = 958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */; };
-		958B046429D2C89D00FC3AEF /* GemmInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */; };
-		958B046629D2C8AF00FC3AEF /* GemmInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */; };
 		95CE1DFF2AC57F6200EFB51E /* MNNReluWithSlopeChannelInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 95CE1DFE2AC57F6200EFB51E /* MNNReluWithSlopeChannelInt8.S */; };
 		95CE1E012AC57F7600EFB51E /* MNNReluWithSlopeChannelInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 95CE1E002AC57F7600EFB51E /* MNNReluWithSlopeChannelInt8.S */; };
 		C43C81FA251894A600A0FF84 /* CommonOptFunctionNeon.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81F8251894A500A0FF84 /* CommonOptFunctionNeon.cpp */; };
@@ -1342,11 +1334,9 @@
 		92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixAdd.S; sourceTree = "<group>"; };
 		92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNExpC8.S; sourceTree = "<group>"; };
 		92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23SourceTransUnit.S; sourceTree = "<group>"; };
-		92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductLeft.S; sourceTree = "<group>"; };
 		92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDeconvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1BilinearOpt.S; sourceTree = "<group>"; };
 		92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit.S; sourceTree = "<group>"; };
-		92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductRight.S; sourceTree = "<group>"; };
 		92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannel.S; sourceTree = "<group>"; };
 		92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
@@ -1386,11 +1376,9 @@
 		92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixAdd.S; sourceTree = "<group>"; };
 		92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNExpC8.S; sourceTree = "<group>"; };
 		92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23SourceTransUnit.S; sourceTree = "<group>"; };
-		92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductLeft.S; sourceTree = "<group>"; };
 		92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDeconvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1BilinearOpt.S; sourceTree = "<group>"; };
 		92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit.S; sourceTree = "<group>"; };
-		92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductRight.S; sourceTree = "<group>"; };
 		92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannel.S; sourceTree = "<group>"; };
 		92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
@@ -1448,7 +1436,6 @@
 		92FF022323AA0B5600AC97F6 /* Int8FunctionsOpt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Int8FunctionsOpt.cpp; sourceTree = "<group>"; };
 		92FF022523AA0B5600AC97F6 /* ConvOpt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvOpt.cpp; sourceTree = "<group>"; };
 		92FF022623AA0B5600AC97F6 /* OptimizedComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = OptimizedComputer.cpp; sourceTree = "<group>"; };
-		92FF022723AA0B5600AC97F6 /* DeconvolutionWithStride.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = DeconvolutionWithStride.hpp; sourceTree = "<group>"; };
 		92FF022823AA0B5600AC97F6 /* ConvolutionTiledExecutor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionTiledExecutor.hpp; sourceTree = "<group>"; };
 		92FF022923AA0B5600AC97F6 /* ConvolutionIntFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionIntFactory.cpp; sourceTree = "<group>"; };
 		92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WinogradOptFunction.cpp; sourceTree = "<group>"; };
@@ -1465,7 +1452,6 @@
 		92FF023B23AA0B5600AC97F6 /* ConvOpt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvOpt.h; sourceTree = "<group>"; };
 		92FF023E23AA0B5600AC97F6 /* OptimizedComputer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = OptimizedComputer.hpp; sourceTree = "<group>"; };
 		92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Int8FunctionsOpt.h; sourceTree = "<group>"; };
-		92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DeconvolutionWithStride.cpp; sourceTree = "<group>"; };
 		92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionTiledExecutor.cpp; sourceTree = "<group>"; };
 		92FF024523AA0B5700AC97F6 /* CPUEltwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUEltwise.cpp; sourceTree = "<group>"; };
 		92FF024623AA0B5700AC97F6 /* CPUInterp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp.cpp; sourceTree = "<group>"; };
@@ -1597,8 +1583,6 @@
 		95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4Int8ForMatMulA_ARM82.S; sourceTree = "<group>"; };
 		95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4Int8ForMatMulA_ARM86.S; sourceTree = "<group>"; };
 		958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S; path = arm/arm64/MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S; sourceTree = "<group>"; };
-		958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GemmInt8Executor.cpp; sourceTree = "<group>"; };
-		958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = GemmInt8Executor.hpp; sourceTree = "<group>"; };
 		95CE1DFE2AC57F6200EFB51E /* MNNReluWithSlopeChannelInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannelInt8.S; sourceTree = "<group>"; };
 		95CE1E002AC57F7600EFB51E /* MNNReluWithSlopeChannelInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannelInt8.S; sourceTree = "<group>"; };
 		C43C81F8251894A500A0FF84 /* CommonOptFunctionNeon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CommonOptFunctionNeon.cpp; sourceTree = "<group>"; };
@@ -2643,11 +2627,9 @@
 				92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */,
 				92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */,
 				92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */,
-				92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */,
 				92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */,
 				92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */,
 				92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */,
-				92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */,
 				92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */,
 				92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
@@ -2737,11 +2719,9 @@
 				92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */,
 				92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */,
 				92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */,
-				92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */,
 				92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */,
 				92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */,
 				92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */,
-				92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */,
 				92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */,
 				92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
@@ -2761,8 +2741,6 @@
 			children = (
 				CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */,
 				CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */,
-				958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */,
-				958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */,
 				C48CAE2528900C4A00271A6D /* ConvInt8Winograd.cpp */,
 				C48CAE2428900C4A00271A6D /* ConvInt8Winograd.hpp */,
 				4A224A1227D0C56E000A9260 /* ConvolutionWinogradBridge.cpp */,
@@ -2790,7 +2768,6 @@
 				92FF022323AA0B5600AC97F6 /* Int8FunctionsOpt.cpp */,
 				92FF022523AA0B5600AC97F6 /* ConvOpt.cpp */,
 				92FF022623AA0B5600AC97F6 /* OptimizedComputer.cpp */,
-				92FF022723AA0B5600AC97F6 /* DeconvolutionWithStride.hpp */,
 				92FF022823AA0B5600AC97F6 /* ConvolutionTiledExecutor.hpp */,
 				92FF022923AA0B5600AC97F6 /* ConvolutionIntFactory.cpp */,
 				92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */,
@@ -2807,7 +2784,6 @@
 				92FF023B23AA0B5600AC97F6 /* ConvOpt.h */,
 				92FF023E23AA0B5600AC97F6 /* OptimizedComputer.hpp */,
 				92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */,
-				92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */,
 				92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */,
 			);
 			path = compute;
@@ -2939,7 +2915,6 @@
 			buildActionMask = 2147483647;
 			files = (
 				48C84B89250F711700EE7666 /* StaticModule.hpp in Headers */,
-				958B046629D2C8AF00FC3AEF /* GemmInt8Executor.hpp in Headers */,
 				1F501F812397BA5B004E8721 /* AutoTime.hpp in Headers */,
 				92FF04A523AA0BFB00AC97F6 /* AutoStorage.h in Headers */,
 				EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */,
@@ -3105,7 +3080,6 @@
 				92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
 				EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
 				4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */,
-				92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
 				92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */,
 				92FF033F23AA0B5A00AC97F6 /* CPUArgMax.hpp in Headers */,
 				92FF034C23AA0B5A00AC97F6 /* CPUSetDiff1D.hpp in Headers */,
@@ -3335,7 +3309,6 @@
 				92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */,
 				4819FB2D24C1396A0050BD09 /* GeometryConv2D.cpp in Sources */,
 				48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */,
-				92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
 				48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */,
 				CE072A1C2C91AEE700F190FD /* MNNRGBAToBGRFast.S in Sources */,
 				CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */,
@@ -3597,7 +3570,6 @@
 				48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
 				92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
 				95772DCF2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S in Sources */,
-				92FF02E623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
 				48747D64245D9E33000B9709 /* GeometryTile.cpp in Sources */,
 				92FF043723AA0B7100AC97F6 /* ShapeDetectionOutput.cpp in Sources */,
 				92FF042623AA0B7100AC97F6 /* ShapeCosineSimilarity.cpp in Sources */,
@@ -3633,7 +3605,6 @@
 				92FF043023AA0B7100AC97F6 /* ShapeQuantizedAvgPool.cpp in Sources */,
 				92FF030623AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */,
 				92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */,
-				92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */,
 				92FF044923AA0B7100AC97F6 /* ShapeGatherND.cpp in Sources */,
 				489D7AB32550FDC900AD896A /* MetalPReLU.mm in Sources */,
 				19D0FE7028534C4500B74B1A /* MetalSoftmax.mm in Sources */,
@@ -3787,13 +3758,11 @@
 				92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
 				92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
 				CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
-				92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
 				92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
 				CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
 				92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
 				92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
 				92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */,
-				958B046429D2C89D00FC3AEF /* GemmInt8Executor.cpp in Sources */,
 				92FF026123AA0B5A00AC97F6 /* CPUCropAndResize.cpp in Sources */,
 				48FA474923AA127B00172C3B /* MathOp.cpp in Sources */,
 				4819FB3C24C69E680050BD09 /* GeometryBatchMatMul.cpp in Sources */,
@@ -3826,7 +3795,6 @@
 				92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */,
 				4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */,
 				92FF044323AA0B7100AC97F6 /* ShapeTopKV2.cpp in Sources */,
-				92FF02EC23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
 				48C84BA1250F725600EE7666 /* InitNet.cpp in Sources */,
 				4894C6E927016F7200D8BE79 /* CPUResizeCache.cpp in Sources */,
 				4DD1791B2684815A00B0098F /* ShapeSetDiff1D.cpp in Sources */,
@@ -4164,7 +4132,7 @@
 				METAL_LIBRARY_FILE_BASE = mnn;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@@ -4260,7 +4228,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -4287,7 +4255,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
diff --git a/pymnn/CMakeLists.txt b/pymnn/CMakeLists.txt
index 7e3694407..aedbbe378 100644
--- a/pymnn/CMakeLists.txt
+++ b/pymnn/CMakeLists.txt
@@ -16,6 +16,7 @@ option(PYMNN_TRAIN_API "MNN train API be exposed" OFF)
 option(PYMNN_INTERNAL_SERVING "Internal use only." OFF)
 option(PYMNN_OPENCV_API "MNN OpenCV API be exposed" ON)
 option(PYMNN_IMGCODECS "MNN IMGCODECS API be exposed" OFF)
+option(PYMNN_AUDIO_API "MNN Audio API be exposed" ON)
 option(PYMNN_OHOS_INTERNAL "compile for harmony internal." OFF)
 
 if (PYMNN_OHOS_INTERNAL)
@@ -91,6 +92,10 @@ if(PYMNN_CVCORE)
     target_compile_definitions(mnnpybridge PRIVATE PYMNN_CVCORE)
 endif()
 
+if(PYMNN_AUDIO_API)
+    target_compile_definitions(mnnpybridge PRIVATE PYMNN_AUDIO_API)
+endif()
+
 if(PYMNN_INTERNAL_SERVING)
     message(STATUS "mnnpybridge define PYMNN_INTERNAL_SERVING")
     target_compile_definitions(mnnpybridge PRIVATE PYMNN_INTERNAL_SERVING)
@@ -197,7 +202,7 @@ else()
         endif()
         export_headers(DIR ${CMAKE_SOURCE_DIR}/pip_package/MNN)
     else()
-        target_link_libraries(mnnpybridge PRIVATE log MNN MNN_Express MNNOpenCV)
+        target_link_libraries(mnnpybridge PRIVATE log MNN MNN_Express MNNOpenCV MNNAudio)
         if(PYMNN_USE_ALINNPYTHON)
             target_link_libraries(mnnpybridge PRIVATE AliNNPython)
         endif()
diff --git a/pymnn/pip_package/MNN/__init__.py b/pymnn/pip_package/MNN/__init__.py
index 5c2a4b1c4..89ed46b14 100644
--- a/pymnn/pip_package/MNN/__init__.py
+++ b/pymnn/pip_package/MNN/__init__.py
@@ -9,3 +9,4 @@
 from . import optim
 from . import numpy
 from . import cv
+from . import audio
\ No newline at end of file
diff --git a/pymnn/pip_package/MNN/audio/__init__.py b/pymnn/pip_package/MNN/audio/__init__.py
new file mode 100644
index 000000000..98b970209
--- /dev/null
+++ b/pymnn/pip_package/MNN/audio/__init__.py
@@ -0,0 +1,96 @@
+from _mnncengine.audio import *
+import _mnncengine.audio as _F
+import MNN.expr as _expr
+import MNN.numpy as _np
+import MNN
+
+# Enum Types
+# enum WINDOW_TYPE
+HAMMING = 0
+HANNING = 1
+POVEY = 2
+RECTANGULAR = 3
+BLACKMAN = 4
+# enum PadValueMode
+CONSTANT = 0
+REFLECT = 1
+SYMMETRIC = 2
+EDGE = 3
+
+"""
+Loads a portion of an audio file.
+
+Parameters:
+    filename (str): The path to the audio file.
+    frame_offset (int): The offset in frames from which to start loading the audio data. Default is 0.
+    num_frames (int): The number of frames to load. If set to -1, the entire audio file will be loaded. Default is -1.
+
+Returns:
+    The result of loading the specified portion of the audio var and the sample rate.
+"""
+def load(filename, sr = 0, frame_offset = 0, num_frames = -1):
+    return _F.load(filename, sr, frame_offset, num_frames)
+
+"""
+Saves an audio var to a file.
+Parameters:
+    filename (str): The path to the audio file.
+    audio (Var): The audio var to save.
+    sample_rate (int): The sample rate of the audio var.
+Returns:
+    None
+"""
+def save(filename, audio, sample_rate):
+    return _F.save(filename, audio, sample_rate)
+
+"""
+Generates a Hamming window.
+Parameters:
+    window_size (int): The size of the window.
+    periodic (bool): Whether the window is periodic. Default is False.
+    alpha (float): The alpha parameter of the Hamming window. Default is 0.54.
+    beta (float): The beta parameter of the Hamming window. Default is 0.46.
+Returns:
+    The Hamming window.
+"""
+def hamming_window(window_size, periodic = False, alpha = 0.54, beta = 0.46):
+    return _F.hamming_window(window_size, periodic, alpha, beta)
+
+"""
+Generates a Hann window.
+Parameters:
+    window_size (int): The size of the window.
+    periodic (bool): Whether the window is periodic. Default is False.
+Returns:
+    The Hann window.
+"""
+def hanning_window(window_size, periodic = False):
+    return _F.hanning_window(window_size, periodic)
+
+def melscale_fbanks(n_mels, n_fft, sampe_rate = 16000, htk = True, norm = False,
+                    f_min = 0.0, f_max = 0.0):
+    return _F.melscale_fbanks(n_mels, n_fft, sampe_rate, htk, norm, f_min, f_max)
+
+def spectrogram(waveform, n_fft = 400, hop_length = 0, win_length = 0, window_type = HANNING,
+                pad_left = 0, pad_right = 0, center = False, normalized = False, pad_mode = REFLECT,
+                power = 2.0):
+    return _F.spectrogram(waveform, n_fft, hop_length, win_length, window_type, pad_left,
+                          pad_right, center, normalized, pad_mode, power)
+
+
+def mel_spectrogram(waveform, n_mels, n_fft, sampe_rate = 16000, htk = True, norm = False,
+                    f_min = 0.0, f_max = 0.0, hop_length = 0, win_length = 0, window_type = HANNING,
+                    pad_left = 0, pad_right = 0, center = False, normalized = False, pad_mode = REFLECT,
+                    power = 2.0):
+    return _F.mel_spectrogram(waveform, n_mels, n_fft, sampe_rate, htk, norm, f_min, f_max,
+                              hop_length, win_length, window_type, pad_left, pad_right, center,
+                              normalized, pad_mode, power)
+
+def fbank(waveform, sample_rate = 16000, n_mels = 80, n_fft = 400, hop_length = 160,
+          dither = 0.0, preemphasis = 0.97):
+    return _F.fbank(waveform, sample_rate, n_mels, n_fft, hop_length, dither, preemphasis)
+
+
+def whisper_fbank(waveform, sample_rate = 16000, n_mels = 128, n_fft = 400,
+                  hop_length = 160, chunk_len = 30):
+    return _F.whisper_fbank(waveform, sample_rate, n_mels, n_fft, hop_length, chunk_len)
\ No newline at end of file
diff --git a/pymnn/pip_package/MNN/llm/__init__.py b/pymnn/pip_package/MNN/llm/__init__.py
index f144b3e06..ebf4cf84e 100644
--- a/pymnn/pip_package/MNN/llm/__init__.py
+++ b/pymnn/pip_package/MNN/llm/__init__.py
@@ -57,7 +57,25 @@ def response(self, prompt, stream = False):
         '''
         return super.response(prompt, stream)
 
-def create(config_path):
+    def txt_embedding(self, prompt):
+        '''
+        get prompt's embedding
+
+        Parameters
+        ----------
+        prompt : input prompt
+
+        Returns
+        -------
+        res : embedding var
+
+        Example:
+        -------
+        >>> res = qwen.txt_embedding('Hello')
+        '''
+        return super.txt_embedding(prompt)
+
+def create(config_path, embedding_model = False):
     '''
     create LLM instance by `config.json`
 
@@ -73,4 +91,4 @@ def create(config_path):
     -------
     >>> qwen = llm.create('./qwen-1.8b-int4/config.json')
     '''
-    return _F.create(config_path)
\ No newline at end of file
+    return _F.create(config_path, embedding_model)
\ No newline at end of file
diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py
index 6ee2398a5..c3fa00278 100644
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@@ -99,7 +99,7 @@ def build_deps():
     if IS_WINDOWS:
         os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
             -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON\
-            -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF .. && ninja MNN MNNConvertDeps')
+            -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_BUILD_AUDIO=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF .. && ninja MNN MNNConvertDeps')
     elif IS_LINUX:
         extra_opts += '-DMNN_TENSORRT=ON \
         -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
@@ -113,14 +113,14 @@ def build_deps():
         os.system('cmake ' + extra_opts +
             '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
             -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-             .. && make MNN MNNTrain MNNConvertDeps -j32')
+            -DMNN_BUILD_AUDIO=ON  .. && make MNN MNNTrain MNNConvertDeps -j32')
     else:
         extra_opts += ' -DMNN_INTERNAL=ON ' if USE_INTERNAL else ' '
         extra_opts += ' -DMNN_BUILD_TORCH=ON ' if USE_TORCH else ' '
         print(extra_opts)
         os.system('cmake ' + extra_opts + '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
             -DMNN_BUILD_SHARED_LIBS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF\
-            -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
+            -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_BUILD_AUDIO=ON\
             .. && make MNN MNNConvertDeps -j64')
 ################################################################################
 # Building dependent libraries
diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py
index 633968edb..5955a9310 100644
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@@ -166,7 +166,7 @@ def configure_extension_build():
         ]
         if check_env_flag('WERROR'):
             extra_compile_args.append('-Werror')
-    extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API']
+    extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_AUDIO_API']
     if IS_LINUX and USE_INTERNAL:
         extra_compile_args += ['-DPYMNN_INTERNAL_SERVING']
         if args.env == 'daily':
@@ -177,6 +177,7 @@ def configure_extension_build():
     engine_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
+    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "audio")]
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
     if USE_TRT or USE_CUDA:
@@ -214,6 +215,8 @@ def configure_extension_build():
         engine_include_dirs += [os.path.join(root_dir, "3rd_party", "rapidjson")]
     # cv include
     engine_include_dirs += [os.path.join(root_dir, "tools", "cv", "include")]
+    # audio include
+    engine_include_dirs += [os.path.join(root_dir, "tools", "audio", "include")]
     # llm include
     engine_include_dirs += [os.path.join(root_dir, "transformers", "llm", "engine", "include")]
     engine_include_dirs += [os.path.join(root_dir, "3rd_party")]
@@ -439,6 +442,7 @@ def make_relative_rpath(path):
     extensions, cmdclass, packages, entry_points, lib_files = configure_extension_build()
 
     setup(
+        zip_safe=False,
         name=package_name,
         version=version,
         description=("C methods for MNN Package"),
diff --git a/pymnn/src/MNN.cc b/pymnn/src/MNN.cc
index 1ec9a15e1..1f065e27e 100644
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@@ -22,6 +22,9 @@ using namespace MNN::Express;
 #ifdef PYMNN_OPENCV_API
 #include "cv/cv.hpp"
 #endif
+#ifdef PYMNN_AUDIO_API
+#include "audio/audio.hpp"
+#endif
 #endif // PYMNN_EXPR_API
 
 #ifdef BUILD_OPTYPE
@@ -64,6 +67,9 @@ using RegularizationMethod = ParameterOptimizer::RegularizationMethod;
 #ifdef PYMNN_OPENCV_API
 #include "cv.h"
 #endif
+#ifdef PYMNN_AUDIO_API
+#include "audio.h"
+#endif
 #endif
 
 #ifdef PYMNN_LLM_API
@@ -1587,7 +1593,8 @@ static PyObject* PyMNNTensor_repr(PyObject *self) {
 #ifdef PYMNN_NUMPY_USABLE
     auto content = PyMNNTensor_getNumpyData(((PyMNNTensor*)self), NULL);
 #else
-    auto content = PyMNNVar_read_as_tuple((PyMNNVar*)self, NULL);
+    // print shape of tensor
+    auto content = PyMNNTensor_getShape((PyMNNTensor*)self, NULL);
 #endif
     auto reprfunc = PyObject_GetAttrString(content, "__repr__");
     auto str = PyEval_CallObject(reprfunc, NULL);
@@ -2713,6 +2720,15 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
         def_method(cv_module, &PyMNNCV_methods[i]);
     }
 #endif
+#ifdef PYMNN_AUDIO_API
+    // audio submodule
+    auto audio_module = def_submodule(m, "audio");
+    // add methods of audio
+    constexpr int audio_method_num = sizeof(PyMNNAUDIO_methods) / sizeof(PyMethodDef);
+    for (int i = 0; i < audio_method_num; i++) {
+        def_method(audio_module, &PyMNNAUDIO_methods[i]);
+    }
+#endif
 #endif
 #ifdef PYMNN_LLM_API
     // llm submodule
diff --git a/pymnn/src/audio.h b/pymnn/src/audio.h
new file mode 100644
index 000000000..a7afc6ec4
--- /dev/null
+++ b/pymnn/src/audio.h
@@ -0,0 +1,105 @@
+// MNN AUDIO
+static PyObject *PyMNNAUDIO_load(PyObject *self, PyObject *args) {
+    const char *filename = NULL;
+    int sr = 0, frame_offset = 0, num_frames = -1;
+    if (PyArg_ParseTuple(args, "s|iii", &filename, &sr, &frame_offset, &num_frames) && filename) {
+        return toPyObj<VARP, toPyObj, int, toPyObj>(AUDIO::load(filename, sr, frame_offset, num_frames));
+    }
+    PyMNN_ERROR("load require args: (string, int, int, int)");
+}
+static PyObject *PyMNNAUDIO_save(PyObject *self, PyObject *args) {
+    const char *filename = NULL;
+    PyObject *audio      = nullptr;
+    int sample_rate      = 0;
+    if (PyArg_ParseTuple(args, "sOi", &filename, &audio, &sample_rate) && filename && isVar(audio)) {
+        return toPyObj(AUDIO::save(filename, toVar(audio), sample_rate));
+    }
+    PyMNN_ERROR("save require args: (string, Var, int)");
+}
+static PyObject *PyMNNAUDIO_hamming_window(PyObject *self, PyObject *args) {
+    int window_size = 0, periodic = 0;
+    float alpha = 0.54, beta = 0.46;
+    if (PyArg_ParseTuple(args, "i|iff", &window_size, &periodic, &alpha, &beta)) {
+        return toPyObj(AUDIO::hamming_window(window_size, periodic, alpha, beta));
+    }
+    PyMNN_ERROR("hamming_window require args: (int, |bool, float, float)");
+}
+static PyObject *PyMNNAUDIO_hann_window(PyObject *self, PyObject *args) {
+    int window_size = 0, periodic = 0;
+    if (PyArg_ParseTuple(args, "i|i", &window_size, &periodic)) {
+        return toPyObj(AUDIO::hann_window(window_size, periodic));
+    }
+    PyMNN_ERROR("hann_window require args: (int, |bool)");
+}
+static PyObject *PyMNNAUDIO_melscale_fbanks(PyObject *self, PyObject *args) {
+    AUDIO::MelscaleParams mel;
+    if (PyArg_ParseTuple(args, "ii|ifff", &mel.n_mels, &mel.n_fft, &mel.sample_rate, &mel.htk, &mel.norm, &mel.f_min, &mel.f_max)) {
+        return toPyObj(AUDIO::melscale_fbanks(&mel));
+    }
+    PyMNN_ERROR("melscale_fbanks require args: (int, int, |int, bool, bool, float, float)");
+}
+static PyObject *PyMNNAUDIO_spectrogram(PyObject *self, PyObject *args) {
+    PyObject *waveform = nullptr;
+    AUDIO::SpectrogramParams spec;
+    if (PyArg_ParseTuple(args, "O|iiiiiiiiiif", &waveform, &spec.n_fft, &spec.hop_length, &spec.win_length,
+                         &spec.window_type, &spec.pad_left, &spec.pad_right, &spec.center, &spec.normalized,
+                         &spec.pad_mode, &spec.power) &&
+        isVar(waveform)) {
+        return toPyObj(AUDIO::spectrogram(toVar(waveform), &spec));
+    }
+    PyMNN_ERROR("spectrogram require args: (Var, |int, int, int, int, int, int, bool, bool, PadValueMode, float)");
+}
+static PyObject *PyMNNAUDIO_mel_spectrogram(PyObject *self, PyObject *args) {
+    PyObject *waveform = nullptr;
+    AUDIO::MelscaleParams mel;
+    AUDIO::SpectrogramParams spec;
+    int n_fft = 400;
+    if (PyArg_ParseTuple(args, "O|iiifiiifiiiii", &waveform, &mel.n_mels, &mel.n_fft, &mel.sample_rate, &mel.htk,
+                         &mel.norm, &mel.f_min, &mel.f_max, &spec.hop_length, &spec.win_length, &spec.window_type,
+                         &spec.pad_left, &spec.pad_right, &spec.center, &spec.normalized, &spec.pad_mode,
+                         &spec.power) &&
+        isVar(waveform)) {
+        spec.n_fft = mel.n_fft;
+        return toPyObj(AUDIO::mel_spectrogram(toVar(waveform), &mel, &spec));
+    }
+    PyMNN_ERROR(
+        "mel_spectrogram require args: (Var, |int, bool, bool, float, float, int, int, int, int, int, bool, bool, "
+        "PadValueMode, float)"
+        "int)");
+}
+static PyObject *PyMNNAUDIO_fbank(PyObject *self, PyObject *args) {
+    PyObject *waveform = nullptr;
+    int sample_rate = 16000, n_mels = 80, n_fft = 400, hop_length = 160;
+    float dither = 0.0, preemphasis = 0.97;
+    if (PyArg_ParseTuple(args, "O|iiiiff", &waveform, &sample_rate, &n_mels, &n_fft, &hop_length, &dither,
+                         &preemphasis) &&
+        isVar(waveform)) {
+        return toPyObj(
+            AUDIO::fbank(toVar(waveform), sample_rate, n_mels, n_fft, hop_length, dither, preemphasis));
+    }
+    PyMNN_ERROR("fbank require args: (Var, |int, int, int, int, float, float)");
+}
+
+static PyObject *PyMNNAUDIO_whisper_fbank(PyObject *self, PyObject *args) {
+    PyObject *waveform = nullptr;
+    int sample_rate = 16000, n_mels = 128, n_fft = 400, hop_length = 160, chunk_len = 30;
+    if (PyArg_ParseTuple(args, "O|iiiii", &waveform, &sample_rate, &n_mels, &n_fft, &hop_length, &chunk_len) &&
+        isVar(waveform)) {
+        return toPyObj(AUDIO::whisper_fbank(toVar(waveform), sample_rate, n_mels, n_fft, hop_length, chunk_len));
+    }
+    PyMNN_ERROR("whisper_fbank require args: (Var, |int, int, int, int, int)");
+}
+
+static PyMethodDef PyMNNAUDIO_methods[] = {
+    register_methods(AUDIO,
+        load, "load",
+        save, "save",
+        hamming_window, "hamming_window",
+        hann_window, "hann_window",
+        melscale_fbanks, "melscale_fbanks",
+        spectrogram, "spectrogram",
+        mel_spectrogram, "mel_spectrogram",
+        fbank, "fbank",
+        whisper_fbank, "whisper_fbank"
+    )
+};
diff --git a/pymnn/src/llm.h b/pymnn/src/llm.h
index 0d363fe98..93329cd60 100644
--- a/pymnn/src/llm.h
+++ b/pymnn/src/llm.h
@@ -4,6 +4,7 @@
 typedef struct {
     PyObject_HEAD
     MNN::Transformer::Llm* llm;
+    bool is_embedding = false;
 } LLM;
 
 static PyObject* PyMNNLLM_new(struct _typeobject *type, PyObject *args, PyObject *kwds) {
@@ -25,6 +26,9 @@ static PyObject* PyMNNLLM_load(LLM *self, PyObject *args) {
 }
 
 static PyObject* PyMNNLLM_forward(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
     PyObject *input_ids = nullptr;
     if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
         Py_RETURN_NONE;
@@ -37,6 +41,9 @@ static PyObject* PyMNNLLM_forward(LLM *self, PyObject *args) {
 }
 
 static PyObject* PyMNNLLM_generate(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
     PyObject *input_ids = nullptr;
     if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
         Py_RETURN_NONE;
@@ -46,6 +53,9 @@ static PyObject* PyMNNLLM_generate(LLM *self, PyObject *args) {
 }
 
 static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
     const char* query = NULL;
     int stream = 0;
     if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
@@ -57,6 +67,9 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
 }
 
 static PyObject* PyMNNLLM_tokenizer_encode(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
     const char* prompt = NULL;
     int use_template = 0;
     if (!PyArg_ParseTuple(args, "s|p", &prompt, &use_template)) {
@@ -67,6 +80,9 @@ static PyObject* PyMNNLLM_tokenizer_encode(LLM *self, PyObject *args) {
 }
 
 static PyObject* PyMNNLLM_tokenizer_decode(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
     PyObject *id = nullptr;
     if (!PyArg_ParseTuple(args, "O", &id) && isInt(id)) {
         Py_RETURN_NONE;
@@ -75,6 +91,19 @@ static PyObject* PyMNNLLM_tokenizer_decode(LLM *self, PyObject *args) {
     return string2Object(query);
 }
 
+static PyObject* PyMNNLLM_txt_embedding(LLM *self, PyObject *args) {
+    if (!self->is_embedding) {
+        Py_RETURN_NONE;
+    }
+    const char* query = NULL;
+    if (!PyArg_ParseTuple(args, "s", &query)) {
+        Py_RETURN_NONE;
+    }
+    auto embeds = getVar();
+    *(embeds->var) = ((MNN::Transformer::Embedding*)self->llm)->txt_embedding(query);
+    return (PyObject *)embeds;
+}
+
 static PyMethodDef PyMNNLLM_methods[] = {
     {"load", (PyCFunction)PyMNNLLM_load, METH_VARARGS, "load model."},
     {"forward", (PyCFunction)PyMNNLLM_forward, METH_VARARGS, "forward `logits` by `input_ids`."},
@@ -82,6 +111,7 @@ static PyMethodDef PyMNNLLM_methods[] = {
     {"response", (PyCFunction)PyMNNLLM_response, METH_VARARGS, "response `query` without hsitory."},
     {"tokenizer_encode", (PyCFunction)PyMNNLLM_tokenizer_encode, METH_VARARGS, "tokenizer encode."},
     {"tokenizer_decode", (PyCFunction)PyMNNLLM_tokenizer_decode, METH_VARARGS, "tokenizer decode."},
+    {"txt_embedding", (PyCFunction)PyMNNLLM_txt_embedding, METH_VARARGS, "txt embedding."},
     {NULL}  /* Sentinel */
 };
 
@@ -131,14 +161,21 @@ static PyObject* PyMNNLLM_create(PyObject *self, PyObject *args) {
         return NULL;
     }
     const char* path = NULL;
-    if (!PyArg_ParseTuple(args, "s", &path)) {
+    int embedding_model = 0;
+    if (!PyArg_ParseTuple(args, "s|p", &path, &embedding_model)) {
         return NULL;
     }
     LLM *llm = (LLM *)PyObject_Call((PyObject*)&PyMNNLLM, PyTuple_New(0), NULL);
     if (!llm) {
         return NULL;
     }
-    llm->llm = MNN::Transformer::Llm::createLLM(path);
+    if (embedding_model) {
+        llm->llm = MNN::Transformer::Embedding::createEmbedding(path);
+        llm->is_embedding = true;
+    } else {
+        llm->llm = MNN::Transformer::Llm::createLLM(path);
+    }
+
     return (PyObject*)llm;
 }
 
diff --git a/pymnn/test/unit_test.py b/pymnn/test/unit_test.py
index 551d08208..426f99d87 100644
--- a/pymnn/test/unit_test.py
+++ b/pymnn/test/unit_test.py
@@ -88,7 +88,8 @@ def test_Tensor(self):
         self.assertEqualArray(x.getNumpyData(), data)
         x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, data.__array_interface__['data'][0], MNN.Tensor_DimensionType_Tensorflow)
         self.assertEqualArray(x.getNumpyData(), data)
-        x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, mp.array([[1., 2.], [3., 4.]]).ptr, MNN.Tensor_DimensionType_Tensorflow)
+        v = mp.array([[1., 2.], [3., 4.]])
+        x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, v.ptr, MNN.Tensor_DimensionType_Tensorflow)
         self.assertEqualArray(x.getNumpyData(), data)
     def test_image_process(self):
         src = np.asarray([[50, 50], [200, 50], [50, 200]], dtype=np.float32)
@@ -481,14 +482,6 @@ def test_matrix_band_part(self):
         upper  = expr.scalar(-1)
         y = np.asarray([0, 1, 2, 3, -1, 0, 1, 2, -0, -1, 0, 1, -0, -0, -1, 0]).reshape([4, 4]).astype(np.float32)
         self.assertEqualVar(expr.matrix_band_part(matrix, lower, upper), y)
-    def test_moments(self):
-        x = expr.const([0.0, 1.0, 2.0, 3.0, -1.0, 0.0, 1.0, 2.0, -2.0, -1.0, 0.0, 1.0, -3.0, -2.0, -1.0, 0.0], [1, 4, 4, 1], expr.NCHW, expr.float)
-        x = expr.convert(x, expr.NC4HW4)
-        shift = expr.scalar(1.0)
-        res = expr.moments(x, [2, 3], shift, True)
-        self.assertEqual(len(res), 2)
-        self.assertEqual(res[0].read_as_tuple(), (1.5, 0.5, -0.5, -1.5))   # mean
-        self.assertEqual(res[1].read_as_tuple(), (1.25, 1.25, 1.25, 1.25)) # var
     def test_setdiff1d(self):
         x = expr.const([-1, 2, -3, 4, 5, -6, 7, -8, -9, -10, 11, 12, 13, 14, -15, -16], [16], expr.NHWC, expr.int)
         y = expr.const([-1, 2, -3, 4, 5, -6, 7, -8], [8], expr.NHWC, expr.int)
diff --git a/schema/current/MNN_generated.h b/schema/current/MNN_generated.h
index bb4f48a44..1c9647c87 100644
--- a/schema/current/MNN_generated.h
+++ b/schema/current/MNN_generated.h
@@ -33,6 +33,9 @@ struct FmhaV2ParamT;
 struct FmhcaParam;
 struct FmhcaParamT;
 
+struct StftParam;
+struct StftParamT;
+
 struct WhileParam;
 struct WhileParamT;
 
@@ -78,6 +81,8 @@ inline const flatbuffers::TypeTable *FmhaV2ParamTypeTable();
 
 inline const flatbuffers::TypeTable *FmhcaParamTypeTable();
 
+inline const flatbuffers::TypeTable *StftParamTypeTable();
+
 inline const flatbuffers::TypeTable *WhileParamTypeTable();
 
 inline const flatbuffers::TypeTable *IfParamTypeTable();
@@ -252,6 +257,7 @@ enum OpType {
   OpType_Svd = 153,
   OpType_Histogram = 154,
   OpType_DynamicQuant = 155,
+  OpType_Stft = 156,
   OpType_Plugin = 256,
   OpType_Select = 257,
   OpType_ZerosLike = 258,
@@ -287,7 +293,7 @@ enum OpType {
   OpType_MAX = OpType_GridSample
 };
 
-inline const OpType (&EnumValuesOpType())[182] {
+inline const OpType (&EnumValuesOpType())[183] {
   static const OpType values[] = {
     OpType_AbsVal,
     OpType_QuantizedAdd,
@@ -440,6 +446,7 @@ inline const OpType (&EnumValuesOpType())[182] {
     OpType_Svd,
     OpType_Histogram,
     OpType_DynamicQuant,
+    OpType_Stft,
     OpType_Plugin,
     OpType_Select,
     OpType_ZerosLike,
@@ -633,7 +640,7 @@ inline const char * const *EnumNamesOpType() {
     "Svd",
     "Histogram",
     "DynamicQuant",
-    "",
+    "Stft",
     "",
     "",
     "",
@@ -1193,11 +1200,12 @@ enum OpParameter {
   OpParameter_FmhaV2Param = 96,
   OpParameter_FmhcaParam = 97,
   OpParameter_AttentionParam = 98,
+  OpParameter_StftParam = 99,
   OpParameter_MIN = OpParameter_NONE,
-  OpParameter_MAX = OpParameter_AttentionParam
+  OpParameter_MAX = OpParameter_StftParam
 };
 
-inline const OpParameter (&EnumValuesOpParameter())[99] {
+inline const OpParameter (&EnumValuesOpParameter())[100] {
   static const OpParameter values[] = {
     OpParameter_NONE,
     OpParameter_QuantizedAdd,
@@ -1297,7 +1305,8 @@ inline const OpParameter (&EnumValuesOpParameter())[99] {
     OpParameter_GroupNorm,
     OpParameter_FmhaV2Param,
     OpParameter_FmhcaParam,
-    OpParameter_AttentionParam
+    OpParameter_AttentionParam,
+    OpParameter_StftParam
   };
   return values;
 }
@@ -1403,13 +1412,14 @@ inline const char * const *EnumNamesOpParameter() {
     "FmhaV2Param",
     "FmhcaParam",
     "AttentionParam",
+    "StftParam",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameOpParameter(OpParameter e) {
-  if (e < OpParameter_NONE || e > OpParameter_AttentionParam) return "";
+  if (e < OpParameter_NONE || e > OpParameter_StftParam) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesOpParameter()[index];
 }
@@ -1810,6 +1820,10 @@ template<> struct OpParameterTraits<AttentionParam> {
   static const OpParameter enum_value = OpParameter_AttentionParam;
 };
 
+template<> struct OpParameterTraits<StftParam> {
+  static const OpParameter enum_value = OpParameter_StftParam;
+};
+
 struct OpParameterUnion {
   OpParameter type;
   void *value;
@@ -2625,6 +2639,14 @@ struct OpParameterUnion {
     return type == OpParameter_AttentionParam ?
       reinterpret_cast<const AttentionParamT *>(value) : nullptr;
   }
+  StftParamT *AsStftParam() {
+    return type == OpParameter_StftParam ?
+      reinterpret_cast<StftParamT *>(value) : nullptr;
+  }
+  const StftParamT *AsStftParam() const {
+    return type == OpParameter_StftParam ?
+      reinterpret_cast<const StftParamT *>(value) : nullptr;
+  }
 };
 
 bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj, OpParameter type);
@@ -3084,6 +3106,82 @@ inline flatbuffers::Offset<FmhcaParam> CreateFmhcaParam(
 
 flatbuffers::Offset<FmhcaParam> CreateFmhcaParam(flatbuffers::FlatBufferBuilder &_fbb, const FmhcaParamT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct StftParamT : public flatbuffers::NativeTable {
+  typedef StftParam TableType;
+  int32_t n_fft;
+  int32_t hop_length;
+  bool abs;
+  StftParamT()
+      : n_fft(0),
+        hop_length(0),
+        abs(true) {
+  }
+};
+
+struct StftParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef StftParamT NativeTableType;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StftParamTypeTable();
+  }
+  int32_t n_fft() const {
+    return GetField<int32_t>(4, 0);
+  }
+  int32_t hop_length() const {
+    return GetField<int32_t>(6, 0);
+  }
+  bool abs() const {
+    return GetField<uint8_t>(8, 1) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, 4) &&
+           VerifyField<int32_t>(verifier, 6) &&
+           VerifyField<uint8_t>(verifier, 8) &&
+           verifier.EndTable();
+  }
+  StftParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StftParamT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<StftParam> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StftParamT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StftParamBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_n_fft(int32_t n_fft) {
+    fbb_.AddElement<int32_t>(4, n_fft, 0);
+  }
+  void add_hop_length(int32_t hop_length) {
+    fbb_.AddElement<int32_t>(6, hop_length, 0);
+  }
+  void add_abs(bool abs) {
+    fbb_.AddElement<uint8_t>(8, static_cast<uint8_t>(abs), 1);
+  }
+  explicit StftParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  StftParamBuilder &operator=(const StftParamBuilder &);
+  flatbuffers::Offset<StftParam> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<StftParam>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<StftParam> CreateStftParam(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t n_fft = 0,
+    int32_t hop_length = 0,
+    bool abs = true) {
+  StftParamBuilder builder_(_fbb);
+  builder_.add_hop_length(hop_length);
+  builder_.add_n_fft(n_fft);
+  builder_.add_abs(abs);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<StftParam> CreateStftParam(flatbuffers::FlatBufferBuilder &_fbb, const StftParamT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct WhileParamT : public flatbuffers::NativeTable {
   typedef WhileParam TableType;
   std::string cond_graph;
@@ -3863,6 +3961,9 @@ struct Op FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const AttentionParam *main_as_AttentionParam() const {
     return main_type() == OpParameter_AttentionParam ? static_cast<const AttentionParam *>(main()) : nullptr;
   }
+  const StftParam *main_as_StftParam() const {
+    return main_type() == OpParameter_StftParam ? static_cast<const StftParam *>(main()) : nullptr;
+  }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(10);
   }
@@ -4292,6 +4393,10 @@ template<> inline const AttentionParam *Op::main_as<AttentionParam>() const {
   return main_as_AttentionParam();
 }
 
+template<> inline const StftParam *Op::main_as<StftParam>() const {
+  return main_as_StftParam();
+}
+
 struct OpBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -5167,6 +5272,38 @@ inline flatbuffers::Offset<FmhcaParam> CreateFmhcaParam(flatbuffers::FlatBufferB
       _heads);
 }
 
+inline StftParamT *StftParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new StftParamT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void StftParam::UnPackTo(StftParamT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = n_fft(); _o->n_fft = _e; };
+  { auto _e = hop_length(); _o->hop_length = _e; };
+  { auto _e = abs(); _o->abs = _e; };
+}
+
+inline flatbuffers::Offset<StftParam> StftParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StftParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStftParam(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<StftParam> CreateStftParam(flatbuffers::FlatBufferBuilder &_fbb, const StftParamT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StftParamT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _n_fft = _o->n_fft;
+  auto _hop_length = _o->hop_length;
+  auto _abs = _o->abs;
+  return MNN::CreateStftParam(
+      _fbb,
+      _n_fft,
+      _hop_length,
+      _abs);
+}
+
 inline WhileParamT *WhileParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new WhileParamT();
   UnPackTo(_o, _resolver);
@@ -6015,6 +6152,10 @@ inline bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj,
       auto ptr = reinterpret_cast<const AttentionParam *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case OpParameter_StftParam: {
+      auto ptr = reinterpret_cast<const StftParam *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -6425,6 +6566,10 @@ inline void *OpParameterUnion::UnPack(const void *obj, OpParameter type, const f
       auto ptr = reinterpret_cast<const AttentionParam *>(obj);
       return ptr->UnPack(resolver);
     }
+    case OpParameter_StftParam: {
+      auto ptr = reinterpret_cast<const StftParam *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6823,6 +6968,10 @@ inline flatbuffers::Offset<void> OpParameterUnion::Pack(flatbuffers::FlatBufferB
       auto ptr = reinterpret_cast<const AttentionParamT *>(value);
       return CreateAttentionParam(_fbb, ptr, _rehasher).Union();
     }
+    case OpParameter_StftParam: {
+      auto ptr = reinterpret_cast<const StftParamT *>(value);
+      return CreateStftParam(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -7221,6 +7370,10 @@ inline OpParameterUnion::OpParameterUnion(const OpParameterUnion &u) FLATBUFFERS
       value = new AttentionParamT(*reinterpret_cast<AttentionParamT *>(u.value));
       break;
     }
+    case OpParameter_StftParam: {
+      value = new StftParamT(*reinterpret_cast<StftParamT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -7718,6 +7871,11 @@ inline void OpParameterUnion::Reset() {
       delete ptr;
       break;
     }
+    case OpParameter_StftParam: {
+      auto ptr = reinterpret_cast<StftParamT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
@@ -7907,12 +8065,13 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     OpTypeTypeTable
   };
-  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 299, 300, 301, 302, 303, 304, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
+  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 299, 300, 301, 302, 303, 304, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
   static const char * const names[] = {
     "AbsVal",
     "QuantizedAdd",
@@ -8065,6 +8224,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "Svd",
     "Histogram",
     "DynamicQuant",
+    "Stft",
     "Plugin",
     "Select",
     "ZerosLike",
@@ -8098,7 +8258,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "GridSample"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 182, type_codes, type_refs, values, names
+    flatbuffers::ST_ENUM, 183, type_codes, type_refs, values, names
   };
   return &tt;
 }
@@ -8203,7 +8363,8 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
     { flatbuffers::ET_SEQUENCE, 0, 94 },
     { flatbuffers::ET_SEQUENCE, 0, 95 },
     { flatbuffers::ET_SEQUENCE, 0, 96 },
-    { flatbuffers::ET_SEQUENCE, 0, 97 }
+    { flatbuffers::ET_SEQUENCE, 0, 97 },
+    { flatbuffers::ET_SEQUENCE, 0, 98 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     QuantizedAddTypeTable,
@@ -8303,7 +8464,8 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
     GroupNormTypeTable,
     FmhaV2ParamTypeTable,
     FmhcaParamTypeTable,
-    AttentionParamTypeTable
+    AttentionParamTypeTable,
+    StftParamTypeTable
   };
   static const char * const names[] = {
     "NONE",
@@ -8404,10 +8566,11 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
     "GroupNorm",
     "FmhaV2Param",
     "FmhcaParam",
-    "AttentionParam"
+    "AttentionParam",
+    "StftParam"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_UNION, 99, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_UNION, 100, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
@@ -8550,6 +8713,23 @@ inline const flatbuffers::TypeTable *FmhcaParamTypeTable() {
   return &tt;
 }
 
+inline const flatbuffers::TypeTable *StftParamTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "n_fft",
+    "hop_length",
+    "abs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
 inline const flatbuffers::TypeTable *WhileParamTypeTable() {
   static const flatbuffers::TypeCode type_codes[] = {
     { flatbuffers::ET_STRING, 0, -1 },
diff --git a/schema/default/MNN.fbs b/schema/default/MNN.fbs
index d415bddcb..e5e588a66 100644
--- a/schema/default/MNN.fbs
+++ b/schema/default/MNN.fbs
@@ -168,6 +168,7 @@ enum OpType : int {
     Svd = 153,
     Histogram = 154,
     DynamicQuant = 155,
+    Stft = 156,
 
     Plugin = 256, //The Type load from plugin
     //Training Op Start from 257
@@ -239,6 +240,12 @@ table FmhcaParam {
     heads: int;
 }
 
+table StftParam {
+    n_fft: int;
+    hop_length: int;
+    abs: bool = true;
+}
+
 table WhileParam {
     // The name of condition subgraph.
     cond_graph: string;
@@ -414,7 +421,8 @@ union OpParameter {
     GroupNorm,
     FmhaV2Param,
     FmhcaParam,
-    AttentionParam
+    AttentionParam,
+    StftParam
 }
 
 table Op {
diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
index 2e297263e..4c4cb7873 100644
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@@ -645,8 +645,8 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
     // TODO: rm this convert when merge diff datatyoe of op
     auto map  = gCreator;
     auto iter = map->find(opType);
-    if (iter == map->end()) {
-        MNN_PRINT("Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str());
+    if (iter == map->end() ) {
+        MNN_PRINT("Don't support type [%s]\n", MNN::EnumNameOpType(op->type()));
         return nullptr;
     }
     Execution* exe = nullptr;
diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp
index b23bc8ead..3ec321c99 100644
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@@ -237,6 +237,9 @@ class CastWrapExecution : public Execution {
         CPUBackend::addCreator(opType, &_temp); \
     }
 
+#define REGISTER_CPU_OP_CREATOR_AUDIO(name, opType) \
+    REGISTER_CPU_OP_CREATOR(name, opType)
+
 } // namespace MNN
 
 #endif /* CPUBackend_hpp */
diff --git a/source/backend/cpu/CPUBinaryInt8.cpp b/source/backend/cpu/CPUBinaryInt8.cpp
index a1da4a2e4..cf46a1af5 100644
--- a/source/backend/cpu/CPUBinaryInt8.cpp
+++ b/source/backend/cpu/CPUBinaryInt8.cpp
@@ -80,16 +80,16 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
 
     int inpBytes = 1;
     int outBytes = 1;
+    QuanPrePostParameters params;
+    
+    params.inputScale = mInputScales.data();
+    params.outputScale = mOutputScales.data();
+    params.outputZeroPoint = mOutputZeros.data();
+    params.inputZeroPoint = mInputZeros.data();
+    params.minValue = (ssize_t)mMinValue;
+    params.maxValue = (ssize_t)TensorUtils::getDescribe(outputs[0])->quantAttr->max;
 
     MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
-        QuanPrePostParameters params;
-        
-        params.inputScale = mInputScales.data();
-        params.outputScale = mOutputScales.data();
-        params.outputZeroPoint = mOutputZeros.data();
-        params.inputZeroPoint = mInputZeros.data();
-        params.minValue = (ssize_t)mMinValue;
-        params.maxValue = (ssize_t)TensorUtils::getDescribe(outputs[0])->quantAttr->max;
 
         int start = schedule.first * (int)tId;
         int realSize = schedule.first;
diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp
index bdef005cb..8bfcf0738 100644
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@@ -18,7 +18,6 @@
 #include "core/ConvolutionCommon.hpp"
 #include "compute/CommonOptFunction.h"
 #include "compute/ConvOpt.h"
-#include "compute/DeconvolutionWithStride.hpp"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 
@@ -83,63 +82,13 @@ static void _transformWeight(const uint8_t* tempWeight, uint8_t* dest, int outpu
     //printf("%d - %d - %d - %d\n", outputCount, srcCount, fh, fw);
     core->MNNPackForMatMul_B((float*)dest, (const float*)cache, outputC4 * fw * fh * core->pack, srcCount, false);
 }
-// Int8 Weight.
-static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, const int8_t* srcPtr,
-                               std::shared_ptr<Tensor>& weight) {
-    auto core = static_cast<CPUBackend*>(bn)->int8Functions();
-    auto gcore =  static_cast<CPUBackend*>(bn)->functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    UNIT = gcore->pack;
 
-    int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
-    std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
-
-    weight.reset(Tensor::createDevice<int8_t>(shape));
-    bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);
-    if (!succ) {
-        MNN_ERROR("Memory not enough");
-        return;
-    }
-    auto dstPtr = weight->host<int8_t>();
-    ::memset(dstPtr, 0, weight->size());
-
-    int icDiv = UP_DIV(ic, SRC_UNIT);
-     for (int k = 0; k < kernelCount; ++k) {
-        auto srcK = srcPtr + k;
-        auto dstK = dstPtr + k * SRC_UNIT * UNIT * icDiv;
-        for (int x = 0; x < oc; ++x) {
-            int xout = x / UNIT;
-            int xin = x % UNIT;
-            auto srcY = srcK + x * kernelCount;
-            auto dstY = dstK + xout * SRC_UNIT * UNIT * icDiv * kernelCount + xin * SRC_UNIT;
-            for (int y = 0; y < ic; ++y) {
-                int yout = y / SRC_UNIT;
-                int yin = y % SRC_UNIT;
-
-                const int dstIndex = yout * SRC_UNIT * UNIT + yin;
-                const int srcIndex = y * oc * kernelCount;
-                dstY[dstIndex] = srcY[srcIndex];
-            }
-        }
-    }
-}
 CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backend* backend, bool dynamicWeight)
     : MNN::CPUDeconvolutionCommon(input, convOp, backend, dynamicWeight) {
     auto core               = static_cast<CPUBackend*>(backend)->functions();
     auto coreInt8           = static_cast<CPUBackend*>(backend)->int8Functions();
     int eP, lP, hP;
     core->MNNGetMatMulPackMode(&eP, &lP, &hP);
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    bool ModeInt8        =  false;
-
-    if (CPUBackend::getDataType(input) == DataType_DT_INT8 || input->getType().bytes() == 1) {
-        eP = DST_XUNIT;
-        lP = SRC_UNIT;
-        hP = UNIT;
-        ModeInt8 = true;
-    }
     auto conv2d                  = convOp->main_as_Convolution2D();
     auto layer                   = conv2d->common();
     int outputCount              = layer->outputCount();
@@ -155,30 +104,17 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
     mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
     std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputAlign * srcCount}));
     if (dynamicWeight) {
-        mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, ModeInt8));
+        mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, false));
         mWeightTransformCache = cache;
         return;
     }
 
     const float* tempWeight      = nullptr;
-    const int8_t* quanWeightInt8 = nullptr;
 
     int tempWeightSize   = 0;
-    std::unique_ptr<Tensor> externalWeightTensor;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
 
-    std::vector<int32_t> _bias(outputChannleUp4, 0);
-    std::vector<float> _scale(outputChannleUp4, 0);
-    std::vector<int32_t> _beta(outputChannleUp4, 0);
-    auto biasPtr = _bias.data();
-    auto scalePtr = _scale.data();
-    auto betaPtr = _beta.data();
-
-    if (ModeInt8) {
-        ConvolutionCommon::getConvInt8Parameters(convOp, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr, betaPtr);
-    } else {
-        ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize);
-    }
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize);
 
     bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
                    backend->onAcquireBuffer(cache.get(), Backend::STATIC);
@@ -196,26 +132,16 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
         core->MNNFp32ToLowp(tempWeight, (int16_t*)lowpWeight.get(), outputCount * srcCount * fh * fw);
         tempWeight = (float*)lowpWeight.get();
     }
-    if (!ModeInt8) {
-        mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
-        success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
-        if (!success) {
-            mValid = false;
-            return;
-        }
-        auto dest = mWeight->host<uint8_t>();
-        _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
-    } else {
-        mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
-        success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
-        if (!success) {
-            mValid = false;
-            return;
-        }
-        _reorderWeightInt8(backend, layer, quanWeightInt8, mWeight);
+    mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
+    success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
+    if (!success) {
+        mValid = false;
+        return;
     }
+    auto dest = mWeight->host<uint8_t>();
+    _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
     backend->onReleaseBuffer(cache.get(), Backend::STATIC);
-    mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, ModeInt8));
+    mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, false));
 }
 
 CPUDeconvolution::~CPUDeconvolution() {
@@ -261,68 +187,21 @@ ErrorCode CPUDeconvolution::onResize(const std::vector<Tensor *> &inputs, const
 }
 
 CPUDeconvolutionOrigin::CPUDeconvolutionOrigin(const Tensor *input, Tensor *weight, const Op *convOp, Backend *b, bool ModeInt8) : CPUDeconvolutionBasic(input, convOp, b) {
-    if (ModeInt8) {
-        const auto weightDataPtr = weight->host<int8_t>();
-        auto conv2d = convOp->main_as_Convolution2D();
-        auto common = conv2d->common();
-        auto pack = static_cast<CPUBackend*>(b)->functions()->pack;
-        mResource = CPUConvolution::makeResourceInt8(backend(), convOp, pack);
-        CPUConvolution::MutableResourceInt8 mutableResource(mResource, b);
-        auto core = static_cast<CPUBackend*>(b)->int8Functions();
-        auto gemmKernel = core->Int8GemmKernel;
-        int UNIT, SRC_UNIT, DST_XUNIT;
-        core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-        const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
-        const int ocDiv4 = UP_DIV(common->outputCount(), pack) * kEleCnt;
-        const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
-        const int ocDivUnit = UP_DIV(common->outputCount(), UNIT);
-        const int oc4 = ocDiv4 / kEleCnt;
-        const int bias_elesize = ocDiv4 * pack;
-        // set offset if use SSE.
-        auto inputQuant = TensorUtils::getQuantInfo(input);
-        auto inputZeroPoint = inputQuant[1];
-        std::vector<int32_t> _bias(bias_elesize, inputZeroPoint);
-#ifdef MNN_USE_SSE
-        int actBits = conv2d->symmetricQuan()->nbits();
-        if (actBits <= 7) {
-            gemmKernel = core->Int8GemmKernelFast;
-        }
-        for (int a = 0; a < kEleCnt; ++a){
-            for (int oz = 0; oz < ocDivUnit * UNIT; ++oz) {
-                int offset = inputZeroPoint, oz4 = oz / UNIT, ozRemain = oz % UNIT;
-                for (int sz = 0; sz < icDiv4 * SRC_UNIT; ++sz) {
-                    int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
-                    int index = (((a * oc4 + oz4) * icDiv4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
-                    auto weightInt8Data = weightDataPtr[index];
-                    offset += weightInt8Data * (-128);
-                }
-                if (oz < oc4 * pack) {
-                    _bias[a * oc4 * pack + oz] = offset;
-                }
-            }
-        }
-#else
-        if(conv2d->symmetricQuan() && conv2d->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
-            gemmKernel = core->Int8GemmKernelFast;
-        }
-#endif
-        mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, convOp, gemmKernel, _bias));
-    }
+    // Do nothing
 }
 
 ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     CPUDeconvolutionBasic::onResize(inputs, outputs);
     auto core = static_cast<CPUBackend*>(backend())->functions();
-    auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
     int bytes = core->bytes;
     auto input  = inputs[0];
     auto output = outputs[0];
     auto oc     = output->channel();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    gcore->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
     if (UP_DIV(oc, core->pack) * core->pack != inputs[2]->length(0)) {
         return INPUT_DATA_ERROR;
     }
+    int eP, lP, hP;
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
 
     auto ocC4       = UP_DIV(output->channel(), core->pack);
     auto icC4       = UP_DIV(input->channel(), core->pack);
@@ -339,136 +218,132 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     auto src_height = output->height();
     auto src_width  = output->width();
     auto batch      = output->batch();
+    auto weightTensor = inputs[1];
+    auto biasTensor = inputs[2];
 
     auto kernelCount = ocC4 * mCommon->kernelX() * mCommon->kernelY();
-    mPostFunctions.clear();
-    auto plane         = width * height * batch;
-    const int maxDepth = 5;
+    auto plane = width * height * batch;
     auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
-    //int zeroPoint = 0;
-
-    auto biasTensor = inputs[2];
-
-    // prepare for float2int8 if necessary.
-    auto outputQuant = TensorUtils::getQuantInfo(outputs[0]);
-    float scale = outputQuant[0];
-    scale = (scale == 0.f ? 0.f : 1.f / scale);
-    auto maxValue = outputQuant[3];
-    auto minValue = outputQuant[2];
-    auto zeroPoint = outputQuant[1];
-
-    AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
-    bool needReleaseTempInput = true;
-    int outi8 = 0;
-    if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
-        outi8 = 1;
+    auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
+    auto tileCount = UP_DIV(plane, eP);
+    threadNumber = ALIMIN(tileCount, threadNumber);
+    auto im2colOutputStride = input->channel() * eP * core->bytes;
+    mGemmInput = allocator->alloc(threadNumber * im2colOutputStride);
+    auto gemmOutputStride = kernelCount * core->pack * eP * core->bytes;
+    mGemmOutput = allocator->alloc(threadNumber * gemmOutputStride);
+    auto outputSize = batch*src_width*src_height*ocC4*core->pack*core->bytes;
+    if (threadNumber > 1) {
+        mExtraOutput = allocator->alloc((threadNumber-1)*outputSize);
     }
-    if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
-        mTempOutput.reset(Tensor::createDevice<float>({batch, height, width, ocC4 * kw * kh * core->pack}));
-        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        mDeconvInt8Exe->onResize({input}, {mTempOutput.get()});
-        if (mResource->mRelu) {
-            minValue = outputQuant[1];
-        }
+    allocator->free(mGemmInput);
+    allocator->free(mGemmOutput);
+    if (threadNumber > 1) {
+        allocator->free(mExtraOutput);
     }
-    else {
-        mTempOutput.reset(Tensor::createDevice<float>({kernelCount, plane, core->pack}));
-        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
-        // tempInput->buffer().host = (uint8_t*)inputPtr;
-
-        needReleaseTempInput = false;
-        TensorUtils::getDescribeOrigin(tempInput.get())->mem = new CPUMemObj(nullptr, TensorUtils::getDescribeOrigin(input)->mem->chunk(), 0);
-        mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
-    }
-    auto threadNumber = ((CPUBackend*)backend())->threadNumber();
-    std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
-    MemChunk outputFp32Ptr;
-    if (outi8) {
-        outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
-        if (outputFp32Ptr.invalid()) {
-            return OUT_OF_MEMORY;
-        }
-    }
-
-    mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
-                       strideX, threadNumber, src_width, src_height, plane, input, biasTensor, this, core, gcore, batch, outi8, scale,
-                       minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
-        auto colBufferPtr = mTempOutput->host<uint8_t>();
-        auto biasPtr      = biasTensor->host<float>();
-        auto inputPtr  = input->host<float>();
+    auto first = std::make_pair([=](uint8_t* outputPtr, int tId) {
+        auto gemmInputBufferPtr = mGemmInput.ptr() + tId * im2colOutputStride;
+        auto colBufferPtr = mGemmOutput.ptr() + tId * gemmOutputStride;
+        auto inputPtr  = input->host<uint8_t>();
         auto unitBytes = core->pack * core->bytes;
         auto tempOutPtr = outputPtr;
-        auto float2Int8_step = src_height * src_width * batch;
-        if (outi8) {
-            tempOutPtr = outputFp32Ptr.ptr();
+        if (tId > 0) {
+            tempOutPtr = mExtraOutput.ptr() + (tId-1) * outputSize;
         }
-        for (int z = (tId); z < ocC4; z += threadNumber) {
-            auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
-            auto srcZ = colBufferPtr + kw * kh * plane * z * unitBytes;
-            ::memset(dstZ, 0, src_width * src_height * batch * unitBytes);
-            for (int b = 0; b < batch; ++b) {
-                auto dstB = dstZ + b * src_width  * src_height * unitBytes;
-                auto srcB = srcZ + b * width * height * unitBytes;
-                for (int oy = 0; oy < height; ++oy) {
-                    for (int ox = 0; ox < width; ++ox) {
-                        int srcStartX = ox * strideX - padX;
-                        int srcStartY = oy * strideY - padY;
-
-                        int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
-                        int efy = ALIMIN(kh, UP_DIV(src_height - srcStartY, dilateY));
-
-                        int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
-                        int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
-
-                        auto dstStart = dstB + srcStartX * unitBytes + srcStartY * src_width * unitBytes;
-                        auto srcStart = srcB + unitBytes * (ox + oy * width);
-                        if (sfy >= efy || sfx >= efx) {
-                            continue;
-                        }
-
-                        for (int fy = sfy; fy < efy; ++fy) {
-                            auto dstY = dstStart + fy * unitBytes * dilateY * src_width;
-                            auto srcY = srcStart + fy * kw * plane * unitBytes;
-                            core->MNNAddC4WithStride((const float*)(srcY + sfx * plane * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), plane * core->pack, dilateX * core->pack, efx - sfx);
-                        }
+        ::memset(tempOutPtr, 0, outputSize);
+
+        int l = mSrcCount;
+        int h = kernelCount * core->pack;
+        auto weightPtr = weightTensor->host<uint8_t>();
+        for (int index=tId; index < tileCount; index+=threadNumber) {
+            int xStart = index * eP;
+            int xEnd = ALIMIN(xStart + eP, plane);
+            int xCount = xEnd-xStart;
+            if (xCount <= 0) {
+                continue;
+            }
+            size_t parameters[7];
+            parameters[0] = xCount * core->bytes;
+            parameters[1] = l;
+            parameters[2] = h;
+            parameters[3] = xCount * core->bytes * core->pack;
+            parameters[4] = 0;
+            parameters[5] = 0;
+            parameters[6] = 0;
+            const float* postParametersPtr = nullptr;
+            int32_t info[4];
+            int32_t stride[4];
+            stride[0] = xCount;
+            stride[1] = (int32_t)parameters[1];
+            stride[2] = 0;
+            stride[3] = 0;
+            info[0] = 1;
+            info[1] = plane;
+            info[2] = xCount;
+            info[3] = 1;
+            auto aStart = inputPtr + xStart * unitBytes;
+            core->MNNPackC4ForMatMul_A((float*)(gemmInputBufferPtr), (const float**)(&aStart), info, stride);
+            if (xCount == eP) {
+                core->MNNPackedMatMul((float*)(colBufferPtr), (float*)gemmInputBufferPtr, (float*)weightPtr, parameters, postParametersPtr, nullptr, nullptr, nullptr);
+            } else {
+                core->MNNPackedMatMulRemain((float*)(colBufferPtr), (float*)gemmInputBufferPtr, (float*)weightPtr, xCount, parameters, postParametersPtr, nullptr, nullptr, nullptr);
+            }
+            // Col2Im
+            for (int z = 0; z < ocC4; ++z) {
+                auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
+                auto srcZ = colBufferPtr + kw * kh * xCount * z * unitBytes;
+                for (int x=0; x<xCount; ++x) {
+                    auto index = xStart + x;
+                    int b = index / (width * height);
+                    index = index % (width * height);
+                    int oy = index / width;
+                    int ox = index % width;
+                    int srcStartX = ox * strideX - padX;
+                    int srcStartY = oy * strideY - padY;
+                    
+                    int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
+                    int efy = ALIMIN(kh, UP_DIV(src_height - srcStartY, dilateY));
+                    
+                    int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
+                    int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
+                    
+                    auto dstStart = dstZ + b * src_width * src_height * unitBytes + srcStartX * unitBytes + srcStartY * src_width * unitBytes;
+                    auto srcStart = srcZ + x * unitBytes;
+                    if (sfy >= efy || sfx >= efx) {
+                        continue;
+                    }
+                    
+                    for (int fy = sfy; fy < efy; ++fy) {
+                        auto dstY = dstStart + fy * unitBytes * dilateY * src_width;
+                        auto srcY = srcStart + fy * kw * xCount * unitBytes;
+                        core->MNNAddC4WithStride((const float*)(srcY + sfx * xCount * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), xCount * core->pack, dilateX * core->pack, efx - sfx);
                     }
                 }
             }
-            core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr +  unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
-            if (outi8) {
-                float scaleOne = scale;
-                float zeroOne  = zeroPoint;
-                gcore->MNNFloat2Int8((float*)dstZ, (int8_t*)(outputPtr + z * float2Int8_step * core->pack), float2Int8_step, &scaleOne, minValue, maxValue, &zeroOne, 0);
+        }
+    }, threadNumber);
+    auto second = std::make_pair([ocC4, src_height, src_width, threadNumber, batch, biasTensor, this, outputSize, core](uint8_t* outputPtr, int tId) {
+        auto unitBytes = core->pack * core->bytes;
+        auto biasPtr = biasTensor->host<uint8_t>();
+        for (int z = tId; z < ocC4; z+=threadNumber) {
+            auto dstZ = outputPtr + z * src_height * src_width * batch * unitBytes;
+            if (threadNumber > 1) {
+                for (int index=0; index<threadNumber-1; ++index) {
+                    auto src = mExtraOutput.ptr() + index * outputSize + z * src_height * src_width * batch * unitBytes;
+                    core->MNNMatrixAdd((float*)(dstZ), (float*)(src), (float*)(dstZ), src_height * src_width * batch, 0, 0, 0, 1);
+                }
             }
+            core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr +  unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
         }
-    }, threadNumber));
-    if (outi8) {
-        allocator->free(outputFp32Ptr);
-    }
-    if (needReleaseTempInput) {
-        backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
-    }
-    backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
+
+    }, threadNumber);
+    mExecuteFuntion = {first, second};
     return NO_ERROR;
 }
 
 ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto inputPtr = inputs[0]->host<uint8_t>();
     auto outputPtr = outputs[0]->host<uint8_t>();
-    if (mDeconvInt8Exe.get() != nullptr) {
-        mDeconvInt8Exe->onExecute({inputs[0], inputs[1]}, {mTempOutput.get()});
-    }
-    else {
-        mMatMul->onExecute();
-    }
-    for (auto& unit : mPostFunctions) {
+    for (auto& unit : mExecuteFuntion) {
         MNN_CONCURRENCY_BEGIN(tId, unit.second) {
             unit.first(outputPtr, (int)tId);
         }
@@ -482,15 +357,6 @@ class CPUDeconvolutionCreator : public CPUBackend::Creator {
                                 const MNN::Op* op, Backend* backend) const {
         auto convOp = op->main_as_Convolution2D();
         auto common = convOp->common();
-        if (backend->type() == MNN_FORWARD_CPU && inputs.size() == 1) {
-            if (common->strideY() > 1 || common->strideX() > 1) {
-                if (common->dilateX() == 1 && common->dilateY() == 1) {
-                    if (common->kernelX() / common->strideX() > 2 || common->kernelY() / common->strideY() > 2) {
-                        return new DeconvolutionWithStride(inputs[0], op, backend);
-                    }
-                }
-            }
-        }
         return new CPUDeconvolution(inputs[0], op, backend, inputs.size() > 1);
     }
 };
diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp
index 82f7168d4..bea9f164a 100644
--- a/source/backend/cpu/CPUDeconvolution.hpp
+++ b/source/backend/cpu/CPUDeconvolution.hpp
@@ -12,7 +12,6 @@
 #include "CPUConvolution.hpp"
 #include "compute/CommonOptFunction.h"
 #include "compute/StrassenMatmulComputor.hpp"
-#include "compute/GemmInt8Executor.hpp"
 #include "core/TensorUtils.hpp"
 namespace MNN {
 class CPUDeconvolutionBasic : public CPUConvolution {
@@ -44,11 +43,11 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic {
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 private:
-    std::shared_ptr<StrassenMatrixComputor> mMatMul;
-    std::shared_ptr<GemmInt8Executor> mDeconvInt8Exe;
-    std::vector<std::pair<std::function<void(uint8_t*, int)>, int>> mPostFunctions;
-    std::shared_ptr<Tensor> mTempOutput;
-    std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
+    MemChunk mGemmOutput;
+    MemChunk mGemmInput;
+    MemChunk mExtraOutput;
+
+    std::vector<std::pair<std::function<void(uint8_t*, int)>, int>> mExecuteFuntion;
 };
 
 class CPUDeconvolution : public CPUDeconvolutionCommon {
diff --git a/source/backend/cpu/CPUEltwiseInt8.cpp b/source/backend/cpu/CPUEltwiseInt8.cpp
index 47362058e..91dd9cbb5 100644
--- a/source/backend/cpu/CPUEltwiseInt8.cpp
+++ b/source/backend/cpu/CPUEltwiseInt8.cpp
@@ -6,8 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#include "backend/cpu/CPUEltwiseInt8.hpp"
 #include "backend/cpu/CPUBackend.hpp"
+#ifdef MNN_SUPPORT_DEPRECATED_OP
+#include "backend/cpu/CPUEltwiseInt8.hpp"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
@@ -129,7 +130,9 @@ class CPUEltwiseInt8Creator : public CPUBackend::Creator {
         return new CPUEltwiseInt8(backend, op);
     }
 };
-
-REGISTER_CPU_OP_CREATOR(CPUEltwiseInt8Creator, OpType_EltwiseInt8);
-
 } // namespace MNN
+#endif
+namespace MNN {
+REGISTER_CPU_OP_CREATOR_OLD(CPUEltwiseInt8Creator, OpType_EltwiseInt8);
+};
+
diff --git a/source/backend/cpu/CPUInstanceNorm.cpp b/source/backend/cpu/CPUInstanceNorm.cpp
index 851d97831..6ed4513e6 100644
--- a/source/backend/cpu/CPUInstanceNorm.cpp
+++ b/source/backend/cpu/CPUInstanceNorm.cpp
@@ -6,9 +6,10 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include "backend/cpu/CPUBackend.hpp"
+#ifdef MNN_SUPPORT_DEPRECATED_OP
 #include "backend/cpu/CPUInstanceNorm.hpp"
 #include <math.h>
-#include "backend/cpu/CPUBackend.hpp"
 #include "core/Concurrency.h"
 #include <MNN/MNNDefine.h>
 #include "core/Macro.h"
@@ -106,7 +107,9 @@ class CPUInstanceNormCreator : public CPUBackend::Creator {
         return new CPUInstanceNorm(backend, op);
     }
 };
-
-REGISTER_CPU_OP_CREATOR(CPUInstanceNormCreator, OpType_InstanceNorm);
-
 } // namespace MNN
+#endif
+namespace MNN {
+REGISTER_CPU_OP_CREATOR_OLD(CPUInstanceNormCreator, OpType_InstanceNorm);
+};
+
diff --git a/source/backend/cpu/CPUMoments.cpp b/source/backend/cpu/CPUMoments.cpp
index 40c2cccf2..8ad50904e 100644
--- a/source/backend/cpu/CPUMoments.cpp
+++ b/source/backend/cpu/CPUMoments.cpp
@@ -6,9 +6,10 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include "backend/cpu/CPUBackend.hpp"
+#ifdef MNN_SUPPORT_DEPRECATED_OP
 #include "backend/cpu/CPUMoments.hpp"
 #include <math.h>
-#include "backend/cpu/CPUBackend.hpp"
 #include "core/Concurrency.h"
 #include <MNN/MNNDefine.h>
 #include "core/Macro.h"
@@ -129,7 +130,9 @@ class CPUMomentsCreator : public CPUBackend::Creator {
         return new CPUMoments(backend, op);
     }
 };
-
-REGISTER_CPU_OP_CREATOR(CPUMomentsCreator, OpType_Moments);
-
 } // namespace MNN
+#endif
+namespace MNN {
+REGISTER_CPU_OP_CREATOR_OLD(CPUMomentsCreator, OpType_Moments);
+};
+
diff --git a/source/backend/cpu/CPUOPRegister.cpp b/source/backend/cpu/CPUOPRegister.cpp
index 37f868732..345f45ce5 100644
--- a/source/backend/cpu/CPUOPRegister.cpp
+++ b/source/backend/cpu/CPUOPRegister.cpp
@@ -78,6 +78,9 @@ extern void ___CPUTextureCreator__OpType_Texture__();
 #ifdef MNN_SUPPORT_TRANSFORMER_FUSE
 extern void ___CPUAttentionCreator__OpType_Attention__();
 #endif
+#ifdef MNN_BUILD_AUDIO
+extern void ___CPUStftCreator__OpType_Stft__();
+#endif
 void registerCPUOps() {
 ___CPUCropAndResizeCreator__OpType_CropAndResize__();
 ___CPUArgMaxCreator__OpType_ArgMax__();
@@ -156,5 +159,8 @@ ___CPUTextureCreator__OpType_Texture__();
 #ifdef MNN_SUPPORT_TRANSFORMER_FUSE
 ___CPUAttentionCreator__OpType_Attention__();
 #endif
+#ifdef MNN_BUILD_AUDIO
+___CPUStftCreator__OpType_Stft__();
+#endif
 }
 }
diff --git a/source/backend/cpu/CPURelu.cpp b/source/backend/cpu/CPURelu.cpp
index 073556464..71bc41f16 100644
--- a/source/backend/cpu/CPURelu.cpp
+++ b/source/backend/cpu/CPURelu.cpp
@@ -46,16 +46,53 @@ ErrorCode CPURelu::onExecute(const std::vector<Tensor*>& inputs, const std::vect
     auto& ob = outputs[0]->buffer();
 
     if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+        auto core = static_cast<CPUBackend*>(backend())->int8Functions();
+        auto gcore = static_cast<CPUBackend*>(backend())->functions();
         const int8_t* srcO = (const int8_t*)ib.host;
+        int8_t* dstO       = (int8_t*)ob.host;
         auto inInfo = TensorUtils::getQuantInfo(inputs[0]);
         auto outInfo = TensorUtils::getQuantInfo(outputs[0]);
-        if (inInfo != outInfo) {
-            MNN_PRINT("this relu int8 implementation has error when input output quant info mismatch\n");
-        }
-        int8_t zeroPoint = int8_t(outInfo[1]);
-        int8_t* dstO       = (int8_t*)ob.host;
         auto size         = mRealSize;
         auto numberThread = ((CPUBackend*)backend())->threadNumber();
+
+        auto inputscale = inInfo[0];
+        auto inputzero = (ssize_t)inInfo[1];
+        auto outputzero = (ssize_t)outInfo[1];
+        auto outputscale = outInfo[0] > 0.f ? 1.0f / outInfo[0] : 0.f;
+        QuanPrePostParameters params;
+        params.maxValue = static_cast<ssize_t>(inInfo[3]);
+        params.minValue = static_cast<ssize_t>(inInfo[2]);
+        params.inputScale = &inputscale;
+        params.inputZeroPoint = &inputzero;
+        params.outputScale = &outputscale;
+        params.outputZeroPoint = &outputzero;
+        
+        if (((float*)mSlope.get())[0] != 0.f) {
+            // PRelu Int8
+            int sizeQuad     = size / gcore->pack;
+            int remain       = size % gcore->pack;
+            int sizeDivide = UP_DIV(sizeQuad, numberThread);
+            
+            if (sizeQuad > 0) {
+                MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+                    
+                    int number = sizeDivide;
+                    if (tId == numberThread - 1) {
+                        number = sizeQuad - tId * sizeDivide;
+                    }
+                    core->MNNReluWithSlopeChannelInt8((int8_t*)(dstO + tId * gcore->pack * sizeDivide), srcO + tId * sizeDivide * gcore->pack, (const float*)(mSlope.get()), number, 1, &params, gcore->pack);
+                                    
+                }
+                MNN_CONCURRENCY_END();
+            }
+            if (remain > 0) {
+                ::memcpy(mCacheSrc.get(), srcO + sizeQuad * gcore->pack, remain);
+                core->MNNReluWithSlopeChannelInt8((int8_t*)mCacheDst.get(), (const int8_t*)(mCacheSrc.get()), (const float*)mSlope.get(), 1, 1, &params, gcore->pack);
+                ::memcpy(dstO + sizeQuad * gcore->pack, mCacheDst.get(), remain);
+            }
+            return NO_ERROR;
+        }
+        int8_t zeroPoint = int8_t(outInfo[1]);
         int sizeQuad     = size / 16;
         int remain       = sizeQuad * 16;
         int sizeDivide = sizeQuad / numberThread;
@@ -187,10 +224,6 @@ ErrorCode CPUPRelu::onResize(const std::vector<Tensor*>& inputs, const std::vect
         mQuanScalesOutput = {outputScale};
         mQuanZerosInput = {inputZero};
         mQuanZerosOutput = {outputZero};
-        auto p = mSlope.host<float>();
-        for (int i = 0; i < mSlope.buffer().dim[0].extent; ++i) {
-            p[i] = p[i] * inputScale * outputScale;
-        }
     }
     return NO_ERROR;
 }
@@ -198,42 +231,53 @@ ErrorCode CPUPRelu::onResize(const std::vector<Tensor*>& inputs, const std::vect
 ErrorCode CPUPRelu::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto& ib            = inputs[0]->buffer();
     auto& ob            = outputs[0]->buffer();
-    int sizeQuad = 1;
-    for (int i=2; i<ib.dimensions; ++i) {
-        sizeQuad *= ib.dim[i].extent;
-    }
     auto core = static_cast<CPUBackend*>(backend())->functions();
     auto coreInt8 = static_cast<CPUBackend*>(backend())->int8Functions();
     const int channel   = ib.dim[1].extent;
     const int batch     = ib.dim[0].extent;
-    int pack = 4;
-    int depthQuad = UP_DIV(channel, core->pack);
-    const uint8_t* srcO   = (const uint8_t*)ib.host;
+    int pack = core->pack;
+    
+    const int8_t* srcO   = (const int8_t*)ib.host;
     uint8_t* dstO         = (uint8_t*)ob.host;
+    auto depthQuad = UP_DIV(channel, core->pack);
     auto totalCount = batch * depthQuad;
     auto numberThread = ((CPUBackend*)backend())->threadNumber();
+    auto sizeQuad = UP_DIV(depthQuad, numberThread);
+    auto sizeCount = sizeQuad * batch * inputs[0]->width() * inputs[0]->height() * core->pack;
+    
     if (mUseInt8) {
-        depthQuad = UP_DIV(channel, pack);
+        auto inputInfo = TensorUtils::getDescribe(inputs[0])->quantAttr;
+        auto outputInfo = TensorUtils::getDescribe(outputs[0])->quantAttr;
+        auto inzero = (ssize_t)inputInfo->zero;
+        auto outzero = (ssize_t)outputInfo->zero;
+        auto outscale = outputInfo->scale > 0 ? 1.f / outputInfo->scale : 0.f;
+        QuanPrePostParameters params;
+        params.maxValue = static_cast<ssize_t>(outputInfo->max);
+        params.minValue = static_cast<ssize_t>(outputInfo->min);
+        params.inputScale = &inputInfo->scale;
+        params.inputZeroPoint = &inzero;
+        params.outputScale = &outscale;
+        params.outputZeroPoint = &outzero;
         MNN_CONCURRENCY_BEGIN(tId, numberThread) {
-            QuanPrePostParameters params;
-            params.maxValue = static_cast<ssize_t>(TensorUtils::getDescribe(inputs[0])->quantAttr->max);
-            params.minValue = static_cast<ssize_t>(TensorUtils::getDescribe(inputs[0])->quantAttr->min);
-            params.inputScale = mQuanScalesInput.data();
-            params.inputZeroPoint = mQuanZerosInput.data();
-            params.outputScale = mQuanScalesOutput.data();
-            params.outputZeroPoint = mQuanZerosOutput.data();
-            for (int b=tId; b<totalCount; b+=numberThread) {
-                auto c = b / batch;
-                coreInt8->MNNReluWithSlopeChannelInt8((int8_t*)(dstO + sizeQuad * pack * b), (const int8_t*)(srcO + sizeQuad * pack * b), (const float*)(mSlope.host<uint8_t>() + core->bytes * pack * c), sizeQuad, 1, &params);
+            
+            
+            auto number = ALIMIN(sizeQuad, depthQuad - tId * sizeQuad);
+            if (number > 0) {
+                auto sizeQ = number * batch * inputs[0]->width() * inputs[0]->height();
+                coreInt8->MNNReluWithSlopeChannelInt8((int8_t*)(dstO + tId * sizeCount), srcO + tId * sizeCount, (const float*)(mSlope.host<uint8_t>() + tId * sizeQuad * pack * core->bytes), sizeQ / number, number, &params, core->pack);
             }
         }
         MNN_CONCURRENCY_END();
         return NO_ERROR;
     }
+    int hw = 1;
+    for (int i=2; i<ib.dimensions; ++i) {
+        hw *= ib.dim[i].extent;
+    }
     MNN_CONCURRENCY_BEGIN(tId, numberThread) {
         for (int b=tId; b<totalCount; b+=numberThread) {
             auto c = b / batch;
-            core->MNNReluWithSlopeChannel((float*)(dstO + sizeQuad * core->bytes * core->pack * b), (const float*)(srcO + sizeQuad * core->pack * core->bytes * b), (const float*)(mSlope.host<uint8_t>() + core->bytes * core->pack * c), sizeQuad, 1);
+            core->MNNReluWithSlopeChannel((float*)(dstO + hw * core->bytes * core->pack * b), (const float*)(srcO + hw * core->pack * core->bytes * b), (const float*)(mSlope.host<uint8_t>() + core->bytes * core->pack * c), hw, 1);
         }
     }
     MNN_CONCURRENCY_END();
diff --git a/source/backend/cpu/CPUStft.cpp b/source/backend/cpu/CPUStft.cpp
new file mode 100644
index 000000000..5e6d40b54
--- /dev/null
+++ b/source/backend/cpu/CPUStft.cpp
@@ -0,0 +1,75 @@
+//
+//  CPUStft.cpp
+//  MNN
+//
+//  Created by MNN on 2024/11/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef MNN_BUILD_AUDIO
+
+#include "backend/cpu/CPUStft.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "core/Concurrency.h"
+#include "core/TensorUtils.hpp"
+#include "core/Macro.h"
+#include "compute/CommonOptFunction.h"
+
+namespace MNN {
+
+CPUStft::CPUStft(Backend* backend, int nfft, int hop_length, bool abs)
+    : Execution(backend), mNfft(nfft), mHopLength(hop_length), mAbs(abs) {
+    // nothing to do
+}
+
+ErrorCode CPUStft::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto cpuBn = static_cast<CPUBackend*>(backend());
+    mTmpFrames.buffer().dim[0].extent = cpuBn->threadNumber();
+    mTmpFrames.buffer().dim[1].extent = mNfft;
+    TensorUtils::getDescribe(&mTmpFrames)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+    mTmpFrames.buffer().dimensions    = 2;
+    mTmpFrames.buffer().type          = inputs[0]->getType();
+    backend()->onAcquireBuffer(&mTmpFrames, Backend::DYNAMIC);
+    backend()->onReleaseBuffer(&mTmpFrames, Backend::DYNAMIC);
+    return NO_ERROR;
+}
+
+ErrorCode CPUStft::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    const float* sample = inputs[0]->host<float>();
+    const float* window = inputs[1]->host<float>();
+    float* buffer = mTmpFrames.host<float>();
+    float* output = outputs[0]->host<float>();
+    auto outputShape = outputs[0]->shape();
+    int frames = outputShape[0];
+    int col = outputShape[1];
+    auto cpuBn = static_cast<CPUBackend*>(backend());
+    int threadNum = cpuBn->threadNumber();
+    // div frames to threadNum
+    int threadNumber = std::min(threadNum, frames);
+    int sizeDivide = frames / threadNumber;
+    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
+        int number = sizeDivide;
+        if (tId == threadNumber - 1) {
+            number = frames - tId * sizeDivide;
+        }
+        for (int i = tId * sizeDivide; i < tId * sizeDivide + number; ++i) {
+            MNNDftAbs(sample + i * mHopLength, window, output + i * col, buffer + tId * mNfft, mNfft);
+        }
+    };
+    MNN_CONCURRENCY_END();
+
+    return NO_ERROR;
+}
+
+class CPUStftCreator : public CPUBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const {
+        auto stft = op->main_as_StftParam();
+        return new CPUStft(backend, stft->n_fft(), stft->hop_length(), stft->abs());
+    }
+};
+
+REGISTER_CPU_OP_CREATOR_AUDIO(CPUStftCreator, OpType_Stft);
+} // namespace MNN
+#endif // MNN_BUILD_AUDIO
\ No newline at end of file
diff --git a/source/backend/cpu/CPUStft.hpp b/source/backend/cpu/CPUStft.hpp
new file mode 100644
index 000000000..e483a9b8c
--- /dev/null
+++ b/source/backend/cpu/CPUStft.hpp
@@ -0,0 +1,31 @@
+//
+//  CPUStft.hpp
+//  MNN
+//
+//  Created by MNN on 2024/11/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef MNN_BUILD_AUDIO
+#ifndef CPUStft_hpp
+#define CPUStft_hpp
+
+#include "core/Execution.hpp"
+
+namespace MNN {
+class CPUStft : public Execution {
+public:
+    CPUStft(Backend *backend, int nfft, int hop_length, bool abs);
+    virtual ~CPUStft() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+private:
+    int mNfft, mHopLength;
+    bool mAbs;
+    Tensor mTmpFrames;
+};
+
+} // namespace MNN
+
+#endif /* CPUStft.hpp */
+#endif // MNN_BUILD_AUDIO
\ No newline at end of file
diff --git a/source/backend/cpu/arm/arm32/MNNReluWithSlopeChannelInt8.S b/source/backend/cpu/arm/arm32/MNNReluWithSlopeChannelInt8.S
index 4595733b8..4c09f69c6 100644
--- a/source/backend/cpu/arm/arm32/MNNReluWithSlopeChannelInt8.S
+++ b/source/backend/cpu/arm/arm32/MNNReluWithSlopeChannelInt8.S
@@ -48,9 +48,9 @@ beq PReluEnd
 cmp r3, #0
 beq PReluEnd
 
-vmov.f32 q12, #0.5
-vmov.f32 q13, #-0.5
 .macro ROUND_TWO x0, x1
+    vmov.f32 q12, #0.5
+    vmov.f32 q13, #-0.5
     vcgt.f32 q10, \x0, #0
     vcgt.f32 q11, \x1, #0
     vbsl.f32 q10, q12, q13
@@ -62,6 +62,8 @@ vmov.f32 q13, #-0.5
 .endm
 
 .macro ROUND_ONE x0
+    vmov.f32 q12, #0.5
+    vmov.f32 q13, #-0.5
     vcgt.f32 q10, \x0, #0
     vbsl.f32 q10, q12, q13
     vadd.f32 \x0, q10, \x0
@@ -69,11 +71,13 @@ vmov.f32 q13, #-0.5
 .endm
 
 vld1.8 d30[0], [r8]
-vdup.8 d30, d30[0]  // inputZeroPoint
-
 vld1.8 d31[0], [r6]
+vdup.8 d30, d30[0]  // inputZeroPoint
 vdup.8 d31, d31[0]  // outputZeroPoint
 
+ldr r6, [r5, #0]    // inputScale
+ldr r8, [r5, #4]    // outputScale
+
 PReluZLoop:
 vld1.32 {q14}, [r2]!
 
@@ -93,17 +97,38 @@ vmovl.s16 q4, d3
 vmovl.s16 q5, d4
 vmovl.s16 q6, d5
 
-vclt.s8 q1, q0, #0
-
 vcvt.f32.s32 q3, q3
 vcvt.f32.s32 q4, q4
 vcvt.f32.s32 q5, q5
 vcvt.f32.s32 q6, q6
-
-vmul.f32 q3, q3, q14
-vmul.f32 q4, q4, q14
-vmul.f32 q5, q5, q14
-vmul.f32 q6, q6, q14
+// *input_scale
+vld1.f32 {d14[0]}, [r6]
+vld1.f32 {d14[1]}, [r8] // outputscale
+vmul.f32 q3, q3, d14[0]
+vmul.f32 q4, q4, d14[0]
+vmul.f32 q5, q5, d14[0]
+vmul.f32 q6, q6, d14[0]
+
+vclt.f32 q0, q3, #0
+vclt.f32 q1, q4, #0
+vclt.f32 q2, q5, #0
+vclt.f32 q12, q6, #0
+
+// *slope
+vmul.f32 q8, q3, q14
+vmul.f32 q9, q4, q14
+vmul.f32 q10, q5, q14
+vmul.f32 q11, q6, q14
+
+vbit.32 q3, q8, q0
+vbit.32 q4, q9, q1
+vbit.32 q5, q10, q2
+vbit.32 q6, q11, q12
+
+vmul.f32 q3, q3, d14[1]
+vmul.f32 q4, q4, d14[1]
+vmul.f32 q5, q5, d14[1]
+vmul.f32 q6, q6, d14[1]
 
 ROUND_TWO q3, q4
 ROUND_TWO q5, q6
@@ -122,8 +147,7 @@ vqmovn.s16 d19, q8
 vmax.s8 q9, q9, q10
 vmin.s8 q9, q9, q11
 
-vbit.8 q0, q9, q1
-vst1.8 {q0}, [r0]!
+vst1.8 {q9}, [r0]!
 
 sub r5, r5, #4
 cmp r5, #4
@@ -139,10 +163,18 @@ vmovl.s8 q1, d0
 vsubw.s8 q1, q1, d30
 
 vmovl.s16 q2, d2
-vclt.s8 d10, d0, #0
 
 vcvt.f32.s32 q2, q2
-vmul.f32 q2, q2, q14
+// *input_scale
+vld1.f32 {d14[0]}, [r6]
+vld1.f32 {d14[1]}, [r8] // outputscale
+vmul.f32 q2, q2, d14[0]
+vclt.f32 q4, q2, #0     // index
+// *slope
+vmul.f32 q3, q2, q14
+vbit q2, q3, q4
+// *output_scale
+vmul.f32 q2, q2, d14[1]
 
 ROUND_ONE q2
 
diff --git a/source/backend/cpu/arm/arm32/MNNWinogradMatrixProductLeft.S b/source/backend/cpu/arm/arm32/MNNWinogradMatrixProductLeft.S
deleted file mode 100644
index 4796a7091..000000000
--- a/source/backend/cpu/arm/arm32/MNNWinogradMatrixProductLeft.S
+++ /dev/null
@@ -1,225 +0,0 @@
-//
-//  MNNWinogradMatrixProductLeft.S
-//  MNN
-//
-//  Created by MNN on 2018/08/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNWinogradMatrixProductLeft
-//void MNNWinogradMatrixProductLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
-
-//Auto: r0: S, r1:B, r2: M, r3:w
-//Load From sp: r4:h, r5:k, r6:length
-
-push {r4-r8, r10, r11, lr} // avoid to touch platform-register r-9
-ldr r4, [sp, #32]
-ldr r5, [sp, #36]
-ldr r6, [sp, #40]
-
-//unitStepInFloat
-mov r8, #16 // 4*sizeof(float)
-mul r8, r6, r8
-
-//srcYUnitStep
-mul lr, r3, r8
-sub lr, lr, r8
-add r7, lr, r8
-
-//B's step
-mov r10, #4
-mul r10, r4, r10
-
-LoopY:
-    push {r0, r3}
-    LoopX:
-        push {r0, r1}
-        vmov.i32 q14, #0
-        mov r11, r6
-        LoopUnitSetZero:
-            vst1.32 {q14}, [r2]!
-            subs r11, r11, #1
-            bne LoopUnitSetZero
-        sub r2, r2, r8
-        mov r12, r5
-
-        LK7:
-        cmp r12, #7
-        blt LK4
-        push {r3-r7}
-        LoopK7:
-            vld1.32 {d0[0]}, [r1], r10
-            vld1.32 {d0[1]}, [r1], r10
-            vld1.32 {d1[0]}, [r1], r10
-            vld1.32 {d1[1]}, [r1], r10
-            vld1.32 {d2[0]}, [r1], r10
-            vld1.32 {d2[1]}, [r1], r10
-            vld1.32 {d3[0]}, [r1], r10
-            mov r11, r6
-            vmov.32 d30[0], r1
-
-            add r1, r0, r7
-            add r3, r1, r7
-            add r4, r3, r7
-            add r5, r4, r7
-            add r6, r5, r7
-            add r7, r6, r7
-
-            LoopUnitK7:
-                vld1.32 {q8}, [r2]
-                vld1.32 {q12}, [r0]!
-                vmla.f32 q8, q12, d0[0]
-                vld1.32 {q13}, [r1]!
-                vmul.f32 q9, q13, d0[1]
-                vld1.32 {q12}, [r3]!
-                vmla.f32 q8, q12, d1[0]
-                vld1.32 {q13}, [r4]!
-                vmla.f32 q9, q13, d1[1]
-                vld1.32 {q12}, [r5]!
-                vmla.f32 q8, q12, d2[0]
-                vld1.32 {q13}, [r6]!
-                vmla.f32 q9, q13, d2[1]
-                vld1.32 {q12}, [r7]!
-                vmla.f32 q8, q12, d3[0]
-
-                vadd.f32 q9, q8, q9
-                vst1.32 {q9}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnitK7
-            sub r2, r2, r8
-            sub r12, r12, #7
-            add r0, r7, lr
-            vmov.32 r1, d30[0]
-            cmp r12, #7
-            bge LoopK7
-        pop {r3-r7}
-
-        LK4:
-        cmp r12, #4
-        blt LK3
-        vmov.32 d30[1], r3
-        vmov.32 d31[0], r4
-        LoopK4:
-            vld1.32 {d0[0]}, [r1], r10
-            vld1.32 {d0[1]}, [r1], r10
-            vld1.32 {d1[0]}, [r1], r10
-            vld1.32 {d1[1]}, [r1], r10
-            mov r11, r6
-            vmov.32 d30[0], r1
-
-            add r1, r0, r7
-            add r3, r1, r7
-            add r4, r3, r7
-
-            LoopUnitK4:
-                vld1.32 {q8}, [r2]
-                vld1.32 {q12}, [r0]!
-                vmla.f32 q8, q12, d0[0]
-                vld1.32 {q13}, [r1]!
-                vmul.f32 q9, q13, d0[1]
-                vld1.32 {q12}, [r3]!
-                vmla.f32 q8, q12, d1[0]
-                vld1.32 {q13}, [r4]!
-                vmla.f32 q9, q13, d1[1]
-
-                vadd.f32 q9, q8, q9
-                vst1.32 {q9}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnitK4
-            sub r2, r2, r8
-            sub r12, r12, #4
-            add r0, r4, lr
-            vmov.32 r1, d30[0]
-            cmp r12, #4
-            bge LoopK4
-        vmov.32 r3, d30[1]
-        vmov.32 r4, d31[0]
-
-        LK3:
-        cmp r12, #3
-        blt LK1
-        vmov.32 d30[1], r3
-        vmov.32 d31[0], r4
-        LoopK3:
-            vld1.32 {d0[0]}, [r1], r10
-            vld1.32 {d0[1]}, [r1], r10
-            vld1.32 {d1[0]}, [r1], r10
-            mov r11, r6
-            vmov.32 d30[0], r1
-
-            add r1, r0, r7
-            add r3, r1, r7
-
-            LoopUnitK3:
-                vld1.32 {q8}, [r2]
-                vld1.32 {q12}, [r0]!
-                vmla.f32 q8, q12, d0[0]
-                vld1.32 {q13}, [r1]!
-                vmul.f32 q9, q13, d0[1]
-                vld1.32 {q12}, [r3]!
-                vmla.f32 q8, q12, d1[0]
-
-                vadd.f32 q9, q8, q9
-                vst1.32 {q9}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnitK3
-            sub r2, r2, r8
-            sub r12, r12, #3
-            add r0, r3, lr
-            vmov.32 r1, d30[0]
-            cmp r12, #3
-            bge LoopK3
-        vmov.32 r3, d30[1]
-        vmov.32 r4, d31[0]
-
-
-
-        LK1:
-        cmp r12, #0
-        beq LKEnd
-
-        LoopK:
-            vld1.32 {d30[0]}, [r1], r10
-
-            vdup.32 q15, d30[0]
-            mov r11, r6
-            LoopUnit:
-                vld1.32 {q0}, [r2]
-                vld1.32 {q1}, [r0]!
-                vmla.f32 q0, q1, q15
-
-                vst1.32 {q0}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnit
-            subs r12, r12, #1
-
-            sub r2, r2, r8
-            add r0, r0, lr
-            bne LoopK
-        LKEnd:
-        pop {r0, r1}
-        subs r3, r3, #1
-        add r0, r0, r8
-        add r2, r2, r8
-
-        bne LoopX
-    pop {r0, r3}
-    add r1, r1, #4 //sizeof(float)
-
-    subs r4, r4, #1
-    bne LoopY
-
-
-
-pop {r4-r8, r10, r11, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNWinogradMatrixProductRight.S b/source/backend/cpu/arm/arm32/MNNWinogradMatrixProductRight.S
deleted file mode 100644
index b0a97197c..000000000
--- a/source/backend/cpu/arm/arm32/MNNWinogradMatrixProductRight.S
+++ /dev/null
@@ -1,223 +0,0 @@
-//
-//  MNNWinogradMatrixProductRight.S
-//  MNN
-//
-//  Created by MNN on 2018/08/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNWinogradMatrixProductRight
-//void MNNWinogradMatrixProductRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
-
-//Auto: r0: S, r1:B, r2: M, r3:w
-//Load From sp: r4:h, r5:k, r6:length
-
-push {r4-r8, r10, r11, lr} // avoid to touch platform-register r-9
-ldr r4, [sp, #32]
-ldr r5, [sp, #36]
-ldr r6, [sp, #40]
-
-//unitStepInFloat
-mov r8, #16 // 4*sizeof(float)
-mul r8, r6, r8
-
-//srcYUnitStep
-mul lr, r5, r8
-
-//B's step
-mov r10, #4
-mul r10, r4, r10
-
-LoopY:
-    push {r1, r3}
-    LoopX:
-        push {r0, r1}
-        vmov.i32 q14, #0
-        mov r11, r6
-        LoopUnitSetZero:
-            vst1.32 {q14}, [r2]!
-            subs r11, r11, #1
-            bne LoopUnitSetZero
-        sub r2, r2, r8
-        mov r12, r5
-
-        LK7:
-        cmp r12, #7
-        blt LK4
-        push {r3-r7}
-        LoopK7:
-            vld1.32 {d0[0]}, [r1], r10
-            vld1.32 {d0[1]}, [r1], r10
-            vld1.32 {d1[0]}, [r1], r10
-            vld1.32 {d1[1]}, [r1], r10
-            vld1.32 {d2[0]}, [r1], r10
-            vld1.32 {d2[1]}, [r1], r10
-            vld1.32 {d3[0]}, [r1], r10
-            mov r11, r6
-            vmov.32 d30[0], r1
-
-            add r1, r0, r8
-            add r3, r1, r8
-            add r4, r3, r8
-            add r5, r4, r8
-            add r6, r5, r8
-            add r7, r6, r8
-
-            LoopUnitK7:
-                vld1.32 {q8}, [r2]
-                vld1.32 {q12}, [r0]!
-                vmla.f32 q8, q12, d0[0]
-                vld1.32 {q13}, [r1]!
-                vmul.f32 q9, q13, d0[1]
-                vld1.32 {q12}, [r3]!
-                vmla.f32 q8, q12, d1[0]
-                vld1.32 {q13}, [r4]!
-                vmla.f32 q9, q13, d1[1]
-                vld1.32 {q12}, [r5]!
-                vmla.f32 q8, q12, d2[0]
-                vld1.32 {q13}, [r6]!
-                vmla.f32 q9, q13, d2[1]
-                vld1.32 {q12}, [r7]!
-                vmla.f32 q8, q12, d3[0]
-
-                vadd.f32 q9, q8, q9
-                vst1.32 {q9}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnitK7
-            sub r2, r2, r8
-            sub r12, r12, #7
-            mov r0, r7
-            vmov.32 r1, d30[0]
-            cmp r12, #7
-            bge LoopK7
-        pop {r3-r7}
-
-        LK4:
-        cmp r12, #4
-        blt LK3
-        vmov.32 d30[1], r3
-        vmov.32 d31[0], r4
-        LoopK4:
-            vld1.32 {d0[0]}, [r1], r10
-            vld1.32 {d0[1]}, [r1], r10
-            vld1.32 {d1[0]}, [r1], r10
-            vld1.32 {d1[1]}, [r1], r10
-            mov r11, r6
-            vmov.32 d30[0], r1
-
-            add r1, r0, r8
-            add r3, r1, r8
-            add r4, r3, r8
-
-            LoopUnitK4:
-                vld1.32 {q8}, [r2]
-                vld1.32 {q12}, [r0]!
-                vmla.f32 q8, q12, d0[0]
-                vld1.32 {q13}, [r1]!
-                vmul.f32 q9, q13, d0[1]
-                vld1.32 {q12}, [r3]!
-                vmla.f32 q8, q12, d1[0]
-                vld1.32 {q13}, [r4]!
-                vmla.f32 q9, q13, d1[1]
-
-                vadd.f32 q9, q8, q9
-                vst1.32 {q9}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnitK4
-            sub r2, r2, r8
-
-            sub r12, r12, #4
-
-            mov r0, r4
-            vmov.32 r1, d30[0]
-            cmp r12, #4
-            bge LoopK4
-        vmov.32 r3, d30[1]
-        vmov.32 r4, d31[0]
-
-        LK3:
-        cmp r12, #3
-        blt LK1
-        vmov.32 d30[1], r3
-        LoopK3:
-            vld1.32 {d0[0]}, [r1], r10
-            vld1.32 {d0[1]}, [r1], r10
-            vld1.32 {d1[0]}, [r1], r10
-            mov r11, r6
-            vmov.32 d30[0], r1
-
-            add r1, r0, r8
-            add r3, r1, r8
-
-            LoopUnitK3:
-                vld1.32 {q8}, [r2]
-                vld1.32 {q12}, [r0]!
-                vmla.f32 q8, q12, d0[0]
-                vld1.32 {q13}, [r1]!
-                vmul.f32 q9, q13, d0[1]
-                vld1.32 {q12}, [r3]!
-                vmla.f32 q8, q12, d1[0]
-
-                vadd.f32 q9, q8, q9
-                vst1.32 {q9}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnitK3
-            sub r2, r2, r8
-
-            sub r12, r12, #3
-
-            mov r0, r3
-            vmov.32 r1, d30[0]
-            cmp r12, #3
-            bge LoopK3
-        vmov.32 r3, d30[1]
-
-
-        LK1:
-        cmp r12, #0
-        beq LKEnd
-
-        LoopK:
-            vld1.32 {d30[0]}, [r1], r10
-
-            vdup.32 q15, d30[0]
-            mov r11, r6
-            LoopUnit:
-                vld1.32 {q0}, [r2]
-                vld1.32 {q1}, [r0]!
-                vmla.f32 q0, q1, q15
-
-                vst1.32 {q0}, [r2]!
-                subs r11, r11, #1
-                bne LoopUnit
-            subs r12, r12, #1
-
-            sub r2, r2, r8
-            bne LoopK
-        LKEnd:
-        pop {r0, r1}
-        subs r3, r3, #1
-        add r2, r2, r8
-        add r1, r1, #4 //sizeof(float)
-
-        bne LoopX
-    pop {r1, r3}
-    add r0, r0, lr
-
-    subs r4, r4, #1
-    bne LoopY
-
-
-
-pop {r4-r8, r10, r11, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNReluWithSlopeChannelInt8.S b/source/backend/cpu/arm/arm64/MNNReluWithSlopeChannelInt8.S
index e1622504c..4128118f9 100644
--- a/source/backend/cpu/arm/arm64/MNNReluWithSlopeChannelInt8.S
+++ b/source/backend/cpu/arm/arm64/MNNReluWithSlopeChannelInt8.S
@@ -25,8 +25,10 @@ asm_function MNNReluWithSlopeChannelInt8
 // MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params)
 // Auto load:
 // x0: dst, x1: src, x2: slope, x3: planeNumber, x4: depthQuad, x5: params
-// Load from x5:  x8: inputZeroPoint, x9: outputZeroPoint, x10: minValue, x11: maxValue
+// Load from x5: x9: outputZeroPoint, x10: minValue, x11: maxValue
 
+ldr x12, [x5, #0]
+ldr x13, [x5, #8]
 ldr x8, [x5, #16]
 ldr x9, [x5, #24]
 ldr x10, [x5, #32]
@@ -43,10 +45,12 @@ beq End
 cmp x4, #0
 beq End
 
-ld1r {v29.8b}, [x8] // inputZeroPoint
-ld1r {v28.8b}, [x9] // outputZeroPoint
+ld1r {v29.16b}, [x8] // inputZeroPoint
+ld1r {v28.16b}, [x9] // outputZeroPoint
 dup v26.16b, w10
 dup v27.16b, w11
+ld1r {v24.4s}, [x12] // inputscale
+ld1r {v25.4s}, [x13] // outputscale
 /*
 Quant parameters
 */
@@ -60,7 +64,6 @@ ble PReluL1
 
 PReluL4Loop:
 ld1 {v0.16b}, [x1], #16
-cmlt v30.16b, v0.16b, #0 // mask0: x<0
 
 sxtl v1.8h, v0.8b
 sxtl2 v2.8h, v0.16b
@@ -76,10 +79,33 @@ scvtf v4.4s, v4.4s
 scvtf v5.4s, v5.4s
 scvtf v6.4s, v6.4s
 
-fmul v3.4s, v3.4s, v31.4s
-fmul v4.4s, v4.4s, v31.4s
-fmul v5.4s, v5.4s, v31.4s
-fmul v6.4s, v6.4s, v31.4s
+// input_scale
+fmul v3.4s, v3.4s, v24.4s
+fmul v4.4s, v4.4s, v24.4s
+fmul v5.4s, v5.4s, v24.4s
+fmul v6.4s, v6.4s, v24.4s
+
+fcmle v7.4s, v3.4s, #0
+fcmle v8.4s, v4.4s, #0
+fcmle v9.4s, v5.4s, #0
+fcmle v10.4s, v6.4s, #0
+
+// *slope
+fmul v11.4s, v3.4s, v31.4s
+fmul v12.4s, v4.4s, v31.4s
+fmul v13.4s, v5.4s, v31.4s
+fmul v14.4s, v6.4s, v31.4s
+
+bit v3.16b, v11.16b, v7.16b
+bit v4.16b, v12.16b, v8.16b
+bit v5.16b, v13.16b, v9.16b
+bit v6.16b, v14.16b, v10.16b
+
+// *output_scale
+fmul v3.4s, v3.4s, v25.4s
+fmul v4.4s, v4.4s, v25.4s
+fmul v5.4s, v5.4s, v25.4s
+fmul v6.4s, v6.4s, v25.4s
 
 fcvtas v3.4s, v3.4s
 fcvtas v4.4s, v4.4s
@@ -99,8 +125,7 @@ sqxtn2 v9.16b, v8.8h
 smax v9.16b, v9.16b, v26.16b
 smin v9.16b, v9.16b, v27.16b
 
-bit v0.16b, v9.16b, v30.16b
-st1 {v0.16b}, [x0], #16
+st1 {v9.16b}, [x0], #16
 
 sub x5, x5, #4
 cmp x5, #4
@@ -113,13 +138,20 @@ beq PReluL1End
 
 PReluL1Loop:
 ld1 {v0.s}[0], [x1], #4
-cmlt v30.8b, v0.8b, #0
 
 sxtl v1.8h, v0.8b
 ssubw v1.8h, v1.8h, v29.8b
 sxtl v1.4s, v1.4h
 scvtf v1.4s, v1.4s
-fmul v1.4s, v1.4s, v31.4s
+// *input_scale
+fmul v1.4s, v1.4s, v24.4s
+fcmle v7.4s, v1.4s, #0
+// *slope
+fmul v11.4s, v1.4s, v31.4s
+bit v1.16b, v11.16b, v7.16b
+// *output_scale
+fmul v1.4s, v1.4s, v25.4s
+
 fcvtas v1.4s, v1.4s
 sqxtn v1.4h, v1.4s
 saddw v1.8h, v1.8h, v28.8b
@@ -127,8 +159,7 @@ sqxtn v1.8b, v1.8h
 smax v1.8b, v1.8b, v26.8b
 smin v1.8b, v1.8b, v27.8b
 
-bit v0.8b, v1.8b, v30.8b
-st1 {v0.s}[0], [x0], #4
+st1 {v1.s}[0], [x0], #4
 subs x5, x5, #1
 bne PReluL1Loop
 
@@ -144,4 +175,4 @@ End:
     ldp d14, d15, [sp], #64
     ret
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNWinogradMatrixProductLeft.S b/source/backend/cpu/arm/arm64/MNNWinogradMatrixProductLeft.S
deleted file mode 100644
index f013aac62..000000000
--- a/source/backend/cpu/arm/arm64/MNNWinogradMatrixProductLeft.S
+++ /dev/null
@@ -1,171 +0,0 @@
-//
-//  MNNWinogradMatrixProductLeft.S
-//  MNN
-//
-//  Created by MNN on 2018/08/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNWinogradMatrixProductLeft
-//void MNNWinogradMatrixProductLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
-
-//Auto: x0: S, x1:B, x2: M, x3:w, x4:h, x5:k, x6:length
-
-//unitStepInFloat
-mov x8, #16 // 4*sizeof(float)
-mul x8, x6, x8
-
-//srcYUnitStep
-mul x9, x3, x8
-sub x9, x9, x8
-add x7, x9, x8
-
-//B's step
-mov x10, #4
-mul x10, x4, x10
-
-LoopY:
-    mov v4.d[0], x0
-    mov v4.d[1], x3
-    LoopX:
-        mov v5.d[0], x0
-        mov v5.d[1], x1
-        movi v30.4s, #0
-        mov x11, x6
-        LoopUnitSetZero:
-            st1 {v30.4s}, [x2], #16
-            subs x11, x11, #1
-            bne LoopUnitSetZero
-        sub x2, x2, x8
-        mov x12, x5
-
-        LK4:
-        cmp x12, #4
-        blt LK3
-        mov v6.d[0], x3
-        mov v6.d[1], x4
-        LoopK4:
-            ld1 {v0.s}[0], [x1], x10
-            ld1 {v0.s}[1], [x1], x10
-            ld1 {v0.s}[2], [x1], x10
-            ld1 {v0.s}[3], [x1], x10
-            mov x11, x6
-            mov v7.d[0], x1
-
-            add x1, x0, x7
-            add x3, x1, x7
-            add x4, x3, x7
-
-            LoopUnitK4:
-                ld1 {v16.4s}, [x2]
-                ld1 {v20.4s}, [x0], #16
-                fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x1], #16
-                fmul v17.4s, v21.4s, v0.s[1]
-                ld1 {v20.4s}, [x3], #16
-                fmla v16.4s, v20.4s, v0.s[2]
-                ld1 {v21.4s}, [x4], #16
-                fmla v17.4s, v21.4s, v0.s[3]
-
-                fadd v17.4s, v16.4s, v17.4s
-                st1 {v17.4s}, [x2], #16
-                subs x11, x11, #1
-                bne LoopUnitK4
-            sub x2, x2, x8
-
-            sub x12, x12, #4
-
-            add x0, x4, x9
-            mov x1, v7.d[0]
-            cmp x12, #4
-            bge LoopK4
-        mov x3, v6.d[0]
-        mov x4, v6.d[1]
-
-        LK3:
-        cmp x12, #3
-        blt LK1
-        mov v6.d[0], x3
-        LoopK3:
-            ld1 {v0.s}[0], [x1], x10
-            ld1 {v0.s}[1], [x1], x10
-            ld1 {v0.s}[2], [x1], x10
-            mov x11, x6
-            mov v7.d[0], x1
-
-            add x1, x0, x7
-            add x3, x1, x7
-
-            LoopUnitK3:
-                ld1 {v16.4s}, [x2]
-                ld1 {v20.4s}, [x0], #16
-                fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x1], #16
-                fmul v17.4s, v21.4s, v0.s[1]
-                ld1 {v20.4s}, [x3], #16
-                fmla v16.4s, v20.4s, v0.s[2]
-
-                fadd v17.4s, v16.4s, v17.4s
-                st1 {v17.4s}, [x2], #16
-                subs x11, x11, #1
-                bne LoopUnitK3
-            sub x2, x2, x8
-
-            sub x12, x12, #3
-
-            add x0, x3, x9
-            mov x1, v7.d[0]
-            cmp x12, #3
-            bge LoopK3
-        mov x3, v6.d[0]
-
-
-        LK1:
-        cmp x12, #0
-        beq LKEnd
-
-        LoopK:
-            ld1 {v31.s}[0], [x1], x10
-
-            dup v31.4s, v31.s[0]
-            mov x11, x6
-            LoopUnit:
-                ld1 {v0.4s}, [x2]
-                ld1 {v1.4s}, [x0], #16
-                fmla v0.4s, v1.4s, v31.4s
-
-                st1 {v0.4s}, [x2], #16
-                subs x11, x11, #1
-                bne LoopUnit
-            subs x12, x12, #1
-
-            sub x2, x2, x8
-            add x0, x0, x9
-            bne LoopK
-        LKEnd:
-        mov x0, v5.d[0]
-        mov x1, v5.d[1]
-        subs x3, x3, #1
-        add x0, x0, x8
-        add x2, x2, x8
-
-        bne LoopX
-    mov x0, v4.d[0]
-    mov x3, v4.d[1]
-    add x1, x1, #4 //sizeof(float)
-
-    subs x4, x4, #1
-    bne LoopY
-
-
-
-    ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNWinogradMatrixProductRight.S b/source/backend/cpu/arm/arm64/MNNWinogradMatrixProductRight.S
deleted file mode 100644
index 5542e3a93..000000000
--- a/source/backend/cpu/arm/arm64/MNNWinogradMatrixProductRight.S
+++ /dev/null
@@ -1,164 +0,0 @@
-//
-//  MNNWinogradMatrixProductRight.S
-//  MNN
-//
-//  Created by MNN on 2018/08/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNWinogradMatrixProductRight
-//void MNNWinogradMatrixProductRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
-
-//Auto: x0: S, x1:B, x2: M, x3:w, x4:h, x5:k, x6:length
-
-//unitStepInFloat
-mov x8, #16 // 4*sizeof(float)
-mul x8, x6, x8
-
-//srcYUnitStep
-mul x9, x5, x8
-
-//B's step
-mov x10, #4
-mul x10, x4, x10
-
-LoopY:
-    mov v4.d[0], x1
-    mov v4.d[1], x3
-    LoopX:
-        mov v5.d[0], x0
-        mov v5.d[1], x1
-        movi v30.4s, #0
-        mov x11, x6
-        LoopUnitSetZero:
-            st1 {v30.4s}, [x2], #16
-            subs x11, x11, #1
-            bne LoopUnitSetZero
-        sub x2, x2, x8
-        mov x12, x5
-
-        LK4:
-        cmp x12, #4
-        blt LK3
-        mov v6.d[0], x3
-        mov v6.d[1], x4
-        LoopK4:
-            ld1 {v0.s}[0], [x1], x10
-            ld1 {v0.s}[1], [x1], x10
-            ld1 {v0.s}[2], [x1], x10
-            ld1 {v0.s}[3], [x1], x10
-            mov x11, x6
-            mov v7.d[0], x1
-
-            add x1, x0, x8
-            add x3, x1, x8
-            add x4, x3, x8
-
-            LoopUnitK4:
-                ld1 {v16.4s}, [x2]
-                ld1 {v20.4s}, [x0], #16
-                fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x1], #16
-                fmul v17.4s, v21.4s, v0.s[1]
-                ld1 {v20.4s}, [x3], #16
-                fmla v16.4s, v20.4s, v0.s[2]
-                ld1 {v21.4s}, [x4], #16
-                fmla v17.4s, v21.4s, v0.s[3]
-
-                fadd v17.4s, v16.4s, v17.4s
-                st1 {v17.4s}, [x2], #16
-                subs x11, x11, #1
-                bne LoopUnitK4
-            sub x2, x2, x8
-            sub x12, x12, #4
-            mov x0, x4
-
-            mov x1, v7.d[0]
-            cmp x12, #4
-            bge LoopK4
-        mov x3, v6.d[0]
-        mov x4, v6.d[1]
-
-        LK3:
-        cmp x12, #3
-        blt LK1
-        mov v6.d[0], x3
-        LoopK3:
-            ld1 {v0.s}[0], [x1], x10
-            ld1 {v0.s}[1], [x1], x10
-            ld1 {v0.s}[2], [x1], x10
-            mov x11, x6
-            mov v7.d[0], x1
-
-            add x1, x0, x8
-            add x3, x1, x8
-
-            LoopUnitK3:
-                ld1 {v16.4s}, [x2]
-                ld1 {v20.4s}, [x0], #16
-                fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x1], #16
-                fmul v17.4s, v21.4s, v0.s[1]
-                ld1 {v20.4s}, [x3], #16
-                fmla v16.4s, v20.4s, v0.s[2]
-
-                fadd v17.4s, v16.4s, v17.4s
-                st1 {v17.4s}, [x2], #16
-                subs x11, x11, #1
-                bne LoopUnitK3
-            sub x2, x2, x8
-            sub x12, x12, #3
-            mov x0, x4
-            mov x1, v7.d[0]
-            cmp x12, #3
-            bge LoopK3
-        mov x3, v6.d[0]
-
-        LK1:
-        cmp x12, #0
-        beq LKEnd
-
-        LoopK:
-            ld1 {v31.s}[0], [x1], x10
-
-            dup v31.4s, v31.s[0]
-            mov x11, x6
-            LoopUnit:
-                ld1 {v0.4s}, [x2]
-                ld1 {v1.4s}, [x0], #16
-                fmla v0.4s, v1.4s, v31.4s
-
-                st1 {v0.4s}, [x2], #16
-                subs x11, x11, #1
-                bne LoopUnit
-            subs x12, x12, #1
-
-            sub x2, x2, x8
-            bne LoopK
-        LKEnd:
-        mov x0, v5.d[0]
-        mov x1, v5.d[1]
-        subs x3, x3, #1
-        add x2, x2, x8
-        add x1, x1, #4 //sizeof(float)
-
-        bne LoopX
-    mov x1, v4.d[0]
-    mov x3, v4.d[1]
-    add x0, x0, x9
-
-    subs x4, x4, #1
-    bne LoopY
-
-
-
-    ret
-
-#endif
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index 0eefd7bb9..734851753 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -23,6 +23,9 @@
 #include "../CPUBinary.hpp"
 #include "../CPUUnary.hpp"
 #include "../CPUPool.hpp"
+#ifndef M_PI
+#define M_PI 3.141592654
+#endif
 #define PACK 4
 #define FLOAT float
 using Vec = MNN::Math::Vec<float, 4>;
@@ -314,7 +317,7 @@ static void MNNSumByAxisLForMatmul_A(float* dest, int8_t* source, const float* s
         dest += (step * blockNum);
         realDstCount -= step;
         srcInt8 += col_buffer_unit_size;
-    } while(realDstCount > 0); 
+    } while(realDstCount > 0);
 }
 
 template<typename T>
@@ -3099,6 +3102,21 @@ void MNNSiLuLowp(float* dst, const float* src, size_t dataSize) {
 #endif
 }
 
+void MNNDftAbs(const float* input, const float* window, float* output, float* buffer, int nfft) {
+    for (int i = 0; i < nfft; ++i) {
+        buffer[i] = input[i] * window[i];
+    }
+    for (int k = 0; k < nfft / 2 + 1; ++k) {
+        float real_sum = 0.f, imag_sum = 0.f;
+        for (int n = 0; n < nfft; ++n) {
+            float angle = 2 * M_PI * k * n / nfft;
+            real_sum += buffer[n] * std::cos(angle);
+            imag_sum -= buffer[n] * std::sin(angle);
+        }
+        output[k] = std::sqrt(real_sum * real_sum + imag_sum * imag_sum);
+    }
+}
+
 static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
     if(sparseBlockOC == 4) {
         packedSparseMatMul = MNNPackedSparseMatMulEpx4;
@@ -3202,7 +3220,7 @@ void MNNCoreFunctionInit() {
     gCoreFunction->MNNFp16ToFp8 = MNNFp16ToFp8;
     gCoreFunction->MNNFp8ToFp32 = MNNFp8ToFp32;
     gCoreFunction->MNNFp8ToFp16 = MNNFp8ToFp16;
-    
+
     // MatMul
     gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode;
     gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A;
diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h
index 9dac6d66e..0159aa286 100644
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@@ -101,6 +101,7 @@ void MNNGeluCommon(float* dst, const float* src, size_t size);
 void MNNGeluStandardCommon(float* dst, const float* src, size_t size);
 void MNNSoftmax(float* dest, const float* source, size_t size);
 void MNNNorm(float* dest, const float* source, const float *gamma, const float *beta, float epsilon, size_t size, bool RMSNorm = false);
+void MNNDftAbs(const float* input, const float* window, float* output, float* buffer, int nfft);
 
 // Get Pack for MatMul's e , l , h , the pack number must be 1 or 4 * n
 void MNNGetMatMulPackMode(int* eP, int *lP, int* hP);
@@ -313,7 +314,7 @@ struct CoreFunctions {
     void(*MNNPoolingMax)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
                            int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
                            int strideHeight, int padWidth, int padHeight, int padType, int countType);
-    
+
     void(*MNNPoolingMaxWithRedice)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
                            int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
                            int strideHeight, int padWidth, int padHeight, int padType, int countType, int *RediceOutput);
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
index 7dd218564..da25f6f95 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@@ -29,7 +29,7 @@ void MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const
 void MNNMaxPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
 
 void MNNAvgPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
-void MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params);
+void MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, const QuanPrePostParameters *params, size_t pack);
 #if defined(__aarch64__) // aarch32 sdot workaround
 void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
                                         const QuanPostTreatParameters* post, size_t realDstCount);
@@ -1543,7 +1543,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
     }
 }
 
-static void MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params) {
+static void MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, const QuanPrePostParameters *params, size_t pack) {
 #ifdef MNN_USE_SSE
 float offset = 128.f;
 uint8_t* srcPtr = (uint8_t*)src;
@@ -1554,24 +1554,22 @@ const int8_t* srcPtr = src;
 int8_t* dstPtr = dst;
 #endif
     float mulVal = 0.f;
-    float inputScale = params->inputScale[0];
-    float outputScale = params->outputScale[0];
     float inputZero = static_cast<float>(params->inputZeroPoint[0]) + offset;
     float outputZero = static_cast<float>(params->outputZeroPoint[0]) + offset;
     int32_t minval = params->minValue + offset;
     int32_t maxval = params->maxValue + offset;
     for (int j = 0;j < depthQuad; ++j) {
-        const float* slopeZ = slope + 4 * j;
-        const auto srcZ = srcPtr + 4 * j * planeNumber;
-        auto dstZ = dstPtr + 4 * j * planeNumber;
+        const float* slopeZ = slope + pack * j;
+        const auto srcZ = srcPtr + pack * j * planeNumber;
+        auto dstZ = dstPtr + pack * j * planeNumber;
         for (int i = 0; i < planeNumber; ++i) {
-            for (int c = 0; c < 4; ++c) {
-                if ((float)srcZ[4 * i + c] < inputZero) {
-                    mulVal = (srcZ[4 * i + c] - inputZero) * slopeZ[c];
-                    dstZ[4 * i + c] = ALIMIN(ALIMAX(static_cast<int32_t>(roundf(mulVal)) + outputZero, minval), maxval);
-                } else {
-                    dstZ[4 * i + c] = srcZ[4 * i + c];
+            for (int c = 0; c < pack; ++c) {
+                float valInput = (static_cast<float>(srcZ[pack * i + c]) - inputZero) * params->inputScale[0];
+                if (valInput < 0) {
+                    valInput *= slopeZ[c];
                 }
+                auto mulVal = valInput * params->outputScale[0] + outputZero;
+                dstZ[pack * i + c] = ALIMIN(ALIMAX(static_cast<int32_t>(roundf(mulVal)), minval), maxval);
             }
         }
     }
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h
index eb405e6e8..460a8dfcd 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@@ -113,7 +113,7 @@ struct CoreInt8Functions {
     void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
     
     // Relu
-    void (*MNNReluWithSlopeChannelInt8)(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params);
+    void (*MNNReluWithSlopeChannelInt8)(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, const QuanPrePostParameters *params, size_t pack);
 };
 void MNNCoreInt8FunctionInit();
 CoreInt8Functions* MNNGetInt8CoreFunctions();
diff --git a/source/backend/cpu/compute/WinogradOptFunction.cpp b/source/backend/cpu/compute/WinogradOptFunction.cpp
index 23b83eab2..31a602077 100644
--- a/source/backend/cpu/compute/WinogradOptFunction.cpp
+++ b/source/backend/cpu/compute/WinogradOptFunction.cpp
@@ -16,77 +16,10 @@
 
 using Vec4 = MNN::Math::Vec<float, 4>;
 #define DEFAULT_UNIT 8
-extern "C" {
-void MNNWinogradMatrixProductLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k,
-                                  size_t length);
-void MNNWinogradMatrixProductRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k,
-                                   size_t length);
-}
-
-#ifndef MNN_USE_NEON
-
-// M = BT * S , M = w*h * l, S = w*k * l, B = h*k
-void MNNWinogradMatrixProductLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k,
-                                  size_t length) {
-    auto unitStep = 4 * length;
-    for (int y = 0; y < h; ++y) {
-        auto dstY = M + y * w * unitStep;
-        for (int x = 0; x < w; ++x) {
-            auto dstX = dstY + x * unitStep;
-            auto srcX = S + x * unitStep;
-            ::memset(dstX, 0, unitStep * sizeof(float));
-            for (int i = 0; i < k; ++i) {
-                auto b    = B[i * h + y];
-                auto srcY = srcX + i * w * unitStep;
-                if (0.0f == b) {
-                    continue;
-                }
-                for (int j = 0; j < unitStep; ++j) {
-                    dstX[j] += srcY[j] * b;
-                }
-            }
-        }
-    }
-}
 
-// M = S * B , M = w*h * l, S = k*h * l, B = w*k
-void MNNWinogradMatrixProductRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k,
-                                   size_t length) {
-    auto unitStep = 4 * length;
-    for (int y = 0; y < h; ++y) {
-        auto dstY = M + y * w * unitStep;
-        auto srcY = S + y * k * unitStep;
-
-        for (int x = 0; x < w; ++x) {
-            auto dstX = dstY + x * unitStep;
-            ::memset(dstX, 0, unitStep * sizeof(float));
-            for (int i = 0; i < k; ++i) {
-                auto srcX = srcY + i * unitStep;
-                auto b    = B[i * h + x];
-                if (0.0f == b) {
-                    continue;
-                }
-                for (int j = 0; j < unitStep; ++j) {
-                    dstX[j] += srcX[j] * b;
-                }
-            }
-        }
-    }
-}
-#endif
 
 namespace MNN {
 
-
-void WinogradFunction::productLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k,
-                                   size_t length) {
-    MNNWinogradMatrixProductLeft(S, B, M, w, h, k, length);
-}
-
-void WinogradFunction::productRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k,
-                                    size_t length) {
-    MNNWinogradMatrixProductRight(S, B, M, w, h, k, length);
-}
 int WinogradFunction::getPreferNumber() {
     return DEFAULT_UNIT;
 }
diff --git a/source/backend/cpu/compute/WinogradOptFunction.hpp b/source/backend/cpu/compute/WinogradOptFunction.hpp
index 579811cc5..8608a2fde 100644
--- a/source/backend/cpu/compute/WinogradOptFunction.hpp
+++ b/source/backend/cpu/compute/WinogradOptFunction.hpp
@@ -15,9 +15,6 @@
 namespace MNN {
 class WinogradFunction {
 public:
-    static void productLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
-    static void productRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
-
     static int getPreferNumber();
 
     typedef void (*TransformFunc)(const float* srcBlock, float* dstStart, size_t srcStep, size_t dstStep);
diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
index 21c8bd408..fc82e6971 100644
--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@@ -132,7 +132,6 @@ void MNNInt8FunctionInit() {
     auto core = MNN::MNNGetInt8CoreFunctions();
     core->MNNAvgPoolInt8 = MNNAvgPoolUint8;
     core->MNNMaxPoolInt8 = MNNMaxPoolInt8_;
-    core->MNNReluWithSlopeChannelInt8 = _SSE_MNNReluWithSlopeChannelInt8;
     if (cpuFlags & libyuv::kCpuHasSSE41) {
         core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
         core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
index 5f8653066..0867c7c07 100644
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@@ -36,7 +36,7 @@ void _SSE_MNNAddC4WithStride(const float* source, float* dest, size_t srcStride,
 void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
 
 void _SSE_MNNGelu(float* dst, const float* src, size_t size, float* parameters);
-void _SSE_MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params);
+void _SSE_MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, const QuanPrePostParameters *params, size_t pack);
 
 void _SSE_MNNHardSwish(float* dst, const float* src, size_t size);
 
diff --git a/source/backend/cpu/x86_x64/sse/MathFunctions.cpp b/source/backend/cpu/x86_x64/sse/MathFunctions.cpp
index b9e857006..8238cb187 100644
--- a/source/backend/cpu/x86_x64/sse/MathFunctions.cpp
+++ b/source/backend/cpu/x86_x64/sse/MathFunctions.cpp
@@ -290,46 +290,3 @@ void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float
     }
 }
 
-void _SSE_MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params) {
-    uint8_t* dstO = (uint8_t*)dst;
-    uint8_t* srcO = (uint8_t*)src;
-    auto outputZero = _mm_set1_ps(static_cast<float>(params->outputZeroPoint[0]));
-    __m128 maxValue = _mm_set1_ps(params->maxValue);
-    __m128 minValue = _mm_set1_ps(params->minValue);
-    auto offset = _mm_set1_epi32(128);
-    auto zero = _mm_set1_epi32(0);
-    __m128 plus = _mm_set1_ps(0.5f);
-    __m128 minus = _mm_set1_ps(-0.5f);
-    __m128i zeroPointValue = _mm_set1_epi32(static_cast<int32_t>(params->inputZeroPoint[0]) + 128);
-    for (int j = 0;j < depthQuad; ++j) {
-        auto slopeZ = _mm_loadu_ps(slope + 4 * j);
-        const uint8_t* srcZ = srcO + 4 * j * planeNumber;
-        uint8_t* dstZ = dstO + 4 * j * planeNumber;
-        int32_t srcZ_ext[4] = {*(int32_t*)srcZ, 0, 0, 0};
-        for (int i = 0; i < planeNumber; ++i) {
-            // auto srcData8 = _mm_loadu_si32(srcZ);
-            auto srcData8 = _mm_castps_si128(_mm_loadu_ps((float*)srcZ_ext));
-            auto srcData16 = _mm_unpacklo_epi8(srcData8, zero);
-            auto srcData32 = _mm_unpacklo_epi16(srcData16, zero);
-            srcData32 = _mm_sub_epi32(srcData32, zeroPointValue);
-            auto srcDataf  = _mm_cvtepi32_ps(srcData32);
-            auto mask1 = _mm_cmplt_ps(srcDataf, _mm_castsi128_ps(zero));
-            auto mask0 = _mm_cmpge_ps(srcDataf, _mm_castsi128_ps(zero));
-            auto f = _mm_mul_ps(srcDataf, slopeZ);
-            f = _mm_add_ps(f, outputZero);
-            f = _mm_min_ps(f, maxValue);
-            f = _mm_max_ps(f, minValue);
-            auto r = _mm_add_ps(_mm_and_ps(srcDataf, mask0), _mm_and_ps(f, mask1));
-            auto m0 = _mm_cmplt_ps(r, _mm_castsi128_ps(zero));
-            m0 = _mm_blendv_ps(plus, minus, m0);
-            r = _mm_add_ps(r, m0);
-            // Round to zero
-            auto d0 = _mm_cvtps_epi32(_mm_round_ps(r, 3));
-            d0 = _mm_add_epi32(d0, offset);
-            d0 = _mm_packs_epi32(d0, d0);
-            d0 = _mm_packus_epi16(d0, d0);
-            *((int*)dstZ + i) = _mm_cvtsi128_si32(d0);
-        }
-    }
-}
-
diff --git a/source/backend/metal/AllShader.cpp b/source/backend/metal/AllShader.cpp
index 7cafcce6c..afb0c6fda 100644
--- a/source/backend/metal/AllShader.cpp
+++ b/source/backend/metal/AllShader.cpp
@@ -1428,7 +1428,6 @@ const char* shader_MetalDeconvolution_metal =
 " int output_height;\n"
 " int output_size;\n"
 " int output_slice;\n"
-" \n"
 " int kernel_x;\n"
 " int kernel_y;\n"
 " int kernel_size;\n"
@@ -1438,12 +1437,10 @@ const char* shader_MetalDeconvolution_metal =
 " int pad_y;\n"
 " int dilation_x;\n"
 " int dilation_y;\n"
-" \n"
 " int delta_ky;\n"
 " int delta_kx;\n"
 " int delta_iy;\n"
 " int delta_ix;\n"
-" int has_bias;\n"
 " int batch;\n"
 " conv_activation_type activation;\n"
 "};\n"
@@ -1494,8 +1491,8 @@ const char* shader_MetalDeconvolution_metal =
 " const device M4 *biasTerms [[buffer(4)]],\n"
 " uint3 gid [[thread_position_in_grid]]) {\n"
 " if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch*cst.output_slice) return;\n"
-" \n"
-" FLOAT4 result=FLOAT4(biasTerms[(int)(gid.z/cst.batch)]);\n"
+" int oz=(int)gid.z/cst.batch;\n"
+" FLOAT4 result=FLOAT4(biasTerms[oz]);\n"
 " \n"
 " int oy=(int)gid.y+cst.pad_y;\n"
 " int ox=(int)gid.x+cst.pad_x;\n"
@@ -1512,7 +1509,7 @@ const char* shader_MetalDeconvolution_metal =
 " int min_iy=(oy-max_ky*cst.dilation_y)/cst.stride_y;\n"
 " int min_ix=(ox-max_kx*cst.dilation_x)/cst.stride_x;\n"
 " \n"
-" auto z_wt=wt+(int)gid.z*cst.kernel_size;\n"
+" auto z_wt=wt+oz*cst.kernel_size;\n"
 " auto z_in=in+(int)gid.z*cst.input_size;\n"
 " for (auto ky=max_ky,iy=min_iy; ky >= min_ky; ky -= cst.delta_ky,iy += cst.delta_iy) {\n"
 " for (auto kx=max_kx,ix=min_ix; kx >= min_kx; kx -= cst.delta_kx,ix += cst.delta_ix) {\n"
@@ -1670,6 +1667,7 @@ const char* shader_MetalConvolution1x1_metal =
 " int batch;\n"
 " int block_size;\n"
 " conv_activation_type activation;\n"
+" float scale_coef;\n"
 "};\n"
 "kernel void conv1x1_g1z4(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
@@ -1711,7 +1709,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device MNN::char4x4 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid [[thread_position_in_grid]]) {\n"
 " if ((int)gid.x*CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;\n"
 " int rx=gid.x*CONV_UNROLL;\n"
@@ -1724,8 +1722,8 @@ const char* shader_MetalConvolution1x1_metal =
 " int computeSize=min(cst.output_size-rx,CONV_UNROLL);\n"
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; ++bi) {\n"
-" FLOAT4 bs0=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
-" FLOAT4 bs1=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" FLOAT4 bs0=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 bs1=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " FLOAT4 scale=bs0;\n"
 " FLOAT4 dequant_bias=bs1;\n"
 " int zmin=bi*block;\n"
@@ -1759,7 +1757,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device uchar2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid [[threadgroup_position_in_grid]],\n"
 " uint tiitg[[thread_index_in_threadgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
@@ -1793,8 +1791,8 @@ const char* shader_MetalConvolution1x1_metal =
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; ++bi) {\n"
 " // [N/4,cst.block_size,2/*scale_bias*/,N4]\n"
-" FLOAT4 scale=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin+kl; z<zmax; z += 8) {\n"
@@ -1849,7 +1847,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device uchar2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid [[threadgroup_position_in_grid]],\n"
 " uint tiitg[[thread_index_in_threadgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
@@ -1884,8 +1882,8 @@ const char* shader_MetalConvolution1x1_metal =
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; ++bi) {\n"
 " // [N/4,cst.block_size,2/*scale_bias*/,N4]\n"
-" FLOAT4 scale=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(idx_n4*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin+kl; z<zmax; z += 2) {\n"
@@ -1945,7 +1943,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device uchar2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid [[threadgroup_position_in_grid]],\n"
 " uint tiitg[[thread_index_in_threadgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
@@ -1980,10 +1978,10 @@ const char* shader_MetalConvolution1x1_metal =
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; ++bi) {\n"
 " // [N/4,cst.block_size,2/*scale_bias*/,N4]\n"
-" FLOAT4 scale0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+1]);\n"
-" FLOAT4 scale1=FLOAT4(dequantScale[2*(idx_n41*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias1=FLOAT4(dequantScale[2*(idx_n41*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 scale1=FLOAT4(dequantScale[2*(idx_n41*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias1=FLOAT4(dequantScale[2*(idx_n41*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin+kl; z<zmax; z += 2) {\n"
@@ -2048,7 +2046,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device uchar2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid [[threadgroup_position_in_grid]],\n"
 " uint tiitg[[thread_index_in_threadgroup]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
@@ -2106,8 +2104,8 @@ const char* shader_MetalConvolution1x1_metal =
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; ++bi) {\n"
 " // [N/4,cst.block_size,2/*scale_bias*/,N4]\n"
-" FLOAT4 scale0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias0=FLOAT4(dequantScale[2*(idx_n40*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " \n"
@@ -2174,7 +2172,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid [[thread_position_in_grid]]) {\n"
 " if ((int)gid.x*CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;\n"
 " int rx=gid.x*CONV_UNROLL;\n"
@@ -2187,8 +2185,8 @@ const char* shader_MetalConvolution1x1_metal =
 " int computeSize=min(cst.output_size-rx,CONV_UNROLL);\n"
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; ++bi) {\n"
-" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin; z<zmax; z++) {\n"
@@ -2226,7 +2224,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid[[threadgroup_position_in_grid]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
@@ -2250,8 +2248,8 @@ const char* shader_MetalConvolution1x1_metal =
 " int outer_index=(tiisg)/middle_step;\n"
 " \n"
 " for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {\n"
-" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
-" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin+middle_index; z<zmax; z += middle_step) {\n"
@@ -2279,7 +2277,7 @@ const char* shader_MetalConvolution1x1_metal =
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
-" const device float4 *dequantScale [[buffer(5)]],\n"
+" const device M4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid[[threadgroup_position_in_grid]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
@@ -2306,10 +2304,10 @@ const char* shader_MetalConvolution1x1_metal =
 " \n"
 " for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {\n"
 " const int quant_offset=2*(uz*cst.block_size+bi);\n"
-" FLOAT4 scale0=FLOAT4(dequantScale[quant_offset+0]);\n"
-" FLOAT4 dequant_bias0=FLOAT4(dequantScale[quant_offset+1]);\n"
-" FLOAT4 scale1=FLOAT4(dequantScale[quant_offset+(cst.block_size << 1)]);\n"
-" FLOAT4 dequant_bias1=FLOAT4(dequantScale[quant_offset+(cst.block_size << 1)+1]);\n"
+" FLOAT4 scale0=FLOAT4(dequantScale[quant_offset+0])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias0=FLOAT4(dequantScale[quant_offset+1])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 scale1=FLOAT4(dequantScale[quant_offset+(cst.block_size << 1)])/(FLOAT)cst.scale_coef;\n"
+" FLOAT4 dequant_bias1=FLOAT4(dequantScale[quant_offset+(cst.block_size << 1)+1])/(FLOAT)cst.scale_coef;\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin+middle_index; z<zmax; z += middle_step) {\n"
diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm
index bcfdbf972..18db999e6 100644
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@@ -11,592 +11,12 @@
 #import "MetalCast.hpp"
 #import "MetalBackend.hpp"
 #import "MNNMetalContext.h"
+#import "MetalAttentionShader.hpp"
 #include "MNN_generated.h"
 
 #if MNN_METAL_ENABLED
 #ifdef MNN_SUPPORT_TRANSFORMER_FUSE
 
-static const char* gMatMulDivMask = R"metal(
-#include <metal_stdlib>
-#include <simd/simd.h>
-using namespace metal;
-struct Param {
-    int query_seq_len;
-    int key_seq_len;
-    int head_num;
-    int group;
-    int head_dim;
-    float scale;
-};
-#define SIMD_GROUP_WIDTH 32
-
-kernel void prefill(const device T* input0 [[buffer(0)]],
-    const device T* input1 [[buffer(1)]],
-    device T* output [[buffer(2)]],
-    device T* past_key [[buffer(3)]],
-#ifdef FLOAT_MASK
-    const device T* mask [[buffer(4)]],
-#else
-    const device int* mask [[buffer(4)]],
-#endif
-    constant Param& param [[buffer(5)]],
-#ifdef SIMD_GROUP_MATRIX
-    uint3 gid[[threadgroup_position_in_grid]],
-    uint tiitg[[thread_index_in_threadgroup]],
-    uint tiisg[[thread_index_in_simdgroup]],
-    uint sgitg[[simdgroup_index_in_threadgroup]]
-#else
-    uint3 gid[[thread_position_in_grid]]
-#endif
-) {
-#ifdef SIMD_GROUP_MATRIX
-
-    /*
-     // Read:
-     ftype 0~127   ---> input: [M16, K8]
-     ftype 128~255 ---> input: [K8, N16]
-     // Write:
-     ftype 0~255 ---> input: [N2, M2, M8, N8]
-     */
-    
-    simdgroup_float8x8 sga[2];
-    simdgroup_float8x8 sgb[2];
-    simdgroup_float8x8 sgd[4];
-    for (int i = 0; i < 4; i++){
-        sgd[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
-    }
-
-    int kl = tiitg % 2;// 0~1
-    int rcl = tiitg / 2;// 0~15
-
-    const int slq = gid.x; // q_seq_len/16 -> M/16
-    const int slk = gid.y; // k_seq_len/16 -> N/16
-    const int z = gid.z; // head_num
-
-    /** Q:
-     threadgroup: [M16, K8]
-     each thread: K4
-     layout: [M, B, K] -> [M/16, M16, B, K/8, K2, K4]
-     index : [slq, rcl, z, 0, kl, K4]
-     offset: ((slq * 16 + rcl) * B + z) * K + (0 * 2 + kl) * 4 + 0
-     */
-    /** K:
-     threadgroup: [K8, N16]
-     each thread: N4
-     layout: [N, B/G, K] -> [N/16, N16, B/G, K/8, K2, K4]
-     index : [slk, rcl, B/G, 0, kl, 0]
-     offset: ((slk * 16 + rcl) * B/G + z/G) * K + 0 * 8 + kl * 4 + 0
-     */
-    /** output:
-     threadgroup: [M16, N16]
-     each thread: N8
-     layout: [B, M, N] -> [B, M/16, M16, N/16, N2, N8]
-     index : [z, sl, rcl, kl, 0]
-     offset: (z * M + sl * 16 + rcl) * N + slk * 16 + kl * 8 + 0
-     */
-
-    int group = param.group;
-    int zin = z / param.group;
-    int q_seq_len = param.query_seq_len;
-    int k_seq_len = param.key_seq_len;
-    int head_num = param.head_num;
-    int head_dim = param.head_dim;
-    const int stride = head_num * head_dim / group;
-
-    threadgroup float sdata[256] = {0.f};
-
-    int idx_slq = slq * 16 + rcl < q_seq_len ? slq * 16 + rcl : q_seq_len - 1;
-    int idx_slk = slk * 16 + rcl < k_seq_len ? slk * 16 + rcl : k_seq_len - 1;
-
-    auto A_offset = input0 + (idx_slq * head_num + z) * head_dim + (0 * 2 + kl) * 4 + 0;
-    auto B_offset = input1 + (idx_slk * head_num / group + zin) * head_dim + 0 * 8 + kl * 4 + 0;
-       
-    for(int i = 0; i < head_dim; i += 8){
-        sdata[rcl * 8 + kl * 4 + 0] = A_offset[i + 0];
-        sdata[rcl * 8 + kl * 4 + 1] = A_offset[i + 1];
-        sdata[rcl * 8 + kl * 4 + 2] = A_offset[i + 2];
-        sdata[rcl * 8 + kl * 4 + 3] = A_offset[i + 3];
-        
-        sdata[128 + (kl * 4 + 0) * 16 + rcl] = B_offset[i + 0];
-        sdata[128 + (kl * 4 + 1) * 16 + rcl] = B_offset[i + 1];
-        sdata[128 + (kl * 4 + 2) * 16 + rcl] = B_offset[i + 2];
-        sdata[128 + (kl * 4 + 3) * 16 + rcl] = B_offset[i + 3];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        simdgroup_load(sga[0], (const threadgroup float*)sdata, 8);
-        simdgroup_load(sga[1], ((const threadgroup float*)sdata) + 64, 8);
-        
-        simdgroup_load(sgb[0], ((const threadgroup float*)sdata) + 128, 16);
-        simdgroup_load(sgb[1], ((const threadgroup float*)sdata) + 136, 16);
-        
-        simdgroup_multiply_accumulate(sgd[0], sga[0], sgb[0], sgd[0]);
-        simdgroup_multiply_accumulate(sgd[1], sga[1], sgb[0], sgd[1]);
-        simdgroup_multiply_accumulate(sgd[2], sga[0], sgb[1], sgd[2]);
-        simdgroup_multiply_accumulate(sgd[3], sga[1], sgb[1], sgd[3]);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-    simdgroup_store(sgd[0], (threadgroup float*)sdata, 8);
-    simdgroup_store(sgd[1], (threadgroup float*)sdata + 64, 8);
-    simdgroup_store(sgd[2], (threadgroup float*)sdata + 128, 8);
-    simdgroup_store(sgd[3], (threadgroup float*)sdata + 192, 8);
-    
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // [N2, M2, M8, N8]
-    float Vscale = (float)param.scale;
-
-    auto xy_out = output + (z * q_seq_len + slq * 16 + rcl) * k_seq_len + slk * 16 + kl * 8 + 0;
-    if(slq * 16 + rcl < q_seq_len) {
-        if(slk * 16 + kl * 8 + 0 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 0] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 0))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 0))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[0] = out0;
-        }
-        if(slk * 16 + kl * 8 + 1 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 1] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 1))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 1))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[1] = out0;
-        }
-        if(slk * 16 + kl * 8 + 2 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 2] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 2))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 2))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[2] = out0;
-        }
-        if(slk * 16 + kl * 8 + 3 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 3] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 3))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 3))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[3] = out0;
-        }
-        if(slk * 16 + kl * 8 + 4 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 4] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 4))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 4))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[4] = out0;
-        }
-        if(slk * 16 + kl * 8 + 5 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 5] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 5))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 5))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[5] = out0;
-        }
-        if(slk * 16 + kl * 8 + 6 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 6] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 6))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 6))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[6] = out0;
-        }
-        if(slk * 16 + kl * 8 + 7 < k_seq_len) {
-            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 7] * Vscale;
-            #ifdef FLOAT_MASK
-                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 7))] + out0;
-            #else
-                out0 = mask[((slq * 16 + rcl) * key_seq_len + (slk * 16 + kl * 8 + 7))] == 0 ? -FLT_MAX : out0;
-            #endif
-            xy_out[7] = out0;
-        }
-    }
-
-#else
-    const int x = gid.x; // query_seq_len
-    const int y = gid.y; // head_num
-    const int z = gid.z; // key_seq_len
-
-    if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) {
-        return;
-    }
-    int group = param.group;
-    int query_seq_len = param.query_seq_len;
-    int key_seq_len = param.key_seq_len;
-    int head_num = param.head_num;
-    int head_dim = param.head_dim;
-    
-    const int offset = head_num * head_dim;
-    const int offset_head = y * head_dim;
-    const int offset_head_kv = (y / param.group) * head_dim;
-    const device T* A_offset = input0 + x * offset + offset_head;
-
-    float Vscale = (float)param.scale;
-
-    device const T* B_offset = input1 + z * offset / group + offset_head_kv;
-    const int output_offset = y * query_seq_len * key_seq_len;
-    float out0 = 0.0;
-    
-    for(int i = 0; i < head_dim; ++i){
-        float A = (float)(A_offset[i]);
-        float B = (float)(B_offset[i]);
-        out0 += B * A;
-    }
-    
-    out0 *= Vscale;
-    
-#ifdef FLOAT_MASK
-    out0 = mask[((x + 0) * key_seq_len + (z + 0))] + out0;
-#else
-    out0 = mask[((x + 0) * key_seq_len + (z + 0))] == 0 ? -FLT_MAX : out0;
-#endif
-    output[output_offset + x * key_seq_len + z] = (T)out0;
-#endif
-}
-
-kernel void decode(const device T* input0 [[buffer(0)]],
-    const device T* input1 [[buffer(1)]],
-    device T* output [[buffer(2)]],
-    device T* past_key [[buffer(3)]],
-#ifdef FLOAT_MASK
-    const device T* mask [[buffer(4)]],
-#else
-    const device int* mask [[buffer(4)]],
-#endif
-    constant Param& param [[buffer(5)]],
-#ifdef SIMD_GROUP_REDUCE
-    uint3 gid[[threadgroup_position_in_grid]],
-    uint  tiisg[[thread_index_in_simdgroup]],
-    uint  sgitg[[simdgroup_index_in_threadgroup]]
-#else
-    uint3 gid[[thread_position_in_grid]]
-#endif
-) {
-    const int x = gid.x; // query_seq_len
-    const int y = gid.y; // head_num
-    const int z = gid.z; // key_seq_len
-    if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) {
-        return;
-    }
-    int group = param.group;
-
-    int key_seq_len = param.key_seq_len;
-    int head_num = param.head_num;
-    int head_dim = param.head_dim;
-    
-    const int offset = head_num * head_dim;
-    const int offset_head = y * head_dim;
-    const int offset_head_kv = (y / param.group) * head_dim;
-    const device T* A_offset = input0 + x * offset + offset_head;
-    device T* Pastkey_offset = past_key + z * offset / group + offset_head_kv;
-    float Vscale = (float)param.scale;
-
-    const device T *B_offset = input1 + offset_head_kv;
-    float out = 0.0;
-
-#ifdef SIMD_GROUP_REDUCE
-    {
-        for(int i = tiisg; i < head_dim; i+=SIMD_GROUP_WIDTH){
-            float A = A_offset[i];
-            float B = (float)Pastkey_offset[i];
-            
-            out += A * B;
-        }
-    }
-    out = simd_sum(out);
-    if(tiisg == 0) {
-        out *= Vscale;
-        output[y * key_seq_len + z] = (T)out;
-    }
-#else
-    {
-        for(int i = 0; i < head_dim; i++){
-            float A = A_offset[i];
-            float B = (float)Pastkey_offset[i];
-            
-            out += A * B;
-        }
-    }
-    out *= Vscale;
-    output[y * key_seq_len + z] = (T)out;
-#endif
-}
-
-)metal";
-
-static const char* gCopyPastKV = R"metal(
-#include <metal_stdlib>
-using namespace metal;
-struct Param {
-    int head_count;
-    int kv_seq_len;
-    int src_offset;
-    int dst_offset;
-};
-kernel void copy(const device T* input0 [[buffer(0)]],
-    const device T* input1 [[buffer(1)]],
-    device T* output0 [[buffer(2)]],
-    device T* output1 [[buffer(3)]],
-    constant Param& param [[buffer(4)]],
-    uint3 gid[[thread_position_in_grid]]
-) {
-    const int x = gid.x; // head_num / group * head_dim / 4
-    const int y = gid.y; // kv_seq_len
-    if (x >= param.head_count || y >= param.kv_seq_len) {
-        return;
-    }
-    const int index = y * param.head_count + x;
-    output0[param.dst_offset + index] = input0[param.src_offset + index];
-    output1[param.dst_offset + index] = input1[param.src_offset + index];
-}
-)metal";
-
-static const char* gMatMulQKV = R"metal(
-
-#include <metal_stdlib>
-#include <simd/simd.h>
-using namespace metal;
-struct Param {
-    int query_seq_len;
-    int key_seq_len;
-    int head_num;
-    int group;
-    int head_dim;
-    float scale;
-};
-#define SIMD_GROUP_WIDTH 32
-kernel void prefill(const device T* input0 [[buffer(0)]],
-    const device T* input1 [[buffer(1)]],
-    device T* output [[buffer(2)]],
-    device T* past_value [[buffer(3)]],
-    constant Param& param [[buffer(4)]],
-#ifdef SIMD_GROUP_MATRIX
-    uint3 gid[[threadgroup_position_in_grid]],
-    uint tiitg[[thread_index_in_threadgroup]],
-    uint tiisg[[thread_index_in_simdgroup]],
-    uint sgitg[[simdgroup_index_in_threadgroup]]
-#else
-    uint3 gid[[thread_position_in_grid]]
-#endif
-) {
-#ifdef SIMD_GROUP_MATRIX
-    /*
-     // Read:
-     ftype 0~127   ---> input: [M16, K8]
-     ftype 128~255 ---> input: [K8, N16]
-     // Write:
-     ftype 0~255 ---> input: [N2, M2, M8, N8]
-     */
-    
-    simdgroup_float8x8 sga[2];
-    simdgroup_float8x8 sgb[2];
-    simdgroup_float8x8 sgd[4];
-    for (int i = 0; i < 4; i++){
-        sgd[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
-    }
-
-    int kl = tiitg % 2;// 0~1
-    int rcl = tiitg / 2;// 0~15
-
-    int nl = tiitg % 4;// 0~3
-    int kcl = tiitg / 4;// 0~7
-
-    const int sl = gid.x; // q_seq_len/16 -> M/16
-    const int hm = gid.y; // head_dim/16 -> N/16
-    const int z = gid.z; // head_num
-
-    /** QK:
-     threadgroup: [M16, K8]
-     each thread: K4
-     layout: [B, M, K] -> [B, M/16, M16, K/8, K2, K4]
-     index : [z, sl, rcl, ml, kl, K4]
-     offset: (z * M + sl * 16 + rcl) * K + (0 * 2 + kl) * 4 + 0
-     */
-    /** V:
-     threadgroup: [K8, N16]
-     each thread: N4
-     layout: [K, B/G, N] -> [K/8, K8, B/G, N/16, N4, N4]
-     index : [0, kcl, B/G, hm, nl, 0]
-     offset: ((0 * 8 + kcl) * B/G + z/G) * N + hm * 16 + nl * 4 + 0
-     */
-    /** output:
-     threadgroup: [M16, N16]
-     each thread: N8
-     layout: [M, B, N] -> [M/16, M16, B, N/16, N2, N8]
-     index : [sl, rcl, B, kl, 0]
-     offset: ((sl * 16 + rcl) * B + z) * N + hm * 16 + kl * 8 + 0
-     */
-
-    int group = param.group;
-    int zin = z / param.group;
-    int qk_seq_len = param.query_seq_len;
-    int value_seq_len = param.key_seq_len;
-    int head_num = param.head_num;
-    int head_dim = param.head_dim;
-    const int stride = head_num * head_dim / group;
-
-    threadgroup float sdata[256] = {0.f};
-
-    int idx_qk_sl = sl * 16 + rcl < qk_seq_len ? (sl * 16 + rcl) : qk_seq_len - 1;
-
-    auto A_offset = input0 + (z * qk_seq_len + idx_qk_sl) * value_seq_len + (0 * 2 + kl) * 4 + 0;
-    auto B_offset = input1 + ((0 * 8 + kcl) * head_num / group + zin) * head_dim + hm * 16 + nl * 4 + 0;
-       
-    for(int i = 0; i < value_seq_len; i += 8){
-        sdata[rcl * 8 + kl * 4 + 0] = (i + kl * 4 + 0 < value_seq_len) ? A_offset[i + 0] : 0.0;
-        sdata[rcl * 8 + kl * 4 + 1] = (i + kl * 4 + 1 < value_seq_len) ? A_offset[i + 1] : 0.0;
-        sdata[rcl * 8 + kl * 4 + 2] = (i + kl * 4 + 2 < value_seq_len) ? A_offset[i + 2] : 0.0;
-        sdata[rcl * 8 + kl * 4 + 3] = (i + kl * 4 + 3 < value_seq_len) ? A_offset[i + 3] : 0.0;
-        
-        sdata[128 + kcl * 16 + nl * 4 + 0] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 0 < head_dim) ? B_offset[i * stride + 0] : 0.0;
-        sdata[128 + kcl * 16 + nl * 4 + 1] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 1 < head_dim) ? B_offset[i * stride + 1] : 0.0;
-        sdata[128 + kcl * 16 + nl * 4 + 2] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 2 < head_dim) ? B_offset[i * stride + 2] : 0.0;
-        sdata[128 + kcl * 16 + nl * 4 + 3] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 3 < head_dim) ? B_offset[i * stride + 3] : 0.0;
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        simdgroup_load(sga[0], (const threadgroup float*)sdata, 8);
-        simdgroup_load(sga[1], ((const threadgroup float*)sdata) + 64, 8);
-        
-        simdgroup_load(sgb[0], ((const threadgroup float*)sdata) + 128, 16);
-        simdgroup_load(sgb[1], ((const threadgroup float*)sdata) + 136, 16);
-        
-        simdgroup_multiply_accumulate(sgd[0], sga[0], sgb[0], sgd[0]);
-        simdgroup_multiply_accumulate(sgd[1], sga[1], sgb[0], sgd[1]);
-        simdgroup_multiply_accumulate(sgd[2], sga[0], sgb[1], sgd[2]);
-        simdgroup_multiply_accumulate(sgd[3], sga[1], sgb[1], sgd[3]);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-    simdgroup_store(sgd[0], (threadgroup float*)sdata, 8);
-    simdgroup_store(sgd[1], (threadgroup float*)sdata + 64, 8);
-    simdgroup_store(sgd[2], (threadgroup float*)sdata + 128, 8);
-    simdgroup_store(sgd[3], (threadgroup float*)sdata + 192, 8);
-    
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // [N2, M2, M8, N8]
-    auto xy_out = output + ((sl * 16 + rcl) * head_num + z) * head_dim + hm * 16 + kl * 8 + 0;
-    if(sl * 16 + rcl < qk_seq_len) {
-        if(hm * 16 + kl * 8 + 0 < head_dim) {
-            xy_out[0] =  sdata[(kl * 16 + rcl) * 8 + 0];
-        }
-        if(hm * 16 + kl * 8 + 1 < head_dim) {
-            xy_out[1] =  sdata[(kl * 16 + rcl) * 8 + 1];
-        }
-        if(hm * 16 + kl * 8 + 2 < head_dim) {
-            xy_out[2] =  sdata[(kl * 16 + rcl) * 8 + 2];
-        }
-        if(hm * 16 + kl * 8 + 3 < head_dim) {
-            xy_out[3] =  sdata[(kl * 16 + rcl) * 8 + 3];
-        }
-        if(hm * 16 + kl * 8 + 4 < head_dim) {
-            xy_out[4] =  sdata[(kl * 16 + rcl) * 8 + 4];
-        }
-        if(hm * 16 + kl * 8 + 5 < head_dim) {
-            xy_out[5] =  sdata[(kl * 16 + rcl) * 8 + 5];
-        }
-        if(hm * 16 + kl * 8 + 6 < head_dim) {
-            xy_out[6] =  sdata[(kl * 16 + rcl) * 8 + 6];
-        }
-        if(hm * 16 + kl * 8 + 7 < head_dim) {
-            xy_out[7] =  sdata[(kl * 16 + rcl) * 8 + 7];
-        }
-    }
-
-#else
-    const int x = gid.x; // kv_seq_len
-    const int y = gid.y; // head_num
-    const int z = gid.z; // head_dim
-    if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) {
-        return;
-    }
-    int group = param.group;
-    int yin = y / param.group;
-    int qk_seq_len = param.query_seq_len;
-    int value_seq_len = param.key_seq_len;
-    int head_num = param.head_num;
-    int head_dim = param.head_dim;
-    const int stride = head_num * head_dim / group;
-    const int offset_head = yin * head_dim + z;
-
-    device const T *A_offset = input0 + (y * qk_seq_len + x) * value_seq_len;
-    device const T *B_offset = input1 + offset_head;
-    float out = 0.0;
-    
-    for(int i = 0; i < value_seq_len; ++i){
-        float A0 = (float)A_offset[i];
-        float B = (float)B_offset[i*stride];
-        out += A0 * B;
-    }
-    output[ x * stride * group + (y * head_dim + z)] = out;
-#endif
-}
-
-kernel void decode(const device T* input0 [[buffer(0)]],
-    const device T* input1 [[buffer(1)]],
-    device T* output [[buffer(2)]],
-    device T* past_value [[buffer(3)]],
-    constant Param& param [[buffer(4)]],
-#ifdef SIMD_GROUP_REDUCE
-    uint3 gid[[threadgroup_position_in_grid]],
-    uint  tiisg[[thread_index_in_simdgroup]],
-    uint  sgitg[[simdgroup_index_in_threadgroup]]
-#else
-    uint3 gid[[thread_position_in_grid]]
-#endif
-) {
-    const int x = gid.x; // query_seq_len
-    const int y = gid.y; // head_num
-    const int z = gid.z; // head_dim
-    if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) {
-        return;
-    }
-    int group = param.group;
-    int yin = y / param.group;
-
-    int value_seq_len = param.key_seq_len;
-    int head_num = param.head_num;
-    int head_dim = param.head_dim;
-    const int stride = head_num * head_dim / group;
-    const int offset_head = yin * head_dim + z;
-
-    device const T *A_offset = input0 + y * value_seq_len;
-    device T *Pastvalue_offset = past_value + offset_head;
-    float out = 0;
-    
-#ifdef SIMD_GROUP_REDUCE
-    for(int i = tiisg; i < value_seq_len; i+=SIMD_GROUP_WIDTH){
-        float A = (float)A_offset[i];
-        float B = (float)Pastvalue_offset[i * stride];
-        
-        out += A * B;
-    }
-    out = simd_sum(out);
-    if(tiisg == 0) {
-        output[(y * head_dim + z)] = (T)out;
-    }
-#else
-    for(int i = 0; i < value_seq_len; i++){
-        float A = (float)A_offset[i];
-        float B = (float)Pastvalue_offset[i * stride];
-        
-        out += A * B;
-    }
-    output[(y * head_dim + z)] = (T)out;
-#endif
-}
-)metal";
-
 namespace MNN {
 class AttentionBufExecution : public MetalExecution {
 public:
@@ -621,7 +41,8 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
 
 private:
     void _init();
-    void reallocKVCache();
+    void reallocKVCache(int history_len);
+    void compilerShader(const std::vector<Tensor *> &inputs);
     bool mKVCache;
     std::shared_ptr<SharedCache> mCache;
     float mScale;
@@ -639,6 +60,14 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     id<MTLBuffer> mParamQKV;
     id<MTLBuffer> mParamSoftmax;
     id<MTLBuffer> mParamCopy;
+    
+private:
+    bool mQkSimdReduce = false;
+    bool mQkSimdMatrix = false;
+    bool mSftmSimdReduce = false;
+    bool mQkvSimdReduce = false;
+    bool mQkvSimdMatrix = false;
+    bool mUseHeadNum2 = false;
 };
 
 struct Param {
@@ -648,6 +77,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     int group;
     int head_dim;
     float scale;
+    int max_kv_len;
 };
 AttentionBufExecution::AttentionBufExecution(Backend *backend, bool kv_cahce)
     : MetalExecution(backend) , mKVCache(kv_cahce) {
@@ -659,29 +89,48 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     auto context = (__bridge MNNMetalContext *)mtbn->context();
     mParamQKV = [context newDeviceBuffer:sizeof(Param) access:CPUWriteOnly];
     mParamSoftmax = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
-    mParamCopy = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
+    mParamCopy = [context newDeviceBuffer:5 * sizeof(int) access:CPUWriteOnly];
     mTempQK.reset(Tensor::createDevice<float>({0, 0}));
     mTempSoftMax.reset(Tensor::createDevice<float>({0, 0}));
 }
 
-void AttentionBufExecution::reallocKVCache() {
-    if (!mKVCache || mCache->mPastLength < mCache->mMaxLength) {
+void AttentionBufExecution::reallocKVCache(int history_len) {
+    /*
+     when kv-cache
+     decoding: past_len > max_len, realloc and copy past_len cache
+     prefill : max_len == 0 (first prefill), realloc and copy history_len cache
+     prefill : max_len > 0 (not first prefill) && past_len >= max_len, realloc and copy history_len cache. copy current prompt in copy shader(not this function)
+     prefill : max_len > 0 (not first prefill) && past_len < max_len, not realloc and no need copy history_len cache, just copy current prompt in copy shader(not this function)
+
+     */
+    if (!mKVCache) {
+        return;
+    }
+    
+    if (mIsDecode && mCache->mPastLength < mCache->mMaxLength) {
         return;
     }
 
+    // not first prefill (do reuse_kvcache) and total past_len < max_len
+    if(!mIsDecode && mCache->mMaxLength > 0 && mCache->mPastLength < mCache->mMaxLength && history_len != 0) {
+        return;
+    }
     auto mtbn = static_cast<MetalBackend *>(backend());
     int byte = 4;
     if(mtbn->useFp16InsteadFp32()) {
         byte = 2;
     }
-    bool needCopy = mCache->mMaxLength > 0;
+    bool needCopy = history_len > 0;
+
+    size_t old_size = mKvNumHead * history_len * mHeadDim * byte;
+    size_t old_piece_size = history_len * byte;
+    size_t old_piece_stride = mCache->mMaxLength * byte;
 
-    size_t old_size = mKvNumHead * mCache->mMaxLength * mHeadDim * byte;
     mCache->mMaxLength = mCache->mPastLength + mExpandChunk;
     // past_key: [1, numhead, headdim, maxlen]
     auto new_key = Tensor::createDevice<float>({mCache->mMaxLength, mKvNumHead, mHeadDim});
     // past_value: [1, numhead, maxlen, headdim]
-    auto new_value = Tensor::createDevice<float>({mCache->mMaxLength, mKvNumHead, mHeadDim});
+    auto new_value = Tensor::createDevice<float>({mKvNumHead, mHeadDim, mCache->mMaxLength});
     size_t size = mKvNumHead * mCache->mMaxLength * mHeadDim * byte;
     backend()->onAcquireBuffer(new_key, Backend::STATIC);
     backend()->onAcquireBuffer(new_value, Backend::STATIC);
@@ -696,67 +145,42 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         auto new_value_ptr = (uint8_t*)[newValueBuf.first contents] + newValueBuf.second;
         auto valueBuf = MetalBackend::getBuffer(mCache->mPastValue.get());
         auto value_ptr = (uint8_t*)[valueBuf.first contents] + valueBuf.second;
-        ::memcpy(new_value_ptr, value_ptr, old_size);
+        for(int i = 0; i <  mKvNumHead * mHeadDim; i++) {
+            ::memcpy(new_value_ptr + i * mCache->mMaxLength * byte, value_ptr + i * old_piece_stride, old_piece_size);
+        }
     }
     mCache->mPastKey.reset(new_key);
     mCache->mPastValue.reset(new_value);
 }
 
-
-void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
-
-    auto query = inputs[0];
-    auto key = inputs[1];
-    auto value = inputs[2];
+void AttentionBufExecution::compilerShader(const std::vector<Tensor *> &inputs) {
     auto mask = inputs[3];
     auto mtbn = static_cast<MetalBackend *>(backend());
-    auto context = (__bridge MNNMetalContext *)mtbn->context();
-    auto shape = query->shape();
-    int seq_len = shape[1];
-    mNumHead = shape[2];
-    mHeadDim = shape[3];
-    mScale = 1.0 / sqrt(mHeadDim);
-    mIsDecode = seq_len == 1;
-    if (mCache->mPastLength == 0 || seq_len > 1) {
-        mCache->mPastLength = seq_len;
-    }
-    mCache->mKv_seq_len = mCache->mPastLength;
-    if(mIsDecode){
-        mCache->mKv_seq_len = mCache->mPastLength + 1;
-    }
-    mKvNumHead = key->shape()[2];
-    
     auto rt = (MetalRuntime*)mtbn->runtime();
-    bool supportSimdReduce = rt->supportSimdGroupReduce();
-    bool supportSimdMatrix = rt->supportSimdGroupMatrix();
-
-    // decode and thread number not too large
-    bool qkSimdReduce = supportSimdReduce && seq_len == 1 && mCache->mKv_seq_len * mNumHead < mHeadDim * 32;
-    // loop_k can divide 8, thus avoid branch
-    bool qkSimdMatrix = supportSimdMatrix && seq_len >= 16 && mHeadDim % 8 == 0;
+    auto context = (__bridge MNNMetalContext *)mtbn->context();
 
-    bool sftmSimdReduce = supportSimdReduce;
-    bool qkvSimdReduce = supportSimdReduce && seq_len == 1 && mHeadDim * mNumHead < mCache->mKv_seq_len * 32;
-    bool qkvSimdMatrix = supportSimdMatrix && seq_len >= 16;
-    
     // Init Kernel
     bool float_mask = (mask->getType() == halide_type_of<float>());
     std::string T = "float";
-    std::string T4 = "float4";
     if (mtbn->useFp16InsteadFp32()) {
         T = "half";
-        T4 = "half4";
     }
     std::vector<std::string> qkKeys = {
         {"matmul_qk_div_mask", T}
     };
-    if(qkSimdReduce) {
+    if(mQkSimdReduce) {
         qkKeys.emplace_back("SIMD_GROUP_REDUCE");
     }
+    
+    // QK matmul total thread is large
+    mUseHeadNum2 = mIsDecode && mCache->mKv_seq_len > 1024;
+    if(mUseHeadNum2) {
+        qkKeys.emplace_back("HEAD_NUM_2");
+    }
     std::vector<std::string> qkvKeys = {
         {"matmul_qkv", T}
     };
-    if(qkvSimdReduce) {
+    if(mQkvSimdReduce) {
         qkvKeys.emplace_back("SIMD_GROUP_REDUCE");
     }
     std::vector<std::string> qkPrefillKeys = {
@@ -765,17 +189,17 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     if (float_mask) {
         qkPrefillKeys.emplace_back("FLOAT_MASK");
     }
-    if(qkSimdMatrix) {
+    if(mQkSimdMatrix) {
         qkPrefillKeys.emplace_back("SIMD_GROUP_MATRIX");
     }
     std::vector<std::string> qkvPrefillKeys = {
         {"matmul_qkv", T, "FOR_PREFILL"}
     };
-    if(qkvSimdMatrix) {
+    if(mQkvSimdMatrix) {
         qkvPrefillKeys.emplace_back("SIMD_GROUP_MATRIX");
     }
     std::vector<std::string> copyPastKeys = {
-        {"pastkv_copy", T4}
+        {"pastkv_copy", T}
     };
     std::vector<std::vector<std::string>> keys = {
         qkKeys,
@@ -791,6 +215,13 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         gMatMulQKV,
         gCopyPastKV
     };
+    std::vector<std::string> shaders = {
+        "decode_qk",
+        "decode_qkv",
+        "prefill_qk",
+        "prefill_qkv",
+        "copy"
+    };
     std::vector<id<MTLComputePipelineState>> pipelines(keys.size());
     for (int i=0; i<keys.size(); ++i) {
         auto pipeline = rt->findPipeline(keys[i]);
@@ -803,14 +234,8 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
                 [dic setValue:@"1" forKey:@(keys[i][j].c_str())];;
             }
             option.preprocessorMacros = dic;
-            if(std::find(keys[i].begin(), keys[i].end(), "FOR_PREFILL") != keys[i].end()) {
-                pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], "prefill", option);
-            } else if(i == 4){
-                pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], "copy", option);
-
-            } else {
-                pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], "decode", option);
-            }
+            
+            pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], shaders[i].c_str(), option);
             rt->insertPipeline(keys[i], pipeline);
         }
         pipelines[i] = pipeline;
@@ -826,15 +251,65 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     MNN_ASSERT(nil != mKernelPrefill_qkv);
     MNN_ASSERT(nil != mKernel_copy);
 
-    if(sftmSimdReduce) {
+    if(mSftmSimdReduce) {
         mKernel_softmax = [context pipelineWithName:@"softmax_plane_sg" fp16:mtbn->useFp16InsteadFp32()];
     } else {
         mKernel_softmax = [context pipelineWithName:@"softmax_plane" fp16:mtbn->useFp16InsteadFp32()];
     }
 
+}
+
+void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
+    auto query = inputs[0];
+    auto key = inputs[1];
+    auto value = inputs[2];
+    auto mask = inputs[3];
+    auto mtbn = static_cast<MetalBackend *>(backend());
+    auto context = (__bridge MNNMetalContext *)mtbn->context();
+    auto shape = query->shape();
+    int seq_len = shape[1];
+    mNumHead = shape[2];
+    mHeadDim = shape[3];
+    mScale = 1.0 / sqrt(mHeadDim);
+    mIsDecode = seq_len == 1;
+
+    int history_len = mCache->mPastLength;
+    // first prefill set history_len to 0
+    if(!mIsDecode && mask->length(2) == mask->length(3)) {
+        history_len = 0;
+    }
+    if (!mIsDecode) {
+        mCache->mPastLength = mask->length(3);
+    }
+    mCache->mKv_seq_len = mCache->mPastLength;
+    if(mIsDecode){
+        mCache->mKv_seq_len = mCache->mPastLength + 1;
+    }
+    mKvNumHead = key->shape()[2];
+    
+    auto rt = (MetalRuntime*)mtbn->runtime();
+    bool supportSimdReduce = rt->supportSimdGroupReduce();
+    bool supportSimdMatrix = rt->supportSimdGroupMatrix();
+
+    // decode and thread number not too large
+    mQkSimdReduce = supportSimdReduce && seq_len == 1;
+    // loop_k can divide 8, thus avoid branch
+    mQkSimdMatrix = supportSimdMatrix && seq_len >= 16 && mHeadDim % 8 == 0;
+
+    mSftmSimdReduce = supportSimdReduce;
+    mQkvSimdReduce = supportSimdReduce && seq_len == 1 && mHeadDim * mNumHead < mCache->mKv_seq_len * 32;
+    mQkvSimdMatrix = supportSimdMatrix && seq_len >= 16;
+    
+    // start to compile attention shaders
+    compilerShader(inputs);
+    
     int group_size = mNumHead / mKvNumHead;
 
-    reallocKVCache();
+    // kv-cache realloc function
+    reallocKVCache(history_len);
+    
+    // temp tensor alloc memory
     bool needMalloc = mTempQK->length(0) != mNumHead;
     if (mIsDecode) {
         if (mTempQK->length(1) != mCache->mMaxLength) {
@@ -844,19 +319,20 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         mTempQK->setLength(1, mCache->mMaxLength);
         mTempSoftMax->setLength(0, mNumHead);
         mTempSoftMax->setLength(1, mCache->mMaxLength);
+
     } else {
-        if (mTempQK->length(1) != mCache->mPastLength * mCache->mPastLength) {
+        if (mTempQK->length(1) != seq_len * mCache->mPastLength) {
             needMalloc = true;
         }
         mTempQK->setLength(0, mNumHead);
-        mTempQK->setLength(1, mCache->mPastLength * mCache->mPastLength);
+        mTempQK->setLength(1, seq_len * mCache->mPastLength);
         mTempSoftMax->setLength(0, mNumHead);
-        mTempSoftMax->setLength(1, mCache->mPastLength * mCache->mPastLength);
+        mTempSoftMax->setLength(1, seq_len * mCache->mPastLength);
     }
     if (needMalloc) {
         auto res = backend()->onAcquireBuffer(mTempQK.get(), Backend::STATIC) && backend()->onAcquireBuffer(mTempSoftMax.get(), Backend::STATIC);
         if (!res) {
-            MNN_ERROR("MNN::Metal: OUT_OF_MEMORY when execute attention metal\n");
+            MNN_ERROR("MNN::Metal: OUT_OF_MEMORY when execute attention metal %d\n", mCache->mPastLength);
             return;
         }
     }
@@ -870,6 +346,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         param->head_num = mNumHead;
         param->group = group_size;
         param->query_seq_len = seq_len;
+        param->max_kv_len = mCache->mMaxLength;
     }
     // For softmax parameter
     int inside, outside;
@@ -878,7 +355,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         outside = mNumHead;
     } else {
         inside = 1;
-        outside = mCache->mKv_seq_len * mNumHead;
+        outside = seq_len * mNumHead;
     }
     int axis = mCache->mKv_seq_len;
     {
@@ -889,22 +366,35 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         softmax[2] = outside;
         softmax[3] = 0;
     }
+    
     // Run Copy Kernel
     {
         auto copyp = (int*)mParamCopy.contents;
-        copyp[0] = mKvNumHead * mHeadDim / 4;
+        copyp[0] = mKvNumHead * mHeadDim;
         
         int copy_line;
         if(mIsDecode) {
+            /*
+             each decode fill one kv_seq.
+             Key -> K-Cache :   [1, mKvNumHead, mHeadDim] -> [mCache->mKv_seq_len + 1, mKvNumHead, mHeadDim]
+             Value -> V-Cache : [1, mKvNumHead, mHeadDim] -> [mKvNumHead, mHeadDim, mCache->mKv_seq_len + 1]
+             */
             copyp[1] = 1;
-            copyp[2] = 0;
+            copyp[2] = mCache->mMaxLength;
             copyp[3] = (mCache->mKv_seq_len - 1) * copyp[0];
+            copyp[4] = mCache->mKv_seq_len - 1;
             copy_line = 1;
         } else {
-            copyp[1] = mCache->mKv_seq_len;
-            copyp[2] = 0;
-            copyp[3] = 0;
-            copy_line = mCache->mKv_seq_len;
+            /*
+             first time copy.
+             Key -> K-Cache :   [mCache->mKv_seq_len, mKvNumHead, mHeadDim] -> [mCache->mKv_seq_len, mKvNumHead, mHeadDim]
+             Value -> V-Cache : [mCache->mKv_seq_len, mKvNumHead, mHeadDim] -> [mKvNumHead, mHeadDim, mCache->mMaxLength (fill when decode)]
+             */
+            copyp[1] = seq_len;
+            copyp[2] = mCache->mMaxLength;
+            copyp[3] = history_len * copyp[0];
+            copyp[4] = history_len;
+            copy_line = seq_len;
         }
 
         id<MTLComputePipelineState> pipeline = mKernel_copy;
@@ -916,7 +406,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         [encoder setBuffer:mParamCopy offset:0 atIndex:4];
         
         std::pair<MTLSize, MTLSize> gl;
-        gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(mKvNumHead * mHeadDim / 4, copy_line, 1)];
+        gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(mKvNumHead * mHeadDim, copy_line, 1)];
 
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
 
@@ -931,21 +421,27 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         }
         [encoder setComputePipelineState:pipeline];
         MetalBackend::setTensor(query, encoder, 0);
-        MetalBackend::setTensor(key, encoder, 1);
-        MetalBackend::setTensor(mTempQK.get(), encoder, 2);
-        MetalBackend::setTensor(mCache->mPastKey.get(), encoder, 3);
-        MetalBackend::setTensor(mask, encoder, 4);
-        [encoder setBuffer:mParamQKV offset:0 atIndex:5];
+        MetalBackend::setTensor(mTempQK.get(), encoder, 1);
+        MetalBackend::setTensor(mCache->mPastKey.get(), encoder, 2);
+        MetalBackend::setTensor(mask, encoder, 3);
+        [encoder setBuffer:mParamQKV offset:0 atIndex:4];
 
+        int decode_grid_y = mNumHead;
+        if(mUseHeadNum2) {
+            decode_grid_y = (decode_grid_y + 1) / 2;
+        }
         std::pair<MTLSize, MTLSize> gl;
-        if(qkSimdReduce) {
-            gl = std::make_pair(MTLSizeMake(seq_len, mNumHead, mCache->mKv_seq_len), MTLSizeMake(32, 1, 1));
-        } else if(qkSimdMatrix) {
+        if(mQkSimdReduce) {
+            gl = std::make_pair(MTLSizeMake(seq_len, decode_grid_y, mCache->mKv_seq_len), MTLSizeMake(32, 1, 1));
+        } else if(mQkSimdMatrix) {
             gl = std::make_pair(MTLSizeMake(UP_DIV(seq_len, 16), UP_DIV(mCache->mKv_seq_len, 16), mNumHead), MTLSizeMake(32, 1, 1));
+        } else if(mIsDecode){
+            gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, decode_grid_y, mCache->mKv_seq_len)];
         } else {
             gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, mNumHead, mCache->mKv_seq_len)];
         }
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
+
     }
     // Run Softmax Kernel
     {
@@ -956,13 +452,14 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
 
         int thread_group_size = 32;
         std::pair<MTLSize, MTLSize> gl;
-        if(sftmSimdReduce) {
+        if(mSftmSimdReduce) {
             gl = std::make_pair(MTLSizeMake(inside, outside, 1), MTLSizeMake(thread_group_size, 1, 1));
         } else {
             gl = [context computeBestGroupAndLocal: mKernel_softmax threads:MTLSizeMake(inside, outside, 1)];
         }
 
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
+
     }
     // Run QKV Kernel
     {
@@ -974,27 +471,26 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         }
         [encoder setComputePipelineState:pipeline];
         MetalBackend::setTensor(mTempSoftMax.get(), encoder, 0);
-        MetalBackend::setTensor(value, encoder, 1);
-        MetalBackend::setTensor(outputs[0], encoder, 2);
-        MetalBackend::setTensor(mCache->mPastValue.get(), encoder, 3);
-        [encoder setBuffer:mParamQKV offset:0 atIndex:4];
+        MetalBackend::setTensor(outputs[0], encoder, 1);
+        MetalBackend::setTensor(mCache->mPastValue.get(), encoder, 2);
+        [encoder setBuffer:mParamQKV offset:0 atIndex:3];
         std::pair<MTLSize, MTLSize> gl;
-        if(qkvSimdReduce) {
+        if(mQkvSimdReduce) {
             gl = std::make_pair(MTLSizeMake(seq_len, mNumHead, mHeadDim), MTLSizeMake(32, 1, 1));
-        } else if(qkvSimdMatrix){
+        } else if(mQkvSimdMatrix){
             gl = std::make_pair(MTLSizeMake(UP_DIV(seq_len, 16), UP_DIV(mHeadDim, 16), mNumHead), MTLSizeMake(32, 1, 1));
-            //printf("qk:%d %d %d, softmax:%d %d %d, qkv:%d %d %d\n", seq_len, mNumHead, mCache->mKv_seq_len, inside, outside, 1, seq_len, mNumHead, mHeadDim);
         } else {
             gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, mNumHead, mHeadDim)];
         }
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
+
     }
     // Update status
     if(mIsDecode){
         mCache->mPastLength += 1;
         mCache->mKv_seq_len = mCache->mPastLength + 1;
     }
-    //printf("qk:%d %d %d, softmax:%d %d %d, qkv:%d %d %d\n", seq_len, mNumHead, mCache->mKv_seq_len, inside, outside, 1, seq_len, mNumHead, mHeadDim);
+
     return;
 }
 
diff --git a/source/backend/metal/MetalAttentionShader.hpp b/source/backend/metal/MetalAttentionShader.hpp
new file mode 100644
index 000000000..9209be406
--- /dev/null
+++ b/source/backend/metal/MetalAttentionShader.hpp
@@ -0,0 +1,636 @@
+//
+//  MetalAttentionShader.hpp
+//  MNN
+//
+//  Created by MNN on b'2024/12/03'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#if MNN_METAL_ENABLED
+#ifdef MNN_SUPPORT_TRANSFORMER_FUSE
+
+const char* gMatMulDivMask = R"metal(
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct Param {
+    int query_seq_len;
+    int key_seq_len;
+    int head_num;
+    int group;
+    int head_dim;
+    float scale;
+    int max_kv_len;
+};
+#define SIMD_GROUP_WIDTH 32
+
+kernel void prefill_qk(const device T* input0 [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    device T* past_key [[buffer(2)]],
+#ifdef FLOAT_MASK
+    const device T* mask [[buffer(3)]],
+#else
+    const device int* mask [[buffer(3)]],
+#endif
+    constant Param& param [[buffer(4)]],
+#ifdef SIMD_GROUP_MATRIX
+    uint3 gid[[threadgroup_position_in_grid]],
+    uint tiitg[[thread_index_in_threadgroup]],
+    uint tiisg[[thread_index_in_simdgroup]],
+    uint sgitg[[simdgroup_index_in_threadgroup]]
+#else
+    uint3 gid[[thread_position_in_grid]]
+#endif
+) {
+#ifdef SIMD_GROUP_MATRIX
+
+    /*
+     // Read:
+     ftype 0~127   ---> input: [M16, K8]
+     ftype 128~255 ---> input: [K8, N16]
+     // Write:
+     ftype 0~255 ---> input: [N2, M2, M8, N8]
+     */
+    
+    simdgroup_float8x8 sga[2];
+    simdgroup_float8x8 sgb[2];
+    simdgroup_float8x8 sgd[4];
+    for (int i = 0; i < 4; i++){
+        sgd[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+
+    int kl = tiitg % 2;// 0~1
+    int rcl = tiitg / 2;// 0~15
+
+    const int slq = gid.x; // q_seq_len/16 -> M/16
+    const int slk = gid.y; // k_seq_len/16 -> N/16
+    const int z = gid.z; // head_num
+
+    /** Q:
+     threadgroup: [M16, K8]
+     each thread: K4
+     layout: [M, B, K] -> [M/16, M16, B, K/8, K2, K4]
+     index : [slq, rcl, z, 0, kl, K4]
+     offset: ((slq * 16 + rcl) * B + z) * K + (0 * 2 + kl) * 4 + 0
+     */
+    /** K:
+     threadgroup: [K8, N16]
+     each thread: N4
+     layout: [N, B/G, K] -> [N/16, N16, B/G, K/8, K2, K4]
+     index : [slk, rcl, B/G, 0, kl, 0]
+     offset: ((slk * 16 + rcl) * B/G + z/G) * K + 0 * 8 + kl * 4 + 0
+     */
+    /** output:
+     threadgroup: [M16, N16]
+     each thread: N8
+     layout: [B, M, N] -> [B, M/16, M16, N/16, N2, N8]
+     index : [z, sl, rcl, kl, 0]
+     offset: (z * M + sl * 16 + rcl) * N + slk * 16 + kl * 8 + 0
+     */
+
+    int group = param.group;
+    int zin = z / param.group;
+    int q_seq_len = param.query_seq_len;
+    int k_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+
+    threadgroup float sdata[256] = {0.f};
+
+    int idx_slq = slq * 16 + rcl < q_seq_len ? slq * 16 + rcl : q_seq_len - 1;
+    int idx_slk = slk * 16 + rcl < k_seq_len ? slk * 16 + rcl : k_seq_len - 1;
+
+    auto A_offset = input0 + (idx_slq * head_num + z) * head_dim + (0 * 2 + kl) * 4 + 0;
+    auto B_offset = past_key + (idx_slk * head_num / group + zin) * head_dim + 0 * 8 + kl * 4 + 0;
+
+    for(int i = 0; i < head_dim; i += 8){
+        sdata[rcl * 8 + kl * 4 + 0] = A_offset[i + 0];
+        sdata[rcl * 8 + kl * 4 + 1] = A_offset[i + 1];
+        sdata[rcl * 8 + kl * 4 + 2] = A_offset[i + 2];
+        sdata[rcl * 8 + kl * 4 + 3] = A_offset[i + 3];
+        
+        sdata[128 + (kl * 4 + 0) * 16 + rcl] = B_offset[i + 0];
+        sdata[128 + (kl * 4 + 1) * 16 + rcl] = B_offset[i + 1];
+        sdata[128 + (kl * 4 + 2) * 16 + rcl] = B_offset[i + 2];
+        sdata[128 + (kl * 4 + 3) * 16 + rcl] = B_offset[i + 3];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        simdgroup_load(sga[0], (const threadgroup float*)sdata, 8);
+        simdgroup_load(sga[1], ((const threadgroup float*)sdata) + 64, 8);
+        
+        simdgroup_load(sgb[0], ((const threadgroup float*)sdata) + 128, 16);
+        simdgroup_load(sgb[1], ((const threadgroup float*)sdata) + 136, 16);
+        
+        simdgroup_multiply_accumulate(sgd[0], sga[0], sgb[0], sgd[0]);
+        simdgroup_multiply_accumulate(sgd[1], sga[1], sgb[0], sgd[1]);
+        simdgroup_multiply_accumulate(sgd[2], sga[0], sgb[1], sgd[2]);
+        simdgroup_multiply_accumulate(sgd[3], sga[1], sgb[1], sgd[3]);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    simdgroup_store(sgd[0], (threadgroup float*)sdata, 8);
+    simdgroup_store(sgd[1], (threadgroup float*)sdata + 64, 8);
+    simdgroup_store(sgd[2], (threadgroup float*)sdata + 128, 8);
+    simdgroup_store(sgd[3], (threadgroup float*)sdata + 192, 8);
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // [N2, M2, M8, N8]
+    float Vscale = (float)param.scale;
+
+    auto xy_out = output + (z * q_seq_len + slq * 16 + rcl) * k_seq_len + slk * 16 + kl * 8 + 0;
+    if(slq * 16 + rcl < q_seq_len) {
+        if(slk * 16 + kl * 8 + 0 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 0] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 0))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 0))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[0] = out0;
+        }
+        if(slk * 16 + kl * 8 + 1 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 1] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 1))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 1))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[1] = out0;
+        }
+        if(slk * 16 + kl * 8 + 2 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 2] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 2))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 2))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[2] = out0;
+        }
+        if(slk * 16 + kl * 8 + 3 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 3] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 3))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 3))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[3] = out0;
+        }
+        if(slk * 16 + kl * 8 + 4 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 4] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 4))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 4))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[4] = out0;
+        }
+        if(slk * 16 + kl * 8 + 5 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 5] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 5))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 5))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[5] = out0;
+        }
+        if(slk * 16 + kl * 8 + 6 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 6] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 6))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 6))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[6] = out0;
+        }
+        if(slk * 16 + kl * 8 + 7 < k_seq_len) {
+            auto out0 =  sdata[(kl * 16 + rcl) * 8 + 7] * Vscale;
+            #ifdef FLOAT_MASK
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 7))] + out0;
+            #else
+                out0 = mask[((slq * 16 + rcl) * k_seq_len + (slk * 16 + kl * 8 + 7))] == 0 ? -FLT_MAX : out0;
+            #endif
+            xy_out[7] = out0;
+        }
+    }
+
+#else
+    const int x = gid.x; // query_seq_len
+    const int y = gid.y; // head_num
+    const int z = gid.z; // key_seq_len
+
+    if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) {
+        return;
+    }
+    int group = param.group;
+    int query_seq_len = param.query_seq_len;
+    int key_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+    
+    const int offset = head_num * head_dim;
+    const int offset_head = y * head_dim;
+    const int offset_head_kv = (y / group) * head_dim;
+    const device T* A_offset = input0 + x * offset + offset_head;
+
+    float Vscale = (float)param.scale;
+
+    device const T* B_offset = past_key + z * offset / group + offset_head_kv;
+    const int output_offset = y * query_seq_len * key_seq_len;
+    float out0 = 0.0;
+    
+    for(int i = 0; i < head_dim; ++i){
+        float A = (float)(A_offset[i]);
+        float B = (float)(B_offset[i]);
+        out0 += B * A;
+    }
+    
+    out0 *= Vscale;
+    
+#ifdef FLOAT_MASK
+    out0 = mask[((x + 0) * key_seq_len + (z + 0))] + out0;
+#else
+    out0 = mask[((x + 0) * key_seq_len + (z + 0))] == 0 ? -FLT_MAX : out0;
+#endif
+    output[output_offset + x * key_seq_len + z] = (T)out0;
+#endif
+}
+
+kernel void decode_qk(const device T* input0 [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    device T* past_key [[buffer(2)]],
+#ifdef FLOAT_MASK
+    const device T* mask [[buffer(3)]],
+#else
+    const device int* mask [[buffer(3)]],
+#endif
+    constant Param& param [[buffer(4)]],
+#ifdef SIMD_GROUP_REDUCE
+    uint3 gid[[threadgroup_position_in_grid]],
+    uint  tiisg[[thread_index_in_simdgroup]],
+    uint  sgitg[[simdgroup_index_in_threadgroup]]
+#else
+    uint3 gid[[thread_position_in_grid]]
+#endif
+) {
+    int x = gid.x; // query_seq_len
+    int y = gid.y; // head_num
+    int z = gid.z; // key_seq_len
+
+#ifdef HEAD_NUM_2
+    y = y * 2;
+#endif
+    if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) {
+        return;
+    }
+    int group = param.group;
+
+    int key_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+    
+    const int offset = head_num * head_dim;
+    const int offset_head = y * head_dim;
+    const int offset_head_kv = (y / param.group) * head_dim;
+    const device T* A_offset = input0 + x * offset + offset_head;
+    device T* Pastkey_offset = past_key + z * offset / group + offset_head_kv;
+    float Vscale = (float)param.scale;
+    float out = 0.0;
+
+#ifdef HEAD_NUM_2
+    const device T* A_offset_1 = A_offset + head_dim;
+    device T* Pastkey_offset_1 = past_key + z * offset / group + ((y+1) / param.group) * head_dim;
+    float out_1 = 0.0;
+#endif
+
+#ifdef SIMD_GROUP_REDUCE
+    for(int i = tiisg; i < head_dim; i+=SIMD_GROUP_WIDTH){
+        float A = A_offset[i];
+        float B = (float)Pastkey_offset[i];
+        
+        out += A * B;
+    }
+
+#ifdef HEAD_NUM_2
+    if(y + 1 < param.head_num) {
+        for(int i = tiisg; i < head_dim; i+=SIMD_GROUP_WIDTH){
+            float A = A_offset_1[i];
+            float B = (float)Pastkey_offset_1[i];
+            
+            out_1 += A * B;
+        }
+    }
+#endif
+    out = simd_sum(out);
+
+#ifdef HEAD_NUM_2
+    if(y + 1 < param.head_num) {
+        out_1 = simd_sum(out_1);
+        if(tiisg == 1) {
+            out_1 *= Vscale;
+            output[(y+1) * key_seq_len + z] = (T)out_1;
+        }
+    }
+#endif
+    if(tiisg == 0) {
+        out *= Vscale;
+        output[y * key_seq_len + z] = (T)out;
+    }
+
+#else
+    {
+        for(int i = 0; i < head_dim; i++){
+            float A = A_offset[i];
+            float B = (float)Pastkey_offset[i];
+            
+            out += A * B;
+        }
+    }
+    out *= Vscale;
+    output[y * key_seq_len + z] = (T)out;
+
+#ifdef HEAD_NUM_2
+    if(y + 1 < param.head_num) {
+        for(int i = 0; i < head_dim; i++){
+            float A = A_offset_1[i];
+            float B = (float)Pastkey_offset_1[i];
+            
+            out_1 += A * B;
+        }
+        out_1 *= Vscale;
+        output[(y+1) * key_seq_len + z] = (T)out_1;
+    }
+#endif
+
+#endif
+}
+
+)metal";
+
+const char* gCopyPastKV = R"metal(
+#include <metal_stdlib>
+using namespace metal;
+struct Param {
+    int head_count;
+    int q_seq_len;
+    int max_kv_len;
+    int dst_k_offset;
+    int dst_v_offset;
+};
+kernel void copy(const device T* input0 [[buffer(0)]],
+    const device T* input1 [[buffer(1)]],
+    device T* output0 [[buffer(2)]],
+    device T* output1 [[buffer(3)]],
+    constant Param& param [[buffer(4)]],
+    uint3 gid[[thread_position_in_grid]]
+) {
+    const int x = gid.x; // head_num / group * head_dim
+    const int y = gid.y; // q_seq_len
+    if (x >= param.head_count || y >= param.q_seq_len) {
+        return;
+    }
+    const int index = y * param.head_count + x;
+    output0[param.dst_k_offset + index] = input0[index];
+    output1[param.dst_v_offset + x * param.max_kv_len + y] = input1[index];
+}
+)metal";
+
+const char* gMatMulQKV = R"metal(
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct Param {
+    int query_seq_len;
+    int key_seq_len;
+    int head_num;
+    int group;
+    int head_dim;
+    float scale;
+    int max_kv_len;
+};
+#define SIMD_GROUP_WIDTH 32
+kernel void prefill_qkv(const device T* input0 [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    device T* past_value [[buffer(2)]],
+    constant Param& param [[buffer(3)]],
+#ifdef SIMD_GROUP_MATRIX
+    uint3 gid[[threadgroup_position_in_grid]],
+    uint tiitg[[thread_index_in_threadgroup]],
+    uint tiisg[[thread_index_in_simdgroup]],
+    uint sgitg[[simdgroup_index_in_threadgroup]]
+#else
+    uint3 gid[[thread_position_in_grid]]
+#endif
+) {
+#ifdef SIMD_GROUP_MATRIX
+    /*
+     // Read:
+     ftype 0~127   ---> input: [M16, K8]
+     ftype 128~255 ---> input: [K8, N16]
+     // Write:
+     ftype 0~255 ---> input: [N2, M2, M8, N8]
+     */
+    
+    simdgroup_float8x8 sga[2];
+    simdgroup_float8x8 sgb[2];
+    simdgroup_float8x8 sgd[4];
+    for (int i = 0; i < 4; i++){
+        sgd[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+
+    int kl = tiitg % 2;// 0~1
+    int rcl = tiitg / 2;// 0~15
+
+    int nl = tiitg % 4;// 0~3
+    int kcl = tiitg / 4;// 0~7
+
+    const int sl = gid.x; // q_seq_len/16 -> M/16
+    const int hm = gid.y; // head_dim/16 -> N/16
+    const int z = gid.z; // head_num
+
+    /** QK:
+     threadgroup: [M16, K8]
+     each thread: K4
+     layout: [B, M, K] -> [B, M/16, M16, K/8, K2, K4]
+     index : [z, sl, rcl, ml, kl, K4]
+     offset: (z * M + sl * 16 + rcl) * K + (0 * 2 + kl) * 4 + 0
+     */
+    /** V:
+     threadgroup: [K8, N16]
+     each thread: N4
+     layout: [K, B/G, N] -> [K/8, K8, B/G, N/16, N4, N4]
+     index : [0, kcl, B/G, hm, nl, 0]
+     offset: ((0 * 8 + kcl) * B/G + z/G) * N + hm * 16 + nl * 4 + 0
+     */
+    /** output:
+     threadgroup: [M16, N16]
+     each thread: N8
+     layout: [M, B, N] -> [M/16, M16, B, N/16, N2, N8]
+     index : [sl, rcl, B, kl, 0]
+     offset: ((sl * 16 + rcl) * B + z) * N + hm * 16 + kl * 8 + 0
+     */
+
+    int group = param.group;
+    int zin = z / group;
+    int q_seq_len = param.query_seq_len;
+    int value_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+
+    threadgroup float sdata[256] = {0.f};
+
+    int idx_qk_sl = sl * 16 + rcl < q_seq_len ? (sl * 16 + rcl) : q_seq_len - 1;
+
+    auto A_offset = input0 + (z * q_seq_len + idx_qk_sl) * value_seq_len + (0 * 2 + kl) * 4 + 0;
+    auto B_offset = past_value + (zin * head_dim + hm * 16 + nl * 4 + 0) * param.max_kv_len + (0 * 8 + kcl);
+    
+
+    for(int i = 0; i < value_seq_len; i += 8){
+        sdata[rcl * 8 + kl * 4 + 0] = (i + kl * 4 + 0 < value_seq_len) ? A_offset[i + 0] : 0.0;
+        sdata[rcl * 8 + kl * 4 + 1] = (i + kl * 4 + 1 < value_seq_len) ? A_offset[i + 1] : 0.0;
+        sdata[rcl * 8 + kl * 4 + 2] = (i + kl * 4 + 2 < value_seq_len) ? A_offset[i + 2] : 0.0;
+        sdata[rcl * 8 + kl * 4 + 3] = (i + kl * 4 + 3 < value_seq_len) ? A_offset[i + 3] : 0.0;
+        
+        sdata[128 + kcl * 16 + nl * 4 + 0] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 0 < head_dim) ? B_offset[i + 0 * param.max_kv_len] : 0.0;
+        sdata[128 + kcl * 16 + nl * 4 + 1] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 1 < head_dim) ? B_offset[i + 1 * param.max_kv_len] : 0.0;
+        sdata[128 + kcl * 16 + nl * 4 + 2] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 2 < head_dim) ? B_offset[i + 2 * param.max_kv_len] : 0.0;
+        sdata[128 + kcl * 16 + nl * 4 + 3] = (i + kcl < value_seq_len && hm * 16 + nl * 4 + 3 < head_dim) ? B_offset[i + 3 * param.max_kv_len] : 0.0;
+
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        simdgroup_load(sga[0], (const threadgroup float*)sdata, 8);
+        simdgroup_load(sga[1], ((const threadgroup float*)sdata) + 64, 8);
+        
+        simdgroup_load(sgb[0], ((const threadgroup float*)sdata) + 128, 16);
+        simdgroup_load(sgb[1], ((const threadgroup float*)sdata) + 136, 16);
+        
+        simdgroup_multiply_accumulate(sgd[0], sga[0], sgb[0], sgd[0]);
+        simdgroup_multiply_accumulate(sgd[1], sga[1], sgb[0], sgd[1]);
+        simdgroup_multiply_accumulate(sgd[2], sga[0], sgb[1], sgd[2]);
+        simdgroup_multiply_accumulate(sgd[3], sga[1], sgb[1], sgd[3]);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    simdgroup_store(sgd[0], (threadgroup float*)sdata, 8);
+    simdgroup_store(sgd[1], (threadgroup float*)sdata + 64, 8);
+    simdgroup_store(sgd[2], (threadgroup float*)sdata + 128, 8);
+    simdgroup_store(sgd[3], (threadgroup float*)sdata + 192, 8);
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // [N2, M2, M8, N8]
+    auto xy_out = output + ((sl * 16 + rcl) * head_num + z) * head_dim + hm * 16 + kl * 8 + 0;
+    if(sl * 16 + rcl < q_seq_len) {
+        if(hm * 16 + kl * 8 + 0 < head_dim) {
+            xy_out[0] =  sdata[(kl * 16 + rcl) * 8 + 0];
+        }
+        if(hm * 16 + kl * 8 + 1 < head_dim) {
+            xy_out[1] =  sdata[(kl * 16 + rcl) * 8 + 1];
+        }
+        if(hm * 16 + kl * 8 + 2 < head_dim) {
+            xy_out[2] =  sdata[(kl * 16 + rcl) * 8 + 2];
+        }
+        if(hm * 16 + kl * 8 + 3 < head_dim) {
+            xy_out[3] =  sdata[(kl * 16 + rcl) * 8 + 3];
+        }
+        if(hm * 16 + kl * 8 + 4 < head_dim) {
+            xy_out[4] =  sdata[(kl * 16 + rcl) * 8 + 4];
+        }
+        if(hm * 16 + kl * 8 + 5 < head_dim) {
+            xy_out[5] =  sdata[(kl * 16 + rcl) * 8 + 5];
+        }
+        if(hm * 16 + kl * 8 + 6 < head_dim) {
+            xy_out[6] =  sdata[(kl * 16 + rcl) * 8 + 6];
+        }
+        if(hm * 16 + kl * 8 + 7 < head_dim) {
+            xy_out[7] =  sdata[(kl * 16 + rcl) * 8 + 7];
+        }
+    }
+
+#else
+    const int x = gid.x; // kv_seq_len
+    const int y = gid.y; // head_num
+    const int z = gid.z; // head_dim
+    if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) {
+        return;
+    }
+    int group = param.group;
+    int yin = y / group;
+    int q_seq_len = param.query_seq_len;
+    int value_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+    const int stride = head_num * head_dim / group;
+    const int offset_head = yin * head_dim + z;
+
+    device const T *A_offset = input0 + (y * q_seq_len + x) * value_seq_len;
+    device const T *B_offset = past_value + offset_head * param.max_kv_len;
+    float out = 0.0;
+    
+    for(int i = 0; i < value_seq_len; ++i){
+        float A0 = (float)A_offset[i];
+        float B = (float)B_offset[i];
+        out += A0 * B;
+    }
+    output[ x * stride * group + (y * head_dim + z)] = out;
+#endif
+}
+
+kernel void decode_qkv(const device T* input0 [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    device T* past_value [[buffer(2)]],
+    constant Param& param [[buffer(3)]],
+#ifdef SIMD_GROUP_REDUCE
+    uint3 gid[[threadgroup_position_in_grid]],
+    uint  tiisg[[thread_index_in_simdgroup]],
+    uint  sgitg[[simdgroup_index_in_threadgroup]]
+#else
+    uint3 gid[[thread_position_in_grid]]
+#endif
+) {
+    const int x = gid.x; // query_seq_len
+    const int y = gid.y; // head_num
+    const int z = gid.z; // head_dim
+    if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) {
+        return;
+    }
+
+    int yin = y / param.group;
+    int value_seq_len = param.key_seq_len;
+
+    int head_dim = param.head_dim;
+
+    const int offset_head = (yin * head_dim + z) * param.max_kv_len;
+
+    device const T *A_offset = input0 + y * value_seq_len;
+    device T *Pastvalue_offset = past_value + offset_head;
+    float out = 0;
+    
+#ifdef SIMD_GROUP_REDUCE
+    for(int i = tiisg; i < value_seq_len; i+=SIMD_GROUP_WIDTH){
+        float A = (float)A_offset[i];
+        float B = (float)Pastvalue_offset[i];
+        
+        out += A * B;
+    }
+    out = simd_sum(out);
+    if(tiisg == 0) {
+        output[(y * head_dim + z)] = (T)out;
+    }
+#else
+    for(int i = 0; i < value_seq_len; i++){
+        float A = (float)A_offset[i];
+        float B = (float)Pastvalue_offset[i];
+        
+        out += A * B;
+    }
+    output[(y * head_dim + z)] = (T)out;
+#endif
+}
+)metal";
+
+#endif/* MNN_SUPPORT_TRANSFORMER_FUSE */
+#endif
+
diff --git a/source/backend/metal/MetalConvolution1x1.hpp b/source/backend/metal/MetalConvolution1x1.hpp
index 672d433b6..bda5a483f 100644
--- a/source/backend/metal/MetalConvolution1x1.hpp
+++ b/source/backend/metal/MetalConvolution1x1.hpp
@@ -23,7 +23,7 @@ class MetalConvolution1x1 : public MetalConvolutionCommon {
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
     virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder)  override;
 private:
-    MetalConvolution1x1(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias, std::shared_ptr<MNN::Tensor> dequantScale, int dequantBits);
+    MetalConvolution1x1(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias, std::shared_ptr<MNN::Tensor> dequantScale, int dequantBits, float scaleCoef);
     id<MTLComputePipelineState> mPipeline;
     std::pair<MTLSize, MTLSize> mThreads;
 };
diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm
index 10ec73c9c..1d0c8d7c4 100644
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@@ -31,11 +31,12 @@
     loadWeight(op, ldInt8Weight);
 }
 
-MetalConvolution1x1::MetalConvolution1x1(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias, std::shared_ptr<MNN::Tensor> dequantScale, int dequantBits) : MetalConvolutionCommon(backend, op, bias) {
+MetalConvolution1x1::MetalConvolution1x1(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias, std::shared_ptr<MNN::Tensor> dequantScale, int dequantBits, float scaleCoef) : MetalConvolutionCommon(backend, op, bias) {
     mWeight = weight;
     mBias = bias;
     mDequantScaleBias = dequantScale;
     mDequantBits = dequantBits;
+    mScaleCoef = scaleCoef;
 }
 
 
@@ -46,7 +47,7 @@
     if (nullptr == dst) {
         return true;
     }
-    *dst = new MetalConvolution1x1(bn, op, mWeight, mBias, mDequantScaleBias, mDequantBits);
+    *dst = new MetalConvolution1x1(bn, op, mWeight, mBias, mDequantScaleBias, mDequantBits, mScaleCoef);
     return true;
 }
 
@@ -72,12 +73,26 @@
     auto context = (__bridge MNNMetalContext *)backend->context();
     int blockSize = 1;
     if (mDequantScaleBias.get()) {
-        blockSize = (int)(mDequantScaleBias->usize() /sizeof(float) / oc_4 / 2 / 4);
+        int bytes = sizeof(float);
+        if(backend->useFp16InsteadFp32()) {
+            bytes = sizeof(__fp16);
+        }
+        blockSize = (int)(mDequantScaleBias->usize() / bytes / oc_4 / 2 / 4);
     }
     // create const buffer
-    int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, blockSize, mActivationType};
-    mConstBuffer = backend->getConstBuffer(sizeof(constants));
-    ::memcpy(mConstBuffer.contents, constants, sizeof(constants));
+    mConstBuffer = backend->getConstBuffer(sizeof(Param));
+    auto param = (Param *)mConstBuffer.contents;
+    param->input_size = is;
+    param->input_slice = ic_4;
+    param->output_width = ow;
+    param->output_height = oh;
+    param->output_size = os;
+    param->output_slice = oc_4;
+    param->output_channel = oc;
+    param->batch = ob;
+    param->block_size = blockSize;
+    param->activation = mActivationType;
+    param->scale_coef = mScaleCoef;
 
     MetalRuntime* rt = (MetalRuntime *)backend->runtime();
     if (mDequantScaleBias.get()) {
diff --git a/source/backend/metal/MetalConvolutionCommon.hpp b/source/backend/metal/MetalConvolutionCommon.hpp
index a391d65e2..ac1175a2e 100644
--- a/source/backend/metal/MetalConvolutionCommon.hpp
+++ b/source/backend/metal/MetalConvolutionCommon.hpp
@@ -26,8 +26,21 @@ class MetalConvolutionCommon : public MetalExecution {
 
     virtual std::shared_ptr<MNN::Tensor> weightTransform(int group, int oc, int ic, int kh, int kw, const float *src, bool int8Weight = false, bool int4Weight = false);
 
-private:
-
+protected:
+    struct Param {
+        int input_size;
+        int input_slice;
+        int output_width;
+        int output_height;
+        int output_size;
+        int output_slice;
+        int output_channel;
+        int batch;
+        int block_size;
+        int activation;
+        float scale_coef;
+    };
+    
 protected:
     int mKernelX        = 0;
     int mKernelY        = 0;
@@ -42,6 +55,7 @@ class MetalConvolutionCommon : public MetalExecution {
     std::shared_ptr<MNN::Tensor> mBias;
     std::shared_ptr<MNN::Tensor> mDequantScaleBias;
     int mDequantBits;
+    float mScaleCoef;
     id<MTLBuffer> mConstBuffer = nil;
 };
 
diff --git a/source/backend/metal/MetalConvolutionCommon.mm b/source/backend/metal/MetalConvolutionCommon.mm
index f47464209..e5b918935 100644
--- a/source/backend/metal/MetalConvolutionCommon.mm
+++ b/source/backend/metal/MetalConvolutionCommon.mm
@@ -97,7 +97,8 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     }
 }
 
-static std::shared_ptr<MNN::Tensor> getDequantScale(const float* scale, int size, MetalBackend *backend, bool asymmetric, int oc) {
+template<typename DType>
+static std::pair<std::shared_ptr<MNN::Tensor>, float> getDequantScale(const float* scale, int size, MetalBackend *backend, bool asymmetric, int oc) {
     int totalCount = 0;
     if (asymmetric) {
         totalCount = size / 2;
@@ -106,15 +107,32 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     }
     int blockSize = totalCount / oc;
     int alignOutputCount = ALIGN_UP4(oc);
-    std::shared_ptr<MNN::Tensor> dequantScale(MNN::Tensor::createDevice<uint8_t>({alignOutputCount, blockSize, (int)(sizeof(float) * 2)}));
+    std::shared_ptr<MNN::Tensor> dequantScale(MNN::Tensor::createDevice<uint8_t>({alignOutputCount, blockSize, (int)(sizeof(DType) * 2)}));
     bool res = backend->onAcquireBuffer(dequantScale.get(), Backend::STATIC);
     if (!res) {
         MNN_ERROR("Buffer allocated error!\n");
-        return nullptr;
+        return std::make_pair(nullptr, 1.0);
     }
     auto buffer0 = MetalBackend::getBuffer(dequantScale.get());
-    auto dst_scale = (float*)((uint8_t*)[buffer0.first contents] + buffer0.second);
+    DType* dst_scale = (DType*)((uint8_t*)[buffer0.first contents] + buffer0.second);
     ::memset(dst_scale, 0, dequantScale->usize());
+    
+    float coef = 1.0;
+    if(std::is_same<DType, __fp16>::value) {
+        float max_data = 0.0;
+        for (int z=0; z<oc; ++z) {
+            auto srcZ = scale + z * blockSize * 2;
+            for (int bi=0; bi<blockSize; ++bi) {
+                float s = fabs(srcZ[2*bi+1]);
+                float b = fabs(srcZ[2*bi+0]);
+                float temp = ALIMAX(s, b);
+                if(temp > max_data) {
+                    max_data = temp;
+                }
+            }
+        }
+        coef = 65504.0 / max_data;
+    }
     if (asymmetric) {
         for (int z=0; z<oc; ++z) {
             int zo = z / 4;
@@ -125,8 +143,8 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
             for (int bi=0; bi<blockSize; ++bi) {
                 float s = srcZ[2*bi+1];
                 float b = srcZ[2*bi+0];
-                dstSZ[bi * 8] = s;
-                dstBZ[bi * 8] = b;
+                dstSZ[bi * 8] = (DType)(s * coef);
+                dstBZ[bi * 8] = (DType)(b * coef);
             }
         }
     } else {
@@ -139,12 +157,12 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
             for (int bi=0; bi<blockSize; ++bi) {
                 float s = srcZ[bi];
                 float b = 0.0f;
-                dstSZ[bi * 8] = s;
+                dstSZ[bi * 8] = (DType)(s * coef);
                 dstBZ[bi * 8] = b;
             }
         }
     }
-    return dequantScale;
+    return std::make_pair(dequantScale, coef);
 }
 void MetalConvolutionCommon::loadWeight(const MNN::Op *op, bool loadWeightInt8) {
     auto conv = op->main_as_Convolution2D();
@@ -166,12 +184,20 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
         ic = size / kw / kh / (oc / group);
     }
 
-    // convert
+    // convert
     if (loadWeightInt8 && qnt->weight.get() != nullptr) {
         auto backend = static_cast<MetalBackend *>(this->backend());
         mWeight = weightTransform(group, oc, ic, kh, kw, (float*)qnt->weight.get(), !qnt->canUseInt4, qnt->canUseInt4);
-        auto dequantParams = getDequantScale(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric, oc);
-        mDequantScaleBias = dequantParams;
+        if(backend->useFp16InsteadFp32()) {
+            auto dequantParams = getDequantScale<__fp16>(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric, oc);
+            mDequantScaleBias = dequantParams.first;
+            mScaleCoef = dequantParams.second;
+        } else {
+            auto dequantParams = getDequantScale<float>(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric, oc);
+            mDequantScaleBias = dequantParams.first;
+            mScaleCoef = dequantParams.second;
+        }
+
         mDequantBits = qnt->canUseInt4 ? 4:8;
     } else if (qnt && qnt->weightFloat.size() > 0) {
         mWeight = weightTransform(group, oc, ic, kh, kw, qnt->weightFloat.get(), false, false);
diff --git a/source/backend/metal/MetalDeconvolution.hpp b/source/backend/metal/MetalDeconvolution.hpp
index 5dc8dc4ed..c6066d646 100644
--- a/source/backend/metal/MetalDeconvolution.hpp
+++ b/source/backend/metal/MetalDeconvolution.hpp
@@ -24,16 +24,7 @@ class MetalDeconvolution : public MetalExecution {
 private:
     bool mDepthwise  = false;
     int mGroup       = 0;
-    int mKernelX     = 0;
-    int mKernelY     = 0;
     PadMode mPadMode = PadMode_CAFFE;
-    int mPadX        = 0;
-    int mPadY        = 0;
-    int mStrideX     = 0;
-    int mStrideY     = 0;
-    int mDilateX     = 0;
-    int mDilateY     = 0;
-    int mActivationType = 0;
 
     const MNN::Op *mOp = nullptr;
 
diff --git a/source/backend/metal/MetalDeconvolution.mm b/source/backend/metal/MetalDeconvolution.mm
index 4338d9e30..af3a6d9da 100755
--- a/source/backend/metal/MetalDeconvolution.mm
+++ b/source/backend/metal/MetalDeconvolution.mm
@@ -14,7 +14,33 @@
 
 #if MNN_METAL_ENABLED
 namespace MNN {
-
+struct deconv_constants {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    
+    int kernel_x;
+    int kernel_y;
+    int kernel_size;
+    int stride_x;
+    int stride_y;
+    int pad_x;
+    int pad_y;
+    int dilation_x;
+    int dilation_y;
+    
+    int delta_ky;
+    int delta_kx;
+    int delta_iy;
+    int delta_ix;
+    int batch;
+    int activation;
+};
 static int leastCommonMultiple(int m, int n) {
     int a = m, b = n;
     while(a != b){
@@ -130,17 +156,7 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
     auto common  = deconv->common();
     mOp          = op;
     mDepthwise   = op->type() == MNN::OpType_DeconvolutionDepthwise;
-    mGroup       = common->group();
-    mKernelX     = common->kernelX();
-    mKernelY     = common->kernelY();
     mPadMode     = common->padMode();
-    mPadX        = common->padX();
-    mPadY        = common->padY();
-    mStrideX     = common->strideX();
-    mStrideY     = common->strideY();
-    mDilateX     = common->dilateX();
-    mDilateY     = common->dilateY();
-    mActivationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);
 
     // forcy downgrade to float like what CPU does
     std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
@@ -167,9 +183,13 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
         mValid = false;
         return;
     }
+    auto weightBuffer = MetalBackend::getBuffer(mWeight.get());
+    auto ptr = (uint8_t*)weightBuffer.first.contents + weightBuffer.second;
     if (mtbn->useFp16InsteadFp32()) {
+        ::memset(ptr, 0, weightSize * sizeof(int16_t));
         weightForDeconv<__fp16>(mWeight, mDepthwise, deconv, qnt.get());
     } else {
+        ::memset(ptr, 0, weightSize * sizeof(float));
         weightForDeconv<float>(mWeight, mDepthwise, deconv, qnt.get());
     }
     mBias = biasForDeconv(backend, deconv, mtbn->useFp16InsteadFp32());
@@ -182,6 +202,24 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
     } else {
         mPipeline = [context pipelineWithName:@"deconv" fp16:mtbn->useFp16InsteadFp32()];
     }
+    mConstBuffer = [context newDeviceBuffer:sizeof(deconv_constants) access:CPUWriteOnly];
+    auto param = (deconv_constants*)mConstBuffer.contents;
+    
+    mGroup       = common->group();
+    param->kernel_x = common->kernelX();
+    param->kernel_y = common->kernelY();
+    param->kernel_size = common->kernelX() * common->kernelY();
+    param->stride_x = common->strideX();
+    param->stride_y = common->strideY();
+    param->dilation_x = common->dilateX();
+    param->dilation_y = common->dilateY();
+    param->activation = common->relu() ? 1 : (common->relu6() ? 2 : 0);
+    auto deltaKy = leastCommonMultiple(common->dilateY(), common->strideY()) / common->dilateY();
+    auto deltaKx = leastCommonMultiple(common->dilateX(), common->strideX()) / common->dilateX();
+    param->delta_kx = deltaKx;
+    param->delta_ky = deltaKy;
+    param->delta_iy = deltaKy * common->dilateY() / common->strideY();
+    param->delta_ix = deltaKx * common->dilateX() / common->strideX();
 }
 
 ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@@ -197,46 +235,28 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
     const int padY = pad.second;
 
     // const buffer
-    auto deltaKy = leastCommonMultiple(mDilateY, mStrideY) / mDilateY;
-    auto deltaKx = leastCommonMultiple(mDilateX, mStrideX) / mDilateX;
-
-    int consts[] = {
-        iw,
-        ih,
-        iw * ih,
-        iz,
-        ow,
-        oh,
-        ow * oh,
-        oz,
-        mKernelX,
-        mKernelY,
-        mKernelX * mKernelY,
-        mStrideX,
-        mStrideY,
-        padX,
-        padY,
-        mDilateX,
-        mDilateY,
-        deltaKy,
-        deltaKx,
-        deltaKy * mDilateY / mStrideY,
-        deltaKx * mDilateX / mStrideX,
-        1,
-        ob,
-        mActivationType
-    };
-    mConstBuffer = [context newDeviceBuffer:sizeof(consts) bytes:consts access:CPUWriteOnly];
+    auto param = (deconv_constants*)mConstBuffer.contents;
+    param->input_width = iw;
+    param->input_height = ih;
+    param->input_size = iw * ih;
+    param->input_slice = iz;
+    param->output_width = ow;
+    param->output_height = oh;
+    param->output_size = ow * oh;
+    param->output_slice = oz;
+    param->batch = ob;
+    param->pad_x = padX;
+    param->pad_y = padY;
 
     mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob)];
     return NO_ERROR;
 }
 
 void MetalDeconvolution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
-        auto input = inputs[0], output = outputs[0];
+    auto input = inputs[0], output = outputs[0];
     [encoder setComputePipelineState:mPipeline];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
+    MetalBackend::setTensor(input, encoder, 0);
+    MetalBackend::setTensor(output, encoder, 1);
     [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
     MetalBackend::setTensor(mWeight.get(), encoder, 3);
     MetalBackend::setTensor(mBias.get(), encoder, 4);
diff --git a/source/backend/metal/shader/MetalConvolution1x1.metal b/source/backend/metal/shader/MetalConvolution1x1.metal
index 962d95447..07ac77b3e 100644
--- a/source/backend/metal/shader/MetalConvolution1x1.metal
+++ b/source/backend/metal/shader/MetalConvolution1x1.metal
@@ -32,6 +32,7 @@ struct conv1x1_constants {
     int batch;
     int block_size;
     conv_activation_type activation;
+    float scale_coef;
 };
 
 kernel void conv1x1_g1z4(const device ftype4 *in            [[buffer(0)]],
@@ -76,7 +77,7 @@ kernel void conv1x1_g1z4_w8(const device ftype4 *in            [[buffer(0)]],
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device MNN::char4x4 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid                          [[thread_position_in_grid]]) {
     if ((int)gid.x * CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;
 
@@ -90,8 +91,8 @@ kernel void conv1x1_g1z4_w8(const device ftype4 *in            [[buffer(0)]],
     int computeSize = min(cst.output_size - rx, CONV_UNROLL);
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
     for (int bi=0; bi<cst.block_size; ++bi) {
-        FLOAT4 bs0 = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
-        FLOAT4 bs1 = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        FLOAT4 bs0 = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 bs1 = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
         FLOAT4 scale = bs0;
         FLOAT4 dequant_bias = bs1;
         int zmin = bi * block;
@@ -127,7 +128,7 @@ kernel void conv1x1_gemm_16x16_w4(const device ftype4 *in            [[buffer(0)
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device uchar2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid                          [[threadgroup_position_in_grid]],
                             uint                  tiitg[[thread_index_in_threadgroup]],
                             uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -161,8 +162,8 @@ kernel void conv1x1_gemm_16x16_w4(const device ftype4 *in            [[buffer(0)
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
     for (int bi=0; bi<cst.block_size; ++bi) {
         // [N/4, cst.block_size, 2/*scale_bias*/, N4]
-        FLOAT4 scale = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 1]);
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
 
@@ -220,7 +221,7 @@ kernel void conv1x1_gemm_32x16_w4(const device ftype4 *in            [[buffer(0)
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device uchar2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid                          [[threadgroup_position_in_grid]],
                             uint                  tiitg[[thread_index_in_threadgroup]],
                             uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -258,8 +259,8 @@ kernel void conv1x1_gemm_32x16_w4(const device ftype4 *in            [[buffer(0)
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
     for (int bi=0; bi<cst.block_size; ++bi) {
         // [N/4, cst.block_size, 2/*scale_bias*/, N4]
-        FLOAT4 scale = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 1]);
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (idx_n4 * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
 
@@ -324,7 +325,7 @@ kernel void conv1x1_gemm_16x32_w4(const device ftype4 *in            [[buffer(0)
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device uchar2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid                          [[threadgroup_position_in_grid]],
                             uint                  tiitg[[thread_index_in_threadgroup]],
                             uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -360,10 +361,10 @@ kernel void conv1x1_gemm_16x32_w4(const device ftype4 *in            [[buffer(0)
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
     for (int bi=0; bi<cst.block_size; ++bi) {
         // [N/4, cst.block_size, 2/*scale_bias*/, N4]
-        FLOAT4 scale0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 1]);
-        FLOAT4 scale1 = FLOAT4(dequantScale[2 * (idx_n41 * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias1 = FLOAT4(dequantScale[2 * (idx_n41 * cst.block_size + bi) + 1]);
+        FLOAT4 scale0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
+        FLOAT4 scale1 = FLOAT4(dequantScale[2 * (idx_n41 * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias1 = FLOAT4(dequantScale[2 * (idx_n41 * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
 
@@ -434,7 +435,7 @@ kernel void conv1x1_gemm_32x64_w4(const device ftype2 *in            [[buffer(0)
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device uchar2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid                          [[threadgroup_position_in_grid]],
                             uint                  tiitg[[thread_index_in_threadgroup]],
                             uint                  tiisg[[thread_index_in_simdgroup]],
@@ -494,8 +495,8 @@ kernel void conv1x1_gemm_32x64_w4(const device ftype2 *in            [[buffer(0)
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
     for (int bi=0; bi<cst.block_size; ++bi) {
         // [N/4, cst.block_size, 2/*scale_bias*/, N4]
-        FLOAT4 scale0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 1]);
+        FLOAT4 scale0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[2 * (idx_n40 * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
 
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
@@ -566,7 +567,7 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device MNN::uchar4x2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid                          [[thread_position_in_grid]]) {
     if ((int)gid.x * CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;
 
@@ -580,8 +581,8 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
     int computeSize = min(cst.output_size - rx, CONV_UNROLL);
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
     for (int bi=0; bi<cst.block_size; ++bi) {
-        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
         for (int z = zmin; z < zmax; z++) {
@@ -621,7 +622,7 @@ kernel void conv1x1_gemv_g8_w4(const device ftype4 *in            [[buffer(0)]],
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device MNN::uchar4x2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid[[threadgroup_position_in_grid]],
                             uint  tiisg[[thread_index_in_simdgroup]],
                             uint  sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -647,8 +648,8 @@ kernel void conv1x1_gemv_g8_w4(const device ftype4 *in            [[buffer(0)]],
     int outer_index  = (tiisg) / middle_step;
     
     for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {
-        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
-        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]) / (FLOAT)cst.scale_coef;
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
         for (int z = zmin + middle_index; z < zmax; z += middle_step) {
@@ -683,7 +684,7 @@ kernel void conv1x1_gemv_g16_w4(const device ftype4 *in            [[buffer(0)]]
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device MNN::uchar4x2 *wt      [[buffer(3)]],
                             const device ftype4 *biasTerms     [[buffer(4)]],
-                            const device float4 *dequantScale  [[buffer(5)]],
+                            const device ftype4 *dequantScale  [[buffer(5)]],
                             uint3 gid[[threadgroup_position_in_grid]],
                             uint  tiisg[[thread_index_in_simdgroup]],
                             uint  sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -712,10 +713,10 @@ kernel void conv1x1_gemv_g16_w4(const device ftype4 *in            [[buffer(0)]]
     
     for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {
         const int quant_offset = 2 * (uz * cst.block_size + bi);
-        FLOAT4 scale0 = FLOAT4(dequantScale[quant_offset + 0]);
-        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[quant_offset + 1]);
-        FLOAT4 scale1 = FLOAT4(dequantScale[quant_offset + (cst.block_size << 1)]);
-        FLOAT4 dequant_bias1 = FLOAT4(dequantScale[quant_offset + (cst.block_size << 1) + 1]);
+        FLOAT4 scale0 = FLOAT4(dequantScale[quant_offset + 0]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[quant_offset + 1]) / (FLOAT)cst.scale_coef;
+        FLOAT4 scale1 = FLOAT4(dequantScale[quant_offset + (cst.block_size << 1)]) / (FLOAT)cst.scale_coef;
+        FLOAT4 dequant_bias1 = FLOAT4(dequantScale[quant_offset + (cst.block_size << 1) + 1]) / (FLOAT)cst.scale_coef;
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
         for (int z = zmin + middle_index; z < zmax; z += middle_step) {
diff --git a/source/backend/metal/shader/MetalDeconvolution.metal b/source/backend/metal/shader/MetalDeconvolution.metal
index 133ca74fa..87d59dd99 100644
--- a/source/backend/metal/shader/MetalDeconvolution.metal
+++ b/source/backend/metal/shader/MetalDeconvolution.metal
@@ -8,7 +8,6 @@ struct deconv_constants {
     int output_height;
     int output_size;
     int output_slice;
-    
     int kernel_x;
     int kernel_y;
     int kernel_size;
@@ -18,12 +17,10 @@ struct deconv_constants {
     int pad_y;
     int dilation_x;
     int dilation_y;
-    
     int delta_ky;
     int delta_kx;
     int delta_iy;
     int delta_ix;
-    int has_bias;
     int batch;
     conv_activation_type activation;
 };
@@ -77,8 +74,8 @@ kernel void deconv_depthwise(const device ftype4 *in        [[buffer(0)]],
                              const device ftype4 *biasTerms [[buffer(4)]],
                              uint3 gid                    [[thread_position_in_grid]]) {
     if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
-    
-    FLOAT4 result = FLOAT4(biasTerms[(int)(gid.z / cst.batch)]);
+    int oz = (int)gid.z / cst.batch;
+    FLOAT4 result = FLOAT4(biasTerms[oz]);
     
     int oy = (int)gid.y + cst.pad_y;
     int ox = (int)gid.x + cst.pad_x;
@@ -95,7 +92,7 @@ kernel void deconv_depthwise(const device ftype4 *in        [[buffer(0)]],
         int min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y;
         int min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x;
         
-        auto z_wt = wt + (int)gid.z * cst.kernel_size;
+        auto z_wt = wt + oz * cst.kernel_size;
         auto z_in = in + (int)gid.z * cst.input_size;
         for (auto ky = max_ky, iy = min_iy; ky >= min_ky; ky -= cst.delta_ky, iy += cst.delta_iy) {
             for (auto kx = max_kx, ix = min_ix; kx >= min_kx; kx -= cst.delta_kx, ix += cst.delta_ix) {
diff --git a/source/backend/opencl/core/BufferConvertor.cpp b/source/backend/opencl/core/BufferConvertor.cpp
index 1f6abd82b..57139bb93 100644
--- a/source/backend/opencl/core/BufferConvertor.cpp
+++ b/source/backend/opencl/core/BufferConvertor.cpp
@@ -574,6 +574,80 @@ bool convertBufferToBuffer(Tensor *input, Tensor *output, OpenCLRuntime *runtime
     return true;
 }
 
+bool convertBetweenAHDandCLmem(const Tensor *input, const Tensor *output, OpenCLRuntime *runtime, int memType, bool toDevice, bool toHost) {
+    std::set<std::string> buildOptions;
+    auto srcDimensionFormat = TensorUtils::getDescribe(input)->dimensionFormat;
+    auto dstDimensionFormat = TensorUtils::getDescribe(output)->dimensionFormat;
+    if(runtime->getGpuMemType() == IMAGE){
+        buildOptions.emplace("-DUSE_IMAGE");
+    }
+    
+    buildOptions.emplace("-DINPUT_FORMAT=" + std::to_string(srcDimensionFormat));
+    buildOptions.emplace("-DOUTPUT_FORMAT=" + std::to_string(dstDimensionFormat));
+    std::vector<int> outputShape;
+    std::shared_ptr<KernelWrap> kernelW;
+    if(toDevice){
+        buildOptions.emplace("-DSHARED_TO_CL");
+        kernelW = runtime->buildKernelWithCache("glmem_convert", "gl_to_cl", buildOptions, nullptr, output);
+        outputShape = tensorShapeFormat(output);
+    } else if(toHost){
+        buildOptions.emplace("-DCL_TO_SHARED");
+        kernelW = runtime->buildKernelWithCache("glmem_convert", "cl_to_gl", buildOptions, input, nullptr);
+        outputShape = tensorShapeFormat(input);
+    }else{
+        MNN_PRINT("convertGLMemBetweenCLmem only support toDevice or toHost!\n");
+        return false;
+    }
+    
+    int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
+    uint32_t gws[3] = {static_cast<uint32_t>(UP_DIV(shape[3], 4)),
+                                  static_cast<uint32_t>(UP_DIV(shape[1], 4)),
+                                  static_cast<uint32_t>(shape[0] * shape[2])};
+    auto Kernel = kernelW->get();
+    uint32_t idx = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= Kernel.setArg(idx++, gws[0]);
+    ret |= Kernel.setArg(idx++, gws[1]);
+    ret |= Kernel.setArg(idx++, gws[2]);
+    if(toDevice){
+        ret |= Kernel.setArg(idx++, *((CLSharedMemReleaseBuffer*)TensorUtils::getSharedMem(input))->getMem());
+    }else{
+        if(runtime->getGpuMemType() == IMAGE) {
+            ret |= Kernel.setArg(idx++, openCLImage(input));
+        }
+        else {
+            ret |= Kernel.setArg(idx++, openCLBuffer(input));
+        }
+    }
+    if (toHost){
+        ret |= Kernel.setArg(idx++, *((CLSharedMemReleaseBuffer*)TensorUtils::getSharedMem(output))->getMem());
+    }else{
+        if(runtime->getGpuMemType() == IMAGE) {
+            ret |= Kernel.setArg(idx++, openCLImage(output));
+        } else {
+            ret |= Kernel.setArg(idx++, openCLBuffer(output));
+        }
+    }
+    ret |= Kernel.setArg(idx++, sizeof(shape), shape);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg glmem_convert");
+    
+    const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(kernelW));
+    const std::vector<uint32_t> lws = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
+    cl::Event event;
+    cl_int res;
+    std::vector<uint32_t> roundUpGroupWorkSize(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+        roundUpGroupWorkSize[i] = ROUND_UP(gws[i], lws[i]);
+    }
+    
+    res = runtime->commandQueue().enqueueNDRangeKernel(Kernel, cl::NullRange,
+                                                       cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1], roundUpGroupWorkSize[2]),
+                                                       cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    event.wait();
+    MNN_CHECK_CL_SUCCESS(res, "glmem_convert");
+    return true;
+}
+
 } // namespace OpenCL
 } // namespace MNN
 #endif /* MNN_OPENCL_BUFFER_CLOSED */
diff --git a/source/backend/opencl/core/BufferConvertor.hpp b/source/backend/opencl/core/BufferConvertor.hpp
index b1843226e..f14d21e80 100644
--- a/source/backend/opencl/core/BufferConvertor.hpp
+++ b/source/backend/opencl/core/BufferConvertor.hpp
@@ -14,6 +14,7 @@
 #include "core/Macro.h"
 #include <MNN/Tensor.hpp>
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/core/OpenCLBackend.hpp"
 
 namespace MNN {
 namespace OpenCL {
@@ -33,6 +34,7 @@ bool convertNC4HW4BufferBetweenNC16HW16Buffer(const Tensor *input, Tensor *outpu
 #endif
 
 bool convertBufferToBuffer(Tensor *input, Tensor *output, OpenCLRuntime *runtime, bool toDevice, bool toHost, bool needWait = false, bool svmFlag = false);
+bool convertBetweenAHDandCLmem(const Tensor *input, const Tensor *output, OpenCLRuntime *runtime, int memType, bool toDevice, bool toHost);
                                        
 class BufferConvertor {
 public:
diff --git a/source/backend/opencl/core/BufferPool.cpp b/source/backend/opencl/core/BufferPool.cpp
index 89e09fcc5..4398d4bd1 100644
--- a/source/backend/opencl/core/BufferPool.cpp
+++ b/source/backend/opencl/core/BufferPool.cpp
@@ -28,7 +28,6 @@ cl::Buffer* BufferPool::alloc(size_t size, bool separate) {
         return nullptr;
     }
     mAllBuffer.insert(std::make_pair(node->buffer.get(), node));
-
     return node->buffer.get();
 }
 
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
index 23cb43b2e..132915c6c 100644
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -333,8 +333,8 @@ Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageTyp
     if(mOpenCLRuntime->getGpuMemType() == BUFFER) {
         size_t size;
         float typeSize = getBytes(nativeTensor);
-        if (nativeTensor->dimensions() >= 2) {
-            auto alignC = ROUND_UP(C, 8);
+        if (MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(nativeTensor)->dimensionFormat && nativeTensor->dimensions() >= 2) {
+            auto alignC = ROUND_UP(C, 4);
             // increment of height and width
             auto hR = ROUND_UP(H + 3, 4) - H;
             auto wR = ROUND_UP(W + 3, 4) - W;
@@ -353,7 +353,6 @@ Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageTyp
         }
         // Align when int4 memory
         size = ROUND_UP(size, 2);
-        
         if (storageType == DYNAMIC_SEPERATE) {
             auto buffer = mBufferPool->alloc(size*typeSize, true);
             ((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
@@ -593,32 +592,53 @@ bool OpenCLBackend::isCreateError() const {
     return mIsCreateError;
 }
 
-void OpenCLBackend::_allocHostBuffer(int length, const Tensor* srcTensor) const {
+bool OpenCLBackend::_allocHostBuffer(int length, const Tensor* srcTensor) const {
     auto memType = srcTensor->buffer().flags;
-    if (nullptr != mHostBuffer.second && length <= mHostBuffer.first && memType != MNN_FORWARD_OPENCL && memType != MNN_FORWARD_OPENGL) {
-        return;
-    }
-    if(memType == MNN_FORWARD_OPENCL){
-        mDeviceBuffer = (cl::Buffer*)srcTensor->buffer().device;
+    if (nullptr != mHostBuffer.second && length <= mHostBuffer.first && memType != MNN_MEMORY_AHARDWAREBUFFER) {
+        return true;
     }
+    cl_int error;
 #ifdef  __ANDROID__
-    else if(memType == MNN_FORWARD_OPENGL && mOpenCLRuntime->isSupportGL()){
-        cl_int error;
-        mDeviceTexture.reset(new cl::ImageGL(mOpenCLRuntime->context(), CL_MEM_READ_WRITE, GL_TEXTURE_2D, 0, (cl_GLuint)srcTensor->buffer().device, &error));
-        std::vector<cl::Memory> map = {*mDeviceTexture.get()};
-        mOpenCLRuntime->commandQueue().enqueueAcquireGLObjects(&map, NULL);
-    }
+    if(MNN_MEMORY_AHARDWAREBUFFER == memType){
+        if (mOpenCLRuntime->isSupportAHD()){
+            CLSharedMemReleaseBuffer *sharedMem = (CLSharedMemReleaseBuffer*)TensorUtils::getSharedMem(srcTensor);
+            if(sharedMem == nullptr || (sharedMem != nullptr && srcTensor->buffer().device != sharedMem->getSharedId())){
+                if(mOpenCLRuntime->getGpuType() == MALI){
+                    const cl_import_properties_arm properties[] = {CL_IMPORT_TYPE_ARM, CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM, 0};
+                    Backend::MemObj* SharedTmp = new CLSharedMemReleaseBuffer(srcTensor->buffer().device, new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)CL_MEM_READ_WRITE, properties, (void*)srcTensor->buffer().device, CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM, &error));
+                    TensorUtils::setSharedMem(srcTensor, SharedTmp);
+                }else if(mOpenCLRuntime->getGpuType() == ADRENO){
+                    cl_mem_ahardwarebuffer_host_ptr myAHBmem = {0};
+                    myAHBmem.ext_host_ptr.allocation_type = CL_MEM_ANDROID_AHARDWAREBUFFER_HOST_PTR_QCOM;
+                    myAHBmem.ext_host_ptr.host_cache_policy = CL_MEM_HOST_WRITEBACK_QCOM;
+                    myAHBmem.ahb_ptr = (AHardwareBuffer*)srcTensor->buffer().device;
+                    Backend::MemObj* SharedTmp = new CLSharedMemReleaseBuffer(srcTensor->buffer().device, new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)(CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM), 0, &myAHBmem, &error));
+                    TensorUtils::setSharedMem(srcTensor, SharedTmp);
+                } else{
+                    MNN_ERROR("This device not support AHardWareBuffer\n");
+                    return false;
+                }
+                if (error != CL_SUCCESS) {
+                    MNN_ERROR("Alloc mAHardWareBuffer error, code:%d \n", error);
+                    return false;
+                }
+            }
+        } else{
+            MNN_ERROR("This device not support AHardWareBuffer\n");
+            return false;
+        }
+    } else
 #endif
-    else{
+    {
         MNN_ASSERT(length > 0);
-        cl_int res;
         mHostBuffer.first = length;
-        mHostBuffer.second.reset(new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)(CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (size_t)length, NULL, &res));
-        if (nullptr == mHostBuffer.second.get() || res != CL_SUCCESS) {
-            MNN_ERROR("Alloc mHostBuffer %d error, code:%d \n", length, res);
-            return;
+        mHostBuffer.second.reset(new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)(CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (size_t)length, NULL, &error));
+        if (nullptr == mHostBuffer.second.get() || error != CL_SUCCESS) {
+            MNN_ERROR("Alloc mHostBuffer %d error, code:%d \n", length, error);
+            return false;
         }
     }
+    return true;
 }
 
 void OpenCLBackend::copyFromDeviceInt8(const Tensor* srcTensor, const Tensor* dstTensor) const{
@@ -674,15 +694,15 @@ int OpenCLBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTe
 }
 
 void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTensor, MNN_DATA_FORMAT data_format, bool svmFlag, int memtype) const {
+#ifdef  __ANDROID__
+    if(MNN_MEMORY_AHARDWAREBUFFER == memtype){
+        convertBetweenAHDandCLmem(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), memtype, false, true);
+        return;
+    }
+#endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
     if(mOpenCLRuntime->getGpuMemType() == BUFFER)
     {
-        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
-            OpenCL::convertNC4HW4BufferToImage(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
-            std::vector<cl::Memory> map = {openCLImage(dstTensor)};
-            mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL);
-            return;
-        }
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
         int cPack = TensorUtils::getTensorChannelPack(srcTensor);
         if (cPack == 16 && mOpenCLRuntime->isSupportedIntelSubgroup()) {
@@ -710,17 +730,6 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens
     else
 #endif /* MNN_OPENCL_BUFFER_CLOSED */
     {
-        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
-            std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(srcTensor);
-
-            mOpenCLRuntime.get()->commandQueue().enqueueCopyImage(
-                    openCLImage(srcTensor), openCLImage(dstTensor),
-                    {0, 0, 0}, {0, 0, 0},
-                    {(size_t)bufferShape[2]* UP_DIV(bufferShape[3], 4), (size_t)bufferShape[0]*bufferShape[1], 1});
-            std::vector<cl::Memory> map = {openCLImage(dstTensor)};
-            mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL);
-            return;
-        }
         switch (data_format) {
             case MNN_DATA_FORMAT_NHWC:
                 OpenCL::convertImageToNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
@@ -748,8 +757,7 @@ void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTen
                        && (srcDimensionFormat == dstDimensionFormat || srcTensor->dimensions() <= 1)
                        && MNN::MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != srcDimensionFormat
                        && (getDataType(srcTensor) == getDataType(dstTensor))
-                       && memType != MNN_FORWARD_OPENCL 
-                       && memType != MNN_FORWARD_OPENGL;
+                       && memType != MNN_MEMORY_AHARDWAREBUFFER;
     if (mOpenCLRuntime->isSupportedFP16()) { // Fp16
         if (dstTensor->getType().code == halide_type_float) {
             directCopy = false;
@@ -792,15 +800,15 @@ void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTen
 
 void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor, MNN_DATA_FORMAT data_format, bool svmFlag, int memtype) const {
     // Format: Host -> OpenCL
+#ifdef  __ANDROID__
+    if(MNN_MEMORY_AHARDWAREBUFFER == memtype){
+        convertBetweenAHDandCLmem(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), memtype, true, false);
+        return;
+    }
+#endif
     #ifndef MNN_OPENCL_BUFFER_CLOSED
     if(mOpenCLRuntime->getGpuMemType() == BUFFER)
     {
-        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
-            OpenCL::convertImageToNC4HW4Buffer(srcTensor, const_cast<Tensor*>(dstTensor),mOpenCLRuntime.get(), false, svmFlag);
-            std::vector<cl::Memory> map = {openCLImage(srcTensor)};
-            mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL);
-            return;
-        }
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
         int cPack = TensorUtils::getTensorChannelPack(dstTensor);
         if (cPack == 16 && mOpenCLRuntime->isSupportedIntelSubgroup()) {
@@ -821,17 +829,6 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor
     else
     #endif /* MNN_OPENCL_BUFFER_CLOSED */
     {
-        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
-            std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(dstTensor);
-
-            mOpenCLRuntime.get()->commandQueue().enqueueCopyImage(
-                    openCLImage(srcTensor), openCLImage(dstTensor),
-                    {0, 0, 0}, {0, 0, 0},
-                    {(size_t)bufferShape[2]* UP_DIV(bufferShape[3], 4), (size_t)bufferShape[0]*bufferShape[1], 1});
-            std::vector<cl::Memory> map = {openCLImage(srcTensor)};
-            mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL);
-            return;
-        }
         if (MNN_DATA_FORMAT_NHWC == data_format) {
             OpenCL::convertNHWCBufferToImage(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
         } else if (MNN_DATA_FORMAT_NCHW == data_format) {
@@ -868,8 +865,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
                        && (srcDimensionFormat == dstDimensionFormat || srcTensor->dimensions() <= 1)
                        && MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != srcDimensionFormat
                        && (getDataType(srcTensor) == getDataType(dstTensor))
-                       && memType != MNN_FORWARD_OPENCL
-                       && memType != MNN_FORWARD_OPENGL;
+                       && memType != MNN_MEMORY_AHARDWAREBUFFER;
     if (mOpenCLRuntime->isSupportedFP16()) { // Fp16
         if (dstTensor->getType().code == halide_type_float) {
             directCopy = false;
@@ -901,15 +897,13 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
     #else
     auto res = mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
     if(res != CL_SUCCESS) {
-	MNN_ERROR("OpenCL enqueue write error:%d\n", res);
-	return;
+        MNN_ERROR("OpenCL enqueue write error:%d\n", res);
+        return;
     }
     #endif
 
     //Covert format
     mCLRuntime->convertToDevice((const Tensor*)&interTensor, dstTensor, srcDimensionFormat, false);
-
-    return;
 }
 
 void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
@@ -918,33 +912,21 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst
     if(MNN_FORWARD_CPU == srcMemtype && MNN_FORWARD_CPU == dstMemtype){
         mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
     } else {
-        const Tensor* copyTensor = MNN_FORWARD_CPU != srcMemtype ? srcTensor : dstTensor;
-        MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(copyTensor)->dimensionFormat;
-        int memType = MNN_FORWARD_CPU != srcMemtype ? srcMemtype : dstMemtype;
-        if(MNN_FORWARD_OPENCL != memType && MNN_FORWARD_OPENGL != memType){
-            MNN_PRINT("Unsupport ForwardType %d for OpenCL backend!\n", memType);
-            return;
-        }
-        if(mOpenCLRuntime->isSupportGL() && MNN_FORWARD_OPENGL == memType){
-            MNN_PRINT("This Device can not find OpenCL GL_EXTENTION function!\n");
+        const Tensor* hostTensor = MNN_FORWARD_CPU != srcMemtype ? srcTensor : dstTensor;
+        const Tensor* deviceTensor = MNN_FORWARD_CPU == srcMemtype ? srcTensor : dstTensor;
+        MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(deviceTensor)->dimensionFormat;
+        
+        bool alloc_error = _allocHostBuffer(0, hostTensor);
+        if(false == alloc_error){
+            MNN_ERROR("Alloc _allocHostBuffer error\n");
             return;
         }
-        _allocHostBuffer(0, copyTensor);
-
-        MNN::Tensor interTensor(copyTensor, copyTensor->getDimensionType(), false);
-        TensorUtils::getDescribe(&interTensor)->dimensionFormat = data_format;
-        if(MNN_FORWARD_OPENCL == memType ){
-            interTensor.buffer().device = (uint64_t)mDeviceBuffer;
-        }else if(MNN_FORWARD_OPENGL == memType){
-            interTensor.buffer().device = (uint64_t)mDeviceTexture.get();
-        }else{
-            interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
-        }
+        
         //Covert format
         if(MNN_FORWARD_CPU != srcMemtype){
-            mCLRuntime->convertToDevice((const Tensor*)&interTensor, dstTensor, data_format, false, srcMemtype);
+            mCLRuntime->convertToDevice(hostTensor, deviceTensor, data_format, false, srcMemtype);
         }else{
-            mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, data_format, false, dstMemtype);
+            mCLRuntime->convertFromDevice(deviceTensor, hostTensor, data_format, false, dstMemtype);
         }
     }
 }
diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp
index 3f0abcefb..1d4a51ece 100644
--- a/source/backend/opencl/core/OpenCLBackend.hpp
+++ b/source/backend/opencl/core/OpenCLBackend.hpp
@@ -153,7 +153,7 @@ class OpenCLBackend : public Backend {
     void copyToDeviceInt8(const Tensor* srcTensor, const Tensor* dstTensor) const;
     void copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTensor) const;
 
-    void _allocHostBuffer(int length, const Tensor* srcTensor) const;
+    bool _allocHostBuffer(int length, const Tensor* srcTensor) const;
 
     const CLRuntime* mCLRuntime;
 
@@ -171,8 +171,6 @@ class OpenCLBackend : public Backend {
     std::shared_ptr<OpenCLRuntime> mOpenCLRuntime;
 
     mutable std::pair<int, std::shared_ptr<cl::Buffer>> mHostBuffer;
-    mutable cl::Buffer *mDeviceBuffer = nullptr;
-    mutable std::shared_ptr<cl::Image> mDeviceTexture;
     BackendConfig::PrecisionMode mPrecision;
     BackendConfig::MemoryMode mMemory;
     bool mIsCreateError{false};
@@ -233,6 +231,26 @@ class TypedCreator : public OpenCLBackend::Creator {
     }
 };
 
+class CLSharedMemReleaseBuffer : public Backend::MemObj {
+public:
+    CLSharedMemReleaseBuffer(uint64_t sharedId, cl::Buffer *bId) {
+        mSharedId = sharedId;
+        mBuffer = bId;
+    }
+    virtual ~ CLSharedMemReleaseBuffer() {
+        delete mBuffer;
+    }
+    uint64_t getSharedId(){
+        return mSharedId;
+    }
+    cl::Buffer *getMem(){
+        return mBuffer;
+    }
+private:
+    uint64_t mSharedId;
+    cl::Buffer *mBuffer;
+};
+
 } // namespace OpenCL
 } // namespace MNN
 #endif  /* OpenCLBackend_hpp */
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
index af7cbef36..e25aef71d 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@@ -159,62 +159,47 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
             }
             const std::string extensions = platforms[0].getInfo<CL_PLATFORM_EXTENSIONS>();
             bool isPriorityHint = (extensions.find("cl_khr_priority_hints") != std::string::npos);
-
+            std::vector<cl_context_properties> context_properties;
+            if(mGpuType == ADRENO && !isPriorityHint){
+                context_properties.push_back(CL_CONTEXT_PERF_HINT_QCOM);
+                context_properties.push_back(CL_PERF_HINT_HIGH_QCOM);
+                context_properties.push_back(CL_CONTEXT_PRIORITY_HINT_QCOM);
+                context_properties.push_back(CL_PRIORITY_HINT_LOW_QCOM);
+                mIsDeviceSupportedLowPower = true;
+            }
+            #ifdef ARM_OPENCL_PRINTF_DEBUG
+            context_properties.push_back(CL_PRINTF_CALLBACK_ARM);
+            context_properties.push_back((cl_context_properties)callback);
+            context_properties.push_back(CL_PRINTF_BUFFERSIZE_ARM);
+            context_properties.push_back(0x1000);
+            #endif
+            std::string deviceextensions = mFirstGPUDevicePtr.get()->getInfo<CL_DEVICE_EXTENSIONS>();
+#ifdef MNN_USE_LIB_WRAPPER
+            mIsSupportAHD = (getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_import_memory_android_hardware_buffer")
+                 && mGpuType == MALI && OpenCLSymbolsOperator::getOpenclSymbolsPtr()->getFuncAddress(platforms[platformId](), "clImportMemoryARM"))
+                 || (mGpuType == ADRENO && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_android_ahardwarebuffer_host_ptr"));
+#endif
             if(nullptr != contextPtr){
-                if(nullptr != glShared && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_khr_gl_sharing")){
-                    std::vector<cl_context_properties> context_properties;
-                    context_properties.reserve(7);
-                    context_properties.push_back(CL_GL_CONTEXT_KHR);
-                    context_properties.push_back((cl_context_properties)contextPtr);
-                    context_properties.push_back(CL_EGL_DISPLAY_KHR);
-                    context_properties.push_back((cl_context_properties)glShared);
-                    context_properties.push_back(CL_CONTEXT_PLATFORM);
-                    context_properties.push_back((cl_context_properties)platforms[platformId]());
-                    context_properties.push_back(0);
-                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties.data(), nullptr, nullptr, &res));
-                }
-                else{
-                    mContext = std::shared_ptr<cl::Context>((cl::Context*)contextPtr, [](void* ptr) {
-                        // Do nothing
-                    });
-                }
+                mContext = std::shared_ptr<cl::Context>((cl::Context*)contextPtr, [](void* ptr) {
+                    // Do nothing
+                });
             }else{
-                if(mGpuType == ADRENO && !isPriorityHint){
-                    std::vector<cl_context_properties> context_properties;
-                    context_properties.reserve(5);
-                    context_properties.push_back(CL_CONTEXT_PERF_HINT_QCOM);
-                    context_properties.push_back(CL_PERF_HINT_HIGH_QCOM);
-                    context_properties.push_back(CL_CONTEXT_PRIORITY_HINT_QCOM);
-                    context_properties.push_back(CL_PRIORITY_HINT_LOW_QCOM);
+                if(context_properties.size() > 0){
                     context_properties.push_back(0);
                     mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties.data(), nullptr, nullptr, &res));
-                    mIsDeviceSupportedLowPower = true;
                 }else{
-                    #ifdef ARM_OPENCL_PRINTF_DEBUG
-                    cl_context_properties context_properties[] =
-                    {
-                        CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platformId](),
-                        CL_PRINTF_CALLBACK_ARM, (cl_context_properties)callback,
-                        CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-                        0
-                    };
-                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties, nullptr, nullptr, &res));
-                    #else
                     mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), nullptr, nullptr, nullptr, &res));
-                    #endif
-                }
-                
-                MNN_CHECK_CL_SUCCESS(res, "context");
-                if (res != CL_SUCCESS) {
-                    mIsCreateError = true;
-                    return;
                 }
             }
+            MNN_CHECK_CL_SUCCESS(res, "context");
+            if (res != CL_SUCCESS) {
+                mIsCreateError = true;
+                return;
+            }
             
             mIsDeviceSupportedLowPower = (mIsDeviceSupportedLowPower || isPriorityHint);
             
             #ifdef MNN_USE_LIB_WRAPPER
-            mIsSupportGL = !OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isGlError();
             if(isPriorityHint)
             {
                 if(true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isPropError())
@@ -646,7 +631,7 @@ std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelWithCache(const std::strin
                 buildOptionsStr += " -DCONVERT_OUTPUT16=convert_int16";
                 buildOptionsStr += " -DWI_DATA=write_imagei";
             } else {
-                MNN_PRINT("opencl input datatype not support, bit:%d\n", output->getType().bits);
+                MNN_PRINT("opencl output datatype not support, bit:%d\n", output->getType().bits);
                 MNN_ASSERT(false);
             }
         } else if(output->getType().code == halide_type_uint){
@@ -668,7 +653,7 @@ std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelWithCache(const std::strin
                 buildOptionsStr += " -DCONVERT_OUTPUT16=convert_uint16";
                 buildOptionsStr += " -DWI_DATA=write_imageui";
             } else {
-                MNN_PRINT("opencl input datatype not support, bit:%d\n", output->getType().bits);
+                MNN_PRINT("opencl output datatype not support, bit:%d\n", output->getType().bits);
                 MNN_ASSERT(false);
             }
         } else {
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
index b5dfa5918..ac7c31f83 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@@ -110,9 +110,9 @@ class OpenCLRuntime {
         return mCLVersion;
     }
 	uint32_t getPrecisionLevel() const;
-    bool isSupportGL(){
-    	return mIsSupportGL;
-	}
+    bool isSupportAHD(){
+        return mIsSupportAHD;
+    }
 #ifdef MNN_OPENCL_SVM_ENABLE
     cl_device_svm_capabilities getSvmCapabilities() {
         return mSvmCapabilities;
@@ -215,7 +215,7 @@ class OpenCLRuntime {
     bool mSupportDotInt8 = false;
     bool mSupportDotAccInt8 = false;
     bool mSupportedIntelSubgroup = false;
-    bool mIsSupportGL = true;
+    bool mIsSupportAHD = false;
     GpuType mGpuType;
     MaliAr mMaliAr;
     float mCLVersion = 1.0f;
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
index a83bfaa8f..b46952dfd 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@@ -121,12 +121,24 @@ bool OpenCLSymbols::isPropError() {
 bool OpenCLSymbols::isQcomError() {
     return mQcomError;
 }
-
-bool OpenCLSymbols::isGlError() {
-    return mGlError;
+    
+bool OpenCLSymbols::getFuncAddress(cl_platform_id platform, const char *func_name){
+    if(clGetExtensionFunctionAddressForPlatform != nullptr){
+        clImportMemoryARM = reinterpret_cast<clImportMemoryARMFunc>(clGetExtensionFunctionAddressForPlatform(platform, "clImportMemoryARM"));
+        if(clImportMemoryARM == nullptr){
+            return false;
+        }
+    }else if(clGetExtensionFunctionAddress != nullptr){
+        clImportMemoryARM = reinterpret_cast<clImportMemoryARMFunc>(clGetExtensionFunctionAddress("clImportMemoryARM"));
+        if(clImportMemoryARM == nullptr){
+            return false;
+        }
+    } else{
+        return false;
+    }
+    return true;
 }
 
-
 bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
 #if defined(_WIN32)
     handle_ = LoadLibraryA(library_path.c_str());
@@ -203,15 +215,7 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
     if(func_name == nullptr){ \
         mQcomError = true; \
     }
-
-#define MNN_LOAD_GL_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(dlsym(handle_, #func_name)); \
-    if(func_name == nullptr && loadOpenCLPointer != nullptr){ \
-        func_name = reinterpret_cast<func_name##Func>(loadOpenCLPointer(#func_name)); \
-    } \
-    if(func_name == nullptr){ \
-        mGlError = true; \
-    }
-
+    
 #endif
 
     MNN_LOAD_FUNCTION_PTR(clGetPlatformIDs);
@@ -261,10 +265,8 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
     MNN_LOAD_FUNCTION_PTR(clEnqueueCopyImage);
     MNN_LOAD_FUNCTION_PTR(clEnqueueReadImage);
     MNN_LOAD_FUNCTION_PTR(clEnqueueWriteImage);
-    MNN_LOAD_GL_PTR(clCreateFromGLBuffer);
-    MNN_LOAD_GL_PTR(clCreateFromGLTexture);
-    MNN_LOAD_GL_PTR(clEnqueueAcquireGLObjects);
-    MNN_LOAD_GL_PTR(clEnqueueReleaseGLObjects);
+    MNN_LOAD_FUNCTION_PTR(clGetExtensionFunctionAddress);
+    MNN_LOAD_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform);
 
     MNN_LOAD_PROP_PTR(clCreateCommandQueueWithProperties);
     MNN_LOAD_SVM_PTR(clSVMAlloc);
@@ -671,49 +673,6 @@ cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue queue,
     return func(queue, src_image, dst_image, src_origin, dst_origin, region, num_events_in_wait_list, event_wait_list, event);
 }
 
-cl_mem CL_API_CALL clCreateFromGLBuffer(cl_context context,
-                                        cl_mem_flags flags,
-                                        cl_GLuint bufobj,
-                                        int *errcode_ret){
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateFromGLBuffer;
-    MNN_CHECK_NOTNULL(func);
-    return func(context, flags, bufobj, errcode_ret);
-}
-
-cl_mem CL_API_CALL clCreateFromGLTexture(cl_context context,
-                                         cl_mem_flags flags,
-                                         cl_GLenum target,
-                                         cl_GLint miplevel,
-                                         cl_GLuint texture,
-                                         cl_int *errcode_ret){
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateFromGLTexture;
-    MNN_CHECK_NOTNULL(func);
-    return func(context, flags, target, miplevel, texture, errcode_ret);
-
-}
-
-cl_int CL_API_CALL clEnqueueAcquireGLObjects(cl_command_queue command_queue,
-                                             cl_uint num_objects,
-                                             const cl_mem *mem_objects,
-                                             cl_uint num_events_in_wait_list,
-                                             const cl_event *event_wait_list,
-                                             cl_event *event){
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueAcquireGLObjects;
-    MNN_CHECK_NOTNULL(func);
-    return func(command_queue, num_objects, mem_objects, num_events_in_wait_list, event_wait_list, event);
-}
-
-cl_int CL_API_CALL clEnqueueReleaseGLObjects(cl_command_queue command_queue,
-                                             cl_uint num_objects,
-                                             const cl_mem *mem_objects,
-                                             cl_uint num_events_in_wait_list,
-                                             const cl_event *event_wait_list,
-                                             cl_event *event){
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueReleaseGLObjects;
-    MNN_CHECK_NOTNULL(func);
-    return func(command_queue, num_objects, mem_objects, num_events_in_wait_list, event_wait_list, event);
-}
-
 // clCreateCommandQueueWithProperties wrapper
 cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context context, cl_device_id device, const cl_queue_properties *properties, cl_int *errcode_ret) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateCommandQueueWithProperties;
@@ -799,5 +758,22 @@ clEnqueueRecordingSVMQCOM(cl_command_queue command_queue, cl_recording_qcom reco
     return func(command_queue, recording, num_args, arg_array, num_svm_args, arg_svm_array, num_global_offsets, global_offset_array, num_global_workgroups, global_workgroup_array, num_local_workgroups, local_workgroups_array, num_non_arg_objs, non_arg_obj_array, num_events_in_wait_list, event_wait_list, event);
 }
 
+void * CL_API_CALL clGetExtensionFunctionAddress(const char *func_name){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clGetExtensionFunctionAddress;
+    MNN_CHECK_NOTNULL(func);
+    return func(func_name);
+}
+
+void * CL_API_CALL clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char *func_name){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clGetExtensionFunctionAddressForPlatform;
+    MNN_CHECK_NOTNULL(func);
+    return func(platform, func_name);
+}
+
+cl_mem CL_API_CALL clImportMemoryARM(cl_context context, cl_mem_flags flags, const cl_import_properties_arm *properties, void *memory, size_t size, cl_int *errcode_ret){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clImportMemoryARM;
+    MNN_CHECK_NOTNULL(func);
+    return func(context, flags, properties, memory, size, errcode_ret);
+}
 
 #endif //MNN_USE_LIB_WRAPPER
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
index ba39a8c30..0b3fecc29 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
@@ -31,6 +31,10 @@
 #endif
 
 #include "CL/cl_ext_qcom.h"
+#include "CL/cl_ext.h"
+#ifdef __ANDROID__
+#include <android/hardware_buffer.h>
+#endif
 
 #define MNN_CHECK_NOTNULL(X) MNN_ASSERT(X != NULL)
 
@@ -53,7 +57,7 @@ class OpenCLSymbols {
     bool isSvmError();
     bool isPropError();
     bool isQcomError();
-    bool isGlError();
+    bool getFuncAddress(cl_platform_id platform, const char *func_name);
     
     using clGetPlatformIDsFunc        = cl_int (CL_API_CALL *)(cl_uint, cl_platform_id *, cl_uint *);
     using clGetPlatformInfoFunc       = cl_int (CL_API_CALL *)(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
@@ -148,10 +152,6 @@ class OpenCLSymbols {
                                                    size_t param_value_size, void *param_value,
                                                    size_t *param_value_size_ret);
     using clGetImageInfoFunc           = cl_int (CL_API_CALL *)(cl_mem, cl_image_info, size_t, void *, size_t *);
-    using clCreateFromGLBufferFunc     = cl_mem (CL_API_CALL *)(cl_context, cl_mem_flags, cl_GLuint, int *);
-    using clCreateFromGLTextureFunc     = cl_mem (CL_API_CALL *)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
-    using clEnqueueAcquireGLObjectsFunc = cl_int (CL_API_CALL *)(cl_command_queue, cl_uint, const cl_mem *, cl_uint, const cl_event *, cl_event *);
-    using clEnqueueReleaseGLObjectsFunc = cl_int (CL_API_CALL *)(cl_command_queue, cl_uint, const cl_mem *, cl_uint, const cl_event *, cl_event *);
     using clReleaseDeviceFunc = cl_int (CL_API_CALL *)(cl_device_id);
     using clRetainDeviceFunc = cl_int (CL_API_CALL *)(cl_device_id);
 
@@ -176,6 +176,10 @@ class OpenCLSymbols {
                                                      size_t, const cl_offset_qcom*, size_t, const cl_workgroup_qcom*, size_t, const cl_workgroup_qcom*,
                                                      size_t, const cl_array_kernel_exec_info_qcom*, cl_uint, const cl_event*, cl_event*);
     
+    using clGetExtensionFunctionAddressFunc = void *(CL_API_CALL *)(const char *);
+    using clGetExtensionFunctionAddressForPlatformFunc = void *(CL_API_CALL *)(cl_platform_id, const char *);
+    using clImportMemoryARMFunc = cl_mem (CL_API_CALL *)(cl_context, cl_mem_flags, const cl_import_properties_arm*, void*, size_t, cl_int*);
+    
 #define MNN_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
 
     MNN_CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
@@ -225,10 +229,6 @@ class OpenCLSymbols {
     MNN_CL_DEFINE_FUNC_PTR(clGetImageInfo);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueReadImage);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueWriteImage);
-    MNN_CL_DEFINE_FUNC_PTR(clCreateFromGLBuffer);
-    MNN_CL_DEFINE_FUNC_PTR(clCreateFromGLTexture);
-    MNN_CL_DEFINE_FUNC_PTR(clEnqueueAcquireGLObjects);
-    MNN_CL_DEFINE_FUNC_PTR(clEnqueueReleaseGLObjects);
     
     MNN_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
     MNN_CL_DEFINE_FUNC_PTR(clSVMAlloc);
@@ -243,6 +243,9 @@ class OpenCLSymbols {
     MNN_CL_DEFINE_FUNC_PTR(clRetainRecordingQCOM);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueRecordingQCOM);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueRecordingSVMQCOM);
+    MNN_CL_DEFINE_FUNC_PTR(clGetExtensionFunctionAddress);
+    MNN_CL_DEFINE_FUNC_PTR(clGetExtensionFunctionAddressForPlatform);
+    MNN_CL_DEFINE_FUNC_PTR(clImportMemoryARM);
 
 #undef MNN_CL_DEFINE_FUNC_PTR
 
@@ -258,7 +261,6 @@ class OpenCLSymbols {
     bool mPropError{false};
     bool mQcomError{false};
     bool mCL_12Error{false};
-    bool mGlError{false};
 };
 
 class OpenCLSymbolsOperator {
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
index db3fb2d38..00f48dbce 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@@ -204,7 +204,7 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
             }
             mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, ptrCL);
 
-            mResource->mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
+            mResource->mFilter.reset(Tensor::createDevice<float>({filterImageShape[1] * 4 * filterImageShape[0]}));
             mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC);
             MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
 
@@ -458,8 +458,8 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
             for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
                 std::set<std::string> buildOption = mResource->mBuildOptions;
-                if(outputShape.at(3) % itemC[knl_idx] != 0){
-                    buildOption.emplace("-DCHANNEL_LEAVE");
+                if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+                    buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
                 }
                 if((outputShape.at(2) % itemW[knl_idx]) != 0){
                     buildOption.emplace("-DBLOCK_LEAVE");
@@ -496,13 +496,12 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
                 }
             }
 
-            std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
             int min_index  = min_cost.second;
             mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
 
             std::set<std::string> buildOption = mResource->mBuildOptions;
-            if(outputShape.at(3) % itemC[min_index] != 0){
-                buildOption.emplace("-DCHANNEL_LEAVE");
+            if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+                buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
             }
             if((outputShape.at(2) % itemW[min_index]) != 0){
                 buildOption.emplace("-DBLOCK_LEAVE");
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
index 1ce568cbd..21db9895e 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
@@ -265,8 +265,8 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor
     // MNN_PRINT("Checking kernel %d.\n", knlCheck);
     for (int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
         std::set<std::string> buildOption = mResource->mBuildOptions;
-        if(outputShape.at(3) % itemC[knl_idx] != 0){
-            buildOption.emplace("-DCHANNEL_LEAVE");
+        if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+            buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
         }
         if((outputShape.at(2) % itemW[knl_idx]) != 0 || (outputShape.at(1) % itemH[knl_idx]) != 0){
             buildOption.emplace("-DBLOCK_LEAVE");
@@ -313,8 +313,8 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor
     mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
 
     std::set<std::string> buildOption = mResource->mBuildOptions;
-    if(outputShape.at(3) % itemC[min_index] != 0){
-        buildOption.emplace("-DCHANNEL_LEAVE");
+    if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+        buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
     }
     if((outputShape.at(2) % itemW[min_index]) != 0 || (outputShape.at(1) % itemH[min_index]) != 0){
         buildOption.emplace("-DBLOCK_LEAVE");
diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
index 44af28f35..1dac90fc3 100644
--- a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
@@ -160,7 +160,11 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
         std::vector<uint32_t> localWorkSize[total_kernel];
         std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
         for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
-            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("depthwise_conv2d_buf", kernelName[knl_idx], mResource->mBuildOptions);
+            std::set<std::string> buildOption = mResource->mBuildOptions;
+            if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+                buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+            }
+            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("depthwise_conv2d_buf", kernelName[knl_idx], buildOption);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
                         
             globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
@@ -196,7 +200,11 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
         int min_index  = min_cost.second;
         mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
         
-        unit.kernel     = mOpenCLBackend->getOpenCLRuntime()->buildKernel("depthwise_conv2d_buf", kernelName[min_index], mResource->mBuildOptions);
+        std::set<std::string> buildOption = mResource->mBuildOptions;
+        if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+            buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+        }
+        unit.kernel     = mOpenCLBackend->getOpenCLRuntime()->buildKernel("depthwise_conv2d_buf", kernelName[min_index], buildOption);
         
         uint32_t idx = 0;
         cl_int ret = CL_SUCCESS;
diff --git a/source/backend/opencl/execution/cl/buffer_convert_buf.cl b/source/backend/opencl/execution/cl/buffer_convert_buf.cl
index 6a4b4e220..ece688d8c 100644
--- a/source/backend/opencl/execution/cl/buffer_convert_buf.cl
+++ b/source/backend/opencl/execution/cl/buffer_convert_buf.cl
@@ -74,7 +74,7 @@ __kernel void buffer_copy_to_buffer(GLOBAL_SIZE_2_DIMS
 #endif
 }
 
-// convert kernel : from buffer(oihw) to image(oc/4 h w , ic oc4)
+// convert kernel : from buffer(oihw) to image(ic, oc/4, h, w, oc4)
 __kernel void conv2d_filter_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS
                                             __global const FLOAT *input_ptr,
                                             __private const int output_channel,
diff --git a/source/backend/opencl/execution/cl/conv_2d.cl b/source/backend/opencl/execution/cl/conv_2d.cl
index 2b0bbad14..c87cb749d 100644
--- a/source/backend/opencl/execution/cl/conv_2d.cl
+++ b/source/backend/opencl/execution/cl/conv_2d.cl
@@ -459,6 +459,7 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_channel_block; ++in_channel_block_idx) {
 #if (defined USE_LOW_BIT_WEIGHT_INT8) || (defined USE_LOW_BIT_WEIGHT_INT4)
         int kindex = (in_channel_block_idx * 4) / blockDim * out_channel_blocks * 8;
+        // already pack to 16, no need boundry protect
         COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(output_channel_idx, dequantScaleOffset + kindex));
         COMPUTE_FLOAT4 scale0 = (COMPUTE_FLOAT4)(ScaleOffset0.s0, ScaleOffset0.s2, ScaleOffset0.s4, ScaleOffset0.s6);
         COMPUTE_FLOAT4 offset0 = (COMPUTE_FLOAT4)(ScaleOffset0.s1, ScaleOffset0.s3, ScaleOffset0.s5, ScaleOffset0.s7);
@@ -476,7 +477,11 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
 
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
         FLOAT16 weightsInt80 = CONVERT_FLOAT16(vload16(0, kernel_ptr + weight_ic_offset + in_channel_block_idx * weight_oc_offset));
+        #ifdef CHANNEL_BOUNDARY_PROTECT
+        FLOAT16 weightsInt81 = output_channel_idx + 1 >= out_channel_blocks ? (FLOAT16)0 : CONVERT_FLOAT16(vload16(0, kernel_ptr + 16 + weight_ic_offset + in_channel_block_idx * weight_oc_offset));
+        #else
         FLOAT16 weightsInt81 = CONVERT_FLOAT16(vload16(0, kernel_ptr + 16 + weight_ic_offset + in_channel_block_idx * weight_oc_offset));
+        #endif
         FLOAT4 weights0 = CONVERT_FLOAT4(weightsInt80.s0123) * scale0 + offset0;
         FLOAT4 weights1 = CONVERT_FLOAT4(weightsInt80.s4567) * scale0 + offset0;
         FLOAT4 weights2 = CONVERT_FLOAT4(weightsInt80.s89ab) * scale0 + offset0;
@@ -541,10 +546,17 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
         weights2 = vload4(weights_width_base + 2, weights + weight_offset);
         weights3 = vload4(weights_width_base + 3, weights + weight_offset);
 
+        #ifdef CHANNEL_BOUNDARY_PROTECT
+        weights4 = output_channel_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base, weights + weight_offset1);
+        weights5 = output_channel_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base + 1, weights + weight_offset1);
+        weights6 = output_channel_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base + 2, weights + weight_offset1);
+        weights7 = output_channel_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base + 3, weights + weight_offset1);
+        #else
         weights4 = vload4(weights_width_base, weights + weight_offset1);
         weights5 = vload4(weights_width_base + 1, weights + weight_offset1);
         weights6 = vload4(weights_width_base + 2, weights + weight_offset1);
         weights7 = vload4(weights_width_base + 3, weights + weight_offset1);
+        #endif
 #else
         weights0 = RI_F(weights, SAMPLER, (int2)(weights_width_base + 0, output_channel_idx));
         weights1 = RI_F(weights, SAMPLER, (int2)(weights_width_base + 1, output_channel_idx));
@@ -1081,10 +1093,18 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
                 weights1 = mad(CONVERT_FLOAT4(charWeight1), scale0, offset0);
                 weights2 = mad(CONVERT_FLOAT4(charWeight2), scale0, offset0);
                 weights3 = mad(CONVERT_FLOAT4(charWeight3), scale0, offset0);
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                charWeight0 = out_channel_block_idx + 1 >= out_channel_blocks ? (char4)0 : vload4(0, kernel_ptr+weight_offset+weight_oc_offset);
+                charWeight1 = out_channel_block_idx + 1 >= out_channel_blocks ? (char4)0 : vload4(0, kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset);
+                charWeight2 = out_channel_block_idx + 1 >= out_channel_blocks ? (char4)0 : vload4(0, kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*2);
+                charWeight3 = out_channel_block_idx + 1 >= out_channel_blocks ? (char4)0 : vload4(0, kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                
+                #else
                 charWeight0 = vload4(0, kernel_ptr+weight_offset+weight_oc_offset);
                 charWeight1 = vload4(0, kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset);
                 charWeight2 = vload4(0, kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*2);
                 charWeight3 = vload4(0, kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #endif
                 weights4 = mad(CONVERT_FLOAT4(charWeight0), scale1, offset1);
                 weights5 = mad(CONVERT_FLOAT4(charWeight1), scale1, offset1);
                 weights6 = mad(CONVERT_FLOAT4(charWeight2), scale1, offset1);
@@ -1153,10 +1173,18 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
                 weights1 = vload4(0, weights+weight_offset+weight_ic_offset);
                 weights2 = vload4(0, weights+weight_offset+weight_ic_offset*2);
                 weights3 = vload4(0, weights+weight_offset+weight_ic_offset*3);
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                charWeight0 =
+                weights4 = out_channel_block_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0, weights+weight_offset + weight_oc_offset);
+                weights5 = out_channel_block_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0, weights+weight_offset+weight_ic_offset + weight_oc_offset);
+                weights6 = out_channel_block_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0, weights+weight_offset+weight_ic_offset*2 + weight_oc_offset);
+                weights7 = out_channel_block_idx + 1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0, weights+weight_offset+weight_ic_offset*3 + weight_oc_offset);
+                #else
                 weights4 = vload4(0, weights+weight_offset + weight_oc_offset);
                 weights5 = vload4(0, weights+weight_offset+weight_ic_offset + weight_oc_offset);
                 weights6 = vload4(0, weights+weight_offset+weight_ic_offset*2 + weight_oc_offset);
                 weights7 = vload4(0, weights+weight_offset+weight_ic_offset*3 + weight_oc_offset);
+                #endif
                 weight_offset += 4;
 #else
                 weights0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 0, weights_y_idx));
diff --git a/source/backend/opencl/execution/cl/conv_2d_buf.cl b/source/backend/opencl/execution/cl/conv_2d_buf.cl
index d3d34e5f4..744324e14 100644
--- a/source/backend/opencl/execution/cl/conv_2d_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_buf.cl
@@ -200,25 +200,33 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = out_c_w_idx / out_w_blocks;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = out_c_w_idx % out_w_blocks;
     const int out_b_idx = out_b_h_idx / out_h;//equal to in_b_idx
     const int out_h_idx = out_b_h_idx % out_h;//equal to in_h_idx
 
     const int out_w4_idx = mul24(out_w_idx, 4);
-    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx<<1, bias_ptr));
+    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias_ptr));
     COMPUTE_FLOAT4 out1 = out0;
     COMPUTE_FLOAT4 out2 = out0;
     COMPUTE_FLOAT4 out3 = out0;
     
-    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4((out_c_idx<<1)+1, bias_ptr));
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
+    COMPUTE_FLOAT4 out5 = out4;
+    COMPUTE_FLOAT4 out6 = out4;
+    COMPUTE_FLOAT4 out7 = out4;
+    #else
+    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
     COMPUTE_FLOAT4 out5 = out4;
     COMPUTE_FLOAT4 out6 = out4;
     COMPUTE_FLOAT4 out7 = out4;
+    #endif
 
     const int intput_width_idx0 = out_w4_idx;
     int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0)<<2;
-    int offset = out_c_idx*8;
+    int offset = out_c_idx_0*4;
     const int inp_add = out_b*out_h*out_w*4;
 
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
@@ -229,6 +237,7 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
         COMPUTE_FLOAT4 in2 = CONVERT_COMPUTE_FLOAT4(vload4(2, input+inp_offset));
         COMPUTE_FLOAT4 in3 = CONVERT_COMPUTE_FLOAT4(vload4(3, input+inp_offset));
         
+        // output_channel at least pack to 8, no need boundry protect
         COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
         COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset));
         COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
@@ -306,7 +315,7 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx + out_c_idx*2*out_b)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx_0*out_b)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
 
     __global FLOAT * _tempoutput = output + out_offset;
     __global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w*out_b;
@@ -323,8 +332,8 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     } else if (remain == 1) {
         vstore4(CONVERT_FLOAT4(out0), 0, _tempoutput);
     }
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx*2+1 >= out_c_block) {
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_block) {
         return;
     }
 #endif
@@ -340,8 +349,8 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     }
 #else
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, _tempoutput);
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx*2+1 >= out_c_block) {
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_block) {
         return;
     }
 #endif
@@ -368,21 +377,26 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = out_c_w_idx / out_w_blocks;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = out_c_w_idx % out_w_blocks;
     const int out_b_idx = out_b_h_idx / out_h;//equal to in_b_idx
     const int out_h_idx = out_b_h_idx % out_h;//equal to in_h_idx
     
     const int out_w2_idx = mul24(out_w_idx, 2);
-    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx<<1, bias_ptr));
+    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias_ptr));
     COMPUTE_FLOAT4 out1 = out0;
     
-    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4((out_c_idx<<1)+1, bias_ptr));
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
+    #else
+    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
+    #endif
     COMPUTE_FLOAT4 out5 = out4;
 
     const int intput_width_idx0 = out_w2_idx;
     int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0)<<2;
-    int offset = out_c_idx*8;
+    int offset = out_c_idx_0*4;
     const int inp_add = out_b*out_h*out_w*4;
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
         
@@ -437,7 +451,7 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out5 = clamp(out5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx + out_c_idx*2*out_b)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx_0*out_b)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
 
 
     __global FLOAT * _tempoutput = output + out_offset;
@@ -450,8 +464,8 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     } else if (remain == 1) {
         vstore4(CONVERT_FLOAT4(out0), 0, _tempoutput);
     }
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx*2+1 >= out_c_block) {
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_block) {
         return;
     }
 #endif
@@ -462,8 +476,8 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     }
 #else
     vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, _tempoutput);
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx*2+1 >= out_c_block) {
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_block) {
         return;
     }
 #endif
@@ -1071,16 +1085,21 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = out_c_w_idx % out_w_blocks;
     const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
     const int out_h_idx = (out_b_h_idx % out_h_blocks) << 2;
     
-    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
+    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
     COMPUTE_FLOAT4 out1 = out0;
     COMPUTE_FLOAT4 out2 = out0;
     COMPUTE_FLOAT4 out3 = out0;
-    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx + 1, bias));
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
+    #else
+    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
+    #endif
     COMPUTE_FLOAT4 out5 = out4;
     COMPUTE_FLOAT4 out6 = out4;
     COMPUTE_FLOAT4 out7 = out4;
@@ -1100,12 +1119,12 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     const int weight_ic_offset = out_c_blocks * weight_oc_offset;
     const int in_hw_size = in_hw.x * in_hw.y;
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
-        //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
-        //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
+        //weights  NC4HW4   [ic/4, ic_4, oc/4, kh*kw, oc_4]
+        //index:   [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
         const int inp_offset_base = (out_b_idx + in_c_idx * batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
-            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
+            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
             const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
             const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
             const int in_h2_idx = (iy * dilate_hw.x + in_h2_idx_base) * in_hw.y;
@@ -1142,11 +1161,18 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
                 out3 = mad(in3.z, weight2, out3);
                 out3 = mad(in3.w, weight3, out3);
 
+                // weight: [ic/4, ic_4, oc/4, kh*kw, oc_4]
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                weight0 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
+                weight1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
+                weight2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
+                weight3 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
+                #else
                 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
                 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
                 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
                 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
-
+                #endif
                 out4 = mad(in0.x, weight0, out4);
                 out4 = mad(in0.y, weight1, out4);
                 out4 = mad(in0.z, weight2, out4);
@@ -1193,7 +1219,7 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 4){
@@ -1211,12 +1237,12 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-    #ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
     #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
@@ -1237,12 +1263,12 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
-    #ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
     #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
@@ -1273,16 +1299,21 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = out_c_w_idx % out_w_blocks;
     const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
     const int out_h_idx = (out_b_h_idx % out_h_blocks) << 1;
     
-    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
+    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
     COMPUTE_FLOAT4 out1 = out0;
-    COMPUTE_FLOAT4 out2 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx + 1, bias));
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    COMPUTE_FLOAT4 out2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
+    #else
+    COMPUTE_FLOAT4 out2 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
+    #endif
     COMPUTE_FLOAT4 out3 = out2;
-
+    
     const int in_w_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
 
     const int in_h0_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
@@ -1298,11 +1329,11 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
     // weight: [ic/4, oc, 4], loop: ic/4
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
-        //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
+        //index:   [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
         const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
-            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
+            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
             const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
             const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
 
@@ -1324,11 +1355,17 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
                 out1 = mad(in1.z, weight2, out1);
                 out1 = mad(in1.w, weight3, out1);
                 
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                weight0 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
+                weight1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
+                weight2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
+                weight3 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
+                #else
                 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
                 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
                 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
                 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
-                
+                #endif
                 out2 = mad(in0.x, weight0, out2);
                 out2 = mad(in0.y, weight1, out2);
                 out2 = mad(in0.z, weight2, out2);
@@ -1357,7 +1394,7 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 2){
@@ -1366,12 +1403,12 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-    #ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
     #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 2){
         vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
@@ -1381,12 +1418,12 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
 #else
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
-    #ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
     #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
 #endif
@@ -1415,17 +1452,21 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = (out_c_w_idx % out_w_blocks) << 2;
     const int out_b_idx = out_b_h_idx / out_hw.x;//equal to in_b_idx
     const int out_h_idx = out_b_h_idx % out_hw.x;
     
-    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
+    COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
     COMPUTE_FLOAT4 out1 = out0;
     COMPUTE_FLOAT4 out2 = out0;
     COMPUTE_FLOAT4 out3 = out0;
-    
-    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx + 1, bias));
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
+    #else
+    COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
+    #endif
     COMPUTE_FLOAT4 out5 = out4;
     COMPUTE_FLOAT4 out6 = out4;
     COMPUTE_FLOAT4 out7 = out4;
@@ -1445,8 +1486,8 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     const int weight_ic_offset = out_c_blocks * weight_oc_offset;
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
-        //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
+        //index:   [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
+        int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
             const int inp_offset_base = (((out_b_idx + in_c_idx * batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
@@ -1487,11 +1528,17 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
                 out3 = mad(in3.z, weight2, out3);
                 out3 = mad(in3.w, weight3, out3);
                 
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                weight0 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
+                weight1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
+                weight2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
+                weight3 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
+                #else
                 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
                 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
                 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
                 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
-                
+                #endif
                 out4 = mad(in0.x, weight0, out4);
                 out4 = mad(in0.y, weight1, out4);
                 out4 = mad(in0.z, weight2, out4);
@@ -1538,7 +1585,7 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.y - out_w_idx;
     if(remain >= 4){
@@ -1551,10 +1598,10 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-    #ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks)return;
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks)return;
     #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
     }else if(remain == 3){
@@ -1567,10 +1614,10 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     }
 #else
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
-    #ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks)return;
+    #ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks)return;
     #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/conv_2d_int_buf.cl b/source/backend/opencl/execution/cl/conv_2d_int_buf.cl
index e42398c63..f482f578d 100644
--- a/source/backend/opencl/execution/cl/conv_2d_int_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_int_buf.cl
@@ -10,7 +10,7 @@
     }
 
 #define MOD_NUM 15
-#ifdef INPUT_CHANNEL_LEAVE
+#ifdef INPUT_CHANNEL_BOUNDARY_PROTECT
     #define PADZEROSVEC(k, channel, data0, data1, data2, data3) \
         data0 = (k << 2) < channel ? data0 : 0; \
         data1 = (k << 2) + 1 < channel ? data1 : 0; \
@@ -674,17 +674,19 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = out_c_w_idx % out_w_blocks;
     const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
     const int out_h_idx = (out_b_h_idx % out_h_blocks) << 2;
     
-    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
     COMPUTE_FLOAT4 out0 = bias0;
     COMPUTE_FLOAT4 out1 = bias0;
     COMPUTE_FLOAT4 out2 = bias0;
     COMPUTE_FLOAT4 out3 = bias0;
-    COMPUTE_FLOAT4 bias1 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx + 1, bias));
+    // bias align to 8, no need boundry protect
+    COMPUTE_FLOAT4 bias1 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
     COMPUTE_FLOAT4 out4 = bias1;
     COMPUTE_FLOAT4 out5 = bias1;
     COMPUTE_FLOAT4 out6 = bias1;
@@ -706,18 +708,22 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
     const int in_hw_size = in_hw.x * in_hw.y;
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         int kindex = (in_c_idx * 4) / blockDim * out_c_blocks * 8;
-        COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx, dequantScaleOffset + kindex));
-        COMPUTE_FLOAT8 ScaleOffset1 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx + 1, dequantScaleOffset + kindex));
+        COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_0, dequantScaleOffset + kindex));
+        #ifdef CHANNEL_BOUNDARY_PROTECT
+        COMPUTE_FLOAT8 ScaleOffset1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1, dequantScaleOffset + kindex));
+        #else
+        COMPUTE_FLOAT8 ScaleOffset1 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1, dequantScaleOffset + kindex));
+        #endif
         COMPUTE_FLOAT4 scale0 = (COMPUTE_FLOAT4)(ScaleOffset0.s0, ScaleOffset0.s2, ScaleOffset0.s4, ScaleOffset0.s6);
         COMPUTE_FLOAT4 offset0 = (COMPUTE_FLOAT4)(ScaleOffset0.s1, ScaleOffset0.s3, ScaleOffset0.s5, ScaleOffset0.s7);
         COMPUTE_FLOAT4 scale1 = (COMPUTE_FLOAT4)(ScaleOffset1.s0, ScaleOffset1.s2, ScaleOffset1.s4, ScaleOffset1.s6);
         COMPUTE_FLOAT4 offset1 = (COMPUTE_FLOAT4)(ScaleOffset1.s1, ScaleOffset1.s3, ScaleOffset1.s5, ScaleOffset1.s7);
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
-        //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
+        //index:   [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
         const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
-            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
+            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
             const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
             const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
             const int in_h2_idx = (iy * dilate_hw.x + in_h2_idx_base) * in_hw.y;
@@ -791,10 +797,17 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
                 out3 = mad(in3.w, weight3, out3);
 
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                charWeight0 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset);
+                charWeight1 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset);
+                charWeight2 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2);
+                charWeight3 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #else
                 charWeight0 = vload4(0, weight+weight_offset+weight_oc_offset);
                 charWeight1 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset);
                 charWeight2 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2);
                 charWeight3 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #endif
                 weight0 = CONVERT_COMPUTE_FLOAT4(charWeight0) * scale1 + offset1;
                 weight1 = CONVERT_COMPUTE_FLOAT4(charWeight1) * scale1 + offset1;
                 weight2 = CONVERT_COMPUTE_FLOAT4(charWeight2) * scale1 + offset1;
@@ -878,7 +891,7 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 4){
@@ -896,12 +909,12 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
 #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + out_c_idx_1*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
@@ -922,12 +935,12 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
     vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
 #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + out_c_idx_1*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
@@ -964,15 +977,17 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = out_c_w_idx % out_w_blocks;
     const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
     const int out_h_idx = (out_b_h_idx % out_h_blocks) << 1;
 
-    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
     COMPUTE_FLOAT4 out0 = bias0;
     COMPUTE_FLOAT4 out1 = bias0;
-    COMPUTE_FLOAT4 bias1 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx + 1, bias));
+    // bias align to 8, no need boundry protect
+    COMPUTE_FLOAT4 bias1 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
     COMPUTE_FLOAT4 out2 = bias1;
     COMPUTE_FLOAT4 out3 = bias1;
 
@@ -991,18 +1006,22 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
     // weight: [ic/4, oc, 4], loop: ic/4
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         int kindex = (in_c_idx * 4) / blockDim * out_c_blocks * 8;
-        COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx, dequantScaleOffset + kindex));
-        COMPUTE_FLOAT8 ScaleOffset1 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx + 1, dequantScaleOffset + kindex));
+        COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_0, dequantScaleOffset + kindex));
+        #ifdef CHANNEL_BOUNDARY_PROTECT
+        COMPUTE_FLOAT8 ScaleOffset1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1, dequantScaleOffset + kindex));
+        #else
+        COMPUTE_FLOAT8 ScaleOffset1 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1, dequantScaleOffset + kindex));
+        #endif
         COMPUTE_FLOAT4 scale0 = (COMPUTE_FLOAT4)(ScaleOffset0.s0, ScaleOffset0.s2, ScaleOffset0.s4, ScaleOffset0.s6);
         COMPUTE_FLOAT4 offset0 = (COMPUTE_FLOAT4)(ScaleOffset0.s1, ScaleOffset0.s3, ScaleOffset0.s5, ScaleOffset0.s7);
         COMPUTE_FLOAT4 scale1 = (COMPUTE_FLOAT4)(ScaleOffset1.s0, ScaleOffset1.s2, ScaleOffset1.s4, ScaleOffset1.s6);
         COMPUTE_FLOAT4 offset1 = (COMPUTE_FLOAT4)(ScaleOffset1.s1, ScaleOffset1.s3, ScaleOffset1.s5, ScaleOffset1.s7);
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
-        //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
+        //index:   [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
         const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
-            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
+            int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
             const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
             const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
 
@@ -1060,10 +1079,17 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
                 out1 = mad(in1.w, weight3, out1);
                 
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                charWeight0 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset);
+                charWeight1 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset);
+                charWeight2 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2);
+                charWeight3 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #else
                 charWeight0 = vload4(0, weight+weight_offset+weight_oc_offset);
                 charWeight1 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset);
                 charWeight2 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2);
                 charWeight3 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #endif
                 weight0 = CONVERT_COMPUTE_FLOAT4(charWeight0) * scale1 + offset1;
                 weight1 = CONVERT_COMPUTE_FLOAT4(charWeight1) * scale1 + offset1;
                 weight2 = CONVERT_COMPUTE_FLOAT4(charWeight2) * scale1 + offset1;
@@ -1128,7 +1154,7 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 2){
@@ -1137,12 +1163,12 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
 #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + out_c_idx_1*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 2){
         vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
@@ -1152,12 +1178,12 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
 #else
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks){
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks){
         return;
     }
 #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + out_c_idx_1*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
 #endif
@@ -1192,17 +1218,19 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
 
     DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
 
-    const int out_c_idx = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
+    const int out_c_idx_1 = out_c_idx_0 + 1;
     const int out_w_idx = (out_c_w_idx % out_w_blocks) << 2;
     const int out_b_idx = out_b_h_idx / out_hw.x;//equal to in_b_idx
     const int out_h_idx = out_b_h_idx % out_hw.x;
     
-    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
     COMPUTE_FLOAT4 out0 = bias0;
     COMPUTE_FLOAT4 out1 = bias0;
     COMPUTE_FLOAT4 out2 = bias0;
     COMPUTE_FLOAT4 out3 = bias0;
-    COMPUTE_FLOAT4 bias1 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx + 1, bias));
+    // bias align to 8, no need boundry protect
+    COMPUTE_FLOAT4 bias1 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
     COMPUTE_FLOAT4 out4 = bias1;
     COMPUTE_FLOAT4 out5 = bias1;
     COMPUTE_FLOAT4 out6 = bias1;
@@ -1223,15 +1251,19 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
     const int weight_ic_offset = out_c_blocks * weight_oc_offset;
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         int kindex = (in_c_idx * 4) / blockDim * out_c_blocks * 8;
-        COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx, dequantScaleOffset + kindex));
-        COMPUTE_FLOAT8 ScaleOffset1 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx + 1, dequantScaleOffset + kindex));
+        COMPUTE_FLOAT8 ScaleOffset0 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_0, dequantScaleOffset + kindex));
+        #ifdef CHANNEL_BOUNDARY_PROTECT
+        COMPUTE_FLOAT8 ScaleOffset1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1, dequantScaleOffset + kindex));
+        #else
+        COMPUTE_FLOAT8 ScaleOffset1 = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1, dequantScaleOffset + kindex));
+        #endif
         COMPUTE_FLOAT4 scale0 = (COMPUTE_FLOAT4)(ScaleOffset0.s0, ScaleOffset0.s2, ScaleOffset0.s4, ScaleOffset0.s6);
         COMPUTE_FLOAT4 offset0 = (COMPUTE_FLOAT4)(ScaleOffset0.s1, ScaleOffset0.s3, ScaleOffset0.s5, ScaleOffset0.s7);
         COMPUTE_FLOAT4 scale1 = (COMPUTE_FLOAT4)(ScaleOffset1.s0, ScaleOffset1.s2, ScaleOffset1.s4, ScaleOffset1.s6);
         COMPUTE_FLOAT4 offset1 = (COMPUTE_FLOAT4)(ScaleOffset1.s1, ScaleOffset1.s3, ScaleOffset1.s5, ScaleOffset1.s7);
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
-        //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
+        //index:   [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
+        int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
             const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
@@ -1309,10 +1341,17 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
                 out3 = mad(in3.w, weight3, out3);
                 
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
+                #ifdef CHANNEL_BOUNDARY_PROTECT
+                charWeight0 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset);
+                charWeight1 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset);
+                charWeight2 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2);
+                charWeight3 = out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #else
                 charWeight0 = vload4(0, weight+weight_offset+weight_oc_offset);
                 charWeight1 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset);
                 charWeight2 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2);
                 charWeight3 = vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3);
+                #endif
                 weight0 = CONVERT_COMPUTE_FLOAT4(charWeight0) * scale1 + offset1;
                 weight1 = CONVERT_COMPUTE_FLOAT4(charWeight1) * scale1 + offset1;
                 weight2 = CONVERT_COMPUTE_FLOAT4(charWeight2) * scale1 + offset1;
@@ -1396,7 +1435,7 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.y - out_w_idx;
     if(remain >= 4){
@@ -1409,10 +1448,10 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks)return;
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks)return;
 #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + out_c_idx_1*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
     }else if(remain == 3){
@@ -1425,10 +1464,10 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
     }
 #else
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
-#ifdef CHANNEL_LEAVE
-    if(out_c_idx + 1 >= out_c_blocks)return;
+#ifdef CHANNEL_BOUNDARY_PROTECT
+    if(out_c_idx_1 >= out_c_blocks)return;
 #endif
-    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + out_c_idx_1*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl b/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl
index c32400af9..12cac5dfc 100644
--- a/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl
+++ b/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl
@@ -303,14 +303,18 @@ void depthwise_conv2d_s1_c8h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
             COMPUTE_FLOAT4 inValue2 = (in_w_start_2+kw < 0 || in_w_start_2+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+2, input+inp_offset_c0));
             COMPUTE_FLOAT4 inValue3 = (in_w_start_3+kw < 0 || in_w_start_3+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+3, input+inp_offset_c0));
 
-            COMPUTE_FLOAT4 inValue4 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c1));
-            COMPUTE_FLOAT4 inValue5 = (in_w_start_1+kw < 0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1, input+inp_offset_c1));
-            COMPUTE_FLOAT4 inValue6 = (in_w_start_2+kw < 0 || in_w_start_2+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+2, input+inp_offset_c1));
-            COMPUTE_FLOAT4 inValue7 = (in_w_start_3+kw < 0 || in_w_start_3+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+3, input+inp_offset_c1));
+            COMPUTE_FLOAT4 inValue4 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c1));
+            COMPUTE_FLOAT4 inValue5 = (in_w_start_1+kw < 0 || in_w_start_1+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1, input+inp_offset_c1));
+            COMPUTE_FLOAT4 inValue6 = (in_w_start_2+kw < 0 || in_w_start_2+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+2, input+inp_offset_c1));
+            COMPUTE_FLOAT4 inValue7 = (in_w_start_3+kw < 0 || in_w_start_3+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+3, input+inp_offset_c1));
             
             //NC4HW4 [1, filterShape.x*filterShape.y, 1, channelBlocks] x oc4
             //index: [0, filterIdx,                   0, inChannelBlockIdx]
             COMPUTE_FLOAT4 weights_0 = CONVERT_COMPUTE_FLOAT4(vload4(0, filter+(filter_idx*c_blocks+c_idx+0)*4));
+            /*
+              weight:[kh*kw, oc/4, oc_4], memory align to 8
+              no need to boundry protect
+              */
             COMPUTE_FLOAT4 weights_1 = CONVERT_COMPUTE_FLOAT4(vload4(0, filter+(filter_idx*c_blocks+c_idx+1)*4));
 
             outValue0 = mad(inValue0, weights_0, outValue0);
@@ -435,12 +439,16 @@ void depthwise_conv2d_s1_c8h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
             COMPUTE_FLOAT4 inValue0 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c0));
             COMPUTE_FLOAT4 inValue1 = (in_w_start_1+kw < 0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1, input+inp_offset_c0));
 
-            COMPUTE_FLOAT4 inValue4 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c1));
-            COMPUTE_FLOAT4 inValue5 = (in_w_start_1+kw < 0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1, input+inp_offset_c1));
+            COMPUTE_FLOAT4 inValue4 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c1));
+            COMPUTE_FLOAT4 inValue5 = (in_w_start_1+kw < 0 || in_w_start_1+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1, input+inp_offset_c1));
 
             //NC4HW4 [1, filterShape.x*filterShape.y, 1, channelBlocks] x oc4
             //index: [0, filterIdx,                   0, inChannelBlockIdx]
             COMPUTE_FLOAT4 weights_0 = CONVERT_COMPUTE_FLOAT4(vload4(0, filter+(filter_idx*c_blocks+c_idx+0)*4));
+            /*
+              weight:[kh*kw, oc/4, oc_4], memory align to 8
+              no need to boundry protect
+              */
             COMPUTE_FLOAT4 weights_1 = CONVERT_COMPUTE_FLOAT4(vload4(0, filter+(filter_idx*c_blocks+c_idx+1)*4));
 
             outValue0 = mad(inValue0, weights_0, outValue0);
diff --git a/source/backend/opencl/execution/cl/glmem_convert.cl b/source/backend/opencl/execution/cl/glmem_convert.cl
new file mode 100644
index 000000000..8288ab9fa
--- /dev/null
+++ b/source/backend/opencl/execution/cl/glmem_convert.cl
@@ -0,0 +1,211 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define GLOBAL_SIZE_3_DIMS __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                       \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                     \
+    }
+
+#define MNN_DATA_FORMAT_NCHW 0
+#define MNN_DATA_FORMAT_NHWC 1
+#define MNN_DATA_FORMAT_NC4HW4 2
+#define MNN_DATA_FORMAT_C4NHW4 3
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+#define OUTPUT_TYPE2 CAT(OUTPUT_TYPE, 2)
+#define OUTPUT_TYPE3 CAT(OUTPUT_TYPE, 3)
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+#ifdef SHARED_TO_CL
+__kernel void gl_to_cl(GLOBAL_SIZE_3_DIMS
+                                    __global uchar *input_ptr,
+                                    #ifdef USE_IMAGE
+                                    __write_only image2d_t output_ptr,
+                                    #else
+                                    __global OUTPUT_TYPE *output_ptr,
+                                    #endif
+                                    __private const int4 shape // N C H W
+) {
+
+    int wblock  = get_global_id(0);
+    int cblock = get_global_id(1);
+    int nh = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(wblock, cblock, nh);
+    const int w = wblock << 2;
+    const int h = nh % shape.z;
+    const int c = cblock << 2;
+    const int n = nh / shape.z;
+    
+    int idx = c * shape.w + w;    // c/4*w
+    int idy = nh;    // n*h
+    const int offset = idy * shape.w * 4;
+    OUTPUT_TYPE4 in0 = CONVERT_OUTPUT4(vload4(idx, input_ptr + offset));
+    OUTPUT_TYPE4 in1 = CONVERT_OUTPUT4(vload4(idx + 1, input_ptr + offset));
+    OUTPUT_TYPE4 in2 = CONVERT_OUTPUT4(vload4(idx + 2, input_ptr + offset));
+    OUTPUT_TYPE4 in3 = CONVERT_OUTPUT4(vload4(idx + 3, input_ptr + offset));
+
+#ifdef USE_IMAGE
+    WI_DATA(output_ptr, (int2)(idx, idy), in0);
+    if(w + 1 >= shape.w) return;
+    WI_DATA(output_ptr, (int2)(idx+1, idy), in1);
+    if(w + 2 >= shape.w) return;
+    WI_DATA(output_ptr, (int2)(idx+2, idy), in2);
+    if(w + 3 >= shape.w) return;
+    WI_DATA(output_ptr, (int2)(idx+3, idy), in3);
+#else
+    #if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW
+    int output_offset = ((n * shape.y + c) * shape.z + h) * shape.w + w;
+    int stride = shape.z * shape.w;
+    int remain = shape.w - w;
+    if(remain >= 4){
+        vstore4((OUTPUT_TYPE4)(in0.x, in1.x, in2.x, in3.x), 0, output_ptr + output_offset);
+        if(c + 1 >= shape.y) return;
+        vstore4((OUTPUT_TYPE4)(in0.y, in1.y, in2.y, in3.y), 0, output_ptr + output_offset + stride);
+        if(c + 2 >= shape.y) return;
+        vstore4((OUTPUT_TYPE4)(in0.z, in1.z, in2.z, in3.z), 0, output_ptr + output_offset + stride + stride);
+        if(c + 3 >= shape.y) return;
+        vstore4((OUTPUT_TYPE4)(in0.w, in1.w, in2.w, in3.w), 0, output_ptr + output_offset + stride + stride + stride);
+    } else if(remain == 3){
+        vstore3((OUTPUT_TYPE3)(in0.x, in1.x, in2.x), 0, output_ptr + output_offset);
+        if(c + 1 >= shape.y) return;
+        vstore3((OUTPUT_TYPE3)(in0.y, in1.y, in2.y), 0, output_ptr + output_offset + stride);
+        if(c + 2 >= shape.y) return;
+        vstore3((OUTPUT_TYPE3)(in0.z, in1.z, in2.z), 0, output_ptr + output_offset + stride + stride);
+        if(c + 3 >= shape.y) return;
+        vstore3((OUTPUT_TYPE3)(in0.w, in1.w, in2.w), 0, output_ptr + output_offset + stride + stride + stride);
+    } else if(remain == 2){
+        vstore2((OUTPUT_TYPE2)(in0.x, in1.x), 0, output_ptr + output_offset);
+        if(c + 1 >= shape.y) return;
+        vstore2((OUTPUT_TYPE2)(in0.y, in1.y), 0, output_ptr + output_offset + stride);
+        if(c + 2 >= shape.y) return;
+        vstore2((OUTPUT_TYPE2)(in0.z, in1.z), 0, output_ptr + output_offset + stride + stride);
+        if(c + 3 >= shape.y) return;
+        vstore2((OUTPUT_TYPE2)(in0.w, in1.w), 0, output_ptr + output_offset + stride + stride + stride);
+    }else if(remain == 1){
+        output_ptr[output_offset] = in0.x;
+        if(c + 1 >= shape.y) return;
+        output_ptr[output_offset + stride] = in0.y;
+        if(c + 2 >= shape.y) return;
+        output_ptr[output_offset + stride + stride] = in0.z;
+        if(c + 3 >= shape.y) return;
+        output_ptr[output_offset + stride + stride + stride] = in0.w;
+    }
+    #elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC
+    int output_offset = ((n * shape.z + h) * shape.w + w) * shape.y + c;
+    int remain = shape.y - c;
+    if(remain >= 4){
+        vstore4(CONVERT_OUTPUT4(in0), 0, output_ptr + output_offset);
+        if(w + 1 >= shape.w) return;
+        vstore4(CONVERT_OUTPUT4(in1), 0, output_ptr + output_offset + shape.y);
+        if(w + 2 >= shape.w) return;
+        vstore4(CONVERT_OUTPUT4(in2), 0, output_ptr + output_offset + shape.y + shape.y);
+        if(w + 3 >= shape.w) return;
+        vstore4(CONVERT_OUTPUT4(in3), 0, output_ptr + output_offset + shape.y + shape.y + shape.y);
+    } else if(remain == 3){
+        vstore3((OUTPUT_TYPE3)(in0.x, in0.y, in0.z), 0, output_ptr + output_offset);
+        if(w + 1 >= shape.w) return;
+        vstore3((OUTPUT_TYPE3)(in1.x, in1.y, in1.z), 0, output_ptr + output_offset + shape.y);
+        if(w + 2 >= shape.w) return;
+        vstore3((OUTPUT_TYPE3)(in2.x, in2.y, in2.z), 0, output_ptr + output_offset + shape.y + shape.y);
+        if(w + 3 >= shape.w) return;
+        vstore3((OUTPUT_TYPE3)(in3.x, in3.y, in3.z), 0, output_ptr + output_offset + shape.y + shape.y + shape.y);
+    } else if(remain == 2){
+        vstore2((OUTPUT_TYPE2)(in0.x, in0.y), 0, output_ptr + output_offset);
+        if(w + 1 >= shape.w) return;
+        vstore2((OUTPUT_TYPE2)(in1.x, in1.y), 0, output_ptr + output_offset + shape.y);
+        if(w + 2 >= shape.w) return;
+        vstore2((OUTPUT_TYPE2)(in2.x, in2.y), 0, output_ptr + output_offset + shape.y + shape.y);
+        if(w + 3 >= shape.w) return;
+        vstore2((OUTPUT_TYPE2)(in3.x, in3.y), 0, output_ptr + output_offset + shape.y + shape.y + shape.y);
+    }else if(remain == 1){
+        output_ptr[output_offset] = in0.x;
+        if(w + 1 >= shape.w) return;
+        output_ptr[output_offset + shape.y] = in1.x;
+        if(w + 2 >= shape.w) return;
+        output_ptr[output_offset + shape.y + shape.y] = in1.x;
+        if(w + 3 >= shape.w) return;
+        output_ptr[output_offset + shape.y + shape.y + shape.y] = in1.x;
+    }
+    #elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4
+    int output_offset = (((cblock * shape.x + n) * shape.z + h) * shape.w + w) * 4;
+    vstore4(in0, 0, output_ptr + output_offset);
+    if(w + 1 >= shape.w) return;
+    vstore4(in1, 0, output_ptr + output_offset + 4);
+    if(w + 2 >= shape.w) return;
+    vstore4(in2, 0, output_ptr + output_offset + 8);
+    if(w + 3 >= shape.w) return;
+    vstore4(in3, 0, output_ptr + output_offset + 12);
+    #endif
+#endif
+}
+#endif
+
+#ifdef CL_TO_SHARED
+__kernel void cl_to_gl(GLOBAL_SIZE_3_DIMS
+                                    #ifdef USE_IMAGE
+                                    __read_only image2d_t input_ptr,
+                                    #else
+                                    __global INPUT_TYPE *input_ptr,
+                                    #endif
+                                    __global uchar *output_ptr,
+                                    __private const int4 shape // N C H W
+) {
+
+    int wblock  = get_global_id(0);
+    int cblock = get_global_id(1);
+    int nh = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(wblock, cblock, nh);
+    const int w = wblock << 2;
+    const int h = nh % shape.z;
+    const int c = cblock << 2;
+    const int n = nh / shape.z;
+    
+    int idx = c * shape.w + w;    // c/4*w
+    int idy = nh;    // n*h
+#ifdef USE_IMAGE
+    INPUT_TYPE4 in0 = RI_DATA(input_ptr, SAMPLER, (int2)(idx, idy));
+    INPUT_TYPE4 in1 = RI_DATA(input_ptr, SAMPLER, (int2)(idx+1, idy));
+    INPUT_TYPE4 in2 = RI_DATA(input_ptr, SAMPLER, (int2)(idx+2, idy));
+    INPUT_TYPE4 in3 = RI_DATA(input_ptr, SAMPLER, (int2)(idx+3, idy));
+#else
+    #if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW
+    int input_offset = ((n * shape.y + c) * shape.z + h) * shape.w + w;
+    int stride = shape.z * shape.w;
+    INPUT_TYPE4 tmp0, tmp1, tmp2, tmp3;
+    tmp0 = vload4(0, input_ptr + input_offset);
+    tmp1 = vload4(0, input_ptr + input_offset + stride);
+    tmp2 = vload4(0, input_ptr + input_offset + stride + stride);
+    tmp3 = vload4(0, input_ptr + input_offset + stride + stride + stride);
+    INPUT_TYPE4 in0 = (INPUT_TYPE4)(tmp0.x, tmp1.x, tmp2.x, tmp3.x);
+    INPUT_TYPE4 in1 = (INPUT_TYPE4)(tmp0.y, tmp1.y, tmp2.y, tmp3.y);
+    INPUT_TYPE4 in2 = (INPUT_TYPE4)(tmp0.z, tmp1.z, tmp2.z, tmp3.z);
+    INPUT_TYPE4 in3 = (INPUT_TYPE4)(tmp0.w, tmp1.w, tmp2.w, tmp3.w);
+    #elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC
+    int input_offset = ((n * shape.z + h) * shape.w + w) * shape.y + c;
+    INPUT_TYPE4 in0 = vload4(0, input_ptr + input_offset);
+    INPUT_TYPE4 in1 = vload4(0, input_ptr + input_offset + shape.y);
+    INPUT_TYPE4 in2 = vload4(0, input_ptr + input_offset + shape.y + shape.y);
+    INPUT_TYPE4 in3 = vload4(0, input_ptr + input_offset + shape.y + shape.y + shape.y);
+    #elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4
+    int input_offset = (((cblock * shape.x + n) * shape.z + h) * shape.w + w) * 4;
+    INPUT_TYPE4 in0 = vload4(0, input_ptr + input_offset);
+    INPUT_TYPE4 in1 = vload4(0, input_ptr + input_offset + 4);
+    INPUT_TYPE4 in2 = vload4(0, input_ptr + input_offset + 8);
+    INPUT_TYPE4 in3 = vload4(0, input_ptr + input_offset + 12);
+    #endif
+#endif
+    const int offset = idy * shape.w * 4;
+    vstore4(convert_uchar4(in0), idx, output_ptr + offset);
+    if(w + 1 >= shape.w) return;
+    vstore4(convert_uchar4(in1), idx+1, output_ptr + offset);
+    if(w + 2 >= shape.w) return;
+    vstore4(convert_uchar4(in2), idx+2, output_ptr + offset);
+    if(w + 3 >= shape.w) return;
+    vstore4(convert_uchar4(in3), idx+3, output_ptr + offset);
+}
+#endif
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index b66986c17..a809072dc 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -384,6 +384,7 @@ const char* conv_2d =
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_channel_block; ++in_channel_block_idx) {\n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8) || (defined USE_LOW_BIT_WEIGHT_INT4)\n"
 " int kindex=(in_channel_block_idx*4)/blockDim*out_channel_blocks*8;\n"
+" // already pack to 16,no need boundry protect\n"
 " COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(output_channel_idx,dequantScaleOffset+kindex));\n"
 " COMPUTE_FLOAT4 scale0=(COMPUTE_FLOAT4)(ScaleOffset0.s0,ScaleOffset0.s2,ScaleOffset0.s4,ScaleOffset0.s6);\n"
 " COMPUTE_FLOAT4 offset0=(COMPUTE_FLOAT4)(ScaleOffset0.s1,ScaleOffset0.s3,ScaleOffset0.s5,ScaleOffset0.s7);\n"
@@ -400,7 +401,11 @@ const char* conv_2d =
 " in3=RI_F(input,SAMPLER,(int2)(input_width_base+intput_width_idx3,input_height_block_idx));\n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " FLOAT16 weightsInt80=CONVERT_FLOAT16(vload16(0,kernel_ptr+weight_ic_offset+in_channel_block_idx*weight_oc_offset));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" FLOAT16 weightsInt81=output_channel_idx+1 >= out_channel_blocks ? (FLOAT16)0 : CONVERT_FLOAT16(vload16(0,kernel_ptr+16+weight_ic_offset+in_channel_block_idx*weight_oc_offset));\n"
+" #else\n"
 " FLOAT16 weightsInt81=CONVERT_FLOAT16(vload16(0,kernel_ptr+16+weight_ic_offset+in_channel_block_idx*weight_oc_offset));\n"
+" #endif\n"
 " FLOAT4 weights0=CONVERT_FLOAT4(weightsInt80.s0123)*scale0+offset0;\n"
 " FLOAT4 weights1=CONVERT_FLOAT4(weightsInt80.s4567)*scale0+offset0;\n"
 " FLOAT4 weights2=CONVERT_FLOAT4(weightsInt80.s89ab)*scale0+offset0;\n"
@@ -464,10 +469,17 @@ const char* conv_2d =
 " weights1=vload4(weights_width_base+1,weights+weight_offset);\n"
 " weights2=vload4(weights_width_base+2,weights+weight_offset);\n"
 " weights3=vload4(weights_width_base+3,weights+weight_offset);\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" weights4=output_channel_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base,weights+weight_offset1);\n"
+" weights5=output_channel_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base+1,weights+weight_offset1);\n"
+" weights6=output_channel_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base+2,weights+weight_offset1);\n"
+" weights7=output_channel_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(weights_width_base+3,weights+weight_offset1);\n"
+" #else\n"
 " weights4=vload4(weights_width_base,weights+weight_offset1);\n"
 " weights5=vload4(weights_width_base+1,weights+weight_offset1);\n"
 " weights6=vload4(weights_width_base+2,weights+weight_offset1);\n"
 " weights7=vload4(weights_width_base+3,weights+weight_offset1);\n"
+" #endif\n"
 "#else\n"
 " weights0=RI_F(weights,SAMPLER,(int2)(weights_width_base+0,output_channel_idx));\n"
 " weights1=RI_F(weights,SAMPLER,(int2)(weights_width_base+1,output_channel_idx));\n"
@@ -979,10 +991,18 @@ const char* conv_2d =
 " weights1=mad(CONVERT_FLOAT4(charWeight1),scale0,offset0);\n"
 " weights2=mad(CONVERT_FLOAT4(charWeight2),scale0,offset0);\n"
 " weights3=mad(CONVERT_FLOAT4(charWeight3),scale0,offset0);\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" charWeight0=out_channel_block_idx+1 >= out_channel_blocks ? (char4)0 : vload4(0,kernel_ptr+weight_offset+weight_oc_offset);\n"
+" charWeight1=out_channel_block_idx+1 >= out_channel_blocks ? (char4)0 : vload4(0,kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset);\n"
+" charWeight2=out_channel_block_idx+1 >= out_channel_blocks ? (char4)0 : vload4(0,kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
+" charWeight3=out_channel_block_idx+1 >= out_channel_blocks ? (char4)0 : vload4(0,kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" \n"
+" #else\n"
 " charWeight0=vload4(0,kernel_ptr+weight_offset+weight_oc_offset);\n"
 " charWeight1=vload4(0,kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset);\n"
 " charWeight2=vload4(0,kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
 " charWeight3=vload4(0,kernel_ptr+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #endif\n"
 " weights4=mad(CONVERT_FLOAT4(charWeight0),scale1,offset1);\n"
 " weights5=mad(CONVERT_FLOAT4(charWeight1),scale1,offset1);\n"
 " weights6=mad(CONVERT_FLOAT4(charWeight2),scale1,offset1);\n"
@@ -1051,10 +1071,18 @@ const char* conv_2d =
 " weights1=vload4(0,weights+weight_offset+weight_ic_offset);\n"
 " weights2=vload4(0,weights+weight_offset+weight_ic_offset*2);\n"
 " weights3=vload4(0,weights+weight_offset+weight_ic_offset*3);\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" charWeight0 =\n"
+" weights4=out_channel_block_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0,weights+weight_offset+weight_oc_offset);\n"
+" weights5=out_channel_block_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0,weights+weight_offset+weight_ic_offset+weight_oc_offset);\n"
+" weights6=out_channel_block_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0,weights+weight_offset+weight_ic_offset*2+weight_oc_offset);\n"
+" weights7=out_channel_block_idx+1 >= out_channel_blocks ? (FLOAT4)0 : vload4(0,weights+weight_offset+weight_ic_offset*3+weight_oc_offset);\n"
+" #else\n"
 " weights4=vload4(0,weights+weight_offset+weight_oc_offset);\n"
 " weights5=vload4(0,weights+weight_offset+weight_ic_offset+weight_oc_offset);\n"
 " weights6=vload4(0,weights+weight_offset+weight_ic_offset*2+weight_oc_offset);\n"
 " weights7=vload4(0,weights+weight_offset+weight_ic_offset*3+weight_oc_offset);\n"
+" #endif\n"
 " weight_offset += 4;\n"
 "#else\n"
 " weights0=RI_F(weights,SAMPLER,(int2)(weights_x_idx+0,weights_y_idx));\n"
@@ -4303,7 +4331,7 @@ const char* conv_2d_int_buf =
 "#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
 "#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
 "#define MOD_NUM 15\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
+"#ifdef INPUT_CHANNEL_BOUNDARY_PROTECT\n"
 " #define PADZEROSVEC(k, channel, data0, data1, data2, data3) "" data0 = (k << 2) < channel ? data0 : 0; "" data1 = (k << 2) + 1 < channel ? data1 : 0; "" data2 = (k << 2) + 2 < channel ? data2 : 0; "" data3=(k << 2)+3<channel ? data3 : 0;\n"
 "#else\n"
 " #define PADZEROSVEC(k,channel,data0,data1,data2,data3)\n"
@@ -4916,17 +4944,19 @@ const char* conv_2d_int_buf =
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
 " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
 " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 2;\n"
 " \n"
-" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
 " COMPUTE_FLOAT4 out0=bias0;\n"
 " COMPUTE_FLOAT4 out1=bias0;\n"
 " COMPUTE_FLOAT4 out2=bias0;\n"
 " COMPUTE_FLOAT4 out3=bias0;\n"
-" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx+1,bias));\n"
+" // bias align to 8,no need boundry protect\n"
+" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
 " COMPUTE_FLOAT4 out4=bias1;\n"
 " COMPUTE_FLOAT4 out5=bias1;\n"
 " COMPUTE_FLOAT4 out6=bias1;\n"
@@ -4946,17 +4976,21 @@ const char* conv_2d_int_buf =
 " const int in_hw_size=in_hw.x*in_hw.y;\n"
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " int kindex=(in_c_idx*4)/blockDim*out_c_blocks*8;\n"
-" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx,dequantScaleOffset+kindex));\n"
-" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx+1,dequantScaleOffset+kindex));\n"
+" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_0,dequantScaleOffset+kindex));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT8 ScaleOffset1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1,dequantScaleOffset+kindex));\n"
+" #else\n"
+" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1,dequantScaleOffset+kindex));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 scale0=(COMPUTE_FLOAT4)(ScaleOffset0.s0,ScaleOffset0.s2,ScaleOffset0.s4,ScaleOffset0.s6);\n"
 " COMPUTE_FLOAT4 offset0=(COMPUTE_FLOAT4)(ScaleOffset0.s1,ScaleOffset0.s3,ScaleOffset0.s5,ScaleOffset0.s7);\n"
 " COMPUTE_FLOAT4 scale1=(COMPUTE_FLOAT4)(ScaleOffset1.s0,ScaleOffset1.s2,ScaleOffset1.s4,ScaleOffset1.s6);\n"
 " COMPUTE_FLOAT4 offset1=(COMPUTE_FLOAT4)(ScaleOffset1.s1,ScaleOffset1.s3,ScaleOffset1.s5,ScaleOffset1.s7);\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
-" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
+" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
 " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
-" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
+" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
 " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
 " const int in_h2_idx=(iy*dilate_hw.x+in_h2_idx_base)*in_hw.y;\n"
@@ -5026,10 +5060,17 @@ const char* conv_2d_int_buf =
 " out3=mad(in3.z,weight2,out3);\n"
 " out3=mad(in3.w,weight3,out3);\n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" charWeight0=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset);\n"
+" charWeight1=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
+" charWeight2=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
+" charWeight3=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #else\n"
 " charWeight0=vload4(0,weight+weight_offset+weight_oc_offset);\n"
 " charWeight1=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
 " charWeight2=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
 " charWeight3=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #endif\n"
 " weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale1+offset1;\n"
 " weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale1+offset1;\n"
 " weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale1+offset1;\n"
@@ -5109,7 +5150,7 @@ const char* conv_2d_int_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 4){\n"
@@ -5127,12 +5168,12 @@ const char* conv_2d_int_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
@@ -5153,12 +5194,12 @@ const char* conv_2d_int_buf =
 " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
@@ -5192,14 +5233,16 @@ const char* conv_2d_int_buf =
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
 " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
 " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 1;\n"
-" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
 " COMPUTE_FLOAT4 out0=bias0;\n"
 " COMPUTE_FLOAT4 out1=bias0;\n"
-" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx+1,bias));\n"
+" // bias align to 8,no need boundry protect\n"
+" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
 " COMPUTE_FLOAT4 out2=bias1;\n"
 " COMPUTE_FLOAT4 out3=bias1;\n"
 " const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
@@ -5216,17 +5259,21 @@ const char* conv_2d_int_buf =
 " // weight: [ic/4,oc,4],loop: ic/4\n"
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " int kindex=(in_c_idx*4)/blockDim*out_c_blocks*8;\n"
-" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx,dequantScaleOffset+kindex));\n"
-" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx+1,dequantScaleOffset+kindex));\n"
+" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_0,dequantScaleOffset+kindex));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT8 ScaleOffset1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1,dequantScaleOffset+kindex));\n"
+" #else\n"
+" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1,dequantScaleOffset+kindex));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 scale0=(COMPUTE_FLOAT4)(ScaleOffset0.s0,ScaleOffset0.s2,ScaleOffset0.s4,ScaleOffset0.s6);\n"
 " COMPUTE_FLOAT4 offset0=(COMPUTE_FLOAT4)(ScaleOffset0.s1,ScaleOffset0.s3,ScaleOffset0.s5,ScaleOffset0.s7);\n"
 " COMPUTE_FLOAT4 scale1=(COMPUTE_FLOAT4)(ScaleOffset1.s0,ScaleOffset1.s2,ScaleOffset1.s4,ScaleOffset1.s6);\n"
 " COMPUTE_FLOAT4 offset1=(COMPUTE_FLOAT4)(ScaleOffset1.s1,ScaleOffset1.s3,ScaleOffset1.s5,ScaleOffset1.s7);\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
-" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
+" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
 " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
-" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
+" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
 " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
 " for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n"
@@ -5283,10 +5330,17 @@ const char* conv_2d_int_buf =
 " out1=mad(in1.w,weight3,out1);\n"
 " \n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" charWeight0=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset);\n"
+" charWeight1=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
+" charWeight2=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
+" charWeight3=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #else\n"
 " charWeight0=vload4(0,weight+weight_offset+weight_oc_offset);\n"
 " charWeight1=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
 " charWeight2=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
 " charWeight3=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #endif\n"
 " weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale1+offset1;\n"
 " weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale1+offset1;\n"
 " weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale1+offset1;\n"
@@ -5348,7 +5402,7 @@ const char* conv_2d_int_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 2){\n"
@@ -5357,12 +5411,12 @@ const char* conv_2d_int_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 2){\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
@@ -5372,12 +5426,12 @@ const char* conv_2d_int_buf =
 "#else\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
 "#endif\n"
@@ -5409,17 +5463,19 @@ const char* conv_2d_int_buf =
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n"
 " const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
 " const int out_h_idx=out_b_h_idx % out_hw.x;\n"
 " \n"
-" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
 " COMPUTE_FLOAT4 out0=bias0;\n"
 " COMPUTE_FLOAT4 out1=bias0;\n"
 " COMPUTE_FLOAT4 out2=bias0;\n"
 " COMPUTE_FLOAT4 out3=bias0;\n"
-" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx+1,bias));\n"
+" // bias align to 8,no need boundry protect\n"
+" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
 " COMPUTE_FLOAT4 out4=bias1;\n"
 " COMPUTE_FLOAT4 out5=bias1;\n"
 " COMPUTE_FLOAT4 out6=bias1;\n"
@@ -5438,15 +5494,19 @@ const char* conv_2d_int_buf =
 " const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " int kindex=(in_c_idx*4)/blockDim*out_c_blocks*8;\n"
-" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx,dequantScaleOffset+kindex));\n"
-" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx+1,dequantScaleOffset+kindex));\n"
+" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_0,dequantScaleOffset+kindex));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT8 ScaleOffset1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1,dequantScaleOffset+kindex));\n"
+" #else\n"
+" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx_1,dequantScaleOffset+kindex));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 scale0=(COMPUTE_FLOAT4)(ScaleOffset0.s0,ScaleOffset0.s2,ScaleOffset0.s4,ScaleOffset0.s6);\n"
 " COMPUTE_FLOAT4 offset0=(COMPUTE_FLOAT4)(ScaleOffset0.s1,ScaleOffset0.s3,ScaleOffset0.s5,ScaleOffset0.s7);\n"
 " COMPUTE_FLOAT4 scale1=(COMPUTE_FLOAT4)(ScaleOffset1.s0,ScaleOffset1.s2,ScaleOffset1.s4,ScaleOffset1.s6);\n"
 " COMPUTE_FLOAT4 offset1=(COMPUTE_FLOAT4)(ScaleOffset1.s1,ScaleOffset1.s3,ScaleOffset1.s5,ScaleOffset1.s7);\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
-" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
+" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
+" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
 " const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
@@ -5519,10 +5579,17 @@ const char* conv_2d_int_buf =
 " out3=mad(in3.w,weight3,out3);\n"
 " \n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" charWeight0=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset);\n"
+" charWeight1=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
+" charWeight2=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
+" charWeight3=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #else\n"
 " charWeight0=vload4(0,weight+weight_offset+weight_oc_offset);\n"
 " charWeight1=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
 " charWeight2=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
 " charWeight3=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
+" #endif\n"
 " weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale1+offset1;\n"
 " weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale1+offset1;\n"
 " weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale1+offset1;\n"
@@ -5603,7 +5670,7 @@ const char* conv_2d_int_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.y-out_w_idx;\n"
 " if(remain >= 4){\n"
@@ -5616,10 +5683,10 @@ const char* conv_2d_int_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks)return;\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks)return;\n"
 "#endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 " }else if(remain == 3){\n"
@@ -5632,10 +5699,10 @@ const char* conv_2d_int_buf =
 " }\n"
 "#else\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks)return;\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks)return;\n"
 "#endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 "#endif\n"
 "}\n"
@@ -8316,14 +8383,18 @@ const char* depthwise_conv2d_buf =
 " COMPUTE_FLOAT4 inValue1=(in_w_start_1+kw<0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1,input+inp_offset_c0));\n"
 " COMPUTE_FLOAT4 inValue2=(in_w_start_2+kw<0 || in_w_start_2+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+2,input+inp_offset_c0));\n"
 " COMPUTE_FLOAT4 inValue3=(in_w_start_3+kw<0 || in_w_start_3+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+3,input+inp_offset_c0));\n"
-" COMPUTE_FLOAT4 inValue4=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c1));\n"
-" COMPUTE_FLOAT4 inValue5=(in_w_start_1+kw<0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1,input+inp_offset_c1));\n"
-" COMPUTE_FLOAT4 inValue6=(in_w_start_2+kw<0 || in_w_start_2+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+2,input+inp_offset_c1));\n"
-" COMPUTE_FLOAT4 inValue7=(in_w_start_3+kw<0 || in_w_start_3+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+3,input+inp_offset_c1));\n"
+" COMPUTE_FLOAT4 inValue4=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c1));\n"
+" COMPUTE_FLOAT4 inValue5=(in_w_start_1+kw<0 || in_w_start_1+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1,input+inp_offset_c1));\n"
+" COMPUTE_FLOAT4 inValue6=(in_w_start_2+kw<0 || in_w_start_2+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+2,input+inp_offset_c1));\n"
+" COMPUTE_FLOAT4 inValue7=(in_w_start_3+kw<0 || in_w_start_3+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+3,input+inp_offset_c1));\n"
 " \n"
 " //NC4HW4 [1,filterShape.x*filterShape.y,1,channelBlocks] x oc4\n"
 " //index: [0,filterIdx,0,inChannelBlockIdx]\n"
 " COMPUTE_FLOAT4 weights_0=CONVERT_COMPUTE_FLOAT4(vload4(0,filter+(filter_idx*c_blocks+c_idx+0)*4));\n"
+" /*\n"
+" weight:[kh*kw,oc/4,oc_4],memory align to 8\n"
+" no need to boundry prptecy\n"
+" */\n"
 " COMPUTE_FLOAT4 weights_1=CONVERT_COMPUTE_FLOAT4(vload4(0,filter+(filter_idx*c_blocks+c_idx+1)*4));\n"
 " outValue0=mad(inValue0,weights_0,outValue0);\n"
 " outValue1=mad(inValue1,weights_0,outValue1);\n"
@@ -8435,11 +8506,15 @@ const char* depthwise_conv2d_buf =
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " COMPUTE_FLOAT4 inValue0=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c0));\n"
 " COMPUTE_FLOAT4 inValue1=(in_w_start_1+kw<0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1,input+inp_offset_c0));\n"
-" COMPUTE_FLOAT4 inValue4=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c1));\n"
-" COMPUTE_FLOAT4 inValue5=(in_w_start_1+kw<0 || in_w_start_1+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1,input+inp_offset_c1));\n"
+" COMPUTE_FLOAT4 inValue4=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c1));\n"
+" COMPUTE_FLOAT4 inValue5=(in_w_start_1+kw<0 || in_w_start_1+kw >= in_hw.y || c_idx+1 >= c_blocks) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+1,input+inp_offset_c1));\n"
 " //NC4HW4 [1,filterShape.x*filterShape.y,1,channelBlocks] x oc4\n"
 " //index: [0,filterIdx,0,inChannelBlockIdx]\n"
 " COMPUTE_FLOAT4 weights_0=CONVERT_COMPUTE_FLOAT4(vload4(0,filter+(filter_idx*c_blocks+c_idx+0)*4));\n"
+" /*\n"
+" weight:[kh*kw,oc/4,oc_4],memory align to 8\n"
+" no need to boundry protect\n"
+" */\n"
 " COMPUTE_FLOAT4 weights_1=CONVERT_COMPUTE_FLOAT4(vload4(0,filter+(filter_idx*c_blocks+c_idx+1)*4));\n"
 " outValue0=mad(inValue0,weights_0,outValue0);\n"
 " outValue1=mad(inValue1,weights_0,outValue1);\n"
@@ -8801,6 +8876,206 @@ const char* depthwise_conv2d_buf =
 "}\n"
 ;
 #endif
+const char* glmem_convert = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"#define GLOBAL_SIZE_3_DIMS __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
+"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"#define MNN_DATA_FORMAT_NCHW 0\n"
+"#define MNN_DATA_FORMAT_NHWC 1\n"
+"#define MNN_DATA_FORMAT_NC4HW4 2\n"
+"#define MNN_DATA_FORMAT_C4NHW4 3\n"
+"#define __CAT(x,y) x##y\n"
+"#define CAT(x,y) __CAT(x,y)\n"
+"#define OUTPUT_TYPE2 CAT(OUTPUT_TYPE,2)\n"
+"#define OUTPUT_TYPE3 CAT(OUTPUT_TYPE,3)\n"
+"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+"#ifdef SHARED_TO_CL\n"
+"__kernel void gl_to_cl(GLOBAL_SIZE_3_DIMS\n"
+" __global uchar *input_ptr,\n"
+" #ifdef USE_IMAGE\n"
+" __write_only image2d_t output_ptr,\n"
+" #else\n"
+" __global OUTPUT_TYPE *output_ptr,\n"
+" #endif\n"
+" __private const int4 shape // N C H W\n"
+") {\n"
+" int wblock=get_global_id(0);\n"
+" int cblock=get_global_id(1);\n"
+" int nh=get_global_id(2);\n"
+" DEAL_NON_UNIFORM_DIM3(wblock,cblock,nh);\n"
+" const int w=wblock << 2;\n"
+" const int h=nh % shape.z;\n"
+" const int c=cblock << 2;\n"
+" const int n=nh/shape.z;\n"
+" \n"
+" int idx=c*shape.w+w; // c/4*w\n"
+" int idy=nh; // n*h\n"
+" const int offset=idy*shape.w*4;\n"
+" OUTPUT_TYPE4 in0=CONVERT_OUTPUT4(vload4(idx,input_ptr+offset));\n"
+" OUTPUT_TYPE4 in1=CONVERT_OUTPUT4(vload4(idx+1,input_ptr+offset));\n"
+" OUTPUT_TYPE4 in2=CONVERT_OUTPUT4(vload4(idx+2,input_ptr+offset));\n"
+" OUTPUT_TYPE4 in3=CONVERT_OUTPUT4(vload4(idx+3,input_ptr+offset));\n"
+"#ifdef USE_IMAGE\n"
+" WI_DATA(output_ptr,(int2)(idx,idy),in0);\n"
+" if(w+1 >= shape.w) return;\n"
+" WI_DATA(output_ptr,(int2)(idx+1,idy),in1);\n"
+" if(w+2 >= shape.w) return;\n"
+" WI_DATA(output_ptr,(int2)(idx+2,idy),in2);\n"
+" if(w+3 >= shape.w) return;\n"
+" WI_DATA(output_ptr,(int2)(idx+3,idy),in3);\n"
+"#else\n"
+" #if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
+" int output_offset=((n*shape.y+c)*shape.z+h)*shape.w+w;\n"
+" int stride=shape.z*shape.w;\n"
+" int remain=shape.w-w;\n"
+" if(remain >= 4){\n"
+" vstore4((OUTPUT_TYPE4)(in0.x,in1.x,in2.x,in3.x),0,output_ptr+output_offset);\n"
+" if(c+1 >= shape.y) return;\n"
+" vstore4((OUTPUT_TYPE4)(in0.y,in1.y,in2.y,in3.y),0,output_ptr+output_offset+stride);\n"
+" if(c+2 >= shape.y) return;\n"
+" vstore4((OUTPUT_TYPE4)(in0.z,in1.z,in2.z,in3.z),0,output_ptr+output_offset+stride+stride);\n"
+" if(c+3 >= shape.y) return;\n"
+" vstore4((OUTPUT_TYPE4)(in0.w,in1.w,in2.w,in3.w),0,output_ptr+output_offset+stride+stride+stride);\n"
+" } else if(remain == 3){\n"
+" vstore3((OUTPUT_TYPE3)(in0.x,in1.x,in2.x),0,output_ptr+output_offset);\n"
+" if(c+1 >= shape.y) return;\n"
+" vstore3((OUTPUT_TYPE3)(in0.y,in1.y,in2.y),0,output_ptr+output_offset+stride);\n"
+" if(c+2 >= shape.y) return;\n"
+" vstore3((OUTPUT_TYPE3)(in0.z,in1.z,in2.z),0,output_ptr+output_offset+stride+stride);\n"
+" if(c+3 >= shape.y) return;\n"
+" vstore3((OUTPUT_TYPE3)(in0.w,in1.w,in2.w),0,output_ptr+output_offset+stride+stride+stride);\n"
+" } else if(remain == 2){\n"
+" vstore2((OUTPUT_TYPE2)(in0.x,in1.x),0,output_ptr+output_offset);\n"
+" if(c+1 >= shape.y) return;\n"
+" vstore2((OUTPUT_TYPE2)(in0.y,in1.y),0,output_ptr+output_offset+stride);\n"
+" if(c+2 >= shape.y) return;\n"
+" vstore2((OUTPUT_TYPE2)(in0.z,in1.z),0,output_ptr+output_offset+stride+stride);\n"
+" if(c+3 >= shape.y) return;\n"
+" vstore2((OUTPUT_TYPE2)(in0.w,in1.w),0,output_ptr+output_offset+stride+stride+stride);\n"
+" }else if(remain == 1){\n"
+" output_ptr[output_offset]=in0.x;\n"
+" if(c+1 >= shape.y) return;\n"
+" output_ptr[output_offset+stride]=in0.y;\n"
+" if(c+2 >= shape.y) return;\n"
+" output_ptr[output_offset+stride+stride]=in0.z;\n"
+" if(c+3 >= shape.y) return;\n"
+" output_ptr[output_offset+stride+stride+stride]=in0.w;\n"
+" }\n"
+" #elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
+" int output_offset=((n*shape.z+h)*shape.w+w)*shape.y+c;\n"
+" int remain=shape.y-c;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_OUTPUT4(in0),0,output_ptr+output_offset);\n"
+" if(w+1 >= shape.w) return;\n"
+" vstore4(CONVERT_OUTPUT4(in1),0,output_ptr+output_offset+shape.y);\n"
+" if(w+2 >= shape.w) return;\n"
+" vstore4(CONVERT_OUTPUT4(in2),0,output_ptr+output_offset+shape.y+shape.y);\n"
+" if(w+3 >= shape.w) return;\n"
+" vstore4(CONVERT_OUTPUT4(in3),0,output_ptr+output_offset+shape.y+shape.y+shape.y);\n"
+" } else if(remain == 3){\n"
+" vstore3((OUTPUT_TYPE3)(in0.x,in0.y,in0.z),0,output_ptr+output_offset);\n"
+" if(w+1 >= shape.w) return;\n"
+" vstore3((OUTPUT_TYPE3)(in1.x,in1.y,in1.z),0,output_ptr+output_offset+shape.y);\n"
+" if(w+2 >= shape.w) return;\n"
+" vstore3((OUTPUT_TYPE3)(in2.x,in2.y,in2.z),0,output_ptr+output_offset+shape.y+shape.y);\n"
+" if(w+3 >= shape.w) return;\n"
+" vstore3((OUTPUT_TYPE3)(in3.x,in3.y,in3.z),0,output_ptr+output_offset+shape.y+shape.y+shape.y);\n"
+" } else if(remain == 2){\n"
+" vstore2((OUTPUT_TYPE2)(in0.x,in0.y),0,output_ptr+output_offset);\n"
+" if(w+1 >= shape.w) return;\n"
+" vstore2((OUTPUT_TYPE2)(in1.x,in1.y),0,output_ptr+output_offset+shape.y);\n"
+" if(w+2 >= shape.w) return;\n"
+" vstore2((OUTPUT_TYPE2)(in2.x,in2.y),0,output_ptr+output_offset+shape.y+shape.y);\n"
+" if(w+3 >= shape.w) return;\n"
+" vstore2((OUTPUT_TYPE2)(in3.x,in3.y),0,output_ptr+output_offset+shape.y+shape.y+shape.y);\n"
+" }else if(remain == 1){\n"
+" output_ptr[output_offset]=in0.x;\n"
+" if(w+1 >= shape.w) return;\n"
+" output_ptr[output_offset+shape.y]=in1.x;\n"
+" if(w+2 >= shape.w) return;\n"
+" output_ptr[output_offset+shape.y+shape.y]=in1.x;\n"
+" if(w+3 >= shape.w) return;\n"
+" output_ptr[output_offset+shape.y+shape.y+shape.y]=in1.x;\n"
+" }\n"
+" #elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
+" int output_offset=(((cblock*shape.x+n)*shape.z+h)*shape.w+w)*4;\n"
+" vstore4(in0,0,output_ptr+output_offset);\n"
+" if(w+1 >= shape.w) return;\n"
+" vstore4(in1,0,output_ptr+output_offset+4);\n"
+" if(w+2 >= shape.w) return;\n"
+" vstore4(in2,0,output_ptr+output_offset+8);\n"
+" if(w+3 >= shape.w) return;\n"
+" vstore4(in3,0,output_ptr+output_offset+12);\n"
+" #endif\n"
+"#endif\n"
+"}\n"
+"#endif\n"
+"#ifdef CL_TO_SHARED\n"
+"__kernel void cl_to_gl(GLOBAL_SIZE_3_DIMS\n"
+" #ifdef USE_IMAGE\n"
+" __read_only image2d_t input_ptr,\n"
+" #else\n"
+" __global INPUT_TYPE *input_ptr,\n"
+" #endif\n"
+" __global uchar *output_ptr,\n"
+" __private const int4 shape // N C H W\n"
+") {\n"
+" int wblock=get_global_id(0);\n"
+" int cblock=get_global_id(1);\n"
+" int nh=get_global_id(2);\n"
+" DEAL_NON_UNIFORM_DIM3(wblock,cblock,nh);\n"
+" const int w=wblock << 2;\n"
+" const int h=nh % shape.z;\n"
+" const int c=cblock << 2;\n"
+" const int n=nh/shape.z;\n"
+" \n"
+" int idx=c*shape.w+w; // c/4*w\n"
+" int idy=nh; // n*h\n"
+"#ifdef USE_IMAGE\n"
+" INPUT_TYPE4 in0=RI_DATA(input_ptr,SAMPLER,(int2)(idx,idy));\n"
+" INPUT_TYPE4 in1=RI_DATA(input_ptr,SAMPLER,(int2)(idx+1,idy));\n"
+" INPUT_TYPE4 in2=RI_DATA(input_ptr,SAMPLER,(int2)(idx+2,idy));\n"
+" INPUT_TYPE4 in3=RI_DATA(input_ptr,SAMPLER,(int2)(idx+3,idy));\n"
+"#else\n"
+" #if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
+" int input_offset=((n*shape.y+c)*shape.z+h)*shape.w+w;\n"
+" int stride=shape.z*shape.w;\n"
+" INPUT_TYPE4 tmp0,tmp1,tmp2,tmp3;\n"
+" tmp0=vload4(0,input_ptr+input_offset);\n"
+" tmp1=vload4(0,input_ptr+input_offset+stride);\n"
+" tmp2=vload4(0,input_ptr+input_offset+stride+stride);\n"
+" tmp3=vload4(0,input_ptr+input_offset+stride+stride+stride);\n"
+" INPUT_TYPE4 in0=(INPUT_TYPE4)(tmp0.x,tmp1.x,tmp2.x,tmp3.x);\n"
+" INPUT_TYPE4 in1=(INPUT_TYPE4)(tmp0.y,tmp1.y,tmp2.y,tmp3.y);\n"
+" INPUT_TYPE4 in2=(INPUT_TYPE4)(tmp0.z,tmp1.z,tmp2.z,tmp3.z);\n"
+" INPUT_TYPE4 in3=(INPUT_TYPE4)(tmp0.w,tmp1.w,tmp2.w,tmp3.w);\n"
+" #elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
+" int input_offset=((n*shape.z+h)*shape.w+w)*shape.y+c;\n"
+" INPUT_TYPE4 in0=vload4(0,input_ptr+input_offset);\n"
+" INPUT_TYPE4 in1=vload4(0,input_ptr+input_offset+shape.y);\n"
+" INPUT_TYPE4 in2=vload4(0,input_ptr+input_offset+shape.y+shape.y);\n"
+" INPUT_TYPE4 in3=vload4(0,input_ptr+input_offset+shape.y+shape.y+shape.y);\n"
+" #elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
+" int input_offset=(((cblock*shape.x+n)*shape.z+h)*shape.w+w)*4;\n"
+" INPUT_TYPE4 in0=vload4(0,input_ptr+input_offset);\n"
+" INPUT_TYPE4 in1=vload4(0,input_ptr+input_offset+4);\n"
+" INPUT_TYPE4 in2=vload4(0,input_ptr+input_offset+8);\n"
+" INPUT_TYPE4 in3=vload4(0,input_ptr+input_offset+12);\n"
+" #endif\n"
+"#endif\n"
+" const int offset=idy*shape.w*4;\n"
+" vstore4(convert_uchar4(in0),idx,output_ptr+offset);\n"
+" if(w+1 >= shape.w) return;\n"
+" vstore4(convert_uchar4(in1),idx+1,output_ptr+offset);\n"
+" if(w+2 >= shape.w) return;\n"
+" vstore4(convert_uchar4(in2),idx+2,output_ptr+offset);\n"
+" if(w+3 >= shape.w) return;\n"
+" vstore4(convert_uchar4(in3),idx+3,output_ptr+offset);\n"
+"}\n"
+"#endif\n"
+;
 #ifndef MNN_OPENCL_BUFFER_CLOSED
 const char* winogradTransform_buf = 
 "#ifdef MNN_SUPPORT_FP16\n"
@@ -13607,23 +13882,31 @@ const char* conv_2d_buf =
 " const int out_c_w_idx=get_global_id(0); //c/8 w/4\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
 " const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n"
 " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
 " const int out_w4_idx=mul24(out_w_idx,4);\n"
-" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx<<1,bias_ptr));\n"
+" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias_ptr));\n"
 " COMPUTE_FLOAT4 out1=out0;\n"
 " COMPUTE_FLOAT4 out2=out0;\n"
 " COMPUTE_FLOAT4 out3=out0;\n"
 " \n"
-" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4((out_c_idx<<1)+1,bias_ptr));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
 " COMPUTE_FLOAT4 out5=out4;\n"
 " COMPUTE_FLOAT4 out6=out4;\n"
 " COMPUTE_FLOAT4 out7=out4;\n"
+" #else\n"
+" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
+" COMPUTE_FLOAT4 out5=out4;\n"
+" COMPUTE_FLOAT4 out6=out4;\n"
+" COMPUTE_FLOAT4 out7=out4;\n"
+" #endif\n"
 " const int intput_width_idx0=out_w4_idx;\n"
 " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n"
-" int offset=out_c_idx*8;\n"
+" int offset=out_c_idx_0*4;\n"
 " const int inp_add=out_b*out_h*out_w*4;\n"
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
 " \n"
@@ -13632,6 +13915,7 @@ const char* conv_2d_buf =
 " COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n"
 " COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4(vload4(3,input+inp_offset));\n"
 " \n"
+" // output_channel at least pack to 8,no need boundry protect\n"
 " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
 " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset));\n"
 " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
@@ -13705,7 +13989,7 @@ const char* conv_2d_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx+out_c_idx*2*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx_0*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
 " __global FLOAT*_tempoutput=output+out_offset;\n"
 " __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n"
 "#ifdef BLOCK_LEAVE\n"
@@ -13720,8 +14004,8 @@ const char* conv_2d_buf =
 " } else if (remain == 1) {\n"
 " vstore4(CONVERT_FLOAT4(out0),0,_tempoutput);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx*2+1 >= out_c_block) {\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_block) {\n"
 " return;\n"
 " }\n"
 "#endif\n"
@@ -13737,8 +14021,8 @@ const char* conv_2d_buf =
 " }\n"
 "#else\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,_tempoutput);\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx*2+1 >= out_c_block) {\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_block) {\n"
 " return;\n"
 " }\n"
 "#endif\n"
@@ -13760,20 +14044,25 @@ const char* conv_2d_buf =
 " const int out_c_w_idx=get_global_id(0); //c/8 w/4\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
 " const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n"
 " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
 " \n"
 " const int out_w2_idx=mul24(out_w_idx,2);\n"
-" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx<<1,bias_ptr));\n"
+" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias_ptr));\n"
 " COMPUTE_FLOAT4 out1=out0;\n"
 " \n"
-" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4((out_c_idx<<1)+1,bias_ptr));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
+" #else\n"
+" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 out5=out4;\n"
 " const int intput_width_idx0=out_w2_idx;\n"
 " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n"
-" int offset=out_c_idx*8;\n"
+" int offset=out_c_idx_0*4;\n"
 " const int inp_add=out_b*out_h*out_w*4;\n"
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
 " \n"
@@ -13822,7 +14111,7 @@ const char* conv_2d_buf =
 " out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx+out_c_idx*2*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx_0*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
 " __global FLOAT*_tempoutput=output+out_offset;\n"
 " __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n"
 "#ifdef BLOCK_LEAVE\n"
@@ -13832,8 +14121,8 @@ const char* conv_2d_buf =
 " } else if (remain == 1) {\n"
 " vstore4(CONVERT_FLOAT4(out0),0,_tempoutput);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx*2+1 >= out_c_block) {\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_block) {\n"
 " return;\n"
 " }\n"
 "#endif\n"
@@ -13844,8 +14133,8 @@ const char* conv_2d_buf =
 " }\n"
 "#else\n"
 " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n"
-"#ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx*2+1 >= out_c_block) {\n"
+"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_block) {\n"
 " return;\n"
 " }\n"
 "#endif\n"
@@ -14383,16 +14672,21 @@ const char* conv_2d_buf =
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
 " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
 " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 2;\n"
 " \n"
-" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
+" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
 " COMPUTE_FLOAT4 out1=out0;\n"
 " COMPUTE_FLOAT4 out2=out0;\n"
 " COMPUTE_FLOAT4 out3=out0;\n"
-" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx+1,bias));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
+" #else\n"
+" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 out5=out4;\n"
 " COMPUTE_FLOAT4 out6=out4;\n"
 " COMPUTE_FLOAT4 out7=out4;\n"
@@ -14410,11 +14704,11 @@ const char* conv_2d_buf =
 " const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
 " const int in_hw_size=in_hw.x*in_hw.y;\n"
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
-" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
-" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
+" //weights NC4HW4 [ic/4,ic_4,oc/4,kh*kw,oc_4]\n"
+" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
 " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
-" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
+" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
 " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
 " const int in_h2_idx=(iy*dilate_hw.x+in_h2_idx_base)*in_hw.y;\n"
@@ -14448,10 +14742,18 @@ const char* conv_2d_buf =
 " out3=mad(in3.y,weight1,out3);\n"
 " out3=mad(in3.z,weight2,out3);\n"
 " out3=mad(in3.w,weight3,out3);\n"
+" // weight: [ic/4,ic_4,oc/4,kh*kw,oc_4]\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
+" weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
+" weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
+" weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
+" #else\n"
 " weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
 " weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
 " weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
 " weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
+" #endif\n"
 " out4=mad(in0.x,weight0,out4);\n"
 " out4=mad(in0.y,weight1,out4);\n"
 " out4=mad(in0.z,weight2,out4);\n"
@@ -14496,7 +14798,7 @@ const char* conv_2d_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 4){\n"
@@ -14514,12 +14816,12 @@ const char* conv_2d_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-" #ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 " #endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
@@ -14540,12 +14842,12 @@ const char* conv_2d_buf =
 " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
-" #ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 " #endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
@@ -14573,15 +14875,21 @@ const char* conv_2d_buf =
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
 " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
 " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 1;\n"
 " \n"
-" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
+" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
 " COMPUTE_FLOAT4 out1=out0;\n"
-" COMPUTE_FLOAT4 out2=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx+1,bias));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT4 out2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
+" #else\n"
+" COMPUTE_FLOAT4 out2=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 out3=out2;\n"
+" \n"
 " const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
 " const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
 " const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n"
@@ -14596,10 +14904,10 @@ const char* conv_2d_buf =
 " // weight: [ic/4,oc,4],loop: ic/4\n"
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
-" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
+" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
 " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
-" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
+" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
 " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
 " for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n"
@@ -14620,11 +14928,17 @@ const char* conv_2d_buf =
 " out1=mad(in1.z,weight2,out1);\n"
 " out1=mad(in1.w,weight3,out1);\n"
 " \n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
+" weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
+" weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
+" weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
+" #else\n"
 " weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
 " weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
 " weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
 " weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
-" \n"
+" #endif\n"
 " out2=mad(in0.x,weight0,out2);\n"
 " out2=mad(in0.y,weight1,out2);\n"
 " out2=mad(in0.z,weight2,out2);\n"
@@ -14651,7 +14965,7 @@ const char* conv_2d_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 2){\n"
@@ -14660,12 +14974,12 @@ const char* conv_2d_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-" #ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 " #endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 2){\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
@@ -14675,12 +14989,12 @@ const char* conv_2d_buf =
 "#else\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
-" #ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks){\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
 " #endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
 "#endif\n"
@@ -14706,17 +15020,21 @@ const char* conv_2d_buf =
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
 " const int out_b_h_idx=get_global_id(1); //b h\n"
 " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
+" const int out_c_idx_1=out_c_idx_0+1;\n"
 " const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n"
 " const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
 " const int out_h_idx=out_b_h_idx % out_hw.x;\n"
 " \n"
-" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
+" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
 " COMPUTE_FLOAT4 out1=out0;\n"
 " COMPUTE_FLOAT4 out2=out0;\n"
 " COMPUTE_FLOAT4 out3=out0;\n"
-" \n"
-" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx+1,bias));\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
+" #else\n"
+" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
+" #endif\n"
 " COMPUTE_FLOAT4 out5=out4;\n"
 " COMPUTE_FLOAT4 out6=out4;\n"
 " COMPUTE_FLOAT4 out7=out4;\n"
@@ -14734,8 +15052,8 @@ const char* conv_2d_buf =
 " const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
-" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
+" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
+" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
 " const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
@@ -14771,11 +15089,17 @@ const char* conv_2d_buf =
 " out3=mad(in3.z,weight2,out3);\n"
 " out3=mad(in3.w,weight3,out3);\n"
 " \n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
+" weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
+" weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
+" weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
+" #else\n"
 " weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
 " weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
 " weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
 " weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
-" \n"
+" #endif\n"
 " out4=mad(in0.x,weight0,out4);\n"
 " out4=mad(in0.y,weight1,out4);\n"
 " out4=mad(in0.z,weight2,out4);\n"
@@ -14820,7 +15144,7 @@ const char* conv_2d_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.y-out_w_idx;\n"
 " if(remain >= 4){\n"
@@ -14833,10 +15157,10 @@ const char* conv_2d_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-" #ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks)return;\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks)return;\n"
 " #endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 " }else if(remain == 3){\n"
@@ -14849,10 +15173,10 @@ const char* conv_2d_buf =
 " }\n"
 "#else\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
-" #ifdef CHANNEL_LEAVE\n"
-" if(out_c_idx+1 >= out_c_blocks)return;\n"
+" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
+" if(out_c_idx_1 >= out_c_blocks)return;\n"
 " #endif\n"
-" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 "#endif\n"
 "}\n"
@@ -18853,7 +19177,7 @@ const char* buffer_convert_buf =
 " }\n"
 "#endif\n"
 "}\n"
-"// convert kernel : from buffer(oihw) to image(oc/4 h w ,ic oc4)\n"
+"// convert kernel : from buffer(oihw) to image(ic,oc/4,h,w,oc4)\n"
 "__kernel void conv2d_filter_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
 " __global const FLOAT *input_ptr,\n"
 " __private const int output_channel,\n"
diff --git a/source/backend/opencl/execution/cl/opencl_source_map.hpp b/source/backend/opencl/execution/cl/opencl_source_map.hpp
index 5f6861718..a347025c2 100644
--- a/source/backend/opencl/execution/cl/opencl_source_map.hpp
+++ b/source/backend/opencl/execution/cl/opencl_source_map.hpp
@@ -71,6 +71,7 @@ extern const char* unary_buf;
 #ifndef MNN_OPENCL_BUFFER_CLOSED
 extern const char* depthwise_conv2d_buf;
 #endif
+extern const char* glmem_convert;
 #ifndef MNN_OPENCL_BUFFER_CLOSED
 extern const char* winogradTransform_buf;
 #endif
@@ -242,6 +243,7 @@ const std::map<std::string, const char*> OpenCLProgramMap =
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "depthwise_conv2d_buf", depthwise_conv2d_buf },
 #endif
+  { "glmem_convert", glmem_convert },
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "winogradTransform_buf", winogradTransform_buf },
 #endif
diff --git a/source/backend/opencl/execution/image/ConvExecution.cpp b/source/backend/opencl/execution/image/ConvExecution.cpp
index d2f6d288a..2f3a8e4a4 100644
--- a/source/backend/opencl/execution/image/ConvExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvExecution.cpp
@@ -25,12 +25,12 @@ ConvCommonExecution::ConvCommonExecution(const Convolution2D *conv2dParams, Back
     int biasSize             = conv2dParams->bias()->size();
     const float *biasDataPtr = conv2dParams->bias()->data();
     
-    int buffer_size = ALIGN_UP4(biasSize) * sizeof(float);
+    int buffer_size = ALIGN_UP8(biasSize) * sizeof(float);
     cl::Buffer biasBuffer(runtime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
     cl_int error;
     auto biasPtrCL = runtime->commandQueue().enqueueMapBuffer(biasBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
     if(biasPtrCL != nullptr && error == CL_SUCCESS){
-        ::memset(biasPtrCL, 0, ALIGN_UP4(biasSize) * sizeof(float));
+        ::memset(biasPtrCL, 0, ALIGN_UP8(biasSize) * sizeof(float));
         ::memcpy(biasPtrCL, biasDataPtr, biasSize * sizeof(float));
     }else{
         MNN_ERROR("Map error biasPtrCL == nullptr \n");
@@ -328,7 +328,11 @@ ErrorCode ConvExecution::onEncode(const std::vector<Tensor *> &inputs, const std
             std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
             
             for(int knl_idx = 0; knl_idx < 1; knl_idx++) {
-                kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], mResource->mBuildOptions);
+                std::set<std::string> buildOption = mResource->mBuildOptions;
+                if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+                    buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+                }
+                kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], buildOption);
                 uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
                 
                 globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
@@ -363,7 +367,11 @@ ErrorCode ConvExecution::onEncode(const std::vector<Tensor *> &inputs, const std
             int min_index  = min_cost.second;
             //printf("min_index = %d  %d\n", min_index, min_cost.first);
             mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
-            unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], mResource->mBuildOptions);
+            std::set<std::string> buildOption = mResource->mBuildOptions;
+            if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+                buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+            }
+            unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], buildOption);
             
             uint32_t idx = 0;
             unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
@@ -407,7 +415,11 @@ ErrorCode ConvExecution::onEncode(const std::vector<Tensor *> &inputs, const std
         std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
         
         for(int knl_idx = 0; knl_idx < total_kernel; knl_idx++) {
-            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], mResource->mBuildOptions);
+            std::set<std::string> buildOption = mResource->mBuildOptions;
+            if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+                buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+            }
+            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], buildOption);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
             
             globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
@@ -446,7 +458,11 @@ ErrorCode ConvExecution::onEncode(const std::vector<Tensor *> &inputs, const std
         }
         int min_index  = min_cost.second;
         mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
-        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], mResource->mBuildOptions);
+        std::set<std::string> buildOption = mResource->mBuildOptions;
+        if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+            buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+        }
+        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], buildOption);
         
         uint32_t idx            = 0;
         cl_int ret = CL_SUCCESS;
diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
index ec8e9f3e2..97dd8a770 100644
--- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
@@ -239,6 +239,9 @@ void ConvLowMemoryExecution::tune1x1CaseLowMemory(Tensor * input, Tensor * outpu
         if(inputChannels % 4 != 0){
             buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
         }
+        if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+            buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+        }
         kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], buildOption);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
         
@@ -277,6 +280,9 @@ void ConvLowMemoryExecution::tune1x1CaseLowMemory(Tensor * input, Tensor * outpu
     if(inputChannels % 4 != 0){
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
     }
+    if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+        buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+    }
     unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], buildOption);
     uint32_t idx = 0;
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
@@ -338,6 +344,9 @@ void ConvLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor * o
         if(inputChannels % 4 != 0){
             buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
         }
+        if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
+            buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+        }
         kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], buildOption);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
 
@@ -379,6 +388,9 @@ void ConvLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor * o
     if(inputChannels % 4 != 0){
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
     }
+    if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
+        buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
+    }
     unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], buildOption);
 
     uint32_t idx            = 0;
diff --git a/source/backend/vulkan/buffer/execution/VulkanPRelu.cpp b/source/backend/vulkan/buffer/execution/VulkanPRelu.cpp
index 3181524e9..751d24934 100644
--- a/source/backend/vulkan/buffer/execution/VulkanPRelu.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanPRelu.cpp
@@ -80,7 +80,7 @@ class VulkanReluCreator : public VulkanBackend::Creator {
 public:
     virtual VulkanBasicExecution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor*>& outputs, const MNN::Op *op, Backend *bn) const override {
         if (1 == op->main_as_PRelu()->slopeCount()) {
-            return new VulkanUnary("RELU", bn, op->main_as_PRelu()->slope()->data()[0]);
+            return new VulkanUnary("RELU", bn, false, op->main_as_PRelu()->slope()->data()[0]);
         }
         return new VulkanPrelu(bn, op);
     }
diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp
index c8280ceef..4b62bc93a 100644
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@@ -193,6 +193,9 @@ void Interpreter::setExternalFile(const char* file, size_t flag) {
 }
 
 ErrorCode Interpreter::updateCacheFile(Session *session, int flag) {
+    if (mNet->cacheFile.empty()) {
+        return NOT_SUPPORT;
+    }
     std::lock_guard<std::mutex> _l(mNet->lock);
 
     // Backend_Auto and no Async work, then don't need updateCache
diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp
index 0197fb965..11cd172b1 100644
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@@ -692,6 +692,9 @@ static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flat
 }
 Execution* OpCommonUtils::createExecutionWithExternal(Backend* backend, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                               const MNN::Op* op, FileLoader* externalFile, std::shared_ptr<BufferStorage>& tmpstore) {
+#ifdef MNN_BUILD_MINI
+    return backend->onCreate(inputs, outputs, op);
+#else
     bool hasExternal = false;
     switch (op->main_type()) {
         case OpParameter_Convolution2D:
@@ -735,6 +738,7 @@ Execution* OpCommonUtils::createExecutionWithExternal(Backend* backend, const st
         }
     }
     return execution;
+#endif
 }
 
 void OpCommonUtils::loadExternalDatas(FileLoader* fileloader, std::vector<char*> addrs,  const int64_t* external) {
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 8bb123a41..12807a42f 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -27,7 +27,7 @@ static bool _supportQuant(const Op* op, const std::vector<Tensor*>& inputs, cons
     switch (otype) {
         case OpType_Convolution:
         case OpType_ConvolutionDepthwise:
-        case OpType_Deconvolution:
+//        case OpType_Deconvolution:
             if (inputs.size() > 1) {
                 return false;
             }
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 1d32a8194..8f5d21248 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -234,15 +234,6 @@ ErrorCode Session::runWithCallBack(const TensorCallBackWithInfo& before, const T
     return NO_ERROR;
 }
 
-void Session::_clearCache() {
-    for (auto& t : mInfo.allTensors) {
-        auto describe = TensorUtils::getDescribe(t.get());
-        if (describe->usage == Tensor::InsideDescribe::TRAINABLE || describe->usage == Tensor::InsideDescribe::CONSTANT) {
-            continue;
-        }
-        describe->regions.clear();
-    }
-}
 
 ErrorCode Session::resize() {
 #ifdef LOG_VERBOSE
diff --git a/source/core/Session.hpp b/source/core/Session.hpp
index 8461a21f3..a7c38d019 100644
--- a/source/core/Session.hpp
+++ b/source/core/Session.hpp
@@ -147,7 +147,6 @@ class MNN_PUBLIC Session {
     }
 
 private:
-    void _clearCache();
     void _setUpTensorInfo(const Schedule::ScheduleInfo& info);
 
 private:
diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
index 01398fb34..5c9830568 100644
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@@ -487,7 +487,7 @@ static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSi
      dx=sx-xo -> [max(0, -xo), max(0, min(sxr-xo, dxr))]
      dy,dz compute the same
      **/
-    
+
     int offsetBias = dstOffset - srcOffset;
     if (sizeNum == 0) {
         // All stride is zero, then size will be all one
@@ -903,4 +903,14 @@ void TensorUtils::setTensorPad(const Tensor* tensor, int left, int right, int bo
     srcDes->mPads.top = std::max(srcDes->mPads.top, top);
 }
 
+void TensorUtils::setSharedMem(const Tensor *tensor, Backend::MemObj *mem){
+    auto srcDes = TensorUtils::getDescribe(tensor);
+    srcDes->mSharedMem = mem;
+}
+
+Backend::MemObj* TensorUtils::getSharedMem(const Tensor* tensor){
+    auto srcDes = TensorUtils::getDescribe(tensor);
+    return srcDes->mSharedMem.get();
+}
+
 } // namespace MNN
diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp
index 442b3184a..268c8be00 100644
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@@ -124,6 +124,8 @@ struct Tensor::InsideDescribe {
         pad mPads;
         // For isMutable = false Tensor , determine whether the content can be convert to main backend
         uint32_t stageMask = 0;
+        // Use for shared memory
+        SharedPtr<Backend::MemObj> mSharedMem;
     };
     std::shared_ptr<NativeInsideDescribe> mContent;
     SharedPtr<Backend::MemObj> mem;
@@ -224,6 +226,10 @@ class MNN_PUBLIC TensorUtils {
     static void setTensorSupportPack(const Tensor* tensor, bool flag);
 
     static void setTensorPad(const Tensor* tensor, int left, int right, int bottom, int top);
+    
+    static void setSharedMem(const Tensor* tensor, Backend::MemObj *mem);
+    
+    static Backend::MemObj* getSharedMem(const Tensor* tensor);
 };
 } // namespace MNN
 
diff --git a/source/shape/ShapeConcat.cpp b/source/shape/ShapeConcat.cpp
index 3eb2675b3..8eba40670 100644
--- a/source/shape/ShapeConcat.cpp
+++ b/source/shape/ShapeConcat.cpp
@@ -14,7 +14,7 @@ class ConcatSizeComputer : public SizeComputer {
     virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                const std::vector<Tensor*>& outputs) const override {
         MNN_ASSERT(1 == outputs.size());
-        MNN_ASSERT(inputs.size() >= 2);
+        // MNN_ASSERT(inputs.size() >= 2);
         auto& ob      = outputs[0]->buffer();
         int basicAxis = 0;
         if (op->type() == OpType_Concat) {
diff --git a/source/shape/ShapeRegister.cpp b/source/shape/ShapeRegister.cpp
index f917bf39a..f4ed83802 100644
--- a/source/shape/ShapeRegister.cpp
+++ b/source/shape/ShapeRegister.cpp
@@ -122,6 +122,9 @@ extern void ___FmhaV2SizeComputer__OpType_FmhaV2__();
 extern void ___FmhcaSizeComputer__OpType_Fmhca__();
 extern void ___AttentionSizeComputer__OpType_Attention__();
 #endif
+#ifdef MNN_BUILD_AUDIO
+extern void ___StftOpComputer__OpType_Stft__();
+#endif
 void registerShapeOps() {
 ___ShapeSizeComputer__OpType_Shape__();
 ___ShapeRasterComputer__OpType_Raster__();
@@ -244,5 +247,8 @@ ___FmhaV2SizeComputer__OpType_FmhaV2__();
 ___FmhcaSizeComputer__OpType_Fmhca__();
 ___AttentionSizeComputer__OpType_Attention__();
 #endif
+#ifdef MNN_BUILD_AUDIO
+___StftOpComputer__OpType_Stft__();
+#endif
 }
 }
diff --git a/source/shape/ShapeStft.cpp b/source/shape/ShapeStft.cpp
new file mode 100644
index 000000000..59847ad62
--- /dev/null
+++ b/source/shape/ShapeStft.cpp
@@ -0,0 +1,38 @@
+//
+//  ShapeStft.cpp
+//  MNN
+//
+//  Created by MNN on 2024/11/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef MNN_BUILD_AUDIO
+
+#include "shape/SizeComputer.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+
+namespace MNN {
+
+class StftOpComputer : public SizeComputer {
+    virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
+                               const std::vector<Tensor*>& outputs) const override {
+        int sample_length = inputs[0]->elementSize();
+        auto stft = op->main_as_StftParam();
+        bool abs = stft->abs();
+        int n_fft = stft->n_fft();
+        int hop_length = stft->hop_length();
+        int frames = (sample_length - n_fft) / hop_length + 1;
+        // Scalar
+        outputs[0]->buffer().dimensions = 2;
+        outputs[0]->setLength(0, frames);
+        outputs[0]->setLength(1, n_fft / 2 + 1);
+        outputs[0]->buffer().type = inputs[0]->getType();
+        TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+        return true;
+    }
+};
+
+REGISTER_SHAPE_AUDIO(StftOpComputer, OpType_Stft);
+} // namespace MNN
+#endif // MNN_BUILD_AUDIO
diff --git a/source/shape/SizeComputer.hpp b/source/shape/SizeComputer.hpp
index 996561d51..7a1c95312 100644
--- a/source/shape/SizeComputer.hpp
+++ b/source/shape/SizeComputer.hpp
@@ -186,4 +186,13 @@ class SizeComputerRegister {
 
 #endif
 
+#ifdef MNN_BUILD_AUDIO
+#define REGISTER_SHAPE_AUDIO(name, op)            \
+    void ___##name##__##op##__() {                        \
+        name* _temp = new name;                            \
+        SizeComputerSuite* ts = SizeComputerSuite::get(); \
+        ts->insert(_temp, op);                           \
+    }
+#endif
+
 #endif
diff --git a/test.sh b/test.sh
index 7204117a8..168f06c40 100755
--- a/test.sh
+++ b/test.sh
@@ -167,7 +167,7 @@ android_static_build() {
     -DMNN_INTERNAL=ON \
     -DMNN_USE_LOGCAT=false \
     -DMNN_BUILD_BENCHMARK=ON \
-    -DANDROID_NATIVE_API_LEVEL=android-21  \
+    -DANDROID_NATIVE_API_LEVEL=android-26  \
     -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
     -DMNN_OPENGL=true \
     -DMNN_BUILD_TRAIN=true \
@@ -198,7 +198,7 @@ android_static_build() {
     -DMNN_USE_LOGCAT=false \
     -DMNN_BUILD_BENCHMARK=ON \
     -DMNN_INTERNAL=ON \
-    -DANDROID_NATIVE_API_LEVEL=android-21  \
+    -DANDROID_NATIVE_API_LEVEL=android-26  \
     -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
     -DMNN_OPENGL=true \
     -DMNN_BUILD_TRAIN=true \
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f128825a6..9aa84590a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,6 +4,10 @@ if(MNN_WITH_PLUGIN)
   list(APPEND TEST_DEPS plugin_matmul)
 endif()
 
+if (CMAKE_SYSTEM_NAME MATCHES "^Android")
+  list(APPEND TEST_DEPS android)
+endif()
+
 if(APPLE)
   file(GLOB_RECURSE Files ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/*.mm)
 else()
diff --git a/test/op/DeconvolutionTest.cpp b/test/op/DeconvolutionTest.cpp
index e7443064b..4e1e4d4ea 100644
--- a/test/op/DeconvolutionTest.cpp
+++ b/test/op/DeconvolutionTest.cpp
@@ -9,6 +9,7 @@
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <string>
+#include <sstream>
 #include <vector>
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
@@ -17,6 +18,56 @@ using namespace std;
 using namespace MNN;
 using namespace MNN::Express;
 
+static void reference_deconv2d(const float* input, const std::vector<float>& weight,
+                             const std::vector<float>& bias, std::vector<float>& output, int batch, int ic, int oc,
+                             int ih, int iw, int pad_h, int pad_w, int kh, int kw, int stridew, int strideh,
+                             int dilation) {
+    int oh, ow;
+    ow  = (iw - 1) * stridew + dilation * (kw - 1) + 1 - pad_w * 2;
+    oh = (ih - 1) * strideh + dilation * (kh - 1) + 1 - pad_h * 2;
+
+    if (oh <= 0 || ow <= 0) {
+        output.clear();
+        return;
+    }
+    output.resize(batch * oh * ow * oc);
+    for (int b = 0; b < batch; ++b) {
+        for (int oz = 0; oz < oc; ++oz) {
+            auto outputPtr = output.data() + b * oh * ow * oc + oz * ow * oh;
+            for (int s=0; s<oh*ow; ++s) {
+                outputPtr[s] = bias[oz];
+            }
+        }
+    }
+
+    for (int b = 0; b < batch; ++b) {
+        for (int oz = 0; oz < oc; ++oz) {
+            // Revert input, output
+            for (int iy = 0; iy < ih; ++iy) {
+                for (int ix = 0; ix < iw; ++ix) {
+                    auto destOffset = (b * oc + oz) * oh * ow;
+                    for (int sz = 0; sz < ic; ++sz) {
+                        auto srcOffset = ((b * ic + sz) * ih + iy)*iw+ix;
+                        auto xValue = input[srcOffset];
+                        auto wOffset = sz * oc * kh * kw + oz * kh * kw;
+                        for (int ky = 0; ky < kh; ++ky) {
+                            for (int kx = 0; kx < kw; ++kx) {
+                                int ox = ix * stridew + kx * dilation - pad_w;
+                                int oy = iy * strideh + ky * dilation - pad_h;
+                                if (ox >= 0 && ox < ow && oy >= 0 && oy < oh) {
+                                    auto w = weight[wOffset+ky*kw+kx];
+                                    output[destOffset + oy * ow + ox] += xValue * w;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
 static PadMode _convertPadMode(PaddingMode mode) {
     switch (mode) {
         case CAFFE:
@@ -72,7 +123,52 @@ class DeconvolutionCommonTest : public MNNTestCase {
     virtual ~DeconvolutionCommonTest() = default;
 
 protected:
-    static bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name,
+    static bool test(const std::string& test_op_name,
+                    int batch, int ic, int oc, int ih, int iw, int pad_h, int pad_w, int kh,
+                    int kw, int stride, int dilation, int group, int precision) {
+        int ow  = (iw - 1) * stride + dilation * (kw - 1) + 1 - pad_w * 2;
+        int oh = (ih - 1) * stride + dilation * (kh - 1) + 1 - pad_h * 2;
+        if (ow <=0 || oh <= 0) {
+            return true;
+        }
+        auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of<float>());
+        auto inputPtr = input->writeMap<float>();
+        {
+            int size = input->getInfo()->size;
+            for (int i=0; i<size; ++i) {
+                inputPtr[i] = (float)((i+1) % 10) / 100.0f;
+            }
+        }
+        std::vector<float> weightData(ic*oc*kh*kw);
+        for (int i=0; i<weightData.size(); ++i) {
+            weightData[i] = (float)(10-(i%10)) / 10.0f;
+        }
+        std::vector<float> biasData(oc);
+        for (int i=0; i<oc; ++i) {
+            biasData[i] = (float)(5-(i%10)) / 10.0f;
+        }
+        std::vector<float> rightOutData;
+        reference_deconv2d(inputPtr, weightData, biasData, rightOutData, batch, ic, oc, ih, iw, pad_h, pad_w, kh, kw, stride, stride, dilation);
+        input = _Convert(input, NC4HW4);
+        auto output = _Deconv(std::move(weightData), std::move(biasData), input, {ic, oc}, {kw, kh}, VALID,
+                              {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false);
+        output = _Convert(output, NCHW);
+        if (rightOutData.size() != output->getInfo()->size) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        
+
+        // difference below 0.5% relative error is considered correct.
+        auto outputPtr = output->readMap<float>();
+        float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 20;
+        if (!checkVectorByRelativeError<float>(outputPtr, rightOutData.data(), rightOutData.size(), 0.005 * errorScale)) {
+            MNN_ERROR("%s test failed!\n", test_op_name.c_str());
+            return false;
+        }
+        return true;
+    }
+    static bool test(const std::string& test_op_name,
                     vector<float>& inputData, vector<float>& weightData, vector<float>& biasData, vector<float>& rightOutData,
                     int batch, int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh,
                     int kw, int stride, int dilation, int group, int precision) {
@@ -87,7 +183,7 @@ class DeconvolutionCommonTest : public MNNTestCase {
         auto outputPtr = output->readMap<float>();
         float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 20;
         if (!checkVectorByRelativeError<float>(outputPtr, rightOutData.data(), rightOutData.size(), 0.005 * errorScale)) {
-            MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
+            MNN_ERROR("%s test failed!\n", test_op_name.c_str());
             return false;
         }
         return true;
@@ -122,6 +218,60 @@ class DeconvolutionCommonTestInt8 : public MNNTestCase {
         return true;
     }
 };
+class DeconvolutionFullTest : public DeconvolutionCommonTest {
+public:
+    virtual ~DeconvolutionFullTest() = default;
+    virtual bool run(int precision) {
+        if (MNN_FORWARD_OPENCL == getCurrentType()) {
+            MNN_ERROR("Currently opencl run deconvolution has error, skip it\n");
+            return true;
+        }
+        int ocStep = 1;
+        int icStep = 1;
+        int isStep = 3;
+        std::vector<int> ocSize = {
+            1, 3, 10, 17
+        };
+        std::vector<int> icSize = {
+            1, 4, 3, 8, 11
+        };
+        std::vector<int> isSize = {
+            1, 7, 9, 13
+        };
+
+        for (int batch = 1; batch <= 2; batch++) {
+            for (auto oc : ocSize) {
+                for (auto ic : icSize) {
+                    for (auto is : isSize) {
+                        int ih = is;
+                        int iw = is;
+                        for (int kw = 1; kw <= 7 && kw <= is; kw+=2) {
+                            for (int kh = 1; kh <= 7 && kh <= is; kh+=3) {
+                                for (int d = 1; d <= 2; d++) {
+                                    for (int s = 1; s <= 2; s++) {
+                                        int stride = s;
+                                        for (int p = 0; p <= 1; p++) {
+                                            std::ostringstream name;
+                                            int pad_w = p;
+                                            int pad_h = p;
+                                            name << "Deconvolution: " << batch <<","<< oc <<","<<ic<<","<<is<<"["<< kw <<","<<kh <<"][s:" << s << ",p:"
+                                            <<p << ",d:"<<d<<"]";
+                                            bool succ = DeconvolutionCommonTest::test(name.str(),batch, ic, oc, ih, iw, pad_h, pad_w, kh, kw, stride, d, 1, precision);
+                                            if (!succ) {
+                                                return false;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return true;
+    }
+};
 
 class DeconvolutionTest : public DeconvolutionCommonTest {
 public:
@@ -169,7 +319,7 @@ class DeconvolutionTest : public DeconvolutionCommonTest {
             int group = 1, batch = 1;
             int pad_w = 0, pad_h = 0;
 
-            bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "DeconvolutionTest0", data_a, weight, bias, data_c,
+            bool succ = DeconvolutionCommonTest::test("DeconvolutionTest0", data_a, weight, bias, data_c,
                                                       batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw,
                                                       stride, dilation, group, precision);
             if (!succ) {
@@ -217,7 +367,7 @@ class DeconvolutionTest : public DeconvolutionCommonTest {
             int group = 1, batch = 1;
             int pad_w = 1, pad_h = 1;
 
-            bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "Deconv", data_a, weight, bias, data_c,
+            bool succ = DeconvolutionCommonTest::test("Deconv", data_a, weight, bias, data_c,
                                                       batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw,
                                                       stride, dilation, group, precision);
             if (!succ) {
@@ -265,7 +415,7 @@ class DeconvolutionTest : public DeconvolutionCommonTest {
             int group = 1, batch = 1;
             int pad_w = 0, pad_h = 0;
 
-            bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "Deconv", data_a, weight, bias, data_c,
+            bool succ = DeconvolutionCommonTest::test("Deconv", data_a, weight, bias, data_c,
                                                       batch, ic, oc, ih, iw, PadMode_SAME, pad_h, pad_w, kh, kw,
                                                       stride, dilation, group, precision);
             if (!succ) {
@@ -510,6 +660,8 @@ class DeconvolutionInt8Test : public DeconvolutionCommonTestInt8 {
         return true;
     }
 };
+MNNTestSuiteRegister(DeconvolutionFullTest, "op/Deconvolutionfull");
 MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution");
 MNNTestSuiteRegister(DeconvolutionInt8Test, "op/DeconvolutionInt8");
 
+
diff --git a/test/op/MomentsTest.cpp b/test/op/MomentsTest.cpp
index d51528a78..1ddcbe782 100644
--- a/test/op/MomentsTest.cpp
+++ b/test/op/MomentsTest.cpp
@@ -16,6 +16,7 @@ class MomentsTest : public MNNTestCase {
 public:
     virtual ~MomentsTest() = default;
     virtual bool run(int precision) {
+#ifdef MNN_SUPPORT_DEPRECATED_OP
         auto input = _Input({1, 4, 4, 1}, NCHW);
         input->setName("input_tensor");
         // set input data
@@ -39,6 +40,7 @@ class MomentsTest : public MNNTestCase {
             MNN_ERROR("MomentsTest test failed!\n");
             return false;
         }
+#endif
         return true;
     }
 };
diff --git a/test/op/PReLUTest.cpp b/test/op/PReLUTest.cpp
index f6a3d1365..62b8bab95 100644
--- a/test/op/PReLUTest.cpp
+++ b/test/op/PReLUTest.cpp
@@ -43,7 +43,7 @@ class PreluTestInt8 : public MNNTestCase {
         auto input = _Input({1, 12, 4, 2}, NCHW);
         input->setName("input_tensor");
         // set input data
-        input->writeScaleMap(0.03567, 1.0);
+        input->writeScaleMap(0.02745, -18.714);
         const float inpudata[] = {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0,
                                   2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                                   -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
@@ -57,7 +57,7 @@ class PreluTestInt8 : public MNNTestCase {
                                   -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
                                   4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0};
         auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
+        memcpy(inputPtr, inpudata, 96 * sizeof(float));
         input->unMap();
         input                                   = _Convert(input, NC4HW4);
         auto output                             = _PRelu(input, {3.0, 1.5, 1.5, 1.5, 3.0, 1.5, 1.5, 1.5, 3.0, 1.5, 1.5, 1.5});
@@ -75,10 +75,32 @@ class PreluTestInt8 : public MNNTestCase {
                                                    -4.5, -4.5, -4.5, -4.5, -4.5, -4.5, -4.5, -4.5,
                                                    4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0
                                                    };
-        output->writeScaleMap(0.03567, 1.0);
+        output->writeScaleMap(0.03333, 7.f);
         auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.05)) {
-            MNN_ERROR("PreluTest test failed!\n");
+        if (!checkVector<float>(gotOutput, expectedOutput.data(), 96, 0.1)) {
+            MNN_ERROR("PreluTest test 1 failed!\n");
+            return false;
+        }
+        // prelu: one slope
+        auto output1 = _PRelu(input, {3.0});
+        output1      = _Convert(output1, NCHW);
+        const std::vector<float> expectedOutput1 = {-3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
+                                                   2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+                                                   -9.0, -9.0, -9.0, -9.0, -9.0, -9.0, -9.0, -9.0,
+                                                   4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
+                                                   -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
+                                                   2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+                                                   -9.0, -9.0, -9.0, -9.0, -9.0, -9.0, -9.0, -9.0,
+                                                   4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
+                                                   -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
+                                                   2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+                                                   -9.0, -9.0, -9.0, -9.0, -9.0, -9.0, -9.0, -9.0,
+                                                   4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
+                                                   };
+        output1->writeScaleMap(0.05098, 48.54);
+        auto gotOutput1 = output1->readMap<float>();
+        if (!checkVector<float>(gotOutput1, expectedOutput1.data(), 96, 0.1)) {
+            MNN_ERROR("PreluTest test 2 failed!\n");
             return false;
         }
         return true;
diff --git a/test/op/StftTest.cpp b/test/op/StftTest.cpp
new file mode 100644
index 000000000..894b849c0
--- /dev/null
+++ b/test/op/StftTest.cpp
@@ -0,0 +1,62 @@
+//
+//  StftTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2024/11/27.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef MNN_BUILD_AUDIO
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+class StftTest : public MNNTestCase {
+public:
+    virtual ~StftTest() = default;
+    virtual bool run(int precision) {
+        /*
+        python:
+            import torch
+            freq = 5, sample_rate = 100, duration = 0.2
+            t = torch.arange(0, duration, 1.0 / sample_rate)
+            sine_wave = torch.sin(2 * torch.pi * freq * t)
+            n_fft = 8, hop_length = 4, win_length = 8
+            window = torch.hann_window(win_length)
+            stft_result = torch.stft(sine_wave, n_fft=n_fft, hop_length=hop_length,
+                                     win_length=win_length, window=window, center=False)
+            magnitude = torch.abs(stft_result).transpose(1, 0)
+        */
+        auto signal = _Input({ 20 }, NCHW);
+        auto window = _Input({  8 }, NCHW);
+        signal->setName("signal");
+        window->setName("window");
+        const float signalData[] = {
+            0.000, 0.309,  0.588,  0.809,  0.951,  1.000,  0.951,  0.809,  0.588,  0.309,
+            0.000, -0.309, -0.588, -0.809, -0.951, -1.000, -0.951, -0.809, -0.588, -0.309
+        };
+        const float windowData[] = { 0.000, 0.146, 0.500, 0.854, 1.000, 0.854, 0.500, 0.146 };
+        auto signalPtr           = signal->writeMap<float>();
+        auto windowPtr           = window->writeMap<float>();
+        memcpy(signalPtr, signalData, 20 * sizeof(float));
+        memcpy(windowPtr, windowData,  8 * sizeof(float));
+        auto output                  = _Stft(signal, window, 8, 4);
+        const float expectedOutput[] = {
+            3.428, 1.958, 0.203, 0.029, 0.013, 2.119, 1.501, 0.261, 0.041, 0.008,
+            2.119, 1.501, 0.261, 0.041, 0.008, 3.428, 1.958, 0.203, 0.029, 0.013
+        };
+        auto gotOutput = output->readMap<float>();
+        for (int i = 0; i < 20; ++i) {
+            auto diff = ::fabsf(gotOutput[i] - expectedOutput[i]);
+            if (diff > 0.01) {
+                MNN_ERROR("StftTest test failed: %f - %f!\n", expectedOutput[i], gotOutput[i]);
+                return false;
+            }
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(StftTest, "op/stft");
+#endif // MNN_BUILD_AUDIO
\ No newline at end of file
diff --git a/test/sharedmem/AhardWareBufferTest.cpp b/test/sharedmem/AhardWareBufferTest.cpp
new file mode 100644
index 000000000..002a8caa4
--- /dev/null
+++ b/test/sharedmem/AhardWareBufferTest.cpp
@@ -0,0 +1,311 @@
+//
+//  ReplaceTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2019/09/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __ANDROID__
+#include <dlfcn.h>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "MNN_generated.h"
+#include <MNN/expr/Module.hpp>
+#include "TestUtils.h"
+#include <android/hardware_buffer.h>
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+
+using namespace MNN;
+using namespace MNN::Express;
+
+/*
+Ref from 
+https://android.googlesource.com/platform/external/libchrome/+/refs/tags/aml_res_331314010/base/android/android_hardware_buffer_compat.h
+*/
+using PFAHardwareBuffer_allocate = int (*)(const AHardwareBuffer_Desc* desc,
+                                            AHardwareBuffer** outBuffer);
+using PFAHardwareBuffer_acquire = void (*)(AHardwareBuffer* buffer);
+using PFAHardwareBuffer_describe = void (*)(const AHardwareBuffer* buffer,
+                                            AHardwareBuffer_Desc* outDesc);
+using PFAHardwareBuffer_lock = int (*)(AHardwareBuffer* buffer,
+                                       uint64_t usage,
+                                       int32_t fence,
+                                       const ARect* rect,
+                                       void** outVirtualAddress);
+using PFAHardwareBuffer_recvHandleFromUnixSocket =
+    int (*)(int socketFd, AHardwareBuffer** outBuffer);
+using PFAHardwareBuffer_release = void (*)(AHardwareBuffer* buffer);
+using PFAHardwareBuffer_sendHandleToUnixSocket =
+    int (*)(const AHardwareBuffer* buffer, int socketFd);
+using PFAHardwareBuffer_unlock = int (*)(AHardwareBuffer* buffer,
+                                         int32_t* fence);
+
+class AndroidHardwareBufferCompat {
+ public:
+  bool IsSupportAvailable() const {
+    return true;
+  }
+  AndroidHardwareBufferCompat();
+  int Allocate(const AHardwareBuffer_Desc* desc, AHardwareBuffer** outBuffer);
+  void Acquire(AHardwareBuffer* buffer);
+  void Describe(const AHardwareBuffer* buffer, AHardwareBuffer_Desc* outDesc);
+  int Lock(AHardwareBuffer* buffer,
+           uint64_t usage,
+           int32_t fence,
+           const ARect* rect,
+           void** out_virtual_address);
+  int RecvHandleFromUnixSocket(int socketFd, AHardwareBuffer** outBuffer);
+  void Release(AHardwareBuffer* buffer);
+  int SendHandleToUnixSocket(const AHardwareBuffer* buffer, int socketFd);
+  int Unlock(AHardwareBuffer* buffer, int32_t* fence);
+ private:
+  PFAHardwareBuffer_allocate allocate_;
+  PFAHardwareBuffer_acquire acquire_;
+  PFAHardwareBuffer_describe describe_;
+  PFAHardwareBuffer_lock lock_;
+  PFAHardwareBuffer_recvHandleFromUnixSocket recv_handle_;
+  PFAHardwareBuffer_release release_;
+  PFAHardwareBuffer_sendHandleToUnixSocket send_handle_;
+  PFAHardwareBuffer_unlock unlock_;
+};
+#define DCHECK(x) MNN_ASSERT(x)
+AndroidHardwareBufferCompat::AndroidHardwareBufferCompat() {
+  // TODO(klausw): If the Chromium build requires __ANDROID_API__ >= 26 at some
+  // point in the future, we could directly use the global functions instead of
+  // dynamic loading. However, since this would be incompatible with pre-Oreo
+  // devices, this is unlikely to happen in the foreseeable future, so just
+  // unconditionally use dynamic loading.
+  // cf. base/android/linker/modern_linker_jni.cc
+  void* main_dl_handle = dlopen(nullptr, RTLD_NOW);
+  *reinterpret_cast<void**>(&allocate_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_allocate");
+  DCHECK(allocate_);
+  *reinterpret_cast<void**>(&acquire_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_acquire");
+  DCHECK(acquire_);
+  *reinterpret_cast<void**>(&describe_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_describe");
+  DCHECK(describe_);
+  *reinterpret_cast<void**>(&lock_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_lock");
+  DCHECK(lock_);
+  *reinterpret_cast<void**>(&recv_handle_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_recvHandleFromUnixSocket");
+  DCHECK(recv_handle_);
+  *reinterpret_cast<void**>(&release_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_release");
+  DCHECK(release_);
+  *reinterpret_cast<void**>(&send_handle_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_sendHandleToUnixSocket");
+  DCHECK(send_handle_);
+  *reinterpret_cast<void**>(&unlock_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_unlock");
+  DCHECK(unlock_);
+}
+
+int AndroidHardwareBufferCompat::Allocate(const AHardwareBuffer_Desc* desc,
+                                           AHardwareBuffer** out_buffer) {
+  DCHECK(IsSupportAvailable());
+  return allocate_(desc, out_buffer);
+}
+void AndroidHardwareBufferCompat::Acquire(AHardwareBuffer* buffer) {
+  DCHECK(IsSupportAvailable());
+  acquire_(buffer);
+}
+void AndroidHardwareBufferCompat::Describe(const AHardwareBuffer* buffer,
+                                           AHardwareBuffer_Desc* out_desc) {
+  DCHECK(IsSupportAvailable());
+  describe_(buffer, out_desc);
+}
+int AndroidHardwareBufferCompat::Lock(AHardwareBuffer* buffer,
+                                      uint64_t usage,
+                                      int32_t fence,
+                                      const ARect* rect,
+                                      void** out_virtual_address) {
+  DCHECK(IsSupportAvailable());
+  return lock_(buffer, usage, fence, rect, out_virtual_address);
+}
+int AndroidHardwareBufferCompat::RecvHandleFromUnixSocket(
+    int socket_fd,
+    AHardwareBuffer** out_buffer) {
+  DCHECK(IsSupportAvailable());
+  return recv_handle_(socket_fd, out_buffer);
+}
+void AndroidHardwareBufferCompat::Release(AHardwareBuffer* buffer) {
+  DCHECK(IsSupportAvailable());
+  release_(buffer);
+}
+int AndroidHardwareBufferCompat::SendHandleToUnixSocket(
+    const AHardwareBuffer* buffer,
+    int socket_fd) {
+  DCHECK(IsSupportAvailable());
+  return send_handle_(buffer, socket_fd);
+}
+int AndroidHardwareBufferCompat::Unlock(AHardwareBuffer* buffer,
+                                        int32_t* fence) {
+  DCHECK(IsSupportAvailable());
+  return unlock_(buffer, fence);
+}
+
+static std::shared_ptr<AndroidHardwareBufferCompat> gFunction;
+
+static AHardwareBuffer* creatAHardwareBuffer(int width, int height, void *data){
+    // 创建和初始化硬件缓冲区
+    AHardwareBuffer_Desc bufferDesc = {};
+    bufferDesc.width = width;
+    bufferDesc.height = height;
+    bufferDesc.layers = 1;
+    bufferDesc.format = AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM;
+    bufferDesc.usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN | AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
+
+    AHardwareBuffer* buffer = nullptr;
+    int result = gFunction->Allocate(&bufferDesc, &buffer);
+    if(result != 0) {
+        MNN_ERROR("alloc AHardwareBuffer failed   %d\n", result);
+    }
+    
+    if(nullptr != data){
+        void* map = nullptr;
+        ARect rect = { 0, 0, width, height };  // Define the region to lock
+        result = gFunction->Lock(buffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1, &rect, &map);
+        if (result != 0) {
+            MNN_ERROR("Handle lock failed\n");
+        }
+        if (map) {
+            memcpy(map, data, width * height * 4);
+        }
+        
+        gFunction->Unlock(buffer, nullptr);
+    }
+    return buffer;
+}
+
+static void ReleaseAHardWareBuffer(AHardwareBuffer* buffer){
+    gFunction->Release(buffer);
+}
+
+static void copyDataFromAHardWareBuffer(AHardwareBuffer* buffer, int width, int height, void *data){
+    int result = 0;
+    if(nullptr != data){
+        void* map = nullptr;
+        ARect rect = { 0, 0, width, height };  // Define the region to lock
+        result = gFunction->Lock(buffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1, &rect, &map);
+        if (result != 0) {
+            MNN_ERROR("Handle lock failed\n");
+        }
+        if (map) {
+            memcpy(data, map, width * height * 4);
+        }
+        
+        gFunction->Unlock(buffer, nullptr);
+    }
+}
+
+static bool checkvalue(const float* ref, const unsigned char* out, int size){
+    for(int i = 0; i < size; ++i){
+        if(ref[i] != (float)out[i]){
+            MNN_ERROR("%d:  ref %f != out %f\n", i, ref[i], (float)out[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+const int width = 1280;
+const int height = 720;
+const int channel = 3;
+static std::shared_ptr<Module> _createModel() {
+    auto x = _Input({1, channel, height, width}, NCHW, halide_type_of<float>());
+    x->setName("Input");
+    auto y = _Transpose(x, {0, 1, 3, 2});
+    y->setName("Transpose");
+    std::unique_ptr<NetT> net(new NetT);
+    Variable::save({y}, net.get());
+    flatbuffers::FlatBufferBuilder builder;
+    auto len = MNN::Net::Pack(builder, net.get());
+    builder.Finish(len);
+    return std::shared_ptr<Module>(Module::load({"Input"}, {"Transpose"}, builder.GetBufferPointer(), builder.GetSize()));
+}
+// Test prepareCompute for dynamic-graph usage
+class AhardWareBufferTest : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        if (nullptr == gFunction) {
+            gFunction.reset(new AndroidHardwareBufferCompat);
+        }
+        if (MNN_FORWARD_OPENCL != getCurrentType()) {
+            MNN_ERROR("Currently forwardtype[%d] run sharedmem/AhardWareBuffer has error, skip it\n", getCurrentType());
+            return true;
+        }
+        auto net = _createModel();
+        auto x = _Input({1, channel, height, width}, NCHW, halide_type_of<float>());
+        unsigned char inputData[4 * height * width];
+        unsigned char outputData[4 * height * width];
+        for(int i = 0; i < 4 * height * width; ++i){
+            inputData[i] = i;
+        }
+        // ahardwarebuffer default format is nc4hw4
+        {
+            auto xPtr = x->writeMap<float>();
+            for (int i = 0; i < channel; ++i){
+                for (int j = 0; j < height * width; ++j) {
+                    xPtr[i * height * width + j] = (float)inputData[j * 4 + i];
+                }
+            }
+            x->unMap();
+        }
+        
+        auto outputs = net->onForward({x});
+        outputs[0] = _Convert(outputs[0], NC4HW4);
+        auto refPtr = outputs[0]->readMap<float>();
+        auto size = outputs[0]->getInfo()->size;
+        
+        auto xShared = _Input({1, channel, height, width}, NCHW, halide_type_of<float>());
+        auto inputAhardwareBuffer = creatAHardwareBuffer(width, height, inputData);
+        volatile uint64_t inputValue = (uint64_t)inputAhardwareBuffer;
+        xShared->setDevicePtr((void*)inputValue, MNN_MEMORY_AHARDWAREBUFFER);
+        auto outputsShared = net->onForward({xShared});
+        auto outputAhardwareBuffer = creatAHardwareBuffer(width, height, nullptr);
+        volatile uint64_t outputValue = (uint64_t)inputAhardwareBuffer;
+        {
+            outputsShared[0]->copyToDevicePtr((void*)outputValue, MNN_MEMORY_AHARDWAREBUFFER);
+            copyDataFromAHardWareBuffer(inputAhardwareBuffer, width, height, outputData);
+            if(checkvalue(refPtr, outputData, size) == false){
+                MNN_ERROR("sharedmem/AhardWareBuffer test failed!\n");
+                return false;
+            }
+        }
+        
+        // speed
+        const auto time = 100;
+        {
+            MNN::Timer _t;
+            for (int t = 0; t < time; ++t) {
+                x->writeMap<float>();
+                auto outputs = net->onForward({x});
+                outputs[0]->readMap<float>();
+            }
+            float timeCost = _t.durationInUs() / 1000.0f / (float)time;
+            MNN_PRINT("cpu copy [%d, %d, %d], Avg time: %f ms\n", channel, height, width, timeCost);
+        }
+        {
+            MNN::Timer _t;
+            for (int t = 0; t < time; ++t) {
+                xShared->setDevicePtr((void*)inputValue, MNN_MEMORY_AHARDWAREBUFFER);
+                auto outputs = net->onForward({xShared});
+                outputs[0]->copyToDevicePtr((void*)outputValue, MNN_MEMORY_AHARDWAREBUFFER);
+            }
+            float timeCost = _t.durationInUs() / 1000.0f / (float)time;
+            MNN_PRINT("shared memory copy [%d, %d, %d], Avg time: %f ms\n", channel, height, width, timeCost);
+        }
+        
+        ReleaseAHardWareBuffer(inputAhardwareBuffer);
+        ReleaseAHardWareBuffer(outputAhardwareBuffer);
+        return true;
+    }
+};
+
+MNNTestSuiteRegister(AhardWareBufferTest, "sharedmem/AhardWareBuffer");
+#endif
diff --git a/test/speed/StftSpeed.cpp b/test/speed/StftSpeed.cpp
new file mode 100644
index 000000000..4d40d3112
--- /dev/null
+++ b/test/speed/StftSpeed.cpp
@@ -0,0 +1,40 @@
+//
+//  StftSpeed.cpp
+//  MNNTests
+//
+//  Created by MNN on 2024/11/27.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef MNN_BUILD_AUDIO
+
+#include <math.h>
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include <random>
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+#include "MNNTestSuite.h"
+using namespace MNN::Express;
+#define SAMPLE 10240
+#define NFFT 256
+#define HOP 128
+#define TIME 100
+class StftSpeed : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        auto x = _Input({SAMPLE}, NHWC);
+        auto w = _Input({NFFT}, NHWC);
+        auto y = _Stft(x, w, NFFT, HOP);
+        {
+            AUTOTIME;
+            for (int i = 0; i < TIME; ++i) {
+                x->writeMap<float>();
+                w->writeMap<float>();
+                y->readMap<float>();
+            }
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(StftSpeed, "speed/stft");
+#endif // MNN_BUILD_AUDIO
\ No newline at end of file
diff --git a/tools/audio/CMakeLists.txt b/tools/audio/CMakeLists.txt
new file mode 100644
index 000000000..e72aa450d
--- /dev/null
+++ b/tools/audio/CMakeLists.txt
@@ -0,0 +1,44 @@
+IF(MNN_BUILD_AUDIO)
+  # imgproc submodules start
+  option(MNN_AUDIO_TEST "Enable audio test" OFF)
+
+  SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../)
+  include_directories(${CMAKE_CURRENT_LIST_DIR}/include)
+  include_directories(${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/imageHelper/)
+
+  # include(${CMAKE_CURRENT_LIST_DIR}/test/CMakeLists.txt)
+  if(${MNN_AUDIO_TEST})
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/test)
+  endif()
+
+  # include dir
+  include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
+
+  # source files
+  FILE(GLOB AUDIO_SRCS ${CMAKE_CURRENT_LIST_DIR}/source/*.cpp)
+
+  IF(MNN_SEP_BUILD)
+    IF(MNN_BUILD_SHARED_LIBS)
+      add_library(MNNAudio SHARED ${AUDIO_SRCS})
+      target_link_libraries(MNNAudio MNN MNN_Express)
+    ELSE()
+      add_library(MNNAudio STATIC ${AUDIO_SRCS})
+    ENDIF()
+  ELSE()
+    add_library(MNNAudio OBJECT ${AUDIO_SRCS})
+  ENDIF()
+  # copy header files
+  IF(CMAKE_SYSTEM_NAME MATCHES "^Android" AND NOT MNN_BUILD_FOR_ANDROID_COMMAND)
+    IF(NOT NATIVE_INCLUDE_OUTPUT)
+      set(NATIVE_INCLUDE_OUTPUT ".")
+    ENDIF()
+    add_custom_command(
+      TARGET MNNAudio
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND}
+      ARGS -E copy_directory ${CMAKE_CURRENT_LIST_DIR}/include ${NATIVE_INCLUDE_OUTPUT}
+    )
+  ELSE()
+    INSTALL(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/include/audio DESTINATION include FILES_MATCHING PATTERN *.hpp)
+  ENDIF()
+ENDIF()
\ No newline at end of file
diff --git a/tools/audio/README.md b/tools/audio/README.md
new file mode 100644
index 000000000..f250212a7
--- /dev/null
+++ b/tools/audio/README.md
@@ -0,0 +1,9 @@
+# MNN audio
+
+MNN audio is a utils of audio process functions.
+
+## Usage
+Compile MNN with audio, using below command:
+```bash
+cmake -DMNN_BUILD_AUDIO=ON .. && make -j8
+```
diff --git a/tools/audio/include/audio/audio.hpp b/tools/audio/include/audio/audio.hpp
new file mode 100644
index 000000000..3e14912cb
--- /dev/null
+++ b/tools/audio/include/audio/audio.hpp
@@ -0,0 +1,169 @@
+//
+//  audio.hpp
+//  MNN
+//
+//  Created by MNN on 2024/11/15.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_AUDIO_HPP
+#define MNN_AUDIO_HPP
+
+#include <MNN/MNNDefine.h>
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/NeuralNetWorkOp.hpp>
+
+namespace MNN {
+namespace AUDIO {
+
+using namespace Express;
+
+enum WINDOW_TYPE { HAMMING = 0, HANNING = 1, POVEY = 2, RECTANGULAR = 3, BLACKMAN = 4 };
+
+/**
+ * Structure to store parameters for the `melscale_fbanks`.
+ */
+struct MelscaleParams {
+    /** Number of mel filterbanks, default is 128. */
+    int n_mels = 128;
+    /** Number of FFT bins, default is 400. */
+    int n_fft = 400;
+    /** Sample rate, default is 16000. */
+    int sample_rate = 16000;
+    /** Scale to use `htk` or `slaney`, default is true mean `htk`. */
+    bool htk = true;
+    /** Divide the triangular mel weights by the width of the mel band, default is false. */
+    bool norm = false;
+    /** Minimum frequency, default is 0. */
+    float f_min = 0.0;
+    /** Maximum frequency, default is 0.(equal to `sample_rate / 2`). */
+    float f_max = 0.0;
+};
+
+/**
+ * Structure to store parameters for the `spectrogram`.
+ */
+struct SpectrogramParams {
+    /** Size of the FFT window, default is 400. */
+    int n_fft = 400;
+
+    /** Hop length between frames, default is 0 (equal to `n_fft / 2`). */
+    int hop_length = 0;
+
+    /** Window length, default is 0 (equal to `n_fft`). */
+    int win_length = 0;
+
+    /** Type of window function, default is Hann window (HANNING). */
+    int window_type = HANNING;
+
+    /** Constant padding value on the left side of the input audio, default is 0. */
+    int pad_left = 0;
+
+    /** Constant padding value on the right side of the input audio, default is 0. */
+    int pad_right = 0;
+
+    /** Whether to apply center padding to the STFT input, default is false. */
+    bool center = false;
+
+    /** Whether to normalize the output, default is false. */
+    bool normalized = false;
+
+    /** Padding mode of `center = true`, default is reflect (REFLECT). */
+    int pad_mode = REFLECT;
+
+    /** Power scaling factor, default is 2.0. */
+    float power = 2.0;
+};
+
+/**
+ * @brief load audio from file
+ * @param filename audio file path
+ * @param frame_offset start frame
+ * @param num_frames number of frames
+ * @return pair<audio tensor, sample rate>
+ */
+MNN_PUBLIC std::pair<VARP, int> load(const std::string& filename, int sr = 0, int frame_offset = 0,
+                                     int num_frames = -1);
+
+/**
+ * @brief save audio to file
+ * @param filename audio file path
+ * @param audio audio tensor
+ * @param sample_rate sample rate
+ */
+MNN_PUBLIC bool save(const std::string& filename, VARP audio, int sample_rate);
+
+/**
+ * @brief compute hamming window
+ * @param window_size window size
+ * @param periodic periodic
+ * @param alpha alpha
+ * @param beta beta
+ * @return hamming window tensor
+ */
+MNN_PUBLIC VARP hamming_window(int window_size, bool periodic = false, float alpha = 0.54, float beta = 0.46);
+
+/**
+ * @brief compute hann window
+ * @param window_size window size
+ * @param periodic periodic
+ * @return hann window tensor
+ */
+MNN_PUBLIC VARP hann_window(int window_size, bool periodic = false);
+
+/**
+ * @brief compute melscale fbanks
+ * @param params melscale fbanks params
+ * @return melscale fbanks var
+ */
+MNN_PUBLIC VARP melscale_fbanks(const MelscaleParams* params = nullptr);
+
+/**
+ * @brief compute spectrogram from audio
+ * @param waveform waveform tensor
+ * @param params spectrogram params
+ * @return spectrogram tensor
+ */
+MNN_PUBLIC VARP spectrogram(VARP waveform, const SpectrogramParams* params = nullptr);
+
+/**
+ * @brief compute mel spectrogram from audio
+ * @param waveform waveform of audio signal.
+ * @param params mel spectrogram params
+ * @param params spectrogram params
+ * @return mel spectrogram tensor
+ */
+MNN_PUBLIC VARP mel_spectrogram(VARP waveform, const MelscaleParams* mel_params = nullptr,
+                                const SpectrogramParams* spec_params = nullptr);
+
+/**
+ * @brief compute fbank from audio
+ * @param waveform waveform tensor
+ * @param sampling_rate sampling rate
+ * @param n_mels number of mel bins
+ * @param n_fft number of fft bins
+ * @param hop_length hop length
+ * @param dither dither
+ * @addindex preemphasis preemphasis
+ * @return fbank tensor
+ */
+MNN_PUBLIC VARP fbank(VARP waveform, int sampling_rate = 16000, int n_mels = 80, int n_fft = 400,
+                      int hop_length = 160, float dither = 0.f, float preemphasis = 0.97);
+
+/**
+ * @brief compute whisper fbank from audio
+ * @param waveform waveform tensor
+ * @param sample_rate sample rate
+ * @param n_mels number of mel bins
+ * @param n_fft number of fft bins
+ * @param hop_length hop length
+ * @param chunk_len chunk length
+ * @return fbank tensor
+ */
+MNN_PUBLIC VARP whisper_fbank(VARP waveform, int sample_rate = 16000, int n_mels = 128, int n_fft = 400,
+                              int hop_length = 160, int chunk_len = 0);
+
+} // namespace AUDIO
+} // namespace MNN
+
+#endif // MNN_AUDIO_HPP
\ No newline at end of file
diff --git a/tools/audio/source/audio.cpp b/tools/audio/source/audio.cpp
new file mode 100644
index 000000000..c91748723
--- /dev/null
+++ b/tools/audio/source/audio.cpp
@@ -0,0 +1,535 @@
+//
+//  audio.cpp
+//  MNN
+//
+//  Created by MNN on 2024/11/15.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "audio/audio.hpp"
+#include <MNN/expr/MathOp.hpp>
+#include <MNN/expr/NeuralNetWorkOp.hpp>
+#include <cmath>
+#include <algorithm>
+#include <complex>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#ifndef M_PI
+#define M_PI 3.141592654
+#endif
+#ifdef _MSC_VER
+#define NOMINMAX
+#include <intrin.h>
+#include <windows.h>
+#endif
+
+namespace MNN {
+namespace AUDIO {
+#ifdef _MSC_VER
+inline uint32_t mnn_clz( uint32_t value ) {
+    DWORD leading_zero = 0;
+    if (_BitScanReverse(&leading_zero, value)) {
+        return 31 - leading_zero;
+    }else {
+         // Same remarks as above
+         return 32;
+    }
+}
+#else
+inline uint32_t mnn_clz( uint32_t value ) {
+    return __builtin_clz(value);
+}
+#endif
+struct WaveHeader {
+    void SeekToDataChunk(std::istream &is) {
+        //                              a t a d
+        while (is && subchunk2_id != 0x61746164) {
+            is.seekg(subchunk2_size, std::istream::cur);
+            is.read(reinterpret_cast<char *>(&subchunk2_id), sizeof(int32_t));
+            is.read(reinterpret_cast<char *>(&subchunk2_size), sizeof(int32_t));
+        }
+    }
+    int32_t chunk_id = 0x46464952; // "RIFF"
+    int32_t chunk_size;
+    int32_t format         = 0x45564157; // "WAVE"
+    int32_t subchunk1_id   = 0x20746d66; // "fmt "
+    int32_t subchunk1_size = 16;         // PCM
+    int16_t audio_format   = 1;          // PCM = 1
+    int16_t num_channels   = 1;          // Mono
+    int32_t sample_rate;
+    int32_t byte_rate;
+    int16_t block_align;
+    int16_t bits_per_sample = 16;
+    int32_t subchunk2_id    = 0x61746164; // "data"
+    int32_t subchunk2_size;
+};
+
+std::pair<VARP, int> load(const std::string &filename, int sr, int frame_offset, int num_frames) {
+    std::ifstream is(filename, std::ifstream::binary);
+    auto ret = std::make_pair<VARP, int>(nullptr, 0);
+    if (!is) {
+        MNN_ERROR("Failed to open file: %s\n", filename.c_str());
+        return ret;
+    }
+    WaveHeader header{};
+    is.read(reinterpret_cast<char *>(&header.chunk_id), sizeof(header.chunk_id));
+    if (header.chunk_id != 0x46464952) { // "RIFF"
+        MNN_ERROR("Expected chunk_id RIFF. Given: 0x%08x\n", header.chunk_id);
+        return ret;
+    }
+
+    is.read(reinterpret_cast<char *>(&header.chunk_size), sizeof(header.chunk_size));
+    is.read(reinterpret_cast<char *>(&header.format), sizeof(header.format));
+    if (header.format != 0x45564157) { // "WAVE"
+        MNN_ERROR("Expected format WAVE. Given: 0x%08x\n", header.format);
+        return ret;
+    }
+
+    is.read(reinterpret_cast<char *>(&header.subchunk1_id), sizeof(header.subchunk1_id));
+    is.read(reinterpret_cast<char *>(&header.subchunk1_size), sizeof(header.subchunk1_size));
+
+    if (header.subchunk1_id == 0x4b4e554a) { // "JUNK"
+        is.seekg(header.subchunk1_size, std::istream::cur);
+        is.read(reinterpret_cast<char *>(&header.subchunk1_id), sizeof(header.subchunk1_id));
+        is.read(reinterpret_cast<char *>(&header.subchunk1_size), sizeof(header.subchunk1_size));
+    }
+
+    if (header.subchunk1_id != 0x20746d66) { // "fmt "
+        MNN_ERROR("Expected subchunk1_id 'fmt '. Given: 0x%08x\n", header.subchunk1_id);
+        return ret;
+    }
+
+    if (header.subchunk1_size != 16 && header.subchunk1_size != 18) {
+        MNN_ERROR("Expected subchunk1_size 16 or 18. Given: %d\n", header.subchunk1_size);
+        return ret;
+    }
+
+    is.read(reinterpret_cast<char *>(&header.audio_format), sizeof(header.audio_format));
+    if (header.audio_format != 1 && header.audio_format != 3) {
+        MNN_ERROR("Unsupported audio_format: %d. Only PCM(1) and IEEE Float(3) supported.\n", header.audio_format);
+        return ret;
+    }
+
+    is.read(reinterpret_cast<char *>(&header.num_channels), sizeof(header.num_channels));
+    if (header.num_channels != 1) {
+        MNN_ERROR("Warning: %d channels found. Only the first channel will be used.\n", header.num_channels);
+    }
+
+    is.read(reinterpret_cast<char *>(&header.sample_rate), sizeof(header.sample_rate));
+    is.read(reinterpret_cast<char *>(&header.byte_rate), sizeof(header.byte_rate));
+    is.read(reinterpret_cast<char *>(&header.block_align), sizeof(header.block_align));
+    is.read(reinterpret_cast<char *>(&header.bits_per_sample), sizeof(header.bits_per_sample));
+
+    if (header.byte_rate != (header.sample_rate * header.num_channels * header.bits_per_sample / 8)) {
+        MNN_ERROR("Incorrect byte rate: %d. Expected: %d\n", header.byte_rate,
+                  header.sample_rate * header.num_channels * header.bits_per_sample / 8);
+        return ret;
+    }
+
+    if (header.block_align != (header.num_channels * header.bits_per_sample / 8)) {
+        MNN_ERROR("Incorrect block align: %d. Expected: %d\n", header.block_align,
+                  header.num_channels * header.bits_per_sample / 8);
+        return ret;
+    }
+
+    if (header.bits_per_sample != 8 && header.bits_per_sample != 16 && header.bits_per_sample != 32) {
+        MNN_ERROR("Unsupported bits_per_sample: %d. Only 8, 16, or 32 bits per sample supported.\n",
+                  header.bits_per_sample);
+        return ret;
+    }
+
+    if (header.subchunk1_size == 18) {
+        int16_t extra_size;
+        is.read(reinterpret_cast<char *>(&extra_size), sizeof(int16_t));
+        if (extra_size != 0) {
+            MNN_ERROR("Unexpected extra size: %d. Expected 0.\n", extra_size);
+            return ret;
+        }
+    }
+
+    is.read(reinterpret_cast<char *>(&header.subchunk2_id), sizeof(header.subchunk2_id));
+    is.read(reinterpret_cast<char *>(&header.subchunk2_size), sizeof(header.subchunk2_size));
+    header.SeekToDataChunk(is);
+
+    if (!is) {
+        MNN_ERROR("Could not locate data chunk.\n");
+        return ret;
+    }
+
+    int total_frames = header.subchunk2_size / header.block_align;
+    if (frame_offset < 0 || frame_offset >= total_frames) {
+        MNN_ERROR("Frame offset out of range.\n");
+        return ret;
+    }
+
+    if (num_frames <= 0 || frame_offset + num_frames > total_frames) {
+        num_frames = total_frames - frame_offset;
+    }
+
+    is.seekg(frame_offset * header.block_align, std::istream::cur);
+
+    ret.first    = _Input({num_frames}, NHWC);
+    ret.second   = header.sample_rate;
+    auto ans_ptr = ret.first->writeMap<float>();
+    if (header.bits_per_sample == 16 && header.audio_format == 1) {
+        std::vector<int16_t> samples(num_frames * header.num_channels);
+        is.read(reinterpret_cast<char *>(samples.data()), num_frames * header.block_align);
+        if (!is) {
+            MNN_ERROR("Failed to read audio data.\n");
+            return ret;
+        }
+        for (int i = 0; i < num_frames; ++i) {
+            ans_ptr[i] = samples[i * header.num_channels] / 32768.f;
+        }
+    } else if (header.bits_per_sample == 8 && header.audio_format == 1) {
+        std::vector<uint8_t> samples(num_frames * header.num_channels);
+        is.read(reinterpret_cast<char *>(samples.data()), num_frames * header.block_align);
+        if (!is) {
+            MNN_ERROR("Failed to read audio data.\n");
+            return ret;
+        }
+        for (int i = 0; i < num_frames; ++i) {
+            ans_ptr[i] = static_cast<float>(samples[i * header.num_channels]) / 128.f - 1.f;
+        }
+    } else if (header.bits_per_sample == 32 && header.audio_format == 1) {
+        std::vector<int32_t> samples(num_frames * header.num_channels);
+        is.read(reinterpret_cast<char *>(samples.data()), num_frames * header.block_align);
+        if (!is) {
+            MNN_ERROR("Failed to read audio data.\n");
+            return ret;
+        }
+        for (int i = 0; i < num_frames; ++i) {
+            ans_ptr[i] = static_cast<float>(samples[i * header.num_channels]) / static_cast<float>(INT32_MAX);
+        }
+    } else if (header.bits_per_sample == 32 && header.audio_format == 3) {
+        std::vector<float> samples(num_frames * header.num_channels);
+        is.read(reinterpret_cast<char *>(samples.data()), num_frames * header.block_align);
+        if (!is) {
+            MNN_ERROR("Failed to read audio data.\n");
+            return ret;
+        }
+        for (int i = 0; i < num_frames; ++i) {
+            ans_ptr[i] = samples[i * header.num_channels];
+        }
+    } else {
+        MNN_ERROR("Unsupported bits per sample: %d or audio format: %d.\n", header.bits_per_sample,
+                  header.audio_format);
+        return ret;
+    }
+
+    if (sr > 0 && sr != ret.second) {
+        // resample
+        float resample_ratio    = static_cast<float>(sr) / header.sample_rate;
+        int resample_num_frames = static_cast<int>(num_frames * resample_ratio);
+        auto resampled_data     = _Input({resample_num_frames}, NHWC);
+        auto src                = ret.first->readMap<float>();
+        auto dst                = resampled_data->writeMap<float>();
+        for (int i = 0; i < resample_num_frames; ++i) {
+            float interp_index = i / resample_ratio;
+            int low_index      = static_cast<int>(interp_index);
+            int high_index     = std::min(low_index + 1, num_frames - 1);
+            float frac         = interp_index - low_index;
+            dst[i]             = (1 - frac) * src[low_index] + frac * src[high_index];
+        }
+        ret.first  = resampled_data;
+        ret.second = sr;
+    }
+    return ret;
+}
+
+bool save(const std::string &filename, VARP audio, int sample_rate) {
+    std::ofstream os(filename, std::ios::binary);
+    if (!os) {
+        MNN_ERROR("Failed to open file for writing: %s\n", filename.c_str());
+        return false;
+    }
+
+    auto audio_size = audio->getInfo()->size;
+    auto audio_ptr  = audio->readMap<float>();
+    WaveHeader header;
+    header.num_channels   = 1;
+    header.sample_rate    = sample_rate;
+    header.byte_rate      = sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align    = header.num_channels * (header.bits_per_sample / 8);
+    header.subchunk2_size = audio_size * (header.bits_per_sample / 8);
+    header.chunk_size     = 36 + header.subchunk2_size;
+
+    os.write(reinterpret_cast<const char *>(&header), sizeof(WaveHeader));
+
+    // Convert float samples to int16 and write to file
+    for (int i = 0; i < audio_size; i++) {
+        float sample       = audio_ptr[i];
+        int16_t int_sample = static_cast<int16_t>(std::max(-1.0f, std::min(1.0f, sample)) * 32767);
+        os.write(reinterpret_cast<const char *>(&int_sample), sizeof(int16_t));
+    }
+
+    if (!os) {
+        MNN_ERROR("Failed to write audio data to file.\n");
+        return false;
+    }
+
+    os.close();
+    return true;
+}
+
+template <typename T>
+static inline VARP _var(std::vector<T> vec, const std::vector<int> &dims) {
+    return _Const(vec.data(), dims, NHWC, halide_type_of<T>());
+}
+
+unsigned int next_power_of_2(unsigned int x) {
+    if (x == 0)
+        return 1;
+    if ((x & (x - 1)) == 0)
+        return x;
+    return 1U << (32 - mnn_clz(x));
+}
+
+VARP hamming_window(int n_fft, bool periodic, float alpha, float beta) {
+    auto window     = _Input({n_fft}, NHWC);
+    auto window_ptr = window->writeMap<float>();
+    int N           = periodic ? n_fft : n_fft - 1;
+    for (int n = 0; n < n_fft; ++n) {
+        window_ptr[n] = alpha - beta * std::cos(2.0 * M_PI * n / N);
+    }
+    return window;
+}
+
+VARP hann_window(int n_fft, bool periodic) {
+    auto window     = _Input({n_fft}, NHWC);
+    auto window_ptr = window->writeMap<float>();
+    int N           = periodic ? n_fft : n_fft - 1;
+    for (int n = 0; n < n_fft; ++n) {
+        window_ptr[n] = 0.5 * (1 - std::cos(2 * M_PI * n / N));
+    }
+    return window;
+}
+
+float hz_to_mel(float freq, bool htk) {
+    if (htk) {
+        return 2595 * std::log10(1 + freq / 700);
+    } else {
+        constexpr float f_min = 0.0, f_sp = 200.0 / 3.0, min_log_hz = 1000.0;
+        constexpr float logstep     = 0.06875177742094912;
+        constexpr float min_log_mel = (min_log_hz - f_min) / f_sp;
+        float mels                  = (freq - f_min) / f_sp;
+        if (freq >= min_log_hz) {
+            mels = min_log_mel + std::log(freq / min_log_hz) / logstep;
+        }
+        return mels;
+    }
+}
+
+float mel_to_hz(float mel, bool htk) {
+    if (htk) {
+        return 700 * (std::pow(10, mel / 2595.0) - 1);
+    } else {
+        constexpr float f_min = 0.0f, f_sp = 200.0f / 3, min_log_hz = 1000.0f;
+        constexpr float logstep     = 0.06875177742094912;
+        constexpr float min_log_mel = (min_log_hz - f_min) / f_sp;
+        float freq                  = f_min + f_sp * mel;
+        if (mel >= min_log_mel) {
+            freq = min_log_hz * std::exp(logstep * (mel - min_log_mel));
+        }
+        return freq;
+    }
+}
+
+VARP melscale_fbanks(const MelscaleParams *params) {
+    int n_mels = 128, n_fft = 400, sample_rate = 16000;
+    bool htk = true, norm = false;
+    float f_min = 0.0, f_max = 0.0;
+    if (params != nullptr) {
+        n_mels      = params->n_mels;
+        n_fft       = params->n_fft;
+        sample_rate = params->sample_rate;
+        htk         = params->htk;
+        norm        = params->norm;
+        f_min       = params->f_min;
+        f_max       = params->f_max;
+    }
+    int n_freqs   = n_fft / 2 + 1;
+    float nyquist = 0.5 * sample_rate;
+    std::vector<float> all_freqs(n_freqs);
+    for (int i = 0; i < n_freqs; ++i) {
+        all_freqs[i] = i * nyquist / (n_freqs - 1);
+    }
+    f_max         = f_max <= 0.0 ? nyquist : f_max;
+    float m_min   = hz_to_mel(f_min, htk);
+    float m_max   = hz_to_mel(f_max, htk);
+    float m_delta = (m_max - m_min) / (n_mels + 1);
+
+    auto bins     = _Input({n_mels, n_freqs}, NHWC);
+    auto bins_ptr = bins->writeMap<float>();
+    for (int n = 0; n < n_mels; ++n) {
+        float left  = mel_to_hz(m_min + m_delta * (n + 0), htk);
+        float curr  = mel_to_hz(m_min + m_delta * (n + 1), htk);
+        float right = mel_to_hz(m_min + m_delta * (n + 2), htk);
+        float enorm = (htk && norm) ? 1.0 : 2.0 / (right - left);
+        for (int k = 0; k < n_freqs; ++k) {
+            float val = 0.f, f_k = all_freqs[k];
+            if (f_k >= left && f_k <= curr) {
+                val = (f_k - left) / (curr - left);
+            } else if (f_k > curr && f_k <= right) {
+                val = (right - f_k) / (right - curr);
+            }
+            bins_ptr[n * n_freqs + k] = val * enorm;
+        }
+    }
+    return bins;
+}
+
+VARP spectrogram(VARP waveform, const SpectrogramParams *params) {
+    int pad_left = 0, pad_right = 0, pad_mode = REFLECT;
+    int n_fft = 400, hop_length = 0, win_length = 0, window_type = HANNING;
+    bool center = false, normalized = false;
+    float power = 2.0;
+    if (params) {
+        pad_left    = params->pad_left;
+        pad_right   = params->pad_right;
+        center      = params->center;
+        pad_mode    = params->pad_mode;
+        n_fft       = params->n_fft;
+        hop_length  = params->hop_length;
+        win_length  = params->win_length;
+        window_type = params->window_type;
+        normalized  = params->normalized;
+        power       = params->power;
+    }
+    if (pad_left > 1 || pad_right > 1) {
+        waveform = _Pad(waveform, _var<int>({pad_left, pad_right}, {2}), CONSTANT);
+    }
+    if (center) {
+        waveform = _Pad(waveform, _var<int>({n_fft / 2, n_fft / 2}, {2}), static_cast<PadValueMode>(pad_mode));
+    }
+    hop_length = hop_length ? hop_length : n_fft / 2;
+    win_length = win_length ? win_length : n_fft;
+    VARP window;
+    switch (window_type) {
+        case HANNING:
+            window = hann_window(win_length);
+            break;
+        case HAMMING:
+            window = hamming_window(win_length);
+            break;
+        default:
+            window = hann_window(win_length);
+            break;
+    }
+    auto specgram = _Stft(waveform, window, n_fft, hop_length);
+    if (normalized) {
+        float window_norm = std::sqrt(_ReduceSum(_Square(window))->readMap<float>()[0]);
+        specgram          = specgram / _Scalar<float>(window_norm);
+    }
+    if (power == 2.0) {
+        specgram = _Square(specgram);
+    } else if (power > 2.0) {
+        specgram = _Pow(specgram, _Scalar<float>(power));
+    }
+    return specgram;
+}
+
+VARP mel_spectrogram(VARP waveform, const MelscaleParams *mel_params, const SpectrogramParams *spec_params) {
+    auto banks        = melscale_fbanks(mel_params);
+    auto specgram     = spectrogram(waveform, spec_params);
+    auto mel_specgram = _MatMul(specgram, banks, false, true);
+    return mel_specgram;
+}
+
+VARP fbank(VARP waveform, int sampling_rate, int n_mels, int n_fft, int hop_length, float dither, float preemphasis) {
+    int wav_len      = waveform->getInfo()->size;
+    int frame_num    = (wav_len - n_fft) / hop_length + 1;
+    if (frame_num <= 0 || wav_len < n_fft) {
+        return nullptr; // frame_num is zero
+    }
+    // get_strided: sizes: [m, n_fft], strides: [windows_shift, 1]
+    int m                           = 1 + (wav_len - n_fft) / hop_length;
+    std::vector<int> strided_region = {
+        0, // src offset
+        wav_len,
+        hop_length,
+        1, // src strides
+        0, // dst offset
+        m * n_fft,
+        n_fft,
+        1, // dst strides
+        1,
+        m,
+        n_fft // dst sizes
+    };
+    auto strided_wav = _Raster({waveform}, strided_region, {m, n_fft});
+    auto wav_dim     = strided_wav->getInfo()->dim;
+    // add_dither
+    if (dither > 0.f) {
+        auto rand_dither = _RandomUnifom(_var<int>(wav_dim, {static_cast<int>(wav_dim.size())}),
+                                         halide_type_of<float>(), -dither, dither);
+        strided_wav      = strided_wav + rand_dither;
+    }
+    // subtract each row/frame by its mean
+    {
+        auto row_means   = _ReduceMean(strided_wav, {-1}, true);
+        strided_wav      = strided_wav - row_means;
+    }
+    if (preemphasis != 0.f) {
+        std::vector<int> offset_region          = {
+            // region 0
+            0,                               // src offset
+            m * n_fft, n_fft, 1, // src strides
+            0,                               // dst offset
+            m * n_fft, n_fft, 1, // dst strides
+            1, m, 1,                         // dst sizes
+            // region 1
+            0,                               // src offset
+            m * n_fft, n_fft, 1, // src strides
+            1,                               // dst offset
+            m * n_fft, n_fft, 1, // dst strides
+            1, m, n_fft - 1            // dst sizes
+        };
+        auto offset_strided_wav = _Raster({strided_wav, strided_wav}, offset_region, {m, n_fft});
+        strided_wav             = strided_wav - _Scalar<float>(preemphasis) * offset_strided_wav;
+    }
+    int padded_n_fft = next_power_of_2(n_fft);
+    MelscaleParams mel_params;
+    mel_params.n_mels      = n_mels;
+    mel_params.n_fft       = padded_n_fft;
+    mel_params.sample_rate = sampling_rate;
+    mel_params.f_min       = 20.0;
+    SpectrogramParams spec_params;
+    spec_params.n_fft      = padded_n_fft;
+    spec_params.hop_length = n_fft;
+    auto mel_energies      = mel_spectrogram(strided_wav, &mel_params, &spec_params);
+    mel_energies           = _Log(mel_energies);
+    return mel_energies;
+}
+
+VARP whisper_fbank(VARP waveform, int sample_rate, int n_mels, int n_fft, int hop_length, int chunk_len) {
+    int n_samples = chunk_len * sample_rate;
+    int pad_right = n_samples - waveform->getInfo()->size;
+    pad_right     = pad_right > 0 ? pad_right : 0;
+    MelscaleParams mel_params;
+    mel_params.n_mels      = n_mels;
+    mel_params.n_fft       = n_fft;
+    mel_params.sample_rate = sample_rate;
+    mel_params.htk         = false;
+    mel_params.norm        = true;
+    SpectrogramParams spec_params;
+    spec_params.pad_right  = pad_right;
+    spec_params.n_fft      = n_fft;
+    spec_params.hop_length = hop_length;
+    spec_params.center     = true;
+    auto mel_specgram      = mel_spectrogram(waveform, &mel_params, &spec_params);
+    mel_specgram =
+        _Slice(mel_specgram, _var<int>({0, 0}, {2}), _var<int>({mel_specgram->getInfo()->dim[0] - 1, -1}, {2}));
+    auto log_specgram = _Log(mel_specgram) / _Log(_Scalar<float>(10.0));
+    log_specgram      = _Maximum(log_specgram, _ReduceMax(log_specgram) - _Scalar<float>(8.0));
+    log_specgram      = (log_specgram + _Scalar<float>(4.0)) / _Scalar<float>(4.0);
+    // NHWC -> NCHW
+    log_specgram = _Unsqueeze(log_specgram, {0, 1});
+    log_specgram = _Convert(log_specgram, NCHW);
+    log_specgram = _Squeeze(log_specgram, {2});
+    return log_specgram;
+}
+
+} // namespace AUDIO
+} // namespace MNN
diff --git a/tools/audio/test/CMakeLists.txt b/tools/audio/test/CMakeLists.txt
new file mode 100644
index 000000000..67c152583
--- /dev/null
+++ b/tools/audio/test/CMakeLists.txt
@@ -0,0 +1,20 @@
+# using gtest
+INCLUDE(FetchContent)
+FetchContent_Declare(
+  googletest
+  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+)
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+# FILE(COPY ../imgs DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
+
+include_directories(${CMAKE_CURRENT_LIST_DIR}/)
+
+enable_testing()
+
+add_executable(audio_test audio_test.cpp)
+target_link_libraries(audio_test MNNAudio gtest_main)
+
+include(GoogleTest)
+gtest_discover_tests(audio_test)
diff --git a/tools/audio/test/audio_test.cpp b/tools/audio/test/audio_test.cpp
new file mode 100644
index 000000000..6b76bc2ca
--- /dev/null
+++ b/tools/audio/test/audio_test.cpp
@@ -0,0 +1,228 @@
+//
+//  audio_test.cpp
+//  MNN
+//
+//  Created by MNN on 2021/08/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "gtest/gtest.h"
+#include "audio/audio.hpp"
+
+#include <fstream>
+#include <numeric>
+#include <iterator>
+#include <algorithm>
+#include <functional>
+
+#include <MNN/expr/NeuralNetWorkOp.hpp>
+#include <MNN/expr/MathOp.hpp>
+
+using namespace MNN;
+using namespace Express;
+using namespace AUDIO;
+
+static bool nearly(float x, float y, float eps = 1e-3) {
+    return abs(x - y) <= eps;
+}
+
+template <typename T>
+static inline VARP _var(std::vector<T> vec, const std::vector<int>& dims) {
+    return _Const(vec.data(), dims, NHWC, halide_type_of<T>());
+}
+static inline VARP _zeros(const std::vector<int>& dims) {
+    std::vector<float> data(std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int>()), 0);
+    return _Const(data.data(), dims, NCHW, halide_type_of<float>());
+}
+
+static void dump_impl(const float *signal, size_t size, int row = 0) {
+if (row) {
+int col = size / row;
+printf("# %d, %d: [\n", row, col);
+for (int i = 0; i < 3; i++) {
+for (int j = 0; j < 3; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("..., ");
+for (int j = col - 3; j < col; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("\n");
+}
+printf("..., \n");
+for (int i = row - 3; i < row; i++) {
+for (int j = 0; j < 3; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("..., ");
+for (int j = col - 3; j < col; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("\n");
+}
+printf("]\n");
+} else {
+printf("# %lu: [", size);
+for (int i = 0; i < 3; i++) {
+printf("%f, ", signal[i]);
+}
+printf("..., ");
+for (int i = size - 3; i < size; i++) {
+printf("%f, ", signal[i]);
+}
+printf("]\n");
+}
+}
+
+void dump_var(VARP var) {
+auto dims    = var->getInfo()->dim;
+bool isfloat = true;
+printf("{\ndtype = ");
+if (var->getInfo()->type == halide_type_of<float>()) {
+printf("float");
+isfloat = true;
+} else if (var->getInfo()->type == halide_type_of<int>()) {
+printf("int");
+isfloat = false;
+}
+printf("\nformat = %d\n", var->getInfo()->order);
+printf("\ndims = [");
+for (int i = 0; i < dims.size(); i++) {
+printf("%d ", dims[i]);
+}
+printf("]\n");
+
+if (isfloat) {
+if ((dims.size() > 2 && dims[1] > 1 && dims[2] > 1) || (dims.size() == 2 && dims[0] > 1 && dims[1] > 1)) {
+int row = dims[dims.size() - 2];
+dump_impl(var->readMap<float>(), var->getInfo()->size, row);
+} else {
+printf("data = [");
+auto total = var->getInfo()->size;
+if (total > 32) {
+for (int i = 0; i < 5; i++) {
+printf("%f ", var->readMap<float>()[i]);
+}
+printf("..., ");
+for (int i = total - 5; i < total; i++) {
+printf("%f ", var->readMap<float>()[i]);
+}
+} else {
+for (int i = 0; i < total; i++) {
+printf("%f ", var->readMap<float>()[i]);
+}
+}
+printf("]\n}\n");
+}
+} else {
+printf("data = [");
+int size = var->getInfo()->size > 10 ? 10 : var->getInfo()->size;
+for (int i = 0; i < size; i++) {
+printf("%d ", var->readMap<int>()[i]);
+}
+printf("]\n}\n");
+}
+}
+
+TEST(load, wav) {
+    auto audio_data = load("audio.wav");
+    auto sample = audio_data.first;
+    int sample_rate = audio_data.second;
+    auto size = sample->getInfo()->size;
+    auto mean = _ReduceMean(sample)->readMap<float>()[0];
+    bool res = size == 88747 && sample_rate == 16000 && nearly(mean, -0.000021);
+    EXPECT_TRUE(res);
+}
+
+TEST(save, wav) {
+    auto audio_data = load("audio.wav");
+    auto sample = audio_data.first;
+    int sample_rate = audio_data.second;
+    bool res = save("audio_save.wav", sample, sample_rate);
+    EXPECT_TRUE(res);
+}
+
+TEST(hamming_window, 256) {
+    auto window = hamming_window(256);
+    auto mean = _ReduceMean(window)->readMap<float>()[0];
+    bool res = std::vector<int>({256}) == window->getInfo()->dim && nearly(mean, 0.538203);
+    EXPECT_TRUE(res);
+}
+
+TEST(hann_window, 256) {
+    auto window = hann_window(256);
+    auto mean = _ReduceMean(window)->readMap<float>()[0];
+    bool res = std::vector<int>({256}) == window->getInfo()->dim && nearly(mean, 0.498047);
+    EXPECT_TRUE(res);
+}
+
+TEST(melscale_fbanks, 80_400) {
+    MelscaleParams mel_params;
+    mel_params.n_mels = 80;
+    mel_params.n_fft = 400;
+    mel_params.sample_rate = 16000;
+    auto mel = melscale_fbanks(&mel_params);
+    auto mean = _ReduceMean(mel)->readMap<float>()[0];
+    bool res = std::vector<int>({80, 201}) == mel->getInfo()->dim && nearly(mean, 0.000124);
+    EXPECT_TRUE(res);
+}
+
+TEST(spectrogram, 512) {
+    auto audio_data = load("audio.wav");
+    auto sample = audio_data.first;
+    int sample_rate = audio_data.second;
+    SpectrogramParams spec_params;
+    spec_params.n_fft = 512;
+    spec_params.window_type = HANNING;
+    auto specgram = spectrogram(sample, &spec_params);
+    auto mean = _ReduceMean(specgram)->readMap<float>()[0];
+    bool res = std::vector<int>({345, 257}) == specgram->getInfo()->dim && nearly(mean, 2.862101);
+    EXPECT_TRUE(res);
+}
+
+TEST(mel_spectrogram, 400) {
+    auto audio_data = load("audio.wav");
+    auto sample = audio_data.first;
+    int sample_rate = audio_data.second;
+    MelscaleParams mel_params;
+    mel_params.n_mels = 80;
+    mel_params.n_fft = 400;
+    mel_params.sample_rate = sample_rate;
+    SpectrogramParams spec_params;
+    spec_params.n_fft = 400;
+    spec_params.hop_length = 160;
+    spec_params.center = true;
+    auto mel = mel_spectrogram(sample, &mel_params, &spec_params);
+    auto mean = _ReduceMean(mel)->readMap<float>()[0];
+    bool res = std::vector<int>({555, 80}) == mel->getInfo()->dim && nearly(mean, 0.149213);
+    EXPECT_TRUE(res);
+}
+
+TEST(fbank, default) {
+    auto audio_data = load("audio.wav", 0, 9600);
+    auto chunk = audio_data.first;
+    int sample_rate = audio_data.second;
+    auto feat = fbank(chunk);
+    auto mean = _ReduceMean(feat)->readMap<float>()[0];
+    bool res = std::vector<int>({492, 80}) == feat->getInfo()->dim && nearly(mean, -9.875551);
+    EXPECT_TRUE(res);
+}
+
+TEST(whisper_fbank, default) {
+    auto audio_data = load("audio.wav");
+    auto sample = audio_data.first;
+    int sample_rate = audio_data.second;
+    auto feat = whisper_fbank(sample);
+    auto mean = _ReduceMean(feat)->readMap<float>()[0];
+    bool res = std::vector<int>({1, 128, 3000}) == feat->getInfo()->dim && nearly(mean, -0.451097);
+    EXPECT_TRUE(res);
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+    auto res = RUN_ALL_TESTS();
+    auto instance = testing::UnitTest::GetInstance();
+    printf("\nTEST_NAME_AUDIO_UNIT: Audio单元测试\nTEST_CASE_AMOUNT_AUDIO_UNIT: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":%d}\n",
+           instance->failed_test_count(), instance->successful_test_count(), instance->skipped_test_count());
+    return res;
+}
\ No newline at end of file
diff --git a/tools/converter/source/common/FullQuantAndCoding.cpp b/tools/converter/source/common/FullQuantAndCoding.cpp
index a9e8b61ce..5db4b2ec4 100644
--- a/tools/converter/source/common/FullQuantAndCoding.cpp
+++ b/tools/converter/source/common/FullQuantAndCoding.cpp
@@ -114,7 +114,6 @@ void FullQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, std::unique_ptr<MNN::O
     auto quanWeight = _Cast<int8_t>(quanWeightClamp);
     auto convScale  = _Reshape(_Reciprocal(outputScaleVar), {-1, 1, 1, 1}) * weightScale * inputScaleVar;
 
-    std::vector<float> quantWeightFloat;
     std::vector<int8_t> quantWeights;
     std::vector<float> biasData;
     std::vector<float> scale;
@@ -122,10 +121,8 @@ void FullQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, std::unique_ptr<MNN::O
     {
         auto info = quanWeight->getInfo();
         quantWeights.resize(info->size);
-        quantWeightFloat.resize(info->size);
         auto ptr = quanWeight->readMap<int8_t>();
-        for (int i = 0; i < quantWeightFloat.size(); i++) {
-            quantWeightFloat[i] = ptr[i];
+        for (int i = 0; i < quantWeights.size(); i++) {
             quantWeights[i] = ptr[i];
         }
     }
@@ -144,7 +141,7 @@ void FullQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, std::unique_ptr<MNN::O
 
     bool asymmetricQuantFlag = false;
     std::vector<float> fakeScales(kernelNum, 1.0f);
-    convParams->quanParameter = IDSTEncoder::encode(quantWeightFloat.data(), fakeScales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), wClampMin);
+    convParams->quanParameter = IDSTEncoder::encode(nullptr, fakeScales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), wClampMin);
     convParams->weight.clear();
     convParams->quanParameter->alpha = std::move(scale);
     convParams->quanParameter->scaleIn = inputParams.scales(0);
diff --git a/tools/converter/source/common/cli.cpp b/tools/converter/source/common/cli.cpp
index 415fd992b..e7fe1d680 100644
--- a/tools/converter/source/common/cli.cpp
+++ b/tools/converter/source/common/cli.cpp
@@ -1412,14 +1412,17 @@ bool CommonKit::json2protobuf(const char* jsonFile, const char* protoFile, MNN::
     auto algos = pipelineInfo["algo"].GetArray();
     for (auto iter = algos.begin(); iter != algos.end(); ++iter) {
         auto algoInfo = iter->GetObject();
+        MNN_ASSERT(algoInfo["type"].GetInt() == 0);
         auto compressionType = (MNN::Compression::CompressionAlgo_CompressionType)algoInfo["type"].GetInt();
-        std::unique_ptr<MNN::Compression::QuantizeParams> quant_params(new MNN::Compression::QuantizeParams());
         auto quantParamsInfo = algoInfo["quant_params"].GetObject();
         auto round_mode = quantParamsInfo["round_mode"].GetInt();
+        MNN::Compression::CompressionAlgo* algo = pipeline->add_algo();
+        algo->set_type(compressionType);
+        auto quant_params = algo->mutable_quant_params();
         quant_params->set_round_mode((MNN::Compression::QuantizeParams_RoundMode)round_mode);
 
-        auto layer = quantParamsInfo["layer"].GetArray();
-        for (auto ly = layer.begin(); ly != layer.end(); ++ly) {
+        auto layers = quantParamsInfo["layer"].GetArray();
+        for (auto ly = layers.begin(); ly != layers.end(); ++ly) {
             auto layerInfo = ly->GetObject();
             auto newLayer = quant_params->add_layer();
             if (layerInfo.HasMember("method")) {
@@ -1450,7 +1453,6 @@ bool CommonKit::json2protobuf(const char* jsonFile, const char* protoFile, MNN::
             // Input.
             auto inputs_ = layerInfo["input"].GetArray();
             for (auto w = inputs_.begin(); w != inputs_.end(); ++w) {
-                // Get weight info.
                 int bits = w->GetObject()["bits"].GetInt();
                 auto name = w->GetObject()["name"].GetString();
                 auto scale = w->GetObject()["scales"].GetArray();
@@ -1471,7 +1473,6 @@ bool CommonKit::json2protobuf(const char* jsonFile, const char* protoFile, MNN::
             // Output.
             auto outputs_ = layerInfo["output"].GetArray();
             for (auto w = outputs_.begin(); w != outputs_.end(); ++w) {
-                // Get weight info.
                 int bits = w->GetObject()["bits"].GetInt();
                 auto name = w->GetObject()["name"].GetString();
                 auto scale = w->GetObject()["scales"].GetArray();
@@ -1489,10 +1490,6 @@ bool CommonKit::json2protobuf(const char* jsonFile, const char* protoFile, MNN::
                 }
             }
         }
-        MNN::Compression::CompressionAlgo* algo = pipeline->add_algo();
-        algo->set_type(compressionType);
-        auto params = algo->quant_params();
-        params.CopyFrom(*quant_params.get());
     }
     // Write protobuf.bin
     if (protoFile) {
diff --git a/tools/cpp/CMakeLists.txt b/tools/cpp/CMakeLists.txt
index 2bb120f81..624d2abc0 100644
--- a/tools/cpp/CMakeLists.txt
+++ b/tools/cpp/CMakeLists.txt
@@ -8,11 +8,9 @@ add_executable(GetMNNInfo ${CMAKE_CURRENT_LIST_DIR}/GetMNNInfo.cpp)
 list(APPEND MNN_CPP_TOOLS GetMNNInfo)
 add_executable(ModuleBasic.out ${CMAKE_CURRENT_LIST_DIR}/ModuleBasic.cpp)
 list(APPEND MNN_CPP_TOOLS ModuleBasic.out)
-IF(MNN_OPENGL AND MNN_OPENCL)
-    add_definitions(-DMNN_USE_LIB_WRAPPER)
-    add_library(MNN_CL_WRAP OBJECT ${CMAKE_CURRENT_LIST_DIR}/../../source/backend/opencl/core/runtime/OpenCLWrapper.cpp)
+IF(CMAKE_SYSTEM_NAME MATCHES "^Android")
     add_executable(GpuInterTest.out ${CMAKE_CURRENT_LIST_DIR}/GpuInterTest.cpp )
-    target_link_libraries(GpuInterTest.out MNN_CL_WRAP)
+    target_link_libraries(GpuInterTest.out android)
     list(APPEND MNN_CPP_TOOLS GpuInterTest.out)
 ENDIF()
 add_executable(SequenceModuleTest.out ${CMAKE_CURRENT_LIST_DIR}/SequenceModuleTest.cpp)
diff --git a/tools/cpp/GpuInterTest.cpp b/tools/cpp/GpuInterTest.cpp
index c92b698ce..991ba8c3f 100644
--- a/tools/cpp/GpuInterTest.cpp
+++ b/tools/cpp/GpuInterTest.cpp
@@ -18,124 +18,201 @@
 #include <sstream>
 #include <numeric>
 #include "ExprDebug.hpp"
-#define MNN_USE_LIB_WRAPPER
-#define MNN_USER_SET_DEVICE
-#define MNN_OPENCL_SVM_ENABLE
 #include "MNN/MNNSharedContext.h"
 using namespace MNN::Express;
 using namespace MNN;
 
 #ifdef __ANDROID__
-#include <GLES2/gl2.h>
-#include <GLES2/gl2ext.h>
-#include <GLES3/gl31.h>
-#include <EGL/egl.h>
-class UserGLDeviceBuffer{
-public:
-    UserGLDeviceBuffer(){
-        EGLDisplay mDisplay = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-        int majorVersion;
-        int minorVersion;
-        eglInitialize(mDisplay, &majorVersion, &minorVersion);
-        EGLint numConfigs;
-        static const EGLint configAttribs[] = {EGL_SURFACE_TYPE, EGL_PBUFFER_BIT, EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
-                                            EGL_RED_SIZE, 8,
-                                            EGL_GREEN_SIZE, 8,
-                                            EGL_BLUE_SIZE, 8,
-                                            EGL_ALPHA_SIZE, 8,
-                                            EGL_NONE};
+#include <dlfcn.h>
+#include <android/hardware_buffer.h>
 
-        EGLConfig surfaceConfig;
-        eglChooseConfig(mDisplay, configAttribs, &surfaceConfig, 1, &numConfigs);
-        static const EGLint contextAttribs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE};
-        mContext = eglCreateContext(mDisplay, surfaceConfig, NULL, contextAttribs);
-        static const EGLint surfaceAttribs[] = {EGL_WIDTH, 1, EGL_HEIGHT, 1, EGL_NONE};
-        mSurface = eglCreatePbufferSurface(mDisplay, surfaceConfig, surfaceAttribs);
-        eglMakeCurrent(mDisplay, mSurface, mSurface, mContext);
-        eglBindAPI(EGL_OPENGL_ES_API);
-        int major;
-        glGetIntegerv(GL_MAJOR_VERSION, &major);
-    }
-    ~UserGLDeviceBuffer(){
-        if (mDisplay != EGL_NO_DISPLAY) {
-            if (mContext != EGL_NO_CONTEXT) {
-                    eglDestroyContext(mDisplay, mContext);
-                    mContext = EGL_NO_CONTEXT;
-                }
-                if (mSurface != EGL_NO_SURFACE) {
-                    eglDestroySurface(mDisplay, mSurface);
-                    mSurface = EGL_NO_SURFACE;
-                }
-                eglMakeCurrent(mDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
-                eglTerminate(mDisplay);
-                mDisplay = EGL_NO_DISPLAY;
-            }
-        eglReleaseThread();
-    }
-    GLuint CreateTexture(int width, int height, void* data) {
-        GLuint textureID;
-        glGenTextures(1, &textureID);
+/*
+Ref from 
+https://android.googlesource.com/platform/external/libchrome/+/refs/tags/aml_res_331314010/base/android/android_hardware_buffer_compat.h
+*/
+using PFAHardwareBuffer_allocate = int (*)(const AHardwareBuffer_Desc* desc,
+                                            AHardwareBuffer** outBuffer);
+using PFAHardwareBuffer_acquire = void (*)(AHardwareBuffer* buffer);
+using PFAHardwareBuffer_describe = void (*)(const AHardwareBuffer* buffer,
+                                            AHardwareBuffer_Desc* outDesc);
+using PFAHardwareBuffer_lock = int (*)(AHardwareBuffer* buffer,
+                                       uint64_t usage,
+                                       int32_t fence,
+                                       const ARect* rect,
+                                       void** outVirtualAddress);
+using PFAHardwareBuffer_recvHandleFromUnixSocket =
+    int (*)(int socketFd, AHardwareBuffer** outBuffer);
+using PFAHardwareBuffer_release = void (*)(AHardwareBuffer* buffer);
+using PFAHardwareBuffer_sendHandleToUnixSocket =
+    int (*)(const AHardwareBuffer* buffer, int socketFd);
+using PFAHardwareBuffer_unlock = int (*)(AHardwareBuffer* buffer,
+                                         int32_t* fence);
 
-        glBindTexture(GL_TEXTURE_2D, textureID);
+class AndroidHardwareBufferCompat {
+ public:
+  bool IsSupportAvailable() const {
+    return true;
+  }
+  AndroidHardwareBufferCompat();
+  int Allocate(const AHardwareBuffer_Desc* desc, AHardwareBuffer** outBuffer);
+  void Acquire(AHardwareBuffer* buffer);
+  void Describe(const AHardwareBuffer* buffer, AHardwareBuffer_Desc* outDesc);
+  int Lock(AHardwareBuffer* buffer,
+           uint64_t usage,
+           int32_t fence,
+           const ARect* rect,
+           void** out_virtual_address);
+  int RecvHandleFromUnixSocket(int socketFd, AHardwareBuffer** outBuffer);
+  void Release(AHardwareBuffer* buffer);
+  int SendHandleToUnixSocket(const AHardwareBuffer* buffer, int socketFd);
+  int Unlock(AHardwareBuffer* buffer, int32_t* fence);
+ private:
+  PFAHardwareBuffer_allocate allocate_;
+  PFAHardwareBuffer_acquire acquire_;
+  PFAHardwareBuffer_describe describe_;
+  PFAHardwareBuffer_lock lock_;
+  PFAHardwareBuffer_recvHandleFromUnixSocket recv_handle_;
+  PFAHardwareBuffer_release release_;
+  PFAHardwareBuffer_sendHandleToUnixSocket send_handle_;
+  PFAHardwareBuffer_unlock unlock_;
+};
+#define DCHECK(x) MNN_ASSERT(x)
+AndroidHardwareBufferCompat::AndroidHardwareBufferCompat() {
+  // TODO(klausw): If the Chromium build requires __ANDROID_API__ >= 26 at some
+  // point in the future, we could directly use the global functions instead of
+  // dynamic loading. However, since this would be incompatible with pre-Oreo
+  // devices, this is unlikely to happen in the foreseeable future, so just
+  // unconditionally use dynamic loading.
+  // cf. base/android/linker/modern_linker_jni.cc
+  void* main_dl_handle = dlopen(nullptr, RTLD_NOW);
+  *reinterpret_cast<void**>(&allocate_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_allocate");
+  DCHECK(allocate_);
+  *reinterpret_cast<void**>(&acquire_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_acquire");
+  DCHECK(acquire_);
+  *reinterpret_cast<void**>(&describe_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_describe");
+  DCHECK(describe_);
+  *reinterpret_cast<void**>(&lock_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_lock");
+  DCHECK(lock_);
+  *reinterpret_cast<void**>(&recv_handle_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_recvHandleFromUnixSocket");
+  DCHECK(recv_handle_);
+  *reinterpret_cast<void**>(&release_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_release");
+  DCHECK(release_);
+  *reinterpret_cast<void**>(&send_handle_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_sendHandleToUnixSocket");
+  DCHECK(send_handle_);
+  *reinterpret_cast<void**>(&unlock_) =
+      dlsym(main_dl_handle, "AHardwareBuffer_unlock");
+  DCHECK(unlock_);
+}
 
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+int AndroidHardwareBufferCompat::Allocate(const AHardwareBuffer_Desc* desc,
+                                           AHardwareBuffer** out_buffer) {
+  DCHECK(IsSupportAvailable());
+  return allocate_(desc, out_buffer);
+}
+void AndroidHardwareBufferCompat::Acquire(AHardwareBuffer* buffer) {
+  DCHECK(IsSupportAvailable());
+  acquire_(buffer);
+}
+void AndroidHardwareBufferCompat::Describe(const AHardwareBuffer* buffer,
+                                           AHardwareBuffer_Desc* out_desc) {
+  DCHECK(IsSupportAvailable());
+  describe_(buffer, out_desc);
+}
+int AndroidHardwareBufferCompat::Lock(AHardwareBuffer* buffer,
+                                      uint64_t usage,
+                                      int32_t fence,
+                                      const ARect* rect,
+                                      void** out_virtual_address) {
+  DCHECK(IsSupportAvailable());
+  return lock_(buffer, usage, fence, rect, out_virtual_address);
+}
+int AndroidHardwareBufferCompat::RecvHandleFromUnixSocket(
+    int socket_fd,
+    AHardwareBuffer** out_buffer) {
+  DCHECK(IsSupportAvailable());
+  return recv_handle_(socket_fd, out_buffer);
+}
+void AndroidHardwareBufferCompat::Release(AHardwareBuffer* buffer) {
+  DCHECK(IsSupportAvailable());
+  release_(buffer);
+}
+int AndroidHardwareBufferCompat::SendHandleToUnixSocket(
+    const AHardwareBuffer* buffer,
+    int socket_fd) {
+  DCHECK(IsSupportAvailable());
+  return send_handle_(buffer, socket_fd);
+}
+int AndroidHardwareBufferCompat::Unlock(AHardwareBuffer* buffer,
+                                        int32_t* fence) {
+  DCHECK(IsSupportAvailable());
+  return unlock_(buffer, fence);
+}
 
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_FLOAT, data);
+static std::shared_ptr<AndroidHardwareBufferCompat> gFunction;
 
-        glBindTexture(GL_TEXTURE_2D, 0);
+static AHardwareBuffer* creatAHardwareBuffer(int width, int height, void *data){
+    // 创建和初始化硬件缓冲区
+    AHardwareBuffer_Desc bufferDesc = {};
+    bufferDesc.width = width;
+    bufferDesc.height = height;
+    bufferDesc.layers = 1;
+    bufferDesc.format = AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM;
+    bufferDesc.usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN | AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
 
-        return textureID;
-    }
-    void ReleaseTexture(GLuint textureID){
-        glDeleteTextures(1, &textureID);
-    }
-private:
-    EGLContext mContext;
-    EGLDisplay mDisplay;
-    EGLSurface mSurface;
-};
-#endif
-#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
-class UserCLDeviceBuffer{
-public:
-    UserCLDeviceBuffer(){
-        OpenCLSymbolsOperator::createOpenCLSymbolsOperatorSingleInstance();
-        std::vector<cl::Platform> platforms;
-        cl_int res = cl::Platform::get(&platforms, 0);
-        cl::Platform::setDefault(platforms[0]);
-        std::vector<cl::Device> gpuDevices;
-        res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
-        mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[0]);
-        mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), nullptr, nullptr, nullptr, &res));
-        mCommandQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, 0, &res);
-    }
-    std::shared_ptr<cl::Context> getContext(){
-        return mContext;
-    }
-    cl::Buffer *createBuffer(size_t size){
-        cl_int res;
-        return new cl::Buffer(*mContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size * sizeof(float), NULL, &res);
+    AHardwareBuffer* buffer = nullptr;
+    int result = gFunction->Allocate(&bufferDesc, &buffer);
+    if(result != 0) {
+        // Handle allocation error
+        MNN_PRINT("alloc AHardwareBuffer failed   %d\n", result);
     }
-    void copyToBuffer(cl::Buffer *buffer, int size, float* ptr){
-        auto gpuptr = mCommandQueuePtr.get()->enqueueMapBuffer(*buffer, CL_TRUE, CL_MAP_WRITE, 0, size * sizeof(float));
-        memcpy(gpuptr, ptr, size);
-        mCommandQueuePtr.get()->enqueueUnmapMemObject(*buffer, gpuptr);
+        
+    if(nullptr != data){
+        void* map = nullptr;
+        ARect rect = { 0, 0, width, height };  // Define the region to lock
+        result = gFunction->Lock(buffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1, &rect, &map);
+        if (result != 0) {
+            // Handle lock failure
+            MNN_PRINT("Handle lock failed\n");
+        }
+        if (map) {
+            // Now write your pixel data to 'data'
+            // For example, fill it with a solid color:
+            memcpy(map, data, width * height * 4); // Assuming RGBA8888 format
+        }
+            
+        gFunction->Unlock(buffer, nullptr);
     }
-    float *mapDevicePtr(cl::Buffer *buffer, int size){
-        auto gpuptr = mCommandQueuePtr.get()->enqueueMapBuffer(*buffer, CL_TRUE, CL_MAP_WRITE, 0, size * sizeof(float));
-        return (float*) gpuptr;
+    return buffer;
+}
+static void copyDataFromAHardWareBuffer(AHardwareBuffer* buffer, int width, int height, void *data){
+    int result = 0;
+    if(nullptr != data){
+        void* map = nullptr;
+        ARect rect = { 0, 0, width, height };  // Define the region to lock
+        result = gFunction->Lock(buffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1, &rect, &map);
+        if (result != 0) {
+            MNN_PRINT("Handle lock failed\n");
+        }
+        if (map) {
+            memcpy(data, map, width * height * 4);
+        }
+            
+        gFunction->Unlock(buffer, nullptr);
     }
-    void *umapDevicePtr(cl::Buffer *buffer, void* ptr){
-        mCommandQueuePtr.get()->enqueueUnmapMemObject(*buffer, ptr);
+}
+static void ReleaseAHardWareBuffer(AHardwareBuffer* buffer){
+    if(buffer != nullptr){
+        gFunction->Release(buffer);
     }
-private:
-    std::shared_ptr<::cl::Context> mContext;
-    std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;
-    std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;
-};
+}
+#endif
 
 int main(int argc, char *argv[]) {
     if (argc < 3) {
@@ -203,10 +280,10 @@ int main(int argc, char *argv[]) {
         }
     }
     int testMode = 0;
-    //testMode = 0 OpenCL, testMode = 1 OpenGL
+    //testMode = 0 AhardwareBuffer
     if(argc > 3){
         testMode = atoi(argv[3]);
-        MNN_PRINT("Use extra forward type: %d(0:OpenCL 1:OpenGL)\n", testMode);
+        MNN_PRINT("Use extra forward type: %d(0:AhardwareBuffer)\n", testMode);
     }
 
     auto type = MNN_FORWARD_CPU;
@@ -246,27 +323,11 @@ int main(int argc, char *argv[]) {
 
     MNN::Express::Module::Config mConfig;
     mConfig.shapeMutable = shapeMutable;
-    MNNDeviceContext DeviceContext;
-    // set shared context for OpenCL, or context and display for OpenGL
 #ifdef __ANDROID__
-    std::vector<GLuint> GLdeviceInputPtrVec;
-    std::vector<GLuint> GLdeviceOutputPtrVec;
-    std::shared_ptr<UserGLDeviceBuffer> GLDeviceBuffer;
-    if(testMode == 1){
-        GLDeviceBuffer = std::shared_ptr<UserGLDeviceBuffer>(new UserGLDeviceBuffer);
-        DeviceContext.contextPtr = eglGetCurrentContext();
-        DeviceContext.glShared = eglGetCurrentDisplay();
-    }
+    gFunction.reset(new AndroidHardwareBufferCompat);
+    std::vector<AHardwareBuffer*> AHardwarePtrInputVec;
+    std::vector<AHardwareBuffer*> AHardwarePtrOutputVec;
 #endif
-    std::vector<cl::Buffer*> CLdeviceInputPtrVec;
-    std::vector<cl::Buffer*> CLdeviceOutputPtrVec;
-    std::shared_ptr<UserCLDeviceBuffer> CLDeviceBuffer;
-    if(testMode == 0){
-        CLDeviceBuffer = std::shared_ptr<UserCLDeviceBuffer>(new UserCLDeviceBuffer);
-        DeviceContext.contextPtr = CLDeviceBuffer.get()->getContext().get();
-    }
-
-    backendConfig.sharedContext = &DeviceContext;
 
     std::shared_ptr<Executor::RuntimeManager> rtmgr(Executor::RuntimeManager::createRuntimeManager(config));
     rtmgr->setCache(cacheFileName);
@@ -281,11 +342,9 @@ int main(int argc, char *argv[]) {
     }
     auto mInfo = net->getInfo();
 #ifdef __ANDROID__
-    GLdeviceInputPtrVec.resize(mInfo->inputs.size());
-    GLdeviceOutputPtrVec.resize(outputNames.size());
+    AHardwarePtrInputVec.resize(mInfo->inputs.size());
+    AHardwarePtrOutputVec.resize(outputNames.size());
 #endif
-    CLdeviceInputPtrVec.resize(mInfo->inputs.size());
-    CLdeviceOutputPtrVec.resize(outputNames.size());
     if (inputs.empty()) {
         inputs.resize(mInfo->inputs.size());
         for (int i=0; i<inputs.size(); ++i) {
@@ -299,7 +358,7 @@ int main(int argc, char *argv[]) {
             auto shapeIter = inputShape.find(inputName);
             if (shapeIter != inputShape.end()) {
                 auto s = shapeIter->second;
-                inputs[i] = _Input(s, mInfo->defaultFormat, mInfo->inputs[i].type);
+                inputs[i] = _Input(s, mInfo->inputs[i].order, mInfo->inputs[i].type);
                 width = s[3];
                 height = s[2];
                 channel = s[1];
@@ -307,16 +366,13 @@ int main(int argc, char *argv[]) {
             // set input device ptr
 #ifdef __ANDROID__
             // OpenGL Texture defaultFormat NC4HW4
-            if(testMode == 1){
+            if(testMode == 0){
                 width = width * ((channel + 3) / 4);
-                GLdeviceInputPtrVec[i] = (GLDeviceBuffer.get()->CreateTexture(width,height,nullptr));
-                inputs[i]->setDevicePtr((void*)GLdeviceInputPtrVec[i], MNN_FORWARD_OPENGL);
+                AHardwarePtrInputVec[i] = creatAHardwareBuffer(width,height,nullptr);
+                volatile uint64_t value = (uint64_t)AHardwarePtrInputVec[i];
+                inputs[i]->setDevicePtr((void*)value, MNN_MEMORY_AHARDWAREBUFFER);
             }
 #endif
-            if(testMode == 0){
-                CLdeviceInputPtrVec[i] = CLDeviceBuffer.get()->createBuffer(info->size);
-                inputs[i]->setDevicePtr(CLdeviceInputPtrVec[i], MNN_FORWARD_OPENCL);
-            }
         }
     }
 
@@ -329,19 +385,16 @@ int main(int argc, char *argv[]) {
             return 0;
         }
         for (int i=0; i<outputNames.size(); ++i) {
-            auto info = inputs[i]->getInfo();
+            auto info = outputs[i]->getInfo();
             int width = info->dim[3], height = info->dim[2], channel = info->dim[1];
             // copy output to device ptr
 #ifdef __ANDROID__
-            if(testMode == 1){
-                GLdeviceOutputPtrVec[i] = GLDeviceBuffer.get()->CreateTexture(width,height,nullptr);
-                outputs[i]->copyToDevicePtr((void*)GLdeviceOutputPtrVec[i], MNN_FORWARD_OPENGL);
-            }
-#endif
             if(testMode == 0){
-                CLdeviceOutputPtrVec[i] = CLDeviceBuffer.get()->createBuffer(info->size);
-                outputs[i]->copyToDevicePtr(CLdeviceOutputPtrVec[i], MNN_FORWARD_OPENCL);
+                AHardwarePtrOutputVec[i] = creatAHardwareBuffer(width,height,nullptr);
+                volatile uint64_t value = (uint64_t)AHardwarePtrOutputVec[i];
+                outputs[i]->copyToDevicePtr((void*)value, MNN_MEMORY_AHARDWAREBUFFER);
             }
+#endif
         }
 
         // Print module's memory
@@ -351,11 +404,11 @@ int main(int argc, char *argv[]) {
     }
 #ifdef __ANDROID__
     if(testMode == 1){
-        for(int i = 0; i < GLdeviceInputPtrVec.size(); ++i){
-            GLDeviceBuffer.get()->ReleaseTexture(GLdeviceInputPtrVec[i]);
+        for(int i = 0; i < AHardwarePtrInputVec.size(); ++i){
+            ReleaseAHardWareBuffer(AHardwarePtrInputVec[i]);
         }
-        for(int i = 0; i < GLdeviceOutputPtrVec.size(); ++i){
-            GLDeviceBuffer.get()->ReleaseTexture(GLdeviceOutputPtrVec[i]);
+        for(int i = 0; i < AHardwarePtrOutputVec.size(); ++i){
+            ReleaseAHardWareBuffer(AHardwarePtrOutputVec[i]);
         }
     }
 #endif
diff --git a/tools/quantization/Helper.cpp b/tools/quantization/Helper.cpp
index 1ad17aa87..7bbdf9857 100644
--- a/tools/quantization/Helper.cpp
+++ b/tools/quantization/Helper.cpp
@@ -26,12 +26,6 @@
 
 std::set<std::string> Helper::gNotNeedFeatureOp = { "Raster", "Pooling", "ReLU", "ReLU6", "Interp", "CropAndResize", "ROIPooling", "Gather", "GatherV2", "GatherND", "ScatterNd" };
 
-std::set<MNN::OpType> Helper::INT8SUPPORTED_OPS = {
-    MNN::OpType_ConvInt8, MNN::OpType_DepthwiseConvInt8, MNN::OpType_PoolInt8, MNN::OpType_EltwiseInt8
-    // MNN::OpType_Int8ToFloat,
-    // MNN::OpType_FloatToInt8,
-};
-
 std::set<std::string> Helper::featureQuantizeMethod = {"EMA", "KL", "ADMM"};
 std::set<std::string> Helper::weightQuantizeMethod  = {"MAX_ABS", "ADMM"};
 
diff --git a/tools/quantization/Helper.hpp b/tools/quantization/Helper.hpp
index 7d9dcbe5e..a428cfbb9 100644
--- a/tools/quantization/Helper.hpp
+++ b/tools/quantization/Helper.hpp
@@ -31,8 +31,6 @@ class Helper {
 
     static std::set<std::string> gNotNeedFeatureOp;
 
-    static std::set<MNN::OpType> INT8SUPPORTED_OPS;
-
     static std::set<std::string> featureQuantizeMethod;
     static std::set<std::string> weightQuantizeMethod;
 
diff --git a/tools/script/register.py b/tools/script/register.py
index 7ab46a500..7271f8841 100644
--- a/tools/script/register.py
+++ b/tools/script/register.py
@@ -6,6 +6,7 @@ def generateShape(rootDir):
     shapeLists = []
     renderShape = []
     transformerFuseShape = []
+    audioShape = []
     def collectFile(f):
         if os.path.isdir(f):
             return
@@ -50,6 +51,13 @@ def collectFile(f):
                     l = l.split(',')
                     func = '___' + l[0] + '__'+l[1]+"__"
                     transformerFuseShape.append(func)
+                elif l.find('REGISTER_SHAPE_AUDIO') >= 0:
+                    l = l.replace("REGISTER_SHAPE_AUDIO(", "")
+                    l = l.split(')')[0]
+                    l = l.replace(' ', "")
+                    l = l.split(',')
+                    func = '___' + l[0] + '__'+l[1]+"__"
+                    audioShape.append(func)
     shapeRegFile = os.path.join(shapeDir, "ShapeRegister.cpp")
     print(shapeRegFile)
     for fi in os.listdir(shapeDir):
@@ -76,6 +84,10 @@ def collectFile(f):
         for l in transformerFuseShape:
             f.write("extern void " + l + '();\n')
         f.write('#endif\n')
+        f.write('#ifdef ' + 'MNN_BUILD_AUDIO' + '\n')
+        for l in audioShape:
+            f.write("extern void " + l + '();\n')
+        f.write('#endif\n')
         f.write('void registerShapeOps() {\n')
         for l in shapeLists:
             f.write(l+'();\n')
@@ -87,6 +99,10 @@ def collectFile(f):
         for l in transformerFuseShape:
             f.write(l+'();\n')
         f.write('#endif\n')
+        f.write('#ifdef ' + 'MNN_BUILD_AUDIO' + '\n')
+        for l in audioShape:
+            f.write(l+'();\n')
+        f.write('#endif\n')
         f.write("}\n}\n")
     return
 
@@ -97,6 +113,7 @@ def generateCPUFile(rootDir):
     funcNames = []
     renderNames = []
     transformerNamse = []
+    audioNames = []
     def collectFile(fileNames, dirname):
         for fi in fileNames:
             f = os.path.join(dirname, fi)
@@ -116,6 +133,8 @@ def collectFile(fileNames, dirname):
                         renderNames.append(funcName)
                     elif lo.find('REGISTER_CPU_OP_CREATOR_TRANSFORMER') >= 0:
                         transformerNamse.append(funcName)
+                    elif lo.find('REGISTER_CPU_OP_CREATOR_AUDIO') >= 0:
+                        audioNames.append(funcName)
                     else:
                         funcNames.append(funcName)
     fileNames = os.listdir(cpuDir)
@@ -139,6 +158,10 @@ def collectFile(fileNames, dirname):
         for l in transformerNamse:
             f.write("extern void " + l + '();\n')
         f.write('#endif\n')
+        f.write('#ifdef ' + 'MNN_BUILD_AUDIO' + '\n')
+        for l in audioNames:
+            f.write("extern void " + l + '();\n')
+        f.write('#endif\n')
         f.write('void registerCPUOps() {\n')
         for l in funcNames:
             f.write(l+'();\n')
@@ -150,8 +173,12 @@ def collectFile(fileNames, dirname):
         for l in transformerNamse:
             f.write(l+'();\n')
         f.write('#endif\n')
+        f.write('#ifdef ' + 'MNN_BUILD_AUDIO' + '\n')
+        for l in audioNames:
+            f.write(l+'();\n')
+        f.write('#endif\n')
         f.write("}\n}\n")
-        
+
 def generateOPENCLFile(rootDir):
     openclDir = os.path.join(rootDir, "source", "backend", "opencl")
     openclBufferDir = os.path.join(rootDir, "source", "backend", "opencl", "execution", "buffer")
@@ -184,11 +211,11 @@ def collectFile(fileNames, dirname):
                         opNamesImage.append(funcName)
                     else:
                         opNamesBuffer.append(funcName)
-                        
+
     bufferFileNames = os.listdir(openclBufferDir)
     print(bufferFileNames)
     collectFile(bufferFileNames, openclBufferDir)
-    
+
     imageFileNames = os.listdir(openclImageDir)
     print(imageFileNames)
     collectFile(imageFileNames, openclImageDir)
diff --git a/transformers/llm/.gitignore b/transformers/llm/.gitignore
deleted file mode 100644
index 000bc68dc..000000000
--- a/transformers/llm/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-datasets/*
-!datasets/*.sh
-
-
-!datasets/visualization/
-datasets/visualization/data
-datasets/visualization/pic
\ No newline at end of file
diff --git a/transformers/llm/datasets/get-sharegpt.sh b/transformers/llm/datasets/get-sharegpt.sh
deleted file mode 100644
index fe4be4b5d..000000000
--- a/transformers/llm/datasets/get-sharegpt.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-git lfs install
-git clone https://huggingface.co/datasets/shareAI/ShareGPT-Chinese-English-90k
\ No newline at end of file
diff --git a/transformers/llm/datasets/get-wikitext-2-raw.sh b/transformers/llm/datasets/get-wikitext-2-raw.sh
deleted file mode 100644
index 33096304c..000000000
--- a/transformers/llm/datasets/get-wikitext-2-raw.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-unzip wikitext-2-raw-v1.zip
\ No newline at end of file
diff --git a/transformers/llm/datasets/visualization/stats.py b/transformers/llm/datasets/visualization/stats.py
deleted file mode 100644
index 761ef256d..000000000
--- a/transformers/llm/datasets/visualization/stats.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import matplotlib.pyplot as plt
-from matplotlib import colors
-from matplotlib.ticker import PercentFormatter
-from matplotlib import cbook
-from matplotlib.axes import Axes
-import pandas as pd
-import numpy as np
-import argparse
-import os
-
-vis_root = "pic"
-
-def remove_blanks(df: pd.DataFrame) -> pd.DataFrame:
-    # Removing unnamed columns using drop function
-    df.drop(df.columns[df.columns.str.contains(
-        'unnamed', case=False)], axis=1, inplace=True)
-    return df
-def add_turns(df: pd.DataFrame) -> pd.DataFrame:
-    df["turns"] = (1-df.isnull()).sum(axis=1) // 2
-    return df
-def get_max_turn(df: pd.DataFrame) -> int:
-    keys = list(df.keys())
-    return max([int(key.replace("decode", "")) for key in keys if "decode" in key]) + 1
-def add_pd_ratio(df: pd.DataFrame) -> pd.DataFrame:
-    max_turns = get_max_turn(df)
-    for i in range(max_turns):
-        df["pd_ratio{}".format(i)] = df["prefill{}".format(i)] / df["decode{}".format(i)]
-    return df 
-def preprocess(file_path: str) -> pd.DataFrame:
-    table = pd.read_csv(file_path)
-    table = remove_blanks(table)
-    table = add_turns(table)
-    table = add_pd_ratio(table)
-    print(table)
-    return table
-
-def draw_distribution(df: pd.DataFrame, file_path: str):
-    turns_bin = df.value_counts(subset=["turns"], sort=False)
-    print(turns_bin)
-    plt.close()
-    plt.rcParams['font.size'] = 10
-    _, ax = plt.subplots()
-    # N is the count in each bin, bins is the lower-limit of the bin
-    N, bins, patches = ax.hist(df["turns"], bins=get_max_turn(df), density=True, align="left", label=True)
-    # We'll color code by height, but you could use any scalar
-    fracs = N / N.max()
-    # we need to normalize the data to 0..1 for the full range of the colormap
-    norm = colors.Normalize(fracs.min(), fracs.max())
-    # Now, we'll loop through our objects and set the color of each accordingly
-    for thisfrac, thispatch in zip(fracs, patches):
-        color = plt.cm.viridis(norm(thisfrac))
-        thispatch.set_facecolor(color)
-    # Now we format the y-axis to display percentage
-    ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
-    ax.set_xlim((0.5, get_max_turn(df)-0.5))
-    ax.set_xticks(np.arange(1,get_max_turn(df)+1),np.arange(1,get_max_turn(df)+1),rotation=60, fontsize=9)
-    ax.set_ylabel("frequency", fontsize=14)
-    ax.set_xlabel("num of turns", fontsize=14)
-    plt.savefig(file_path, dpi=600)
-    plt.close()
-
-def draw_prefill(df: pd.DataFrame, ax: Axes):
-    stats = [cbook.boxplot_stats(df[df["prefill{}".format(i)].notna()]["prefill{}".format(i)], labels=[i+1])[0]
-                 for i in range(get_max_turn(df))]
-    print(stats)
-    ax.bxp(stats, patch_artist=True, boxprops={'facecolor': 'bisque'}, flierprops=dict(marker='o', markersize=2))
-    ax.set_ylim(0,600)
-    ax.set_yticks(np.arange(0,700,100), np.arange(0,700,100), fontsize=9)
-    ax.set_ylabel("prefill", fontsize=12, rotation=90)
-    return
-def draw_decode(df: pd.DataFrame, ax: Axes):
-    stats = [cbook.boxplot_stats(df[df["decode{}".format(i)].notna()]["decode{}".format(i)], labels=[i+1])[0]
-                 for i in range(get_max_turn(df))]
-    print(stats)
-    ax.bxp(stats, patch_artist=True, boxprops={'facecolor': 'bisque'}, flierprops=dict(marker='o', markersize=2))
-    ax.set_ylim(0,600)
-    ax.set_yticks(np.arange(0,700,100), np.arange(0,700,100), fontsize=9)
-    ax.set_ylabel("decode", fontsize=12, rotation=90)
-    return
-def draw_pd_ratio(df: pd.DataFrame, ax: Axes):
-    stats = [cbook.boxplot_stats(df[df["pd_ratio{}".format(i)].notna()]["pd_ratio{}".format(i)], labels=[i+1])[0]
-                 for i in range(get_max_turn(df))]
-    print(stats)
-    ax.bxp(stats, patch_artist=True, boxprops={'facecolor': 'bisque'}, flierprops=dict(marker='o', markersize=2))
-    ax.plot(np.arange(0,get_max_turn(df)+2), np.ones_like(np.arange(0,get_max_turn(df)+2),dtype=float))
-    ax.set_xlim(0, get_max_turn(df)+1)
-    ax.set_ylim(0, 2.)
-    ax.set_xticks(np.arange(1,get_max_turn(df)), np.arange(1,get_max_turn(df)), rotation=60, fontsize=9)
-    ax.set_yticks([0,0.5,1,2], [0,0.5,1,2], fontsize=9)
-    ax.set_xlabel("round", fontsize=12)
-    ax.set_ylabel("prefill/decode", fontsize=12, rotation=90)
-    return
-def draw_reuse_kv(df: pd.DataFrame, file_path: str):
-    plt.close()
-    _, axs = plt.subplots(3,1,sharex="col")
-    draw_prefill(df, axs[0])
-    draw_decode(df, axs[1])
-    draw_pd_ratio(df, axs[2])
-    plt.savefig(file_path, dpi=1200)
-    plt.close()
-    return
-def draw_no_reuse_kv():
-    return
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--root", type=str, default="./data")
-    parser.add_argument("--name", type=str, default="shareGPT_dialog_stats_common_en.csv")
-    args = parser.parse_args()
-
-    file_path = os.path.join(args.root, args.name)
-    dist_path = os.path.join(vis_root, args.name.split('.')[0]+"_dist.png")
-    pd_dist_path = os.path.join(vis_root, args.name.split('.')[0]+"_pd_dist.png")
-    table = preprocess(file_path)
-    draw_distribution(table, dist_path)
-    draw_reuse_kv(table, pd_dist_path)
\ No newline at end of file
diff --git a/transformers/llm/datasets/visualization/time.py b/transformers/llm/datasets/visualization/time.py
deleted file mode 100644
index 27cc0069d..000000000
--- a/transformers/llm/datasets/visualization/time.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import matplotlib.pyplot as plt
-from matplotlib import colors
-from matplotlib.ticker import PercentFormatter
-from matplotlib import cbook
-from matplotlib.axes import Axes
-from typing import List, Dict, Tuple
-import pandas as pd
-import numpy as np
-import argparse
-import os
-import re
-from io import StringIO
-
-def split_by_turns(id: str, content: str) -> List[pd.DataFrame]:
-    pattern = "<{id}>\n(.*?)</{id}>\n".format(id=id)
-    return [pd.read_csv(StringIO(item)) for item in re.findall(pattern, content, flags=re.DOTALL)]
-def preprocess(file_path: str) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]:
-    content = open(file_path, "rt").read()
-    return split_by_turns("prefill", content), split_by_turns("decode", content)
-def get_max_turn(no_reuse_prefill_record):
-    return max(10, max([len(record) for record in no_reuse_prefill_record]))
-def draw_history_len(ax: Axes, no_reuse_prefill_record:  List[pd.DataFrame]):
-    max_round = get_max_turn(no_reuse_prefill_record)
-    history_len = [0 for _ in range(0, max_round)]
-    for turn in range(0, max_round):
-        history_len[turn] = np.median([record["input_token"][turn] - record["prompt_token"][turn]
-                                     for record in no_reuse_prefill_record if len(record)>=turn+1]).item()
-    plt.plot(np.arange(1, max_round+1), history_len, label="median history len", marker=".", markersize=8)
-    return
-def draw_prefill_bar_chat(ax: Axes, no_reuse, reuse):
-    offset = 0.2
-    max_round = len(no_reuse)
-    no_reuse_med = [np.median(turn) for turn in no_reuse]
-    rects = ax.bar(np.arange(1,max_round+1) + offset, no_reuse_med, offset*2, label="no reuse kv", color="tomato")
-    ax.bar_label(rects, fmt="{:.2f}", padding=4, fontsize=6)
-    reuse_med = [np.median(turn) for turn in reuse]
-    rects = ax.bar(np.arange(1,max_round+1) - offset, reuse_med, offset*2, label="reuse kv", color="springgreen")
-    ax.bar_label(rects, fmt="{:.2f}", padding=4, fontsize=6)
-    return
-def compare_prefill_reuse_kv(no_reuse_prefill_record: List[pd.DataFrame],
-                             reuse_prefill_record: List[pd.DataFrame]):
-    plt.close()
-    _,ax1 = plt.subplots()
-    ax2 = ax1.twinx()
-    # plot history_len
-    draw_history_len(ax2, no_reuse_prefill_record)
-    # calculate per turn 
-    max_round = get_max_turn(no_reuse_prefill_record)
-    no_reuse = [[] for _ in range(0, max_round)]
-    for turn in range(0, max_round):
-        no_reuse[turn] = [record["response_speed"][turn] for record in no_reuse_prefill_record if len(record)>=turn+1]
-    reuse = [[] for _ in range(0, max_round)]
-    for turn in range(0, max_round):
-        reuse[turn] = [record["response_speed"][turn] for record in reuse_prefill_record if len(record)>=turn+1]
-    # plot the bar chat (with error bar)
-    draw_prefill_bar_chat(ax1, no_reuse, reuse)
-    ax1.set_xticks(np.arange(1,max_round+1),np.arange(1,max_round+1),fontsize=9)
-    ax1.set_ylim(0,100)
-    ax2.set_ylim(0,1000)
-    ax1.legend(loc='upper left', title="prefill response speed")
-    ax2.legend(loc='upper right')
-    ax1.set_ylabel("prefill\nresponse\nspeed", rotation=0, labelpad=12)
-    ax2.set_ylabel("history\nlen", rotation=0, labelpad=8)
-    ax1.set_xlabel("round")
-    plt.title("KV cache reuse for multi-turn chat\neffects on ShareGPT")
-    plt.tight_layout() 
-    plt.savefig("./pic/fig.png",dpi=1200)
-    plt.close()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--root", type=str, default="./data")
-    parser.add_argument("--no_reuse", type=str, default="shareGPT_common_en_70k_noreuse.txt")
-    parser.add_argument("--reuse", type=str, default="shareGPT_common_en_70k_reuse.txt")
-    args = parser.parse_args()
-
-    no_reuse_file_path = os.path.join(args.root, args.no_reuse)
-    reuse_file_path = os.path.join(args.root, args.reuse)
-    no_reuse_prefill_record, no_reuse_decode_record = preprocess(no_reuse_file_path)
-    reuse_prefill_record, reuse_decode_record = preprocess(reuse_file_path)
-    # visualize prefill
-    compare_prefill_reuse_kv(no_reuse_prefill_record, reuse_prefill_record)
diff --git a/transformers/llm/engine/CMakeLists.txt b/transformers/llm/engine/CMakeLists.txt
index b7ccf1139..6fb3b9351 100644
--- a/transformers/llm/engine/CMakeLists.txt
+++ b/transformers/llm/engine/CMakeLists.txt
@@ -1,11 +1,17 @@
 option(LLM_SUPPORT_VISION "Llm model support vision input." OFF)
+option(LLM_SUPPORT_AUDIO "Llm model support audio input." OFF)
 
-if (LLM_SUPPORT_VISION)
-    add_definitions(-DLLM_SUPPORT_VISION)
+
+if (LLM_SUPPORT_VISION AND MNN_BUILD_OPENCV)
     list(APPEND MNN_DEPS MNNOpenCV)
     include_directories(${CMAKE_SOURCE_DIR}/tools/cv/include/)
 endif()
 
+if (LLM_SUPPORT_AUDIO AND MNN_BUILD_AUDIO)
+    list(APPEND MNN_DEPS MNNAudio)
+    include_directories(${CMAKE_SOURCE_DIR}/tools/audio/include/)
+endif()
+
 # include dir
 include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
 
@@ -25,15 +31,19 @@ else()
     add_library(llm OBJECT ${SRCS})
 endif()
 
-add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/app/llm_demo.cpp)
-add_executable(ppl_demo ${CMAKE_CURRENT_LIST_DIR}/app/ppl_demo.cpp)
-add_executable(embedding_demo ${CMAKE_CURRENT_LIST_DIR}/app/embedding_demo.cpp)
+if (LLM_SUPPORT_VISION AND MNN_BUILD_OPENCV)
+    target_compile_definitions(llm PRIVATE LLM_SUPPORT_VISION)
+endif()
+if (LLM_SUPPORT_AUDIO AND MNN_BUILD_AUDIO)
+    target_compile_definitions(llm PRIVATE LLM_SUPPORT_AUDIO)
+endif()
+
+add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/llm_demo.cpp)
+add_executable(embedding_demo ${CMAKE_CURRENT_LIST_DIR}/embedding_demo.cpp)
 IF (NOT MNN_SEP_BUILD)
     target_link_libraries(llm_demo ${MNN_DEPS})
-    target_link_libraries(ppl_demo ${MNN_DEPS})
     target_link_libraries(embedding_demo ${MNN_DEPS})
 ELSE ()
     target_link_libraries(llm_demo ${MNN_DEPS} llm)
-    target_link_libraries(ppl_demo ${MNN_DEPS} llm)
     target_link_libraries(embedding_demo ${MNN_DEPS} llm)
 ENDIF ()
diff --git a/transformers/llm/engine/app/ppl_demo.cpp b/transformers/llm/engine/app/ppl_demo.cpp
deleted file mode 100644
index 393b5b86d..000000000
--- a/transformers/llm/engine/app/ppl_demo.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//
-//  ppl_demo.cpp
-//
-//  Created by MNN on 2023/03/24.
-//  ZhaodeWang
-//
-
-#include "llm/llm.hpp"
-#define MNN_OPEN_TIME_TRACE
-#include <MNN/AutoTime.hpp>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <stdlib.h>
-#include <MNN/expr/Executor.hpp>
-#include <MNN/expr/ExecutorScope.hpp>
-using namespace MNN::Transformer;
-static void trace_prepare(Llm* llm) {
-    MNN_PRINT("Prepare for resize opt Begin\n");
-    llm->trace(true);
-    std::ostringstream cacheOs;
-    llm->response("Hello", &cacheOs);
-    MNN_PRINT("Prepare for resize opt End\n");
-    llm->trace(false);
-    llm->reset();
-}
-
-// parse json
-
-static int ppl_eval(Llm* llm, std::string prompt_file, std::ofstream* perfOS) {
-    std::cout << "prompt file is " << prompt_file << std::endl;
-    // ppl evaluation
-    std::vector<float> ppls = llm->perplexity(prompt_file, perfOS);
-    float mean_ppl = 0.f;
-    for (int j = 0; j < ppls.size(); ++j) mean_ppl += ppls[j];
-    mean_ppl /= ppls.size();
-    std::cout << mean_ppl << std::endl;
-    return 0;
-}
-
-int main(int argc, const char* argv[]) {
-    if (argc < 3) {
-        std::cout << "Usage: " << argv[0] << " config.json ppl-prompt.txt [perf.txt]" << std::endl;
-        return 0;
-    }
-    std::string config_path = argv[1];
-    std::cout << "config path is " << config_path << std::endl;
-    std::unique_ptr<Llm> llm(Llm::createLLM(config_path));
-    {
-        AUTOTIME;
-        llm->load();
-    }
-    {
-        AUTOTIME;
-        trace_prepare(llm.get());
-    }
-    std::string prompt_file = argv[2];
-    std::unique_ptr<std::ofstream> perfOS(nullptr);
-    if (argc == 4) { perfOS.reset(new std::ofstream(argv[3])); }
-    return ppl_eval(llm.get(), prompt_file, perfOS.get());
-}
diff --git a/transformers/llm/engine/app/embedding_demo.cpp b/transformers/llm/engine/embedding_demo.cpp
similarity index 100%
rename from transformers/llm/engine/app/embedding_demo.cpp
rename to transformers/llm/engine/embedding_demo.cpp
diff --git a/transformers/llm/engine/include/evaluation/MemMonitor.hpp b/transformers/llm/engine/include/evaluation/MemMonitor.hpp
deleted file mode 100644
index 7750eb56c..000000000
--- a/transformers/llm/engine/include/evaluation/MemMonitor.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef MEMMONITOR_hpp
-#define MEMMONITOR_hpp
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <time.h>
-
-#define BUFFER_SIZE 256
-
-struct MemoryInfo {
-    // in MB
-    float total_phys_mem;
-    float free_phys_mem;
-    float total_swap;
-    float free_swap;
-    float process_resident_set_size;
-    float process_swap;
-    float process_virtual_mem_total;
-    float process_virtual_mem_used;
-};
-
-
-#if defined(__ANDROID__) || defined(linux) || defined(__APPLE__) || defined(__MACOSX)
-#define SELF_FILE "/proc/self/status"
-#define MEMINFO_FILE "/proc/meminfo"
-#endif // linux
-
-int readMemInfo(MemoryInfo *mem_info);
-
-int readProcStatus(MemoryInfo *mem_info);
-
-void printMemoryInfo(const MemoryInfo *mem_info);
-
-float getSysMemInc(MemoryInfo* prev, MemoryInfo* now);
-
-float getProcMem(MemoryInfo* info);
-
-#endif
\ No newline at end of file
diff --git a/transformers/llm/engine/include/evaluation/dataset.hpp b/transformers/llm/engine/include/evaluation/dataset.hpp
deleted file mode 100644
index b9585fe71..000000000
--- a/transformers/llm/engine/include/evaluation/dataset.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef LLM_DATASET_hpp
-#define LLM_DATASET_hpp
-
-#include <vector>
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <rapidjson/document.h>
-#include <rapidjson/writer.h>
-#include <rapidjson/stringbuffer.h>
-#include "llm/llm.hpp"
-
-#include <MNN/MNNDefine.h>
-
-namespace MNN {
-namespace Transformer {
-
-
-// parse csv
-MNN_PUBLIC std::vector<std::vector<std::string>> parse_csv(const std::vector<std::string>& lines);
-void parse_jsonl(std::string prompt_file, std::vector<std::vector<std::vector<PromptItem>>>& dialogs);
-
-std::string getPPLType(std::string dataset_name);
-std::vector<std::string> rowsplit(std::string prompt_file);
-std::vector<std::string> plaintext(std::string prompt_file);
-std::vector<std::string> wikitext(std::string prompt_file);
-std::vector<std::vector<std::vector<PromptItem>>> shareGPT(std::string prompt_file, int sample_size=-1); // -1: no sampling
-
-} // Transformer
-} // MNN
-
-#endif // LLM_DATASET_hpp
\ No newline at end of file
diff --git a/transformers/llm/engine/include/evaluation/evaluation.hpp b/transformers/llm/engine/include/evaluation/evaluation.hpp
deleted file mode 100644
index 9ecf0c335..000000000
--- a/transformers/llm/engine/include/evaluation/evaluation.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-
-
-#ifndef TRANSFORMER_EVALUATION_hpp
-#define TRANSFORMER_EVALUATION_hpp
-
-#include <vector>
-#include <cstddef>
-#include "MemMonitor.hpp"
-
-namespace MNN {
-namespace Transformer {
-
-#define MICRO_TO_MILLI 1e-3f
-#define MILLI_TO_MICRO 1000
-#define MICRO_TO_SEC 1e-6f
-#define SEC_TO_MICRO 1000000
-
-#define MEGA_TO_GIGA (1/1024.f)
-#define GIGA_TO_MEGA 1024.f
-#define KILLO_TO_GIGA (1/1024.f/1024.f)
-#define GIGA_TO_KILLO (1024.f*1024.f)
-#define KILLO_TO_MEGA (1/1024.f)
-#define MEGA_TO_KILLO 1024.f
-#define BYTE_TO_MEGA (1/1024.f/1024.f)
-#define MEGA_TO_BYTE (1024.f*1024.f)
-
-struct PrefillTimePerformance {
-    size_t prefill_prev_token_ = 0;
-    size_t prefill_token_ = 0;
-    size_t prefill_us_ = 0;
-};
-
-struct DecodeTimePerformance {
-    size_t decode_prev_token_ = 0;
-    size_t decode_us_ = 0;
-};
-
-struct TimePerformance {
-    std::vector<PrefillTimePerformance> prefill_record_;
-    std::vector<DecodeTimePerformance> decode_record_;
-    std::vector<int> prompt_record_;
-};
-
-void appendNewPromptRecord(struct TimePerformance* perf, int input_len, bool reuse_kv);
-
-struct PrefillMemPerformance {
-    size_t prefill_prev_token_ = 0;
-    size_t prefill_token_ = 0;
-    float prefill_MB_ = 0;
-};
-
-struct DecodeMemPerformance {
-    size_t decode_prev_token_ = 0;
-    float decode_MB_ = 0;
-};
-
-struct MemPerformance {
-    std::vector<PrefillMemPerformance> prefill_record_;
-    std::vector<DecodeMemPerformance> decode_record_;
-};
-
-void mergePerformance(struct TimePerformance* dst, struct TimePerformance* src);
-void mergePerformance(struct MemPerformance* dst, struct MemPerformance* src);
-void clearPerformance(struct TimePerformance* perf);
-void clearPerformance(struct MemPerformance* perf);
-} // namespace Transformer
-} // namespace MNN
-#endif // TRANSFORMER_EVALUATION_hpp
\ No newline at end of file
diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp
index b0d970aa0..f8b45788a 100644
--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@@ -18,7 +18,6 @@
 #include <functional>
 #include <unordered_map>
 
-#include "evaluation/evaluation.hpp"
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/Module.hpp>
 #include <MNN/expr/MathOp.hpp>
@@ -29,41 +28,6 @@ namespace Transformer {
 class Tokenizer;
 class Pipeline;
 class LlmConfig;
-class Sampler;
-class PromptLib;
-struct TimePerformance;
-
-
-// <role, content>
-#define PromptItem std::pair<std::string, std::string>
-
-class MNN_PUBLIC LlmSessionInfo {
-public:
-    // Llm::forward needs, for mask and embedding.
-    int all_seq_len_=0, gen_seq_len_=0;
-    // Sampler needs
-    std::vector<int> tokens;
-    // PromptLib needs
-    std::vector<PromptItem> mHistory;
-    std::vector<PromptItem> mInputs;
-    // Performance needs
-    struct TimePerformance mTimePerformance;
-public:
-    LlmSessionInfo():all_seq_len_(0),gen_seq_len_(0){}
-    void resetSamplerFields();
-    void resetPromptFields();
-    void resetPerformanceFields();
-    void print_speed(std::ostream* os);
-    float average_total_speed();
-    float average_prefill_speed();
-    float average_decode_speed();
-    float getTotalPrefillTime();
-    float getTotalDecodeTime();
-    int getTotalPromptLen();
-    int getTotalDecodeLen();
-};
-
-
 class DiskEmbedding;
 
 enum TuneType {
@@ -72,30 +36,26 @@ enum TuneType {
 };
 
 class MNN_PUBLIC Llm {
-public:
-    std::shared_ptr<Sampler> mSampler;
-    std::shared_ptr<PromptLib> mPromptLib;
-    std::vector<LlmSessionInfo> mLlmSessionInfos; // Llm conversation session information. Currently, only mLlmSessionInfos[0] is allowed!
+    using PromptItem = std::pair<std::string, std::string>; // <role, content>
 public:
     Llm(std::shared_ptr<LlmConfig> config) : config_(config) {}
     virtual ~Llm();
     static Llm* createLLM(const std::string& config_path);
-    void chat(bool session_by_line = false, bool from_file = false, 
-              std::istream* is = &std::cin, std::ostream* os = &std::cout, 
-              const char* end_with = "\n", std::string exit_prompt = "/exit", std::string reset_token = "/reset");
+    void chat();
     void reset();
     void trace(bool start);
     void tuning(TuneType type, std::vector<int> candidates);
     virtual void load();
-    MNN::Express::VARP forward(const std::vector<int>& input_ids, bool is_prefill=true);
+    MNN::Express::VARP forward(const std::vector<int>& input_ids);
+    int sample(MNN::Express::VARP logits, const std::vector<int>& pre_ids);
+    std::string apply_prompt_template(const std::string& user_content) const;
+    std::string apply_chat_template(const std::vector<PromptItem>& chat_prompts) const;
     std::string response(const std::string& user_content, std::ostream* os = &std::cout, const char* end_with = nullptr);
-    std::string generate(const std::string& prompt, std::ostream* os = &std::cout, const char* end_with = "\n");
-    std::string generate(const std::vector<int>& input_ids, std::ostream* os = &std::cout, const char* end_with = "\n");
+    std::string response(const std::vector<PromptItem>& chat_prompts, std::ostream* os = &std::cout, const char* end_with = nullptr);
     void generate_init();
-    std::string generateTrace(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
+    std::string generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
+    std::vector<int> generate(const std::vector<int>& input_ids, int max_new_tokens = -1);
     void print_speed();
-    void print_speed(std::ostream* os);
-    std::vector<float> perplexity(std::string prompt_file, std::ostream* statsOS = nullptr);
     // config function
     std::string dump_config();
     bool set_config(const std::string& content);
@@ -110,18 +70,18 @@ class MNN_PUBLIC Llm {
     virtual std::vector<int> tokenizer_encode(const std::string& query, bool use_template = true);
     friend class Pipeline;
 public:
+    // forward info
+    int prompt_len_ = 0;
+    int gen_seq_len_ = 0;
+    int all_seq_len_ = 0;
+    std::vector<int> history_ids_;
+    // time
+    int64_t vision_us_ = 0;
+    int64_t audio_us_ = 0;
+    int64_t prefill_us_ = 0;
+    int64_t decode_us_ = 0;
     bool is_single_ = true;
     bool attention_fused_ = true;
-    bool reuse_kv() const;
-public:
-    // time profile
-    float average_total_speed();
-    float average_prefill_speed();
-    float average_decode_speed();
-    float getTotalPrefillTime();
-    float getTotalDecodeTime();
-    int getTotalPromptLen();
-    int getTotalDecodeLen();
 protected:
     std::shared_ptr<LlmConfig> config_;
     std::shared_ptr<Tokenizer> tokenizer_;
@@ -130,6 +90,7 @@ class MNN_PUBLIC Llm {
     std::vector<MNN::Express::VARP> past_key_values_;
     MNN::Express::VARP inputs_embeds_, attention_mask_, position_ids_;
     std::shared_ptr<MNN::Express::Executor::RuntimeManager> runtime_manager_;
+    std::shared_ptr<MNN::Express::Executor::RuntimeManager> mllm_runtime_manager_;
     std::vector<std::shared_ptr<MNN::Express::Module>> modules_;
     std::vector<std::shared_ptr<MNN::Express::Module>> prefill_modules_, decode_modules_, current_modules_;
     const MNN::Express::Module* base_module_ = nullptr;
@@ -138,10 +99,6 @@ class MNN_PUBLIC Llm {
     virtual MNN::Express::VARP gen_attention_mask(int seq_len);
     virtual MNN::Express::VARP gen_position_ids(int seq_len);
     bool mTracing = false;
-protected:
-    bool getUserPrompt(bool from_file, std::istream* is, std::string& user_str);
-    void chat_init();
-    void chat_reset();
 };
 
 // Embedding start
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
index 5db6a9c81..4561338f8 100644
--- a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
@@ -89,16 +89,26 @@ - (void)processInput:(NSString *)input withStreamHandler:(StreamOutputHandler)ha
             }
             prompts.push_back(prompt);
         }
+        int prompt_len = 0;
+        int decode_len = 0;
+        int64_t prefill_time = 0;
+        int64_t decode_time = 0;
         for (int i = 0; i < prompts.size(); i++) {
             llm->response(prompts[i], &os, "\n");
+            prompt_len += llm->prompt_len_;
+            decode_len += llm->gen_seq_len_;
+            prefill_time += llm->prefill_us_;
+            decode_time += llm->decode_us_;
         }
+        float prefill_s = prefill_time / 1e6;
+        float decode_s = decode_time / 1e6;
         os << "\n#################################\n"
-           << "prompt tokens num  = " << llm->getTotalPromptLen() << "\n"
-           << "decode tokens num  = " << llm->getTotalDecodeLen() << "\n"
-           << "prefill time = " << std::fixed << std::setprecision(2) << llm->getTotalPrefillTime() << " s\n"
-           << " decode time = " << std::fixed << std::setprecision(2) << llm->getTotalDecodeTime() << " s\n"
-           << "prefill speed = " << std::fixed << std::setprecision(2) << llm->average_prefill_speed() << " tok/s\n"
-           << " decode speed = " << std::fixed << std::setprecision(2) << llm->average_decode_speed() << " tok/s\n"
+           << "prompt tokens num  = " << prompt_len << "\n"
+           << "decode tokens num  = " << decode_len << "\n"
+           << "prefill time = " << std::fixed << std::setprecision(2) << prefill_s << " s\n"
+           << " decode time = " << std::fixed << std::setprecision(2) << decode_s << " s\n"
+           << "prefill speed = " << std::fixed << std::setprecision(2) << prompt_len / prefill_s << " tok/s\n"
+           << " decode speed = " << std::fixed << std::setprecision(2) << decode_len / decode_s << " tok/s\n"
            << "##################################\n";
         os << "<eop>";
     } else {
diff --git a/transformers/llm/engine/app/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp
similarity index 62%
rename from transformers/llm/engine/app/llm_demo.cpp
rename to transformers/llm/engine/llm_demo.cpp
index 55a1b09df..8083cf881 100644
--- a/transformers/llm/engine/app/llm_demo.cpp
+++ b/transformers/llm/engine/llm_demo.cpp
@@ -6,24 +6,21 @@
 //
 
 #include "llm/llm.hpp"
-#include "evaluation/dataset.hpp"
 #define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include <MNN/expr/ExecutorScope.hpp>
 #include <fstream>
 #include <sstream>
 #include <stdlib.h>
-#include <initializer_list>
 using namespace MNN::Transformer;
 
 static void trace_prepare(Llm* llm) {
     MNN_PRINT("Prepare for resize opt Begin\n");
     llm->trace(true);
     std::ostringstream cacheOs;
-    llm->generate(std::initializer_list<int>{200, 200}, &cacheOs, "");
+    llm->generate({200, 200}, &cacheOs, "");
     MNN_PRINT("Prepare for resize opt End\n");
     llm->trace(false);
-    llm->reset();
 }
 
 static void tuning_prepare(Llm* llm) {
@@ -32,7 +29,57 @@ static void tuning_prepare(Llm* llm) {
     MNN_PRINT("Prepare for tuning opt End\n");
 }
 
+std::vector<std::vector<std::string>> parse_csv(const std::vector<std::string>& lines) {
+    std::vector<std::vector<std::string>> csv_data;
+    std::string line;
+    std::vector<std::string> row;
+    std::string cell;
+    bool insideQuotes = false;
+    bool startCollecting = false;
+
+    // content to stream
+    std::string content = "";
+    for (auto line : lines) {
+        content = content + line + "\n";
+    }
+    std::istringstream stream(content);
+
+    while (stream.peek() != EOF) {
+        char c = stream.get();
+        if (c == '"') {
+            if (insideQuotes && stream.peek() == '"') { // quote
+                cell += '"';
+                stream.get(); // skip quote
+            } else {
+                insideQuotes = !insideQuotes; // start or end text in quote
+            }
+            startCollecting = true;
+        } else if (c == ',' && !insideQuotes) { // end element, start new element
+            row.push_back(cell);
+            cell.clear();
+            startCollecting = false;
+        } else if ((c == '\n' || stream.peek() == EOF) && !insideQuotes) { // end line
+            row.push_back(cell);
+            csv_data.push_back(row);
+            cell.clear();
+            row.clear();
+            startCollecting = false;
+        } else {
+            cell += c;
+            startCollecting = true;
+        }
+    }
+    return csv_data;
+}
+
 static int benchmark(Llm* llm, const std::vector<std::string>& prompts) {
+    int prompt_len = 0;
+    int decode_len = 0;
+    int64_t vision_time = 0;
+    int64_t audio_time = 0;
+    int64_t prefill_time = 0;
+    int64_t decode_time = 0;
+    // llm->warmup();
     for (int i = 0; i < prompts.size(); i++) {
         const auto& prompt = prompts[i];
         // prompt start with '#' will be ignored
@@ -40,14 +87,26 @@ static int benchmark(Llm* llm, const std::vector<std::string>& prompts) {
             continue;
         }
         llm->response(prompt);
+        prompt_len += llm->prompt_len_;
+        decode_len += llm->gen_seq_len_;
+        vision_time += llm->vision_us_;
+        audio_time += llm->audio_us_;
+        prefill_time += llm->prefill_us_;
+        decode_time += llm->decode_us_;
     }
+    float vision_s = vision_time / 1e6;
+    float audio_s = audio_time / 1e6;
+    float prefill_s = prefill_time / 1e6;
+    float decode_s = decode_time / 1e6;
     printf("\n#################################\n");
-    printf("prompt tokens num  = %d\n", llm->getTotalPromptLen());
-    printf("decode tokens num  = %d\n", llm->getTotalDecodeLen());
-    printf("prefill time = %.2f s\n", llm->getTotalPrefillTime());
-    printf(" decode time = %.2f s\n", llm->getTotalDecodeTime());
-    printf("prefill speed = %.2f tok/s\n", llm->average_prefill_speed());
-    printf(" decode speed = %.2f tok/s\n", llm->average_decode_speed());
+    printf("prompt tokens num = %d\n", prompt_len);
+    printf("decode tokens num = %d\n", decode_len);
+    printf(" vision time = %.2f s\n", vision_s);
+    printf("  audio time = %.2f s\n", audio_s);
+    printf("prefill time = %.2f s\n", prefill_s);
+    printf(" decode time = %.2f s\n", decode_s);
+    printf("prefill speed = %.2f tok/s\n", prompt_len / prefill_s);
+    printf(" decode speed = %.2f tok/s\n", decode_len / decode_s);
     printf("##################################\n");
     return 0;
 }
diff --git a/transformers/llm/engine/test/bench_cn.txt b/transformers/llm/engine/model/bench.txt
similarity index 100%
rename from transformers/llm/engine/test/bench_cn.txt
rename to transformers/llm/engine/model/bench.txt
diff --git a/transformers/llm/engine/src/LlmSessionInfo.cpp b/transformers/llm/engine/src/LlmSessionInfo.cpp
deleted file mode 100644
index a46dc5d8c..000000000
--- a/transformers/llm/engine/src/LlmSessionInfo.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-
-#include "llm/llm.hpp"
-
-namespace MNN {
-namespace Transformer {
-
-// LlmSessionInfo starts
-void LlmSessionInfo::resetSamplerFields() {
-    all_seq_len_ = 0;
-    gen_seq_len_ = 0;
-    tokens.clear();
-}
-void LlmSessionInfo::resetPromptFields() {
-    mHistory.clear();
-    mInputs.clear();
-}
-void LlmSessionInfo::resetPerformanceFields() {
-    clearPerformance(&mTimePerformance);
-}
-float LlmSessionInfo::average_total_speed() {
-    return (getTotalPromptLen()+getTotalDecodeLen())/(getTotalPrefillTime()+getTotalDecodeTime());
-}
-float LlmSessionInfo::average_prefill_speed() {
-    // prefill response rate
-    return getTotalPromptLen()/getTotalPrefillTime();
-}
-float LlmSessionInfo::average_decode_speed() {
-    return getTotalDecodeLen()/getTotalDecodeTime();
-}
-float LlmSessionInfo::getTotalPrefillTime() {
-    float sum = 0.f;
-    for (auto record : mTimePerformance.prefill_record_) {
-        sum += ((float)record.prefill_us_)*MICRO_TO_SEC;
-    }
-    return sum;
-}
-float LlmSessionInfo::getTotalDecodeTime() {
-    float sum = 0.0f;
-    for (auto record : mTimePerformance.decode_record_) {
-        sum += ((float)record.decode_us_)*MICRO_TO_SEC;
-    }
-    return sum;
-}
-int LlmSessionInfo::getTotalPromptLen() {
-    int prompt_len = 0;
-    if (mTimePerformance.prefill_record_.size() != mTimePerformance.prompt_record_.size()) {
-        for (auto record : mTimePerformance.prefill_record_) {
-            prompt_len += record.prefill_token_;
-        }
-    } else {
-        for (int r=0; r < mTimePerformance.prompt_record_.size(); ++r) {
-            prompt_len += mTimePerformance.prompt_record_[r];
-        }
-    } 
-    return prompt_len;
-}
-int LlmSessionInfo::getTotalDecodeLen() {
-    return mTimePerformance.decode_record_.size();
-}
-void LlmSessionInfo::print_speed(std::ostream* os) {
-    // prefill statistics
-    (*os) << "<prefill>" << std::endl;
-    if (mTimePerformance.prefill_record_.size() != mTimePerformance.prompt_record_.size()) {
-        (*os) << "prev_token,input_token,response_speed" << std::endl;
-        for (auto record : mTimePerformance.prefill_record_) {
-            (*os) << record.prefill_prev_token_ << "," << record.prefill_token_ << "," << record.prefill_token_/(((float)record.prefill_us_)*MICRO_TO_SEC) << std::endl;
-        }
-    } else {
-        (*os) << "prev_token,input_token,prompt_token,response_speed" << std::endl;
-        for (int r=0; r < mTimePerformance.prompt_record_.size(); ++r) {
-            auto record = mTimePerformance.prefill_record_[r];
-            auto prompt_len = mTimePerformance.prompt_record_[r];
-            (*os) << record.prefill_prev_token_ << "," << record.prefill_token_ << "," << prompt_len << "," << prompt_len/(((float)record.prefill_us_)*MICRO_TO_SEC) << std::endl;
-        }
-    }
-    (*os) << "</prefill>" << std::endl;
-    // decode statistics
-    (*os) << "<decode>" << std::endl;
-    (*os) << "prev_token,response_speed" << std::endl;
-    for (auto record : mTimePerformance.decode_record_) {
-        (*os) << record.decode_prev_token_ << "," << 1./(((float)record.decode_us_)*MICRO_TO_SEC) << std::endl;
-    }
-    (*os) << "</decode>" << std::endl;
-}
-
-} // Transformer
-} // MNN
\ No newline at end of file
diff --git a/transformers/llm/engine/src/dataset.cpp b/transformers/llm/engine/src/dataset.cpp
deleted file mode 100644
index c4d0db45a..000000000
--- a/transformers/llm/engine/src/dataset.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-#include <algorithm>
-#include <vector>
-#include <cmath>
-#include <llm/llm.hpp>
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <string>
-#include <iterator>
-#include <random>
-#include "evaluation/dataset.hpp"
-#include <rapidjson/document.h>
-#include <rapidjson/writer.h>
-#include <rapidjson/stringbuffer.h>
-
-namespace MNN {
-namespace Transformer {
-
-
-// parse file
-// csv json
-
-// parse csv
-std::vector<std::vector<std::string>> parse_csv(const std::vector<std::string>& lines) {
-    std::vector<std::vector<std::string>> csv_data;
-    std::string line;
-    std::vector<std::string> row;
-    std::string cell;
-    bool insideQuotes = false;
-    bool startCollecting = false;
-
-    // content to stream
-    std::string content = "";
-    for (auto line : lines) {
-        content = content + line + "\n";
-    }
-    std::istringstream stream(content);
-
-    while (stream.peek() != EOF) {
-        char c = stream.get();
-        if (c == '"') {
-            if (insideQuotes && stream.peek() == '"') { // quote
-                cell += '"';
-                stream.get(); // skip quote
-            } else {
-                insideQuotes = !insideQuotes; // start or end text in quote
-            }
-            startCollecting = true;
-        } else if (c == ',' && !insideQuotes) { // end element, start new element
-            row.push_back(cell);
-            cell.clear();
-            startCollecting = false;
-        } else if ((c == '\n' || stream.peek() == EOF) && !insideQuotes) { // end line
-            row.push_back(cell);
-            csv_data.push_back(row);
-            cell.clear();
-            row.clear();
-            startCollecting = false;
-        } else {
-            cell += c;
-            startCollecting = true;
-        }
-    }
-    return csv_data;
-}
-
-// dialog, turn, 
-void parse_jsonl(std::string prompt_file, std::vector<std::vector<std::vector<PromptItem>>>& dialogs) {
-    std::ifstream prompt_fs(prompt_file);
-    std::string prompt;
-    while(std::getline(prompt_fs, prompt)) {
-        rapidjson::Document document;
-        document.Parse(prompt.c_str());
-        std::vector<std::vector<PromptItem>> cnv; 
-        if(document.HasMember("conversation")) {
-            auto& value = document["conversation"];
-            if (value.IsArray()) {
-                for (auto& v : value.GetArray()) {
-                    if (v.IsObject()) {
-                        std::vector<PromptItem> result;
-                        for (auto itr = v.MemberBegin(); itr != v.MemberEnd(); ++itr) {
-                            // {"human"/"user": , "assistant": }
-                            result.push_back(std::make_pair(itr->name.GetString(), itr->value.GetString()));
-                        }
-                        cnv.push_back(result);
-                    }
-                }
-            }
-        }
-        dialogs.push_back(cnv);
-    }
-}
-
-void write_jsonl(std::string prompt_file, const std::vector<std::vector<std::vector<PromptItem>>>& dialogs) {
-    std::ofstream prompt_fs(prompt_file);
-    for(auto& dialog : dialogs) {
-        rapidjson::Document document;
-        document.SetObject();
-        rapidjson::Value conversation(rapidjson::kArrayType);
-        conversation.SetArray();
-        for (auto& turn : dialog) {
-            rapidjson::Value sentence(rapidjson::kObjectType);
-            sentence.SetObject();
-            for (auto& role : turn) {
-                sentence.AddMember(rapidjson::Value(role.first.c_str(), document.GetAllocator()),
-                                     rapidjson::Value(role.second.c_str(), document.GetAllocator()), document.GetAllocator());
-            }
-            conversation.PushBack(sentence, document.GetAllocator());
-        }
-        document.AddMember("conversation", conversation, document.GetAllocator());
-        // write to file
-        rapidjson::StringBuffer buffer;
-        rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
-        document.Accept(writer);
-        prompt_fs << buffer.GetString() << std::endl;
-    }
-}
-
-
-// dataset
-// wikitext, ShareGPT
-
-std::string getPPLType(std::string dataset_name) {
-    if (dataset_name == "wikitext" 
-        || dataset_name == "plaintext"
-        || dataset_name == "rowsplit") {
-        return "text";
-    } else if (dataset_name == "shareGPT") {
-        return "chat";
-    } else {
-        // default chat
-        return "chat";
-    }
-}
-
-std::vector<std::string> plaintext(std::string prompt_file) {
-    // split by line
-    std::ifstream prompt_fs(prompt_file);
-    std::vector<std::string> prompts;
-    std::string prompt;
-    prompts.push_back("");
-    while (std::getline(prompt_fs, prompt)) {
-        if (prompt.back() == '\r' || prompt.back() == '\n') {
-            prompt.pop_back();
-        }
-        // concatenate.
-        prompts.back() += prompt + "\n";
-    }
-    return prompts;
-}
-
-std::vector<std::string> rowsplit(std::string prompt_file) {
-    // split by line
-    std::ifstream prompt_fs(prompt_file);
-    std::vector<std::string> prompts;
-    std::string prompt;
-    while (std::getline(prompt_fs, prompt)) {
-        if (prompt.back() == '\r' || prompt.back() == '\n') {
-            prompt.pop_back();
-        }
-        prompts.push_back(prompt);
-    }
-    return prompts;
-}
-
-// wikitext
-void removeSubstrs(std::string& s, std::string p) { 
-    std::string::size_type n = p.length();
-    for (std::string::size_type i = s.find(p); i != std::string::npos; i = s.find(p))
-        s.erase(i, n);
-}
-std::vector<std::string> wikitext(std::string prompt_file) {
-    // split wiki text into " = " first-level column.
-    std::ifstream prompt_fs(prompt_file);
-    std::vector<std::string> prompts;
-    std::string prompt;
-    while (std::getline(prompt_fs, prompt)) {
-        if (prompt.back() == '\r' || prompt.back() == '\n') {
-            prompt.pop_back();
-        }
-        if (prompt.size() < 4) continue;
-        removeSubstrs(prompt, "@-@");
-        if ((prompts.size() == 0) \
-             || (prompt.size() >= 4 \
-                 && prompt.at(0) == ' ' \
-                 && prompt.at(1) == '=' \
-                 && prompt.at(2) == ' ' \
-                 && prompt.at(3) != '=')) {
-            // first-level column.
-            prompts.push_back(prompt);
-        } else {
-            // concatenate.
-            prompts.back() += "\n" + prompt;
-        }
-    }
-    return prompts;
-}
-
-std::string genSampleName(std::string oriName, int sample_size) {
-    const size_t last_slash_idx = oriName.rfind('.');
-    auto stem = oriName.substr(0, last_slash_idx);
-    return stem + "_sample" + std::to_string(sample_size) + ".jsonl";
-}
-
-std::vector<std::vector<std::vector<PromptItem>>> shareGPT(std::string prompt_file, int sample_size) {
-    std::vector<std::vector<std::vector<PromptItem>>> dialogs, dataset;
-    parse_jsonl(prompt_file, dialogs);
-    // randomly sample a subset
-    if (sample_size > 0 && sample_size < dialogs.size()){
-        std::random_device rd;
-        std::mt19937 g(rd());
-        std::shuffle(dialogs.begin(), dialogs.end(), g);
-        dataset.insert(dataset.end(), dialogs.begin(), dialogs.begin() + sample_size);
-        dialogs = dataset;
-        // store dialogs to file
-        write_jsonl(genSampleName(prompt_file, sample_size), dialogs);
-    }
-    return dialogs;
-}
-
-
-} // Transformer
-} // MNN
diff --git a/transformers/llm/engine/src/evaluation.cpp b/transformers/llm/engine/src/evaluation.cpp
deleted file mode 100644
index cd2524a4e..000000000
--- a/transformers/llm/engine/src/evaluation.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-
-
-#include <vector>
-#include <cstddef>
-#include "evaluation/evaluation.hpp"
-
-namespace MNN {
-namespace Transformer {
-
-void clearPerformance(struct TimePerformance* perf) {
-    perf->prefill_record_.clear();
-    perf->decode_record_.clear();
-    perf->prompt_record_.clear();
-}
-void appendNewPromptRecord(struct TimePerformance* perf, int input_len, bool reuse_kv) {
-    if (reuse_kv) {
-        perf->prompt_record_.push_back(input_len);
-    } else {
-        // not reuse kv
-        if (!perf->decode_record_.empty()) {
-            perf->prompt_record_.push_back(input_len - (perf->decode_record_.back().decode_prev_token_+1));
-        } else {
-            // first prefill
-            perf->prompt_record_.push_back(input_len);
-        }
-    }
-}
-
-} // Transformer
-} // MNN
\ No newline at end of file
diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp
index 6a1d7fdf8..e7f1e3474 100644
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@@ -6,28 +6,28 @@
 //
 // #define MNN_OPEN_TIME_TRACE 1
 
-#include <iostream>
 #include <fstream>
+#include <iostream>
+#include <regex>
 #include <sstream>
 #include <unordered_set>
-#include <regex>
 
-#include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
 #include "cpp/ExprDebug.hpp"
 #include "llm/llm.hpp"
-#include "evaluation/evaluation.hpp"
-#include "sampler.hpp"
-#include "prompt.hpp"
-#include "tokenizer.hpp"
 #include "llmconfig.hpp"
+#include "tokenizer.hpp"
 // 0: no debug, 1: test op time, 2: print tensor info
 #define DEBUG_MODE 0
 
-#ifdef LLM_SUPPORT_VISION
 #include "httplib.h"
+#ifdef LLM_SUPPORT_VISION
 #include <cv/cv.hpp>
 #endif
+#ifdef LLM_SUPPORT_AUDIO
+#include <audio/audio.hpp>
+#endif
 
 using namespace MNN::Express;
 namespace MNN {
@@ -37,12 +37,12 @@ typedef void (*DequantFunction)(const uint8_t*, float*, float, float, int);
 
 static void q41_dequant_ref(const uint8_t* src, float* dst, float scale, float zero, int size) {
     for (int i = 0; i < size / 2; i++) {
-        int x = src[i];
-        int x1 = x / 16 - 8;
-        int x2= x % 16 - 8;
-        float w1 = x1 * scale + zero;
-        float w2 = x2 * scale + zero;
-        dst[2 * i] = w1;
+        int x          = src[i];
+        int x1         = x / 16 - 8;
+        int x2         = x % 16 - 8;
+        float w1       = x1 * scale + zero;
+        float w2       = x2 * scale + zero;
+        dst[2 * i]     = w1;
         dst[2 * i + 1] = w2;
     }
 }
@@ -56,11 +56,13 @@ static void q81_dequant_ref(const uint8_t* src, float* dst, float scale, float z
 class DiskEmbedding {
 public:
     explicit DiskEmbedding(const std::shared_ptr<LlmConfig>& config);
-    ~DiskEmbedding() {}
+    ~DiskEmbedding() {
+    }
     void embedding(const std::vector<int>& input_ids, float* ptr);
+
 private:
     void seek_read(uint8_t* dst, size_t size, size_t offset);
-    std::unique_ptr<uint8_t[]> alpha_ = nullptr;
+    std::unique_ptr<uint8_t[]> alpha_  = nullptr;
     std::unique_ptr<uint8_t[]> weight_ = nullptr;
     std::unique_ptr<FILE, decltype(&fclose)> fp_;
     DequantFunction dequant_;
@@ -76,18 +78,18 @@ void DiskEmbedding::seek_read(uint8_t* dst, size_t size, size_t offset) {
 
 DiskEmbedding::DiskEmbedding(const std::shared_ptr<LlmConfig>& config) : fp_(nullptr, &fclose) {
     auto tie_embeddings = config->tie_embeddings();
-    hidden_size_ = config->hidden_size();
+    hidden_size_        = config->hidden_size();
     if (tie_embeddings.size() == 5) {
-        w_offset_    = tie_embeddings[0];
-        quant_bit_   = tie_embeddings[3];
-        quant_block_ = tie_embeddings[4];
-        block_num_ = hidden_size_ / quant_block_;
+        w_offset_          = tie_embeddings[0];
+        quant_bit_         = tie_embeddings[3];
+        quant_block_       = tie_embeddings[4];
+        block_num_         = hidden_size_ / quant_block_;
         weight_token_size_ = hidden_size_ * quant_bit_ / 8;
         fp_.reset(fopen(config->llm_weight().c_str(), "rb"));
         // TODO: optimize dequant function
-        dequant_ = quant_bit_ == 8 ? q81_dequant_ref : q41_dequant_ref;
-        auto a_offset    = tie_embeddings[1];
-        auto alpha_size  = tie_embeddings[2];
+        dequant_        = quant_bit_ == 8 ? q81_dequant_ref : q41_dequant_ref;
+        auto a_offset   = tie_embeddings[1];
+        auto alpha_size = tie_embeddings[2];
         alpha_.reset(new uint8_t[alpha_size]);
         seek_read(alpha_.get(), alpha_size, a_offset);
     } else {
@@ -103,13 +105,13 @@ void DiskEmbedding::embedding(const std::vector<int>& input_ids, float* dst) {
         for (size_t i = 0; i < input_ids.size(); i++) {
             int token = input_ids[i];
             seek_read(weight_.get(), weight_token_size_, w_offset_ + token * weight_token_size_);
-            auto dptr = dst + i * hidden_size_;
+            auto dptr      = dst + i * hidden_size_;
             auto alpha_ptr = reinterpret_cast<float*>(alpha_.get()) + token * block_num_ * 2;
             for (int n = 0; n < block_num_; n++) {
-                auto dst_ptr = dptr + n * quant_block_;
+                auto dst_ptr     = dptr + n * quant_block_;
                 uint8_t* src_ptr = weight_.get() + n * (quant_block_ * quant_bit_ / 8);
-                float zero = (alpha_ptr + n * 2)[0];
-                float scale = (alpha_ptr + n * 2)[1];
+                float zero       = (alpha_ptr + n * 2)[0];
+                float scale      = (alpha_ptr + n * 2)[1];
                 dequant_(src_ptr, dst_ptr, scale, zero, quant_block_);
             }
         }
@@ -119,43 +121,55 @@ void DiskEmbedding::embedding(const std::vector<int>& input_ids, float* dst) {
             seek_read(weight_.get(), weight_token_size_, input_ids[i] * weight_token_size_);
             int16_t* dst_ptr = reinterpret_cast<int16_t*>(dst + i * hidden_size_);
             for (int j = 0; j < hidden_size_; j++) {
-                dst_ptr[j * 2] = 0;
+                dst_ptr[j * 2]     = 0;
                 dst_ptr[j * 2 + 1] = reinterpret_cast<int16_t*>(weight_.get())[j];
             }
         }
     }
 }
 
-class Lvlm : public Llm {
+class Mllm : public Llm {
 public:
-    Lvlm(std::shared_ptr<LlmConfig> config) : Llm(config) {
-        image_size_ = config->llm_config_.value("image_size", image_size_);
-        image_pad_ = config->llm_config_.value("image_pad", image_pad_);
-        vision_start_ = config->llm_config_.value("vision_start", vision_start_);
-        vision_end_ = config->llm_config_.value("vision_end", vision_end_);
-        image_mean_ = config->llm_config_.value("image_mean", image_mean_);
-        image_norm_ = config->llm_config_.value("image_norm", image_norm_);
-    }
-    ~Lvlm() { visual_module_.reset(); }
+    Mllm(std::shared_ptr<LlmConfig> config) : Llm(config) {
+        if (config->is_visual()) {
+            image_height_  = config->llm_config_.value("image_size", image_height_);
+            image_width_   = image_height_;
+            img_pad_       = config->llm_config_.value("image_pad", img_pad_);
+            vision_start_  = config->llm_config_.value("vision_start", vision_start_);
+            vision_end_    = config->llm_config_.value("vision_end", vision_end_);
+            image_mean_    = config->llm_config_.value("image_mean", image_mean_);
+            image_norm_    = config->llm_config_.value("image_norm", image_norm_);
+        }
+        if (config->is_audio()) {
+        }
+    }
+    ~Mllm() {
+        mul_module_.reset();
+    }
     virtual void load() override;
-
     virtual std::vector<int> tokenizer_encode(const std::string& query, bool use_template = true) override;
     virtual MNN::Express::VARP embedding(const std::vector<int>& input_ids) override;
+
 private:
-    int image_size_ = 448, vision_start_ = 151857, vision_end_ = 151858, image_pad_ = 151859;
-    std::vector<float> image_mean_ {122.7709383 , 116.7460125 , 104.09373615};
-    std::vector<float> image_norm_ {0.01459843, 0.01500777, 0.01422007};
-    std::vector<int> image_process(const std::string& img_info);
-    std::shared_ptr<Module> visual_module_;
-    std::vector<VARP> image_embeddings_;
+    // vision config
+    int image_height_ = 448, image_width_ = 448, vision_start_ = 151857, vision_end_ = 151858, img_pad_ = 151859;
+    std::vector<float> image_mean_{122.7709383, 116.7460125, 104.09373615};
+    std::vector<float> image_norm_{0.01459843, 0.01500777, 0.01422007};
+    // audio config
+    int audio_pad_ = 151646;
+    std::vector<int> multimode_process(const std::string& mode, std::string info);
+    std::vector<int> vision_process(const std::string& file);
+    std::vector<int> audio_process(const std::string& file);
+    std::shared_ptr<Module> mul_module_;
+    std::vector<VARP> mul_embeddings_;
 };
 
 // Llm start
 Llm* Llm::createLLM(const std::string& config_path) {
     std::shared_ptr<LlmConfig> config(new LlmConfig(config_path));
     Llm* llm = nullptr;
-    if (config->is_visual()) {
-        llm = new Lvlm(config);
+    if (config->is_visual() || config->is_audio()) {
+        llm = new Mllm(config);
     } else {
         llm = new Llm(config);
     }
@@ -163,13 +177,20 @@ Llm* Llm::createLLM(const std::string& config_path) {
 }
 
 static MNNForwardType backend_type_convert(const std::string& type_str) {
-    if (type_str == "cpu") return MNN_FORWARD_CPU;
-    if (type_str == "metal") return MNN_FORWARD_METAL;
-    if (type_str == "cuda") return MNN_FORWARD_CUDA;
-    if (type_str == "opencl") return MNN_FORWARD_OPENCL;
-    if (type_str == "opengl") return MNN_FORWARD_OPENGL;
-    if (type_str == "vulkan") return MNN_FORWARD_VULKAN;
-    if (type_str == "npu") return MNN_FORWARD_NN;
+    if (type_str == "cpu")
+        return MNN_FORWARD_CPU;
+    if (type_str == "metal")
+        return MNN_FORWARD_METAL;
+    if (type_str == "cuda")
+        return MNN_FORWARD_CUDA;
+    if (type_str == "opencl")
+        return MNN_FORWARD_OPENCL;
+    if (type_str == "opengl")
+        return MNN_FORWARD_OPENGL;
+    if (type_str == "vulkan")
+        return MNN_FORWARD_VULKAN;
+    if (type_str == "npu")
+        return MNN_FORWARD_NN;
     return MNN_FORWARD_AUTO;
 }
 
@@ -184,8 +205,9 @@ bool Llm::set_config(const std::string& content) {
 void Llm::init_runtime() {
     ScheduleConfig config;
     BackendConfig cpuBackendConfig;
-    config.type          = backend_type_convert(config_->backend_type());
-    config.numThread     = config_->thread_num();
+    config.type      = backend_type_convert(config_->backend_type());
+    config.numThread = config_->thread_num();
+    ExecutorScope::Current()->setGlobalExecutorConfig(config.type, cpuBackendConfig, config.numThread);
     if (config_->power() == "high") {
         cpuBackendConfig.power = BackendConfig::Power_High;
     } else if (config_->power() == "low") {
@@ -202,7 +224,6 @@ void Llm::init_runtime() {
         cpuBackendConfig.precision = BackendConfig::Precision_Low;
     }
     config.backendConfig = &cpuBackendConfig;
-    ExecutorScope::Current()->setGlobalExecutorConfig(config.type, cpuBackendConfig, config.numThread);
 
     runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
     runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
@@ -217,16 +238,17 @@ void Llm::init_runtime() {
         runtime_manager_->setExternalPath(tmpPath, MNN::Interpreter::EXTERNAL_WEIGHT_DIR);
     }
 
-#if DEBUG_MODE==1
+#if DEBUG_MODE == 1
     runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
     _initTimeTrace();
 #endif
-#if DEBUG_MODE==2
+#if DEBUG_MODE == 2
     runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
     _initTensorStatic();
 #endif
     {
-        runtime_manager_->setCache(".tempcache");
+        std::string cacheFilePath = tmpPath.length() != 0 ? tmpPath : ".";
+        runtime_manager_->setCache(cacheFilePath + "/mnn_cachefile.bin");
     }
 }
 
@@ -234,7 +256,7 @@ void Llm::load() {
     init_runtime();
     // init module status
     key_value_shape_ = config_->key_value_shape();
-    is_single_ = config_->is_single();
+    is_single_       = config_->is_single();
     attention_fused_ = config_->attention_fused();
     MNN_PRINT("### is_single_ = %d\n", is_single_);
     // 1. load vocab
@@ -245,7 +267,7 @@ void Llm::load() {
     // 3. load model
     Module::Config module_config;
     module_config.shapeMutable = true;
-    module_config.rearrange = true;
+    module_config.rearrange    = true;
     // using base module for lora module
     if (base_module_ != nullptr) {
         module_config.base = base_module_;
@@ -259,51 +281,40 @@ void Llm::load() {
         MNN_PRINT("load %s ... ", model_path.c_str());
         runtime_manager_->setExternalFile(config_->llm_weight());
         if (attention_fused_) {
-            modules_[0].reset(Module::load(
-                                           {"input_ids", "attention_mask", "position_ids"},
-                                           {"logits"}, model_path.c_str(), runtime_manager_, &module_config));
+            modules_[0].reset(Module::load({"input_ids", "attention_mask", "position_ids"}, {"logits"},
+                                           model_path.c_str(), runtime_manager_, &module_config));
         } else {
-            modules_[0].reset(Module::load(
-                                           {"input_ids", "attention_mask", "position_ids", "past_key_values"},
-                                           {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+            modules_[0].reset(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"},
+                                           {"logits", "presents"}, model_path.c_str(), runtime_manager_,
+                                           &module_config));
         }
         MNN_PRINT("Load Module Done!\n");
     } else {
-        MNN_ERROR("Split version is deprecated\n");
+        MNN_ERROR("Split version is depercerate\n");
     }
     decode_modules_.resize(modules_.size());
-    for (int v=0; v<modules_.size(); ++v) {
+    for (int v = 0; v < modules_.size(); ++v) {
         decode_modules_[v].reset(Module::clone(modules_[v].get()));
     }
     MNN_PRINT("Clone Decode Module Done!\n");
 
     prefill_modules_ = modules_;
-
-    // workflow instruments
-    // 3. create Sampler
-    mSampler.reset(Sampler::createSampler(this, config_));
-    MNN_PRINT("Sampler initiated!\n");
-    // 4. create PromptLib
-    mPromptLib.reset(PromptLib::createPromptLib(this, config_));
-    MNN_PRINT("PromptLib initiated!\n");
-    // 5. reset
-    reset();
-    MNN_PRINT("Llm Session Init!\n");
 }
 
 size_t Llm::apply_lora(const std::string& lora_path) {
     std::string model_path = config_->base_dir_ + "/" + lora_path;
     Module::Config module_config;
     module_config.shapeMutable = true;
-    module_config.rearrange = true;
-    module_config.base = modules_.begin()->get();
-    size_t lora_index = modules_.size();
+    module_config.rearrange    = true;
+    module_config.base         = modules_.begin()->get();
+    size_t lora_index          = modules_.size();
     if (attention_fused_) {
-        modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids"},
-                                           {"logits"}, model_path.c_str(), runtime_manager_, &module_config));
+        modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids"}, {"logits"},
+                                           model_path.c_str(), runtime_manager_, &module_config));
     } else {
         modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"},
-                                           {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+                                           {"logits", "presents"}, model_path.c_str(), runtime_manager_,
+                                           &module_config));
     }
     select_module(lora_index);
     return lora_index;
@@ -360,21 +371,21 @@ void Llm::trace(bool start) {
 }
 
 void Llm::tuning(TuneType type, std::vector<int> candidates) {
-    if(type != OP_ENCODER_NUMBER) {
+    if (type != OP_ENCODER_NUMBER) {
         MNN_ERROR("tuning type not supported\n");
         return;
     }
-    if(config_->backend_type() != "metal") {
+    if (config_->backend_type() != "metal") {
         return;
     }
 
-    current_modules_ = decode_modules_;
-    int64_t min_time = INT64_MAX;
+    current_modules_     = decode_modules_;
+    int64_t min_time     = INT64_MAX;
     int prefer_candidate = 10;
-    for(auto& candidate : candidates) {
+    for (auto& candidate : candidates) {
         runtime_manager_->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, candidate);
 
-        auto st = std::chrono::system_clock::now();
+        auto st     = std::chrono::system_clock::now();
         auto logits = forward({0});
         if (nullptr == logits.get()) {
             return;
@@ -382,24 +393,22 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
         if (logits->getInfo()->size == 0) {
             return;
         }
-        // no need for sampling here, because metal OP does not affects much in sampling.
-        auto et = std::chrono::system_clock::now();
+        auto token   = sample(logits, {});
+        auto et      = std::chrono::system_clock::now();
         int64_t time = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
-        if(time < min_time) {
+        if (time < min_time) {
             prefer_candidate = candidate;
-            min_time = time;
-            //MNN_PRINT("op encode number:%d, decode time: %lld us\n", candidate, time);
+            min_time         = time;
+            // MNN_PRINT("op encode number:%d, decode time: %lld us\n", candidate, time);
         }
     }
     runtime_manager_->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, prefer_candidate);
 }
 
-VARP Llm::forward(const std::vector<int>& input_ids, bool is_prefill) {
-    if (is_prefill) current_modules_ = prefill_modules_;
-    else current_modules_ = decode_modules_;
-    int seq_len = input_ids.size();
+VARP Llm::forward(const std::vector<int>& input_ids) {
+    int seq_len         = input_ids.size();
     auto attention_mask = gen_attention_mask(seq_len);
-    auto position_ids = gen_position_ids(seq_len);
+    auto position_ids   = gen_position_ids(seq_len);
     VARP logits;
     if (is_single_) {
         std::vector<MNN::Express::VARP> outputs;
@@ -407,7 +416,8 @@ VARP Llm::forward(const std::vector<int>& input_ids, bool is_prefill) {
         if (attention_fused_) {
             outputs = current_modules_.back()->onForward({hidden_states, attention_mask, position_ids});
         } else {
-            outputs = current_modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
+            outputs =
+                current_modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
         }
         if (outputs.empty()) {
             return nullptr;
@@ -417,142 +427,269 @@ VARP Llm::forward(const std::vector<int>& input_ids, bool is_prefill) {
             past_key_values_[0] = outputs[1];
         }
     } else {
-        MNN_ERROR("Split models is deprecated\n");
+        MNN_ERROR("Split models is depercarate\n");
         return nullptr;
     }
-    // sequence length is handled in forward.
-    mLlmSessionInfos[0].all_seq_len_ += seq_len;
-    mLlmSessionInfos[0].gen_seq_len_++;
+    all_seq_len_ += seq_len;
+    gen_seq_len_++;
     return logits;
 }
 
-// < "app_type": "chat"
-bool Llm::getUserPrompt(bool from_file, std::istream* is, std::string& user_str) {
-    if (!from_file) std::cout << "\nQ: ";
-    return (bool)std::getline(*is, user_str);
-}
-
-void Llm::chat(bool session_by_line, bool from_file, 
-               std::istream* is, std::ostream* os, 
-               const char* end_with, std::string exit_token, std::string reset_token) {
-    // handle system prompt
-    reset();
-    std::string user_str;
-    while (getUserPrompt(from_file, is, user_str)) {
-        // whether to end
-        if (user_str == exit_token) {
-            reset();
-            break;
-        }
-        // whether to reset
-        if (session_by_line || user_str == reset_token) {
-            reset();
-            if (!from_file) std::cout << "\nreset done." << std::endl;
-            continue;
+int Llm::sample(VARP logits, const std::vector<int>& pre_ids) {
+    std::unordered_set<int> ids_set(pre_ids.begin(), pre_ids.end());
+    auto scores = (float*)(logits->readMap<float>());
+    auto size   = logits->getInfo()->size;
+    // repetition penalty
+    const float repetition_penalty = 1.1;
+    for (auto id : ids_set) {
+        float score = scores[id];
+        scores[id]  = score < 0 ? score * repetition_penalty : score / repetition_penalty;
+    }
+    // argmax
+    float max_score = 0;
+    int token_id    = 0;
+    for (int i = 0; i < size; i++) {
+        float score = scores[i];
+        if (score > max_score) {
+            max_score = score;
+            token_id  = i;
         }
-        // get answer
-        (*os) << "\nA: " << std::flush;
-        response(user_str, os, end_with);
-        (*os) << std::endl;
     }
-    reset();
+    return token_id;
 }
 
-std::string Llm::response(const std::string& user_str, std::ostream* os, const char* end_with) {
-    mPromptLib->appendUserPrompt(user_str);
-    auto assistant_str = generate(mPromptLib->getLLMInput(), os, end_with);
-    mPromptLib->appendLLMOutput(assistant_str);
-    return assistant_str;
+static std::string apply_template(std::string prompt_template, const std::string& content,
+                                  const std::string& role = "") {
+    if (prompt_template.empty())
+        return content;
+    if (!role.empty()) {
+        const std::string placeholder = "%r";
+        size_t start_pos              = prompt_template.find(placeholder);
+        if (start_pos == std::string::npos)
+            return content;
+        prompt_template.replace(start_pos, placeholder.length(), role);
+    }
+    const std::string placeholder = "%s";
+    size_t start_pos              = prompt_template.find(placeholder);
+    if (start_pos == std::string::npos)
+        return content;
+    prompt_template.replace(start_pos, placeholder.length(), content);
+    return prompt_template;
 }
-// "app_type": "chat" >
-
 
+std::string Llm::apply_prompt_template(const std::string& user_content) const {
+    auto chat_prompt = config_->prompt_template();
+    return apply_template(chat_prompt, user_content);
+}
 
-void Llm::reset() {
-    // clear KV cache
-    // KV cache automatically cleared as long as seq_len reset!
-    mLlmSessionInfos.clear();
-    mLlmSessionInfos.emplace_back(LlmSessionInfo()); 
+std::string Llm::apply_chat_template(const std::vector<PromptItem>& chat_prompts) const {
+    auto chat_template = config_->chat_template();
+    std::string prompt_result;
+    auto iter = chat_prompts.begin();
+    for (; iter != chat_prompts.end() - 1; ++iter) {
+        prompt_result += apply_template(chat_template, iter->second, iter->first);
+    }
+    if (iter->first == "user") {
+        prompt_result += apply_prompt_template(iter->second);
+    } else {
+        prompt_result += apply_template(chat_template, iter->second, iter->first);
+    }
+    return prompt_result;
 }
 
-bool Llm::reuse_kv() const {
-    return config_->reuse_kv();
+void Llm::chat() {
+    std::vector<PromptItem> history;
+    history.push_back(std::make_pair("system", "You are a helpful assistant."));
+    while (true) {
+        std::cout << "\nQ: ";
+        std::string user_str;
+        std::cin >> user_str;
+        if (user_str == "/exit") {
+            break;
+        }
+        if (user_str == "/reset") {
+            history.resize(1);
+            std::cout << "\nA: reset done." << std::endl;
+            continue;
+        }
+        std::cout << "\nA: " << std::flush;
+        if (config_->reuse_kv()) {
+            response(user_str);
+        } else {
+            history.emplace_back(std::make_pair("user", user_str));
+            auto assistant_str = response(history);
+            history.emplace_back(std::make_pair("assistant", assistant_str));
+        }
+        std::cout << std::endl;
+    }
 }
 
+void Llm::reset() {
+    history_ids_.clear();
+    all_seq_len_ = 0;
+}
 
-// < generate
 void Llm::generate_init() {
-    // handle past_key_values if not attention_fused_
+    // init status
+    gen_seq_len_ = 0;
+    vision_us_   = 0;
+    audio_us_    = 0;
+    prefill_us_  = 0;
+    decode_us_   = 0;
     past_key_values_.clear();
-    if (!attention_fused_) {
-        if (is_single_) {
+    if (is_single_) {
+        past_key_values_.push_back(_Input(key_value_shape_, NCHW));
+    } else {
+        for (int i = 0; i < config_->layer_nums(); i++) {
             past_key_values_.push_back(_Input(key_value_shape_, NCHW));
-        } else {
-            MNN_ERROR("Split version is deprecated\n");
         }
     }
-    if (!reuse_kv()) {
-        // only reset sampler. The history is handled by mPromptLib.
-        mLlmSessionInfos[0].resetSamplerFields();
+    if (!config_->reuse_kv()) {
+        all_seq_len_ = 0;
+        history_ids_.clear();
     }
     current_modules_ = prefill_modules_;
 }
 
-std::string Llm::generate(const std::string& prompt, std::ostream* os, const char* end_with) {
-    if (prompt.empty()) { return ""; }
-    if (!end_with) { end_with = "\n"; }
-    // std::cout << "# prompt : " << prompt << std::endl;
-    auto input_ids = tokenizer_encode(prompt, false);
-    std::string out_str = generate(input_ids, os, end_with);
-    return out_str;
-}
-
-std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
-    if (mTracing) return generateTrace(input_ids, os, end_with);
-    if (input_ids.empty()) { return ""; }
-    if (!end_with) { end_with = "\n"; }
+std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_tokens) {
     generate_init();
-    // printf("input_ids (%lu): ", input_ids.size()); for (auto id : input_ids) printf("%d, ", id); printf("\n");
-    std::string out_str = mSampler->sample(input_ids, os, end_with, &(mLlmSessionInfos[0].mTimePerformance));
-    return out_str;
+    std::vector<int> output_ids, all_ids = input_ids;
+    prompt_len_ = static_cast<int>(input_ids.size());
+    if (max_new_tokens < 0) {
+        max_new_tokens = config_->max_new_tokens();
+    }
+    // prefill
+    current_modules_ = prefill_modules_;
+    auto logits      = forward(input_ids);
+    if (logits.get() == nullptr) {
+        return {};
+    }
+    int token = sample(logits, all_ids);
+    output_ids.push_back(token);
+    all_ids.push_back(token);
+    // decode
+    current_modules_ = decode_modules_;
+    while (gen_seq_len_ < max_new_tokens) {
+        logits = nullptr;
+        logits = forward({token});
+        if (logits.get() == nullptr) {
+            return {};
+        }
+        token = sample(logits, all_ids);
+        if (is_stop(token)) {
+            break;
+        }
+        output_ids.push_back(token);
+        all_ids.push_back(token);
+    }
+    return output_ids;
 }
 
-
-std::string Llm::generateTrace(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
+std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
     if (mTracing) {
         // Skip real forward
-        forward(input_ids, true);
-        forward({input_ids[0]}, false);
-        forward({input_ids[0]}, false);
+        current_modules_ = prefill_modules_;
+        forward(input_ids);
+        current_modules_ = decode_modules_;
+        forward({input_ids[0]});
+        forward({input_ids[0]});
         return "Test";
     }
-    return "Test";
+    prompt_len_ = static_cast<int>(input_ids.size());
+    history_ids_.insert(history_ids_.end(), input_ids.begin(), input_ids.end()); // push to history_ids_
+    auto st          = std::chrono::system_clock::now();
+    current_modules_ = prefill_modules_;
+    auto logits      = forward(input_ids);
+    if (nullptr == logits.get()) {
+        return "";
+    }
+    int token              = sample(logits, history_ids_);
+    auto et                = std::chrono::system_clock::now();
+    current_modules_       = decode_modules_;
+    std::string output_str = tokenizer_decode(token);
+    prefill_us_            = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
+    *os << output_str << std::flush;
+    while (gen_seq_len_ < config_->max_new_tokens()) {
+        st = std::chrono::system_clock::now();
+        history_ids_.push_back(token);
+        logits = nullptr;
+        logits = forward({token});
+        if (nullptr == logits.get()) {
+            return "";
+        }
+        if (logits->getInfo()->size == 0) {
+            return "";
+        }
+        token = sample(logits, history_ids_);
+        et    = std::chrono::system_clock::now();
+        decode_us_ += std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
+        if (is_stop(token)) {
+            *os << end_with << std::flush;
+            break;
+        }
+        auto word = tokenizer_decode(token);
+        *os << word << std::flush;
+        output_str += word;
+    }
+    ExecutorScope::Current()->gc(Executor::FULL);
+#ifdef DUMP_PROFILE_INFO
+    print_speed();
+#endif
+    return output_str;
 }
 
 std::vector<int> Llm::tokenizer_encode(const std::string& user_content, bool use_template) {
     if (!use_template) {
         return tokenizer_->encode(user_content);
     }
-    auto prompt = mPromptLib->applyTemplate(user_content);
+    auto prompt    = apply_prompt_template(user_content);
     auto input_ids = tokenizer_->encode(prompt);
     return input_ids;
 }
 
-
-// < evaluation
-std::vector<float> Llm::perplexity(std::string prompt_file, std::ostream* perfOS) {
-    return mSampler->perplexity(prompt_file, perfOS);
+std::string Llm::response(const std::string& user_content, std::ostream* os, const char* end_with) {
+    generate_init();
+    if (!end_with) {
+        end_with = "\n";
+    }
+    std::vector<int> input_ids;
+    if (config_->reuse_kv()) {
+        auto prompt = apply_prompt_template(user_content);
+        if (all_seq_len_ > 0) {
+            prompt = "<|im_end|>\n" + prompt;
+        }
+        input_ids = tokenizer_->encode(prompt);
+    } else {
+        input_ids = tokenizer_encode(user_content);
+    }
+    return generate(input_ids, os, end_with);
 }
-// evaluation >
 
+std::string Llm::response(const std::vector<PromptItem>& chat_prompts, std::ostream* os, const char* end_with) {
+    if (chat_prompts.empty()) {
+        return "";
+    }
+    generate_init();
+    if (!end_with) {
+        end_with = "\n";
+    }
+    auto prompt = apply_chat_template(chat_prompts);
+    if (config_->reuse_kv() && all_seq_len_ > 0) {
+        prompt = "<|im_end|>\n" + prompt;
+    }
+    // std::cout << "# prompt : " << prompt << std::endl;
+    auto input_ids = tokenizer_->encode(prompt);
+    // printf("input_ids (%lu): ", input_ids.size()); for (auto id : input_ids) printf("%d, ", id); printf("\n");
+    return generate(input_ids, os, end_with);
+}
 
 Llm::~Llm() {
-#if DEBUG_MODE==1
+#if DEBUG_MODE == 1
     if (nullptr != gTimeTraceInfo) {
-        float opSummer = 0.0f;
+        float opSummer       = 0.0f;
         float opFlopsSummber = 0.0f;
         for (auto& iter : gTimeTraceInfo->mTypes) {
-            float summer = 0.0f;
+            float summer      = 0.0f;
             float summerflops = 0.0f;
             for (auto& t : iter.second) {
                 for (auto& t0 : t.second) {
@@ -560,13 +697,15 @@ Llm::~Llm() {
                     summerflops += t0.second;
                 }
             }
-            summer = summer;
+            summer      = summer;
             summerflops = summerflops;
-            MNN_PRINT("%s : %.7f, FLOP: %.7f, Speed: %.7f GFlops\n", iter.first.c_str(), summer, summerflops, summerflops / summer);
+            MNN_PRINT("%s : %.7f, FLOP: %.7f, Speed: %.7f GFlops\n", iter.first.c_str(), summer, summerflops,
+                      summerflops / summer);
             opSummer += summer;
-            opFlopsSummber+= summerflops;
+            opFlopsSummber += summerflops;
         }
-        MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber, opFlopsSummber/opSummer);
+        MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber,
+                  opFlopsSummber / opSummer);
     }
 #endif
     current_modules_.clear();
@@ -576,64 +715,31 @@ Llm::~Llm() {
     runtime_manager_.reset();
 }
 
-// < speed
 void Llm::print_speed() {
+    auto vision_s   = vision_us_ * 1e-6;
+    auto audio_s   = audio_us_ * 1e-6;
+    auto prefill_s = prefill_us_ * 1e-6;
+    auto decode_s  = decode_us_ * 1e-6;
+    auto total_s   = vision_s + audio_s + prefill_s + decode_s;
     printf("\n#################################\n");
-    printf("average   total speed = %.3f tok/s\n", average_total_speed());
-    printf("average prefill speed = %.3f tok/s\n", average_prefill_speed());
-    printf("average  decode speed = %.3f tok/s\n", average_decode_speed());
+    printf(" total tokens num  = %d\n", prompt_len_ + gen_seq_len_);
+    printf("prompt tokens num  = %d\n", prompt_len_);
+    printf("output tokens num  = %d\n", gen_seq_len_);
+    printf("  total time = %.2f s\n", total_s);
+    if (1 || vision_s) {
+    printf(" vision time = %.2f s\n", audio_s);
+    }
+    if (1 || audio_us_) {
+    printf("  audio time = %.2f s\n", audio_s);
+    }
+    printf("prefill time = %.2f s\n", prefill_s);
+    printf(" decode time = %.2f s\n", decode_s);
+    printf("  total speed = %.2f tok/s\n", (prompt_len_ + gen_seq_len_) / total_s);
+    printf("prefill speed = %.2f tok/s\n", prompt_len_ / prefill_s);
+    printf(" decode speed = %.2f tok/s\n", gen_seq_len_ / decode_s);
+    printf("   chat speed = %.2f tok/s\n", gen_seq_len_ / total_s);
     printf("##################################\n");
-    #if DEBUG_MODE==1
-        if (nullptr != gTimeTraceInfo) {
-            float opSummer = 0.0f;
-            float opFlopsSummber = 0.0f;
-            for (auto& iter : gTimeTraceInfo->mTypes) {
-                float summer = 0.0f;
-                float summerflops = 0.0f;
-                for (auto& t : iter.second) {
-                    for (auto& t0 : t.second) {
-                        summer += t0.first;
-                        summerflops += t0.second;
-                    }
-                }
-                summer = summer;
-                summerflops = summerflops;
-                MNN_PRINT("%s : %.7f, FLOP: %.7f, Speed: %.7f GFlops\n", iter.first.c_str(), summer, summerflops, summerflops / summer);
-                opSummer += summer;
-                opFlopsSummber+= summerflops;
-            }
-            MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber, opFlopsSummber/opSummer);
-        }
-    #endif
-}
-
-void Llm::print_speed(std::ostream* os) {
-    mLlmSessionInfos[0].print_speed(os);
-}
-
-float Llm::average_total_speed() {
-    return mLlmSessionInfos[0].average_total_speed();
-}  
-float Llm::average_prefill_speed() {
-    // prefill response rate
-    return mLlmSessionInfos[0].average_prefill_speed();
-}
-float Llm::average_decode_speed() {
-    return mLlmSessionInfos[0].average_decode_speed();
-}
-float Llm::getTotalPrefillTime() {
-    return mLlmSessionInfos[0].getTotalPrefillTime();
 }
-float Llm::getTotalDecodeTime() {
-    return mLlmSessionInfos[0].getTotalDecodeTime();
-}
-int Llm::getTotalPromptLen() {
-    return mLlmSessionInfos[0].getTotalPromptLen();
-}
-int Llm::getTotalDecodeLen() {
-    return mLlmSessionInfos[0].getTotalDecodeLen();
-}
-// speed >
 
 static inline bool needNewVar(VARP var, int axis, int seq_len) {
     if (var == nullptr) {
@@ -648,7 +754,7 @@ static inline bool needNewVar(VARP var, int axis, int seq_len) {
 VARP Llm::embedding(const std::vector<int>& input_ids) {
     AUTOTIME;
     int hidden_size = config_->hidden_size();
-    int seq_len = static_cast<int>(input_ids.size());
+    int seq_len     = static_cast<int>(input_ids.size());
     if (needNewVar(inputs_embeds_, 0, seq_len)) {
         inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW);
     }
@@ -660,43 +766,42 @@ VARP Llm::embedding(const std::vector<int>& input_ids) {
 std::string Llm::tokenizer_decode(int id) {
     std::string word = tokenizer_->decode(id);
     // Fix utf-8 garbled characters
-    if (word.length() == 6 && word[0] == '<' && word[word.length()-1] == '>' && word[1] == '0' && word[2] == 'x') {
+    if (word.length() == 6 && word[0] == '<' && word[word.length() - 1] == '>' && word[1] == '0' && word[2] == 'x') {
         int num = std::stoi(word.substr(3, 2), nullptr, 16);
-        word = static_cast<char>(num);
+        word    = static_cast<char>(num);
     }
     return word;
 }
 
 VARP Llm::gen_attention_mask(int seq_len) {
-    int kv_seq_len_=mLlmSessionInfos[0].all_seq_len_+seq_len, gen_seq_len_=mLlmSessionInfos[0].gen_seq_len_;
-    int prev_seq_len_ = kv_seq_len_ - seq_len;
+    int kv_seq_len = all_seq_len_ + seq_len;
     if (seq_len == 1) {
-        kv_seq_len_ = seq_len;
+        kv_seq_len = seq_len;
     }
     if (config_->attention_mask() == "float") {
         if (needNewVar(attention_mask_, 2, seq_len)) {
-            attention_mask_ = _Input({1, 1, seq_len, kv_seq_len_}, NCHW, halide_type_of<float>());
+            attention_mask_ = _Input({1, 1, seq_len, kv_seq_len}, NCHW, halide_type_of<float>());
         } else {
             return attention_mask_;
         }
         auto ptr = attention_mask_->writeMap<float>();
         for (int i = 0; i < seq_len; i++) {
-            for (int j = 0; j < kv_seq_len_; j++) {
-                int row = i + prev_seq_len_;
-                ptr[kv_seq_len_ * i + j] = (j > row) * std::numeric_limits<float>::lowest();
+            for (int j = 0; j < kv_seq_len; j++) {
+                int row                 = i + all_seq_len_;
+                ptr[kv_seq_len * i + j] = (j > row) * std::numeric_limits<float>::lowest();
             }
         }
         return attention_mask_;
     } else {
         if (needNewVar(attention_mask_, 2, seq_len)) {
-            attention_mask_ = _Input({1, 1, seq_len, kv_seq_len_}, NCHW, halide_type_of<int>());
+            attention_mask_ = _Input({1, 1, seq_len, kv_seq_len}, NCHW, halide_type_of<int>());
         } else {
             return attention_mask_;
         }
         auto ptr = attention_mask_->writeMap<int>();
         if (config_->attention_mask() == "glm") {
             // chatglm
-            for (int i = 0; i < seq_len * kv_seq_len_; i++) {
+            for (int i = 0; i < seq_len * kv_seq_len; i++) {
                 ptr[i] = 0;
             }
             if (seq_len > 1) {
@@ -707,8 +812,8 @@ VARP Llm::gen_attention_mask(int seq_len) {
         } else {
             bool is_glm2 = config_->attention_mask() == "glm2";
             for (int i = 0; i < seq_len; i++) {
-                for (int j = 0; j < kv_seq_len_; j++) {
-                    int row = i + prev_seq_len_;
+                for (int j = 0; j < kv_seq_len; j++) {
+                    int row              = i + all_seq_len_;
                     ptr[seq_len * i + j] = is_glm2 ? j > row : j <= row;
                 }
             }
@@ -718,8 +823,6 @@ VARP Llm::gen_attention_mask(int seq_len) {
 }
 
 VARP Llm::gen_position_ids(int seq_len) {
-    int kv_seq_len_=mLlmSessionInfos[0].all_seq_len_+seq_len, gen_seq_len_=mLlmSessionInfos[0].gen_seq_len_;
-    int prev_seq_len_ = kv_seq_len_ - seq_len;
     if (config_->attention_mask() == "glm") {
         // chatglm
         if (needNewVar(position_ids_, 2, seq_len)) {
@@ -727,14 +830,14 @@ VARP Llm::gen_position_ids(int seq_len) {
         }
         auto ptr = position_ids_->writeMap<int>();
         if (seq_len == 1) {
-            ptr[0] = prev_seq_len_ - gen_seq_len_ - 2;
+            ptr[0] = all_seq_len_ - gen_seq_len_ - 2;
             ptr[1] = gen_seq_len_ + 1;
         } else {
             for (int i = 0; i < seq_len - 1; i++) {
-                ptr[i] = i;
+                ptr[i]           = i;
                 ptr[seq_len + i] = 0;
             }
-            ptr[seq_len - 1] = seq_len - 2;
+            ptr[seq_len - 1]     = seq_len - 2;
             ptr[2 * seq_len - 1] = 1;
         }
         return position_ids_;
@@ -745,10 +848,10 @@ VARP Llm::gen_position_ids(int seq_len) {
         }
         auto ptr = position_ids_->writeMap<int>();
         if (seq_len == 1) {
-            ptr[0] = is_glm2 ? gen_seq_len_ : prev_seq_len_;
+            ptr[0] = is_glm2 ? gen_seq_len_ : all_seq_len_;
         } else {
             for (int i = 0; i < seq_len; i++) {
-                ptr[i] = i + prev_seq_len_;
+                ptr[i] = i + all_seq_len_;
             }
         }
         return position_ids_;
@@ -759,79 +862,332 @@ bool Llm::is_stop(int token_id) {
     return tokenizer_->is_stop(token_id);
 }
 
-void Lvlm::load() {
+void Mllm::load() {
     Llm::load();
+    if (config_->mllm_config_.empty()) {
+        mllm_runtime_manager_ = runtime_manager_;
+    } else {
+        ScheduleConfig config;
+        BackendConfig cpuBackendConfig;
+        config.type      = backend_type_convert(config_->backend_type(true));;
+        config.numThread = config_->thread_num(true);
+        if (config_->power(true) == "high") {
+            cpuBackendConfig.power = BackendConfig::Power_High;
+        } else if (config_->power(true) == "low") {
+            cpuBackendConfig.power = BackendConfig::Power_Low;
+        }
+        if (config_->memory(true) == "high") {
+            cpuBackendConfig.memory = BackendConfig::Memory_High;
+        } else if (config_->memory(true) == "low") {
+            cpuBackendConfig.memory = BackendConfig::Memory_Low;
+        }
+        if (config_->precision(true) == "high") {
+            cpuBackendConfig.precision = BackendConfig::Precision_High;
+        } else if (config_->precision(true) == "low") {
+            cpuBackendConfig.precision = BackendConfig::Precision_Low;
+        }
+        config.backendConfig = &cpuBackendConfig;
+        mllm_runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
+        mllm_runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
+        mllm_runtime_manager_->setHint(MNN::Interpreter::DYNAMIC_QUANT_OPTIONS, 1); // 1: per batch quant, 2: per tensor quant
+        mllm_runtime_manager_->setHint(MNN::Interpreter::QKV_QUANT_OPTIONS, config_->quant_qkv());
+        mllm_runtime_manager_->setHint(MNN::Interpreter::KVCACHE_SIZE_LIMIT, config_->kvcache_limit());
+        std::string tmpPath = config_->tmp_path();
+        if (config_->kvcache_mmap()) {
+            mllm_runtime_manager_->setExternalPath(tmpPath, MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR);
+        }
+        if (config_->use_mmap()) {
+            mllm_runtime_manager_->setExternalPath(tmpPath, MNN::Interpreter::EXTERNAL_WEIGHT_DIR);
+        }
+    }
     Module::Config module_config;
     module_config.shapeMutable = true;
-    module_config.rearrange = false;
-    runtime_manager_->setExternalFile(config_->visual_model() + ".weight");
-    visual_module_.reset(Module::load({}, {}, config_->visual_model().c_str(), runtime_manager_, &module_config));
+    module_config.rearrange    = true;
+    if (config_->is_visual()) {
+        mllm_runtime_manager_->setExternalFile(config_->visual_model() + ".weight");
+        mul_module_.reset(Module::load({}, {}, config_->visual_model().c_str(), mllm_runtime_manager_, &module_config));
+    }
+    if (config_->is_audio()) {
+        mllm_runtime_manager_->setExternalFile(config_->audio_model() + ".weight");
+        mul_module_.reset(Module::load({}, {}, config_->audio_model().c_str(), mllm_runtime_manager_, &module_config));
+    }
+}
+
+static void dump_impl(const float *signal, size_t size, int row = 0) {
+if (row) {
+int col = size / row;
+printf("# %d, %d: [\n", row, col);
+for (int i = 0; i < 3; i++) {
+for (int j = 0; j < 3; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("..., ");
+for (int j = col - 3; j < col; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("\n");
+}
+printf("..., \n");
+for (int i = row - 3; i < row; i++) {
+for (int j = 0; j < 3; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("..., ");
+for (int j = col - 3; j < col; j++) {
+printf("%f, ", signal[i * col + j]);
+}
+printf("\n");
+}
+printf("]\n");
+} else {
+printf("# %lu: [", size);
+for (int i = 0; i < 3; i++) {
+printf("%f, ", signal[i]);
+}
+printf("..., ");
+for (int i = size - 3; i < size; i++) {
+printf("%f, ", signal[i]);
+}
+printf("]\n");
+}
+}
+
+void dump_var(VARP var) {
+auto dims    = var->getInfo()->dim;
+bool isfloat = true;
+printf("{\ndtype = ");
+if (var->getInfo()->type == halide_type_of<float>()) {
+printf("float");
+isfloat = true;
+} else if (var->getInfo()->type == halide_type_of<int>()) {
+printf("int");
+isfloat = false;
+}
+printf("\nformat = %d\n", var->getInfo()->order);
+printf("\ndims = [");
+for (int i = 0; i < dims.size(); i++) {
+printf("%d ", dims[i]);
+}
+printf("]\n");
+
+if (isfloat) {
+if ((dims.size() > 2 && dims[1] > 1 && dims[2] > 1) || (dims.size() == 2 && dims[0] > 1 && dims[1] > 1)) {
+int row = dims[dims.size() - 2];
+dump_impl(var->readMap<float>(), var->getInfo()->size, row);
+} else {
+printf("data = [");
+auto total = var->getInfo()->size;
+if (total > 32) {
+for (int i = 0; i < 5; i++) {
+printf("%f ", var->readMap<float>()[i]);
+}
+printf("..., ");
+for (int i = total - 5; i < total; i++) {
+printf("%f ", var->readMap<float>()[i]);
+}
+} else {
+for (int i = 0; i < total; i++) {
+printf("%f ", var->readMap<float>()[i]);
+}
+}
+printf("]\n}\n");
+}
+} else {
+printf("data = [");
+int size = var->getInfo()->size > 10 ? 10 : var->getInfo()->size;
+for (int i = 0; i < size; i++) {
+printf("%d ", var->readMap<int>()[i]);
+}
+printf("]\n}\n");
+}
 }
 
-std::vector<int> Lvlm::image_process(const std::string& image_info) {
+std::vector<int> Mllm::vision_process(const std::string& file) {
 #ifdef LLM_SUPPORT_VISION
-    VARP image = nullptr;
-    if (image_info.substr(0, 4) == "http") {
+    VARP image = MNN::CV::imread(file);
+    auto st    = std::chrono::system_clock::now();
+    VARP image_embedding;
+
+    if (mul_module_->getInfo()->inputNames[0] == "patches") {
+        // Qwen2-VL
+        image_height_ = round(image_height_ / 28.0) * 28;
+        image_width_ = round(image_width_ / 28.0) * 28;
+        image        = MNN::CV::resize(image, {image_height_, image_width_}, 0, 0,
+                                     MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB,
+                                     image_mean_, image_norm_);
+        image        = MNN::Express::_Unsqueeze(image, {0});
+        image        = MNN::Express::_Convert(image, NCHW);
+        auto patches = MNN::Express::_Concat({image, image}, 0);
+        auto patches_dim = patches->getInfo()->dim;
+        int temporal = patches_dim[0];
+        int channel  = patches_dim[1];
+        int height   = patches_dim[2];
+        int width    = patches_dim[3];
+        constexpr int temporal_patch_size = 2;
+        constexpr int patch_size = 14;
+        constexpr int merge_size = 2;
+        int grid_t = temporal / temporal_patch_size;
+        int grid_h = height / patch_size;
+        int grid_w = width / patch_size;
+        // build patches
+        patches = MNN::Express::_Reshape(patches, {
+            grid_t, temporal_patch_size,
+            channel,
+            grid_h / merge_size, merge_size, patch_size,
+            grid_w / merge_size, merge_size, patch_size,
+        });
+        patches = MNN::Express::_Permute(patches, {0, 3, 6, 4, 7, 2, 1, 5, 8});
+        patches = MNN::Express::_Reshape(patches, {
+            grid_t * grid_h * grid_w,
+            channel * temporal_patch_size * patch_size * patch_size
+        });
+        const int seq_len = grid_t * grid_h * grid_w;
+        // build position_ids
+        const int wblock_size = merge_size * merge_size;
+        const int hblock_size = wblock_size * grid_w / merge_size;
+        VARP position_ids = MNN::Express::_Input({2, seq_len}, NCHW, halide_type_of<int>());
+        auto hpos_ptr = position_ids->writeMap<int>();
+        auto wpos_ptr = hpos_ptr + seq_len;
+        for (int i = 0; i < grid_h; i++) {
+            int h_idx = i / merge_size, h_off = i % merge_size;
+            for (int j = 0; j < grid_w; j++) {
+                int w_idx = j / merge_size, w_off = j % merge_size;
+                int index = h_idx * hblock_size + w_idx * wblock_size + h_off * 2 + w_off;
+                hpos_ptr[index] = i;
+                wpos_ptr[index] = j;
+            }
+        }
+        // build attention_mask
+        VARP attention_mask = MNN::Express::_Input({1, seq_len, seq_len}, NCHW);
+        ::memset(attention_mask->writeMap<float>(), 0, seq_len * seq_len * sizeof(float));
+        image_embedding = mul_module_->onForward({patches, position_ids, attention_mask})[0];
+    } else {
+        image           = MNN::CV::resize(image, {image_height_, image_width_}, 0, 0,
+                                          MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB,
+                                          image_mean_, image_norm_);
+        image           = MNN::Express::_Unsqueeze(image, {0});
+        image           = MNN::Express::_Convert(image, NC4HW4);
+        image_embedding = mul_module_->forward(image);
+    }
+    auto et    = std::chrono::system_clock::now();
+    vision_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
+    mul_embeddings_.push_back(image_embedding);
+    int visual_len = image_embedding->getInfo()->dim[0];
+    std::vector<int> img_ids(visual_len, img_pad_);
+    img_ids.insert(img_ids.begin(), vision_start_);
+    img_ids.push_back(vision_end_);
+    return img_ids;
+#else
+    return std::vector<int>(0);
+#endif
+}
+
+template <typename T>
+static inline VARP _var(std::vector<T> vec, const std::vector<int> &dims) {
+    return _Const(vec.data(), dims, NHWC, halide_type_of<T>());
+}
+
+std::vector<int> Mllm::audio_process(const std::string& file) {
+#ifdef LLM_SUPPORT_AUDIO
+    constexpr int sample_rate = 16000;
+    auto load_res        = MNN::AUDIO::load(file, sample_rate);
+    VARP waveform        = load_res.first;
+    // int sample_rate      = load_res.second;
+    int wav_len          = waveform->getInfo()->dim[0];
+    int hop_length       = 160;
+    auto st              = std::chrono::system_clock::now();
+    auto input_features  = MNN::AUDIO::whisper_fbank(waveform);
+    auto audio_embedding = mul_module_->forward(input_features);
+    audio_embedding = _Permute(audio_embedding, {1, 0, 2});
+    auto et         = std::chrono::system_clock::now();
+    audio_us_       = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
+    mul_embeddings_.push_back(audio_embedding);
+    int embed_len = audio_embedding->getInfo()->dim[0];
+    std::vector<int> audio_ids(embed_len, audio_pad_);
+    return audio_ids;
+#else
+    return std::vector<int>(0);
+#endif
+}
+
+std::vector<int> Mllm::multimode_process(const std::string& mode, std::string info) {
+    auto file_info = info;
+    if (mode == "img") {
+        std::regex hw_regex(R"(<hw>(.*?)</hw>)");
+        std::sregex_iterator iter(info.begin(), info.end(), hw_regex);
+        std::sregex_iterator end;
+        file_info = "";
+
+        size_t currentPosition = 0;
+        if (iter != end) {
+            std::smatch match = *iter;
+            size_t matchPosition = match.position();
+            if (matchPosition > currentPosition) {
+                file_info.append(info.substr(currentPosition, matchPosition - currentPosition));
+            }
+
+            std::stringstream hw_ss(match.str(1));
+            char comma;
+            hw_ss >> image_height_ >> comma >> image_width_;
+            currentPosition = matchPosition + match.length();
+        }
+        if (currentPosition < info.length()) {
+            file_info.append(info.substr(currentPosition));
+        }
+        // std::cout << "hw: " << image_height_ << ", " << image_width_ << std::endl;
+        // std::cout << "file: " << file_info << std::endl;
+    }
+    if (file_info.substr(0, 4) == "http") {
         std::regex url_regex(R"(^https?://([^/]+)(/.*))");
         std::smatch url_match_result;
         std::string host, path;
-        if (std::regex_search(image_info, url_match_result, url_regex) && url_match_result.size() == 3) {
+        if (std::regex_search(file_info, url_match_result, url_regex) && url_match_result.size() == 3) {
             host = url_match_result[1].str();
             path = url_match_result[2].str();
         }
         // std::cout << host << "#" << path << std::endl;
         httplib::Client cli(host);
-        auto res = cli.Get(path);
-        std::string img_file = "downloaded_image.jpg";
+        auto res  = cli.Get(path);
+        file_info = "downloaded_file";
         if (res && res->status == 200) {
-            std::ofstream file(img_file, std::ios::binary);
+            std::ofstream file(file_info, std::ios::binary);
             if (file.is_open()) {
                 file.write(res->body.c_str(), res->body.size());
-                std::cout << "Image has been downloaded successfully." << std::endl;
+                std::cout << "File has been downloaded successfully." << std::endl;
                 file.close();
             } else {
-                std::cerr << "Unable to open file to write image." << std::endl;
+                std::cerr << "Unable to open file to write." << std::endl;
                 exit(0);
             }
         } else {
-            std::cerr << "Failed to download image. Status code: " << (res ? res->status : 0) << std::endl;
+            std::cerr << "Failed to download file. Status code: " << (res ? res->status : 0) << std::endl;
             exit(0);
         }
-        image = MNN::CV::imread(img_file);
-    } else {
-        image = MNN::CV::imread(image_info);
     }
-    image = MNN::CV::resize(image, {image_size_, image_size_}, 0, 0, MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB, image_mean_, image_norm_);
-    image = MNN::Express::_Unsqueeze(image, {0});
-    image = MNN::Express::_Convert(image, NC4HW4);
-    auto image_embedding = visual_module_->forward(image);
-    image_embeddings_.push_back(image_embedding);
-    int visual_len = image_embedding->getInfo()->dim[0];
-    std::vector<int> img_ids(visual_len, image_pad_);
-    img_ids.insert(img_ids.begin(), vision_start_);
-    img_ids.push_back(vision_end_);
-    return img_ids;
-#else
+    if (mode == "img" && config_->is_visual()) {
+        return vision_process(file_info);
+    }
+    if (mode == "audio" && config_->is_audio()) {
+        return audio_process(file_info);
+    }
     return std::vector<int>(0);
-#endif
 }
 
-std::vector<int> Lvlm::tokenizer_encode(const std::string& query, bool use_template) {
-    auto prompt = query;
-    if (!use_template) { prompt = mPromptLib->applyTemplate(query); }
+std::vector<int> Mllm::tokenizer_encode(const std::string& query, bool use_template) {
+    auto prompt = apply_prompt_template(query);
     // split query
-    std::regex img_regex("<img>(.*?)</img>");
+    std::regex multimode_regex("<(img|audio)>(.*?)</\\1>");
     std::string::const_iterator searchStart(prompt.cbegin());
     std::smatch match;
     std::vector<std::string> img_infos;
-    std::vector<int> ids {};
+    std::vector<int> ids{};
 
-    while (std::regex_search(searchStart, prompt.cend(), match, img_regex)) {
+    while (std::regex_search(searchStart, prompt.cend(), match, multimode_regex)) {
         // std::cout << "img match: " << match[1].str() << std::endl;
         auto txt_ids = tokenizer_->encode(match.prefix().str());
         ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
-        auto img_ids = image_process(match[1].str());
-        ids.insert(ids.end(), img_ids.begin(), img_ids.end());
+        auto mul_ids = multimode_process(match[1].str(), match[2].str());
+        ids.insert(ids.end(), mul_ids.begin(), mul_ids.end());
         searchStart = match.suffix().first;
     }
     if (searchStart != prompt.cend()) {
@@ -842,29 +1198,47 @@ std::vector<int> Lvlm::tokenizer_encode(const std::string& query, bool use_templ
     return ids;
 }
 
-VARP Lvlm::embedding(const std::vector<int>& input_ids) {
+VARP Mllm::embedding(const std::vector<int>& input_ids) {
     if (input_ids.size() == 1) {
         return Llm::embedding(input_ids);
     }
     std::vector<VARP> embeddings;
-    int img_idx = 0;
+    int mul_idx = 0;
     std::vector<int> cur_txt_ids;
+    bool in_audio = false;
     for (int i = 0; i < input_ids.size(); i++) {
         int id = input_ids[i];
-        if (id == image_pad_) {
+        // audio
+        if (in_audio) {
+            if (id == audio_pad_) {
+                continue;
+            } else {
+                cur_txt_ids.clear();
+                in_audio = false;
+            }
+        } else if (id == audio_pad_) {
+            auto txt_embedding = Llm::embedding(cur_txt_ids);
+            auto mul_embedding = mul_embeddings_[mul_idx++];
+            embeddings.push_back(txt_embedding);
+            embeddings.push_back(mul_embedding);
+            in_audio = true;
+        }
+        // vision
+        if (id == img_pad_) {
             continue;
         }
         cur_txt_ids.push_back(id);
         if (id == vision_start_) {
             auto txt_embedding = Llm::embedding(cur_txt_ids);
-            auto img_embedding = image_embeddings_[img_idx++];
+            auto mul_embedding = mul_embeddings_[mul_idx++];
             embeddings.push_back(txt_embedding);
-            embeddings.push_back(img_embedding);
+            embeddings.push_back(mul_embedding);
         } else if (id == vision_end_) {
             cur_txt_ids.clear();
             cur_txt_ids.push_back(id);
         }
     }
+    mul_embeddings_.clear();
     if (!cur_txt_ids.empty()) {
         auto txt_embedding = Llm::embedding(cur_txt_ids);
         embeddings.push_back(txt_embedding);
@@ -877,7 +1251,7 @@ VARP Lvlm::embedding(const std::vector<int>& input_ids) {
 // Embedding start
 float Embedding::dist(VARP var0, VARP var1) {
     auto distVar = _Sqrt(_ReduceSum(_Square(var0 - var1)));
-    auto dist = distVar->readMap<float>()[0];
+    auto dist    = distVar->readMap<float>()[0];
     return dist;
 }
 
@@ -890,9 +1264,12 @@ Embedding* Embedding::createEmbedding(const std::string& config_path, bool load)
     return embedding;
 }
 
-Embedding::Embedding(std::shared_ptr<LlmConfig> config) : Llm(config) {}
+Embedding::Embedding(std::shared_ptr<LlmConfig> config) : Llm(config) {
+}
 
-int Embedding::dim() const { return config_->hidden_size(); }
+int Embedding::dim() const {
+    return config_->hidden_size();
+}
 
 void Embedding::load() {
     init_runtime();
@@ -905,33 +1282,32 @@ void Embedding::load() {
     // 2. load model
     Module::Config module_config;
     module_config.shapeMutable = true;
-    module_config.rearrange = true;
-    auto model_path = config_->llm_model();
+    module_config.rearrange    = true;
+    auto model_path            = config_->llm_model();
     MNN_PRINT("load %s ... ", model_path.c_str());
     modules_.resize(1);
-    modules_[0].reset(Module::load(
-                                   {"input_ids", "attention_mask", "position_ids"},
-                                   {"sentence_embeddings"}, model_path.c_str(), runtime_manager_, &module_config));
+    modules_[0].reset(Module::load({"input_ids", "attention_mask", "position_ids"}, {"sentence_embeddings"},
+                                   model_path.c_str(), runtime_manager_, &module_config));
     MNN_PRINT("Done!\n");
 }
 
 VARP Embedding::ids_embedding(const std::vector<int>& ids) {
-    int prompt_len = ids.size();
-    auto inputs_ids = embedding(ids);
-    auto attention_mask = gen_attention_mask(prompt_len);
-    auto position_ids = gen_position_ids(prompt_len);
-    auto outputs = modules_[0]->onForward({inputs_ids, attention_mask, position_ids});
+    int prompt_len           = ids.size();
+    auto inputs_ids          = embedding(ids);
+    auto attention_mask      = gen_attention_mask(prompt_len);
+    auto position_ids        = gen_position_ids(prompt_len);
+    auto outputs             = modules_[0]->onForward({inputs_ids, attention_mask, position_ids});
     auto sentence_embeddings = outputs[0];
     return sentence_embeddings;
 }
 
 VARP Embedding::txt_embedding(const std::string& txt) {
-    return ids_embedding(tokenizer_encode(txt, false));
+    return ids_embedding(tokenizer_encode(txt));
 }
 
 VARP Embedding::gen_attention_mask(int seq_len) {
     auto attention_mask = _Input({1, 1, 1, seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = attention_mask->writeMap<int>();
+    auto ptr            = attention_mask->writeMap<int>();
     for (int i = 0; i < seq_len; i++) {
         ptr[i] = 1;
     }
@@ -940,12 +1316,12 @@ VARP Embedding::gen_attention_mask(int seq_len) {
 
 VARP Embedding::gen_position_ids(int seq_len) {
     auto position_ids = _Input({1, seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = position_ids->writeMap<int>();
+    auto ptr          = position_ids->writeMap<int>();
     for (int i = 0; i < seq_len; i++) {
         ptr[i] = i;
     }
     return position_ids;
 }
 // Embedding end
-}
-}
+} // namespace Transformer
+} // namespace MNN
diff --git a/transformers/llm/engine/src/llmconfig.cpp b/transformers/llm/engine/src/llmconfig.cpp
deleted file mode 100644
index 458f3c4d1..000000000
--- a/transformers/llm/engine/src/llmconfig.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//
-//  llmconfig.hpp
-//
-//  Created by MNN on 2024/07/19.
-//  ZhaodeWang
-//
-
-
-
-#include "rapidjson/document.h"
-#include <rapidjson/writer.h>
-#include <rapidjson/stringbuffer.h>
-#include "llmconfig.hpp"
-
-namespace MNN {
-namespace Transformer {
-
-bool merge_json(rapidjson::Value& destination, const rapidjson::Value& source,
-                rapidjson::Document::AllocatorType& allocator) {
-    if (!source.IsObject() || !destination.IsObject()) {
-        return false;
-    }
-
-    for (auto it = source.MemberBegin(); it != source.MemberEnd(); ++it) {
-        const char* key = it->name.GetString();
-        if (destination.HasMember(key)) {
-            if (destination[key].IsObject() && it->value.IsObject()) {
-                // Recursively merge the two JSON objects
-                merge_json(destination[key], it->value, allocator);
-            } else {
-                // Overwrite the value in the destination
-                destination[key].CopyFrom(it->value, allocator);
-            }
-        } else {
-            // Add the value to the destination
-            rapidjson::Value newKey(key, allocator);
-            rapidjson::Value newValue;
-            newValue.CopyFrom(it->value, allocator);
-            destination.AddMember(newKey, newValue, allocator);
-        }
-    }
-    return true;
-}
-
-} // Transformer
-} // MNN
-
diff --git a/transformers/llm/engine/src/llmconfig.hpp b/transformers/llm/engine/src/llmconfig.hpp
index 301ce38ae..a3ea87bd4 100644
--- a/transformers/llm/engine/src/llmconfig.hpp
+++ b/transformers/llm/engine/src/llmconfig.hpp
@@ -5,18 +5,9 @@
 //  ZhaodeWang
 //
 
-#ifndef LLMCONFIG_Hpp
-#define LLMCONFIG_Hpp
-
-#include <vector>
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <rapidjson/document.h>
+#include "rapidjson/document.h"
 #include <rapidjson/writer.h>
 #include <rapidjson/stringbuffer.h>
-		
-
 
 namespace MNN {
 namespace Transformer {
@@ -45,7 +36,31 @@ static inline std::string file_name(const std::string& path) {
 }
 
 bool merge_json(rapidjson::Value& destination, const rapidjson::Value& source,
-                rapidjson::Document::AllocatorType& allocator);
+                rapidjson::Document::AllocatorType& allocator) {
+    if (!source.IsObject() || !destination.IsObject()) {
+        return false;
+    }
+
+    for (auto it = source.MemberBegin(); it != source.MemberEnd(); ++it) {
+        const char* key = it->name.GetString();
+        if (destination.HasMember(key)) {
+            if (destination[key].IsObject() && it->value.IsObject()) {
+                // Recursively merge the two JSON objects
+                merge_json(destination[key], it->value, allocator);
+            } else {
+                // Overwrite the value in the destination
+                destination[key].CopyFrom(it->value, allocator);
+            }
+        } else {
+            // Add the value to the destination
+            rapidjson::Value newKey(key, allocator);
+            rapidjson::Value newValue;
+            newValue.CopyFrom(it->value, allocator);
+            destination.AddMember(newKey, newValue, allocator);
+        }
+    }
+    return true;
+}
 
 class rapid_json_wrapper {
 public:
@@ -66,6 +81,7 @@ class rapid_json_wrapper {
         rapid_json_wrapper json_wrapper(std::move(document));
         return json_wrapper;
     }
+    bool empty() { return document.IsNull(); }
     bool merge(const char* str) {
         rapidjson::Document input_doc;
         input_doc.Parse(str);
@@ -83,12 +99,13 @@ class rapid_json_wrapper {
         return buffer.GetString();
     }
     // read value
-    float value(const char* key, const float& default_value) const {
-        if (document.HasMember(key)) {
-            const auto& value = document[key];
-            if (value.IsFloat()) return value.GetFloat();
+    rapid_json_wrapper value(const char* key) const {
+        if (document.HasMember(key)  && document[key].IsObject()) {
+            rapidjson::Document subDoc;
+            subDoc.CopyFrom(document[key], subDoc.GetAllocator());
+            return rapid_json_wrapper(std::move(subDoc));
         }
-        return default_value;
+        return rapid_json_wrapper();
     }
     int value(const char* key, const int& default_value) const {
         if (document.HasMember(key)) {
@@ -154,21 +171,6 @@ class rapid_json_wrapper {
         }
         return default_value;
     }
-    std::vector<std::string> value(const char* key, const std::vector<std::string>& default_value) const {
-        if (document.HasMember(key)) {
-            const auto& value = document[key];
-            if (value.IsArray()) {
-                std::vector<std::string> result;
-                for (auto& v : value.GetArray()) {
-                    if (v.IsString()) {
-                        result.push_back(v.GetString());
-                    }
-                }
-                return result;
-            }
-        }
-        return default_value;
-    }
     std::string value(const char key[], const char default_value[]) const {
         return value(key, std::string(default_value));
     }
@@ -177,7 +179,7 @@ class rapid_json_wrapper {
 class LlmConfig {
 public:
     std::string base_dir_;
-    rapid_json_wrapper config_, llm_config_;
+    rapid_json_wrapper config_, llm_config_, mllm_config_, cur_config_;
     LlmConfig() {}
     LlmConfig(const std::string& path) {
         // load config
@@ -214,6 +216,7 @@ class LlmConfig {
         } else {
             std::cerr << "Unable to open llm_config file: " << llm_config() << std::endl;
         }
+        mllm_config_ = config_.value("mllm");
     }
 
     // < model file config start
@@ -252,13 +255,13 @@ class LlmConfig {
     std::string visual_model() const {
         return base_dir_ + config_.value("visual_model", "visual.mnn");
     }
-    // model file config end >
 
-    // < generate config start
-    int max_all_tokens() const {
-        return config_.value("max_all_tokens", 2048);
+    std::string audio_model() const {
+        return base_dir_ + config_.value("audio_model", "audio.mnn");
     }
+    // model file config end >
 
+    // < generate config start
     int max_new_tokens() const {
         return config_.value("max_new_tokens", 512);
     }
@@ -269,22 +272,27 @@ class LlmConfig {
     // generate config end >
 
     // < backend config start
-    std::string backend_type() const {
+    std::string backend_type(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("backend_type", "cpu");
         return config_.value("backend_type", "cpu");
     }
 
-    int thread_num() const {
+    int thread_num(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("thread_num", 4);
         return config_.value("thread_num", 4);
     }
 
-    std::string precision() const {
+    std::string precision(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("precision", "low");
         return config_.value("precision", "low");
     }
-    std::string power() const {
+    std::string power(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("power", "normal");
         return config_.value("power", "normal");
     }
 
-    std::string memory() const {
+    std::string memory(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("memory", "low");
         return config_.value("memory", "low");
     }
 
@@ -306,6 +314,10 @@ class LlmConfig {
         return llm_config_.value("is_visual", false);
     }
 
+    bool is_audio() const {
+        return llm_config_.value("is_audio", false);
+    }
+
     bool use_mmap() const {
         return config_.value("use_mmap", false);
     }
@@ -335,96 +347,18 @@ class LlmConfig {
         return llm_config_.value("attention_fused", true);
     }
 
-    std::string system_prompt_template() const {
-        return llm_config_.value("system_prompt_template", "<|im_start|>system\n%s<|im_end|>\n");
-    }
-    std::string user_prompt_template() const {
-        return llm_config_.value("user_prompt_template", "<|im_start|>user\n%s<|im_end|>\n");
+    std::string chat_template() const {
+        return llm_config_.value("chat_template", "");
     }
-    std::string assistant_prefix() const {
-        return llm_config_.value("assistant_prefix", "<|im_start|>assistant\n");
-    }
-    std::string assistant_suffix() const {
-        return llm_config_.value("assistant_suffix", "<|im_end|>\n");
+
+    std::string prompt_template() const {
+        return llm_config_.value("prompt_template", "");
     }
 
     std::vector<int64_t> tie_embeddings() const {
         return llm_config_.value("tie_embeddings", std::vector<int64_t>{});
     }
     // llm model config end >
-
-    // < sampler config start
-    std::string sampler_type() const {
-        return config_.value("sampler_type", "mixed");
-    }
-
-    std::vector<std::string> mixed_samplers() const {
-        return config_.value("mixed_samplers", std::vector<std::string>({"topK", "tfs", "typical", "topP", "min_p", "temperature"}));
-    }
-
-    float temperature() const {
-        return config_.value("temperature", 1.0f);
-    }
-
-    int topK() const {
-        return config_.value("topK", 40);
-    }
-
-    float topP() const {
-        return config_.value("topP", 0.9f);
-    }
-
-    float minP() const {
-        return config_.value("minP", 0.1f);
-    }
-
-    float tfsZ() const {
-        return config_.value("tfsZ", 1.0f);
-    }
-
-    float typical() const {
-        return config_.value("typical", 1.0f);
-    }
-
-    float penalty() const {
-        return config_.value("penalty", 0.0f);
-    }
-
-    int ngram() const {
-        return config_.value("n_gram", 8);
-    }
-
-    float ngram_factor() const {
-        return config_.value("ngram_factor", 1.0f);
-    }
-
-    std::string penalty_sampler() const {
-        return config_.value("penalty_sampler", "greedy");
-    }
-    // sampler config end >
-
-    // < app config start
-    std::string app_type() const {
-        return config_.value("app_type", "chat");
-    }
-    std::string system_prompt() const {
-        return config_.value("system_prompt", "You are a helpful assistant!\n");
-    }
-    // app config end >
-
-    // < evaulation config start
-    int ppl_stride() const {
-        return config_.value("ppl_stride", 0);
-    }
-    std::string dataset() const {
-        return config_.value("dataset", "wikitext");
-    }
-    int dataset_sample_size() const {
-        return config_.value("dataset_sample_size", -1); // -1 stands for no sampling, use all.
-    }
-    // evaulation config end
 };
 } // Transformer
 } // MNN
-
-#endif
diff --git a/transformers/llm/engine/src/perplexity.cpp b/transformers/llm/engine/src/perplexity.cpp
deleted file mode 100644
index 4d04cdf31..000000000
--- a/transformers/llm/engine/src/perplexity.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-#include <algorithm>
-#include <vector>
-#include <cmath>
-#include <llm/llm.hpp>
-#include <iostream>
-#include <iomanip>
-
-#include "sampler.hpp"
-#include "perplexity.hpp"
-#include "llmconfig.hpp"
-#include "prompt.hpp"
-
-namespace MNN{
-namespace Transformer{
-
-
-/* -----------TextPPLMeasurer---------- */
-TextPPLMeasurer::TextPPLMeasurer(Llm* llm, std::shared_ptr<LlmConfig> llmConfig) {
-    mLlm = llm;
-    mConfig.max_all_tokens = llmConfig->max_all_tokens();
-    mConfig.max_new_tokens = llmConfig->max_new_tokens();
-    mDatasetType = llmConfig->dataset();
-    mStride = llmConfig->ppl_stride();
-    if (mStride == 0) {
-        // default stride for sliding window.
-        mStride = mConfig.max_all_tokens / 2;
-    } 
-}
-
-/* Implemented based on https://huggingface.co/docs/transformers/perplexity
-
- ******************** HuggingFace Python Version ************************
-
-import torch
-from tqdm import tqdm
-
-max_length = model.config.n_positions
-stride = 512
-seq_len = encodings.input_ids.size(1)
-
-nlls = []
-prev_end_loc = 0
-for begin_loc in tqdm(range(0, seq_len, stride)):
-    end_loc = min(begin_loc + max_length, seq_len)
-    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
-    target_ids = input_ids.clone()
-    target_ids[:, :-trg_len] = -100
-
-    with torch.no_grad():
-        outputs = model(input_ids, labels=target_ids)
-
-        # loss is calculated using CrossEntropyLoss which averages over valid labels
-        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-        # to the left by 1.
-        neg_log_likelihood = outputs.loss
-
-    nlls.append(neg_log_likelihood)
-
-    prev_end_loc = end_loc
-    if end_loc == seq_len:
-        break
-
-ppl = torch.exp(torch.stack(nlls).mean())
-
- ******************** HuggingFace Python Version ************************ 
-*/
-
-float TextPPLMeasurer::perplexity_one(const std::vector<int>& prompt) {
-    int seq_len = prompt.size();
-    std::vector<float> nlls;
-    float ppl = 0.f;
-    
-    // start calculation 
-    int prev_end_loc = 1; // the first token start from id=1, do not count the first one.
-    for (int begin_loc = 0; begin_loc < seq_len; begin_loc += mStride) {
-        int end_loc = std::min(begin_loc + mConfig.max_all_tokens, seq_len);
-        // first token
-        std::vector<int> tokens(prev_end_loc - begin_loc);
-        for (int it = begin_loc; it < prev_end_loc; ++it) tokens[it - begin_loc] = prompt[it];
-        mLlm->mLlmSessionInfos[0].all_seq_len_ = tokens.size();
-        mLlm->mLlmSessionInfos[0].gen_seq_len_ = mLlm->mLlmSessionInfos[0].all_seq_len_;
-        auto logits = mLlm->forward(tokens, true);
-        logits = MNN::Express::_Softmax(logits);
-        nlls.push_back(-std::log(((float*)(logits->readMap<float>()))[prompt[prev_end_loc]]));
-        // std::cout << mLlm->tokenizer_decode(argmax(logits)) << "  " << mLlm->tokenizer_decode(prompt[prev_end_loc]) << "  " << -std::log(((float*)(logits->readMap<float>()))[prompt[prev_end_loc]]) << std::endl;
-        std::cout << -std::log(((float*)(logits->readMap<float>()))[prompt[prev_end_loc]]) << std::endl;
-        // decode following tokens
-        for (int it = prev_end_loc+1; it < end_loc; ++it) {
-            mLlm->mLlmSessionInfos[0].all_seq_len_ += 1;
-            mLlm->mLlmSessionInfos[0].gen_seq_len_ = mLlm->mLlmSessionInfos[0].all_seq_len_;
-            auto logits = mLlm->forward({prompt[it-1]}, false);
-            logits = MNN::Express::_Softmax(logits);
-            nlls.push_back(-std::log(((float*)(logits->readMap<float>()))[prompt[it]]));
-            // std::cout << mLlm->tokenizer_decode(argmax(logits)) << "  " << mLlm->tokenizer_decode(prompt[it]) << "  " << -std::log(((float*)(logits->readMap<float>()))[prompt[it]]) << std::endl;
-            std::cout << -std::log(((float*)(logits->readMap<float>()))[prompt[it]]) << std::endl;
-        }
-        // clean up once
-        mLlm->reset();
-        prev_end_loc = end_loc;
-        if (end_loc == seq_len) break;
-    } 
-
-    // calculate ppl
-    for (int j = 0; j < nlls.size(); ++j) ppl += nlls[j];
-    ppl /= nlls.size();
-    ppl = std::exp(ppl);
-    
-    // print 
-    std::cout << "PPL: " << std::setprecision(8) << ppl << std::endl;
-    return ppl;
-}
-
-std::vector<float> TextPPLMeasurer::perplexity(std::vector<std::vector<int>> prompts) {
-    std::vector<float> ppls;
-    for (auto prompt : prompts) {
-        ppls.push_back(perplexity_one(prompt));
-        mLlm->reset();
-    }
-    return ppls;
-}
-
-std::vector<float> TextPPLMeasurer::perplexity(std::vector<std::string> prompts) {
-    std::vector<std::vector<int>> tokens(prompts.size());
-    for (int p = 0; p < prompts.size(); ++p) tokens[p] = mLlm->tokenizer_encode(prompts[p], false);
-    return perplexity(tokens);
-}
-
-std::vector<float> TextPPLMeasurer::perplexity(std::string prompt_file, std::ostream* perfOS) {
-    // No performance will be printed!
-    std::vector<std::string> prompts;
-    if (mDatasetType == "wikitext") {
-        prompts = wikitext(prompt_file);
-    }
-    else if (mDatasetType == "plaintext") {
-        prompts = plaintext(prompt_file);
-    }
-    else if (mDatasetType == "rowsplit") {
-        prompts = rowsplit(prompt_file);
-    }
-    else {
-        MNN_ERROR("Dataset not suppoted");
-        exit(1);
-    }
-    std::cout << "prompt file loaded!" << std::endl;
-    return perplexity(prompts);
-}
-
-/* -----------ChatPPLMeasurer---------- */
-ChatPPLMeasurer::ChatPPLMeasurer(Llm* llm, std::shared_ptr<LlmConfig> llmConfig) {
-    mLlm = llm;
-    mConfig.max_all_tokens = llmConfig->max_all_tokens();
-    mConfig.max_new_tokens = llmConfig->max_new_tokens();
-    mDatasetType = llmConfig->dataset();
-    mDatasetSampleSize = llmConfig->dataset_sample_size();
-}
-
-void ChatPPLMeasurer::handleToken(int token) {
-    // CommonPrefix and Candidates managements
-    mLlm->mLlmSessionInfos[0].tokens.push_back(token);
-}
-
-std::vector<float> ChatPPLMeasurer::sample(const std::vector<int>& input_ids, const std::vector<int>& prompt, struct TimePerformance* time_perf) {
-    std::vector<float> nlls;
-    // initialization for time performance
-    PrefillTimePerformance prefill_time;
-    prefill_time.prefill_prev_token_ = mLlm->mLlmSessionInfos[0].tokens.size();
-    prefill_time.prefill_token_ = input_ids.size();
-    appendNewPromptRecord(time_perf, input_ids.size(), mLlm->reuse_kv());
-    // initialization
-    mLlm->mLlmSessionInfos[0].tokens.insert(mLlm->mLlmSessionInfos[0].tokens.end(), input_ids.begin(), input_ids.end());
-    // all_seq_len_ in sampler functions as kv_seq_len_, prev_seq_len_ = all_seq_len_ - seq_len
-    mLlm->mLlmSessionInfos[0].all_seq_len_ = mLlm->mLlmSessionInfos[0].tokens.size() - input_ids.size(); 
-    mLlm->mLlmSessionInfos[0].gen_seq_len_ = 0;
-    // prefill 
-    auto st = std::chrono::system_clock::now();
-    auto logits = mLlm->forward(input_ids, true);
-    logits = MNN::Express::_Softmax(logits);
-    nlls.push_back(-std::log(((float*)(logits->readMap<float>()))[prompt[mLlm->mLlmSessionInfos[0].gen_seq_len_]]));
-    // record time
-    auto et = std::chrono::system_clock::now();
-    prefill_time.prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
-    time_perf->prefill_record_.push_back(prefill_time);
-    // handle the new token
-    handleToken(prompt[mLlm->mLlmSessionInfos[0].gen_seq_len_]);
-    // decode
-    while (mLlm->mLlmSessionInfos[0].gen_seq_len_ < prompt.size()) {
-        DecodeTimePerformance decode_time;
-        decode_time.decode_prev_token_ = mLlm->mLlmSessionInfos[0].tokens.size();
-        st = std::chrono::system_clock::now();
-        // next token
-        logits = mLlm->forward({mLlm->mLlmSessionInfos[0].tokens.back()}, false);
-        logits = MNN::Express::_Softmax(logits);
-        nlls.push_back(-std::log(((float*)(logits->readMap<float>()))[prompt[mLlm->mLlmSessionInfos[0].gen_seq_len_]]));
-        et = std::chrono::system_clock::now();
-        decode_time.decode_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
-        time_perf->decode_record_.push_back(decode_time);
-        handleToken(prompt[mLlm->mLlmSessionInfos[0].gen_seq_len_]);
-    }
-    // return nlls
-    return nlls;
-}
-
-float ChatPPLMeasurer::perplexity_one(const std::vector<std::vector<PromptItem>>& prompt, std::ostream* perfOS) {
-    // (turns, roles)
-    std::vector<float> nlls;
-    float ppl = 0.f;
-
-    // < simulated chat
-    mLlm->reset();
-    for (auto& turn : prompt) {
-        mLlm->mPromptLib->appendUserPrompt(turn[0].second);
-        std::vector<int> input_ids = mLlm->tokenizer_encode(mLlm->mPromptLib->getLLMInput(), false);
-        mLlm->generate_init();
-        auto turn_nlls = sample(input_ids, mLlm->tokenizer_encode(turn[1].second, false), &(mLlm->mLlmSessionInfos[0].mTimePerformance));
-        nlls.insert(nlls.end(), turn_nlls.begin(), turn_nlls.end());
-        mLlm->mPromptLib->appendLLMOutput(turn[1].second);
-    }
-
-    // record time performance to file
-    if (perfOS != nullptr) {
-        mLlm->mLlmSessionInfos[0].print_speed(perfOS);
-    }
-
-    mLlm->reset();
-    // simulated chat >
-
-    // calculate ppl
-    for (int j = 0; j < nlls.size(); ++j) ppl += nlls[j];
-    ppl /= nlls.size();
-    ppl = std::exp(ppl);
-    
-    // print 
-    std::cout << "PPL: " << std::setprecision(8) << ppl << std::endl;
-    return ppl;
-}
-
-
-std::vector<float> ChatPPLMeasurer::perplexity(const std::vector<std::vector<std::vector<PromptItem>>>& prompts, std::ostream* perfOS) {
-    std::vector<float> ppls;
-    for (auto& prompt : prompts) {
-        ppls.push_back(perplexity_one(prompt, perfOS));
-        mLlm->reset();
-    }
-    return ppls;
-}
-
-void ChatPPLMeasurer::getStats(const std::vector<std::vector<std::vector<PromptItem>>>& prompts) {
-    std::ofstream total_stats("total_stats.csv");
-    std::ofstream dialog_stats("dialog_stats.csv");
-    float average_turns=0, average_prefill=0, average_decode=0, average_total_tokens=0;
-    int max_turns=0;
-    std::vector<std::vector<std::vector<int>>> stats; // (dialog, turn, (prefill, decode))
-    std::cout << prompts.size() << std::endl;
-    int counter = 0;
-    for (auto& dialog : prompts) {
-        std::vector<std::vector<int>> dialog_stats;
-        if ((counter++) % std::max((int)prompts.size()/200, 1) == 0) std::cout << "*" << std::flush;
-        float prefill_len_turn = 0;
-        float decode_len_turn = 0;
-        for (auto& turn : dialog) {
-            // turn: prefill, decode
-            int prefill_len = mLlm->tokenizer_encode(turn[0].second, false).size();
-            int decode_len = mLlm->tokenizer_encode(turn[1].second, false).size();
-            prefill_len_turn += prefill_len;
-            decode_len_turn += decode_len;
-            average_total_tokens += prefill_len + decode_len;
-            dialog_stats.push_back({prefill_len, decode_len});
-        }
-        stats.push_back(dialog_stats);
-        average_prefill += prefill_len_turn / dialog.size(); // average over turns
-        average_decode += decode_len_turn / dialog.size(); // average over turns
-        average_turns += dialog.size();
-        max_turns = std::max(max_turns, (int)dialog.size());
-    }
-    average_turns /= prompts.size();
-    average_prefill /= prompts.size();
-    average_decode /= prompts.size();
-    average_total_tokens /= prompts.size();
-    total_stats << "total_dialogs," << "max_turns," << "avg_turns," \
-                 << "avg_prefill_tokens/turn," << "avg_decode_tokens/turn," \
-                  << "avg_total_tokens/dialog" << std::endl;
-    total_stats << prompts.size() << ","  << max_turns << "," << average_turns << "," \
-                 << average_prefill << "," << average_decode << "," \
-                  << average_total_tokens << std::endl;
-    for (int i=0; i<max_turns; ++i) dialog_stats <<  "prefill" << i << "," << "decode" << i << ","; // this creates an extra blank column at the end.
-    dialog_stats <<  std::endl;
-    for (auto& dialog : stats) {
-        for (auto& turn : dialog){
-            dialog_stats << turn[0] << "," << turn[1] << ",";
-        }
-        for (int i=dialog.size(); i<max_turns; ++i) {
-            dialog_stats <<  ",,";
-        }  
-        dialog_stats <<  std::endl;
-    }
-}   
-
-
-std::vector<float> ChatPPLMeasurer::perplexity(std::string prompt_file, std::ostream* perfOS) {
-    // No performance will be printed!
-    std::vector<std::vector<std::vector<PromptItem>>> prompts;
-    if (mDatasetType == "shareGPT") {
-        prompts = shareGPT(prompt_file, mDatasetSampleSize);
-    }
-    else {
-        MNN_ERROR("Dataset not suppoted");
-        exit(1);
-    }
-    std::cout << "prompt file loaded!" << std::endl;
-    getStats(prompts);
-    std::cout << "\nshareGPT statistics counted!" << std::endl;
-    return perplexity(prompts, perfOS);
-}
-
-
-} // Transformer
-} // MNN
\ No newline at end of file
diff --git a/transformers/llm/engine/src/perplexity.hpp b/transformers/llm/engine/src/perplexity.hpp
deleted file mode 100644
index ece414dd2..000000000
--- a/transformers/llm/engine/src/perplexity.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef PERPLEXITY_hpp
-#define PERPLEXITY_hpp
-
-#include <vector>
-#include <memory>
-#include <string>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <streambuf>
-#include <functional>
-#include <unordered_map>
-#include <utility>
-
-#include <MNN/expr/Expr.hpp>
-#include <MNN/expr/Module.hpp>
-#include <MNN/expr/MathOp.hpp>
-#include <MNN/expr/NeuralNetWorkOp.hpp>
-
-#include "sampler.hpp"
-#include "evaluation/dataset.hpp"
-
-namespace MNN {
-namespace Transformer {
-class Llm;
-
-class MNN_PUBLIC TextPPLMeasurer : public Sampler {
-protected:
-    Llm* mLlm;
-    int mStride;
-    std::string mDatasetType;
-    LlmSamplerConfig mConfig;
-public:
-    TextPPLMeasurer(Llm* llm, std::shared_ptr<LlmConfig> config);
-    float perplexity_one(const std::vector<int>& prompt);
-    std::vector<float> perplexity(std::vector<std::vector<int>> prompts);
-    std::vector<float> perplexity(std::vector<std::string> prompts);
-    virtual std::string sample(const std::vector<int>& input_ids, std::ostream* os = &std::cout, const char* end_with = nullptr, struct TimePerformance* time_perf = nullptr) override { return "perplexity evaluation!\n"; }
-    virtual std::vector<float> perplexity(std::string prompt_file, std::ostream* perfOS = nullptr) override;
-};
-
-class MNN_PUBLIC ChatPPLMeasurer : public Sampler {
-protected:
-    Llm* mLlm;
-    std::string mDatasetType;
-    int mDatasetSampleSize;
-    LlmSamplerConfig mConfig;
-    void handleToken(int token);
-    std::vector<float> sample(const std::vector<int>& input_ids, const std::vector<int>& prompt, struct TimePerformance* time_perf);
-public:
-    ChatPPLMeasurer(Llm* llm, std::shared_ptr<LlmConfig> config);
-    void getStats(const std::vector<std::vector<std::vector<PromptItem>>>& prompts);
-    float perplexity_one(const std::vector<std::vector<PromptItem>>& prompt, std::ostream* perfOS);
-    std::vector<float> perplexity(const std::vector<std::vector<std::vector<PromptItem>>>& prompts, std::ostream* perfOS);
-    virtual std::string sample(const std::vector<int>& input_ids, std::ostream* os = &std::cout, const char* end_with = nullptr, struct TimePerformance* time_perf = nullptr) override { return "perplexity evaluation!\n"; }
-    virtual std::vector<float> perplexity(std::string prompt_file, std::ostream* perfOS = nullptr) override;
-};
-
-
-
-} // Transformer
-} // MNN
-
-
-#endif // SAMPLER_hpp
\ No newline at end of file
diff --git a/transformers/llm/engine/src/prompt.cpp b/transformers/llm/engine/src/prompt.cpp
deleted file mode 100644
index 445c630ce..000000000
--- a/transformers/llm/engine/src/prompt.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "prompt.hpp"
-
-namespace MNN {
-namespace Transformer {
-
-/* ----------PromptLib---------- */
-PromptLib* PromptLib::createPromptLib(Llm* llm, const std::string& config_path) {
-    return createPromptLib(llm, std::shared_ptr<LlmConfig>(new LlmConfig(config_path)));
-}
-PromptLib* PromptLib::createPromptLib(Llm* llm, std::shared_ptr<LlmConfig> config) {
-    if (config->app_type() == "chat" || config->app_type() == "perplexity") {
-        return new BaseChatPromptLib(llm, config);
-    } else {
-        std::cout << "PromptLib not Implemented!\n" << std::endl; 
-        return nullptr;
-    }
-}
-
-/* ----------BaseChatPromptLib---------- */
-BaseChatPromptLib::BaseChatPromptLib(Llm* llm, std::shared_ptr<LlmConfig> config) {
-    mLlm = llm;
-    mReuseKV = config->reuse_kv();
-    mDefaultSystemPrompt = config->system_prompt();
-    mSystemTemplate = config->system_prompt_template();
-    mUserTemplate = config->user_prompt_template();
-    mAssistantPrefix = config->assistant_prefix();
-    mAssistantSuffix = config->assistant_suffix();
-}
-
-void BaseChatPromptLib::appendSystemPrompt() {
-    appendSystemPrompt(mDefaultSystemPrompt);
-}
-void BaseChatPromptLib::appendSystemPrompt(const std::string sys_prompt) {
-    mLlm->mLlmSessionInfos[0].mHistory.emplace_back(std::make_pair("system", sys_prompt));
-    mLlm->mLlmSessionInfos[0].mInputs.emplace_back(std::make_pair("system", sys_prompt));
-}
-void BaseChatPromptLib::appendUserPrompt(const std::string user_prompt) {
-    if (mLlm->mLlmSessionInfos[0].mHistory.empty()) { appendSystemPrompt(); } // prevent no system prompt appendix.
-    mLlm->mLlmSessionInfos[0].mHistory.emplace_back(std::make_pair("user", user_prompt));
-    mLlm->mLlmSessionInfos[0].mInputs.emplace_back(std::make_pair("user", user_prompt));
-}
-void BaseChatPromptLib::appendLLMOutput(std::string out_str) {
-    mLlm->mLlmSessionInfos[0].mHistory.emplace_back(std::make_pair("assistant", out_str));
-    if (mReuseKV) {
-        // clear input
-        mLlm->mLlmSessionInfos[0].mInputs.clear();
-    } else {
-        // keep input, append output
-        mLlm->mLlmSessionInfos[0].mInputs.emplace_back(std::make_pair("assistant", out_str));
-    }
-}
-
-std::string BaseChatPromptLib::getLLMInput() {
-    std::string input_str;
-    if (mReuseKV) {
-        if (mLlm->mLlmSessionInfos[0].mHistory.size() != mLlm->mLlmSessionInfos[0].mInputs.size()) {
-            // 1.1 not first prefill, add end of speech.
-            input_str += mAssistantSuffix;
-        }
-    }
-    // 1.2 generate from template
-    input_str += applyTemplates(mLlm->mLlmSessionInfos[0].mInputs);
-    input_str += mAssistantPrefix;
-    return input_str;
-}
-
-std::string BaseChatPromptLib::applyTemplate(PromptItem item, std::string prompt_template, std::string placeholder) {
-    size_t start_pos = prompt_template.find(placeholder);
-    if (start_pos == std::string::npos) return item.first + "\n" + item.second + "\n";
-    else {
-        prompt_template.replace(start_pos, placeholder.length(), item.second);
-        return prompt_template;
-    }
-}
-
-std::string BaseChatPromptLib::applyTemplates(std::vector<PromptItem> inputs) {
-    std::string input_str;
-    for (auto input : inputs) {
-        if (input.first == "") continue;
-        if (input.first == "system") {
-            if (input.second == "") continue;
-            input_str += applyTemplate(input, mSystemTemplate, "%s");
-            continue;
-        } 
-        if (input.first == "user") {
-            input_str += applyTemplate(input, mUserTemplate, "%s");
-            continue;
-        }
-        if (input.first == "assistant") {
-            input_str += mAssistantPrefix + input.second + mAssistantSuffix;
-            continue;
-        }
-        // Invalid role!!!
-    }
-    return input_str;
-}
-
-std::string BaseChatPromptLib::applyTemplate(std::string user_content) {
-    std::vector<PromptItem> prompts;
-    prompts.push_back(std::make_pair("system", mDefaultSystemPrompt));
-    prompts.push_back(std::make_pair("user", user_content));
-    return applyTemplates(prompts) + mAssistantPrefix;
-}
-
-std::string BaseChatPromptLib::getAssistantSuffix() const {
-    return mAssistantSuffix;
-}
-
-}
-}
\ No newline at end of file
diff --git a/transformers/llm/engine/src/prompt.hpp b/transformers/llm/engine/src/prompt.hpp
deleted file mode 100644
index 3fcf42509..000000000
--- a/transformers/llm/engine/src/prompt.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-
-
-#ifndef PROMPT_Hpp
-#define PROMPT_Hpp
-
-#include "llm/llm.hpp"
-#include "llmconfig.hpp"
-#define MNN_OPEN_TIME_TRACE
-#include <MNN/AutoTime.hpp>
-#include <MNN/expr/ExecutorScope.hpp>
-#include <fstream>
-#include <sstream>
-#include <stdlib.h>
-
-
-namespace MNN {
-namespace Transformer {
-
-/* PromptLib: history organization + input organization */
-class MNN_PUBLIC PromptLib {
-protected:
-    Llm* mLlm;
-public:
-    static PromptLib* createPromptLib(Llm* llm, const std::string& config_path);
-    static PromptLib* createPromptLib(Llm* llm, std::shared_ptr<LlmConfig> config);
-    virtual std::string applyTemplate(std::string user_content) = 0;
-    virtual std::string getAssistantSuffix() const = 0;
-    virtual void appendSystemPrompt(const std::string sys_prompt) = 0;
-    virtual void appendSystemPrompt() = 0;
-    virtual void appendUserPrompt(const std::string use_prompt) = 0;
-    virtual void appendLLMOutput(std::string out_str) = 0;
-    virtual std::string getLLMInput() = 0;
-    virtual void reset(Llm* llm) { mLlm = llm; }
-};
-
-class MNN_PUBLIC BaseChatPromptLib : public PromptLib {
-protected:
-    bool mReuseKV;
-    std::string mDefaultSystemPrompt;
-    std::string mSystemTemplate;
-    std::string mUserTemplate;
-    std::string mAssistantPrefix;
-    std::string mAssistantSuffix;
-    std::string applyTemplate(PromptItem item, std::string prompt_template, std::string placeholder = "%s");
-    std::string applyTemplates(std::vector<PromptItem> inputs);
-public:
-    BaseChatPromptLib(Llm* llm, std::shared_ptr<LlmConfig> config);
-    virtual std::string applyTemplate(std::string user_content) override;
-    virtual std::string getAssistantSuffix() const override;
-    virtual void appendSystemPrompt(const std::string sys_prompt) override;
-    virtual void appendSystemPrompt() override;
-    virtual void appendUserPrompt(const std::string user_prompt) override;
-    virtual void appendLLMOutput(std::string out_str) override;
-    virtual std::string getLLMInput() override;
-};
-
-}
-}
-
-
-#endif
\ No newline at end of file
diff --git a/transformers/llm/engine/src/sampler.cpp b/transformers/llm/engine/src/sampler.cpp
deleted file mode 100644
index 87a9c28b9..000000000
--- a/transformers/llm/engine/src/sampler.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-#include <random>
-#include <queue>
-#include <algorithm>
-#include <cmath>
-#include <unordered_map>
-
-#include <MNN/expr/Executor.hpp>
-#include <MNN/expr/ExecutorScope.hpp>
-
-#include "llm/llm.hpp"
-#include "evaluation/dataset.hpp"
-#include "sampler.hpp"
-#include "perplexity.hpp"
-#include "llmconfig.hpp"
-
-namespace MNN{
-namespace Transformer{
-
-MNN::Express::VARP _TempratureSoftmax(MNN::Express::VARP logits, float temperature, int axis) {
-    return MNN::Express::_Softmax(logits * MNN::Express::_Scalar<float>(1.0f / temperature), axis);
-}
-
-/* ----------Sampler's members---------- */
-int Sampler::select(struct SubsetLogits& subset, int id) {
-    if (!(subset.is_subset)) return id;
-    return subset.index[id];
-}
-
-int Sampler::randomSelect(float* probs, size_t size) {
-    std::random_device rd;
-    std::mt19937 generator(rd());
-    std::uniform_real_distribution<float> distribution(0.0, 1.0);
-    float target = distribution(generator);
-    float cumulative = 0.0;
-    for (int i = 0; i < size; i++) {
-        cumulative += probs[i];
-        if (target < cumulative) {
-            return i;
-        }
-    }
-    return size - 1;
-}
-
-int Sampler::randomSelect(MNN::Express::VARP probs) {
-    return randomSelect((float*)(probs->readMap<float>()), probs->getInfo()->size);
-}
-
-int Sampler::reSoftmaxSelect(struct SubsetLogits subset, float temperature) {
-    int token_index_id = randomSelect(_TempratureSoftmax(subset.logits, temperature));
-    return ((subset.is_subset) ? subset.index[token_index_id] : token_index_id);
-}
-
-SubsetLogits Sampler::createSubsetLogits(MNN::Express::VARP logits) {
-    struct SubsetLogits subset;
-    subset.logits = logits;
-    subset.is_subset = false;
-    return subset;
-}
-
-SubsetLogits Sampler::createSubsetLogits(MNN::Express::VARP logits, const std::vector<int>& index) {
-    struct SubsetLogits subset;
-    subset.logits = logits;
-    subset.index = index;
-    subset.is_subset = true;
-    return subset;
-}
-
-SubsetLogits Sampler::createSubsetLogits(int size) {
-    struct SubsetLogits subset;
-    subset.logits = MNN::Express::_Input({size}, MNN::Express::NHWC);
-    subset.index.resize(size);
-    subset.is_subset = true;
-    return subset;
-}
-
-SubsetLogits Sampler::createSubsetLogits(const std::vector<float>& scores, const std::vector<int>& index) {
-    int size = (int)(index.size());
-    struct SubsetLogits subset;
-    subset.logits = MNN::Express::_Input({size}, MNN::Express::NHWC);
-    auto pointer = (float*)(subset.logits->writeMap<float>());
-    for (int i = 0; i < size; ++i) {
-        pointer[i] = scores[i];
-    }
-    subset.index = index;
-    subset.is_subset = true;
-    return subset;
-}
-
-void Sampler::transformIndex(struct SubsetLogits& superset, struct SubsetLogits& subset) {
-    if (!(superset.is_subset)) return;
-    for (auto& id : subset.index) {
-        id = superset.index[id];
-    }
-}
-
-Sampler* Sampler::createSampler(Llm* llm, const std::string& config_path) {
-    return createSampler(llm, std::shared_ptr<LlmConfig>(new LlmConfig(config_path)));
-}
-
-Sampler* Sampler::createSampler(Llm* llm, std::shared_ptr<LlmConfig> config) {
-    std::string sampler_type = config->sampler_type();
-    if (sampler_type == "greedy"
-        || sampler_type == "temperature"
-        || sampler_type == "penalty"
-        || sampler_type == "topK"
-        || sampler_type == "topP"
-        || sampler_type == "minP"
-        || sampler_type == "tfs"
-        || sampler_type == "typical"
-        || sampler_type == "mixed"
-        ) {
-        return new LocalSampler(llm, config);
-    } else if (config->app_type() == "perplexity") {
-        std::string ppl_type = getPPLType(config->dataset());
-        if (ppl_type == "text") { return new TextPPLMeasurer(llm, config); }
-        else if (ppl_type == "chat") { return new ChatPPLMeasurer(llm, config); }
-    } else {
-        std::cout << "Designated Sampler Not Supported yet!";
-        exit(1);
-    }
-    return nullptr;
-}
-
-
-/* ----------LocalSamplerConfig---------- */
-void LocalSampler::LocalSamplerConfig::configSampler( std::string sampler_type, std::shared_ptr<LlmConfig> llmConfig) {
-    if (sampler_type == "greedy"){
-        this->configGreedy(llmConfig);
-    } else if (sampler_type == "temperature"){
-        this->configTemperature(llmConfig);
-    } else if (sampler_type == "topK"){
-        this->configTopK(llmConfig);
-    } else if (sampler_type == "topP"){
-        this->configTopP(llmConfig);
-    } else if (sampler_type == "minP"){
-        this->configMinP(llmConfig);
-    } else if (sampler_type == "tfs"){
-        this->configTFS(llmConfig);
-    } else if (sampler_type == "typical"){
-        this->configTypical(llmConfig);
-    } else if (sampler_type == "penalty"){
-        this->configPenalty(llmConfig);
-    } else if (sampler_type == "mixed"){
-        this->configMixed(llmConfig);
-    }
-}
-void LocalSampler::LocalSamplerConfig::configGreedy(std::shared_ptr<LlmConfig> llmConfig) {
-    select_type = "greedy"; 
-}
-void LocalSampler::LocalSamplerConfig::configTemperature(std::shared_ptr<LlmConfig> llmConfig) {
-    temperature = llmConfig->temperature();
-    select_type = "temperature"; 
-}
-void LocalSampler::LocalSamplerConfig::configTopK(std::shared_ptr<LlmConfig> llmConfig) {
-    topK = llmConfig->topK();
-    select_type = "temperature"; 
-}
-void LocalSampler::LocalSamplerConfig::configTopP(std::shared_ptr<LlmConfig> llmConfig) {
-    topP = llmConfig->topP();
-    temperature = llmConfig->temperature();
-    select_type = "temperature"; 
-}
-void LocalSampler::LocalSamplerConfig::configMinP(std::shared_ptr<LlmConfig> llmConfig) {
-    minP = llmConfig->minP();
-    temperature = llmConfig->temperature();
-    select_type = "temperature"; 
-}
-void LocalSampler::LocalSamplerConfig::configTFS(std::shared_ptr<LlmConfig> llmConfig) {
-    tfsZ = llmConfig->tfsZ();
-    temperature = llmConfig->temperature();
-    select_type = "temperature"; 
-}
-void LocalSampler::LocalSamplerConfig::configTypical(std::shared_ptr<LlmConfig> llmConfig) {
-    typical = llmConfig->typical();
-    temperature = llmConfig->temperature();
-    select_type = "temperature";  
-}
-void LocalSampler::LocalSamplerConfig::configPenalty(std::shared_ptr<LlmConfig> llmConfig) {
-    penaltyConfig.penalty = llmConfig->penalty();
-    penaltyConfig.ngram = llmConfig->ngram();
-    penaltyConfig.ngram_factor = llmConfig->ngram_factor();
-    penaltyConfig.sampler = llmConfig->penalty_sampler();
-    select_type = penaltyConfig.sampler;
-}
-void LocalSampler::LocalSamplerConfig::configMixed(std::shared_ptr<LlmConfig> llmConfig) {
-    mixedSamplers = llmConfig->mixed_samplers();
-    std::cout << "Mixed Sampler Sequence: " << std::flush;
-    for (auto samplerName : mixedSamplers) {
-        this->configSampler(samplerName, llmConfig);
-        std::cout << samplerName << " " << std::flush;
-    }
-    std::cout << std::endl;
-    // set select type
-    // the final sampler select the token
-    if (mixedSamplers.back() == "greedy") select_type = "greedy";
-    else if(mixedSamplers.back()=="temperature") select_type = "temperature";
-    else select_type = "temperature"; // By default temperature is used.   
-}
-
-
-/* ----------LocalSampler's members---------- */ 
-LocalSampler::LocalSamplerConfig LocalSampler::getSamplerConfig(std::shared_ptr<LlmConfig> llmConfig) {
-    LocalSampler::LocalSamplerConfig samplerConfig;
-    samplerConfig.max_all_tokens = llmConfig->max_all_tokens();
-    samplerConfig.max_new_tokens = llmConfig->max_new_tokens();
-    samplerConfig.type = llmConfig->sampler_type();
-    std::string sampler_type = samplerConfig.type;
-    std::cout << "Sampler: " << sampler_type << std::endl;
-    samplerConfig.configSampler(sampler_type, llmConfig);
-    return samplerConfig;
-}
-
-LocalSampler::LocalSampler(Llm* llm, std::shared_ptr<LlmConfig> config) {
-    // initialize model and candidates
-    mLlm = llm;
-    // initialize config
-    mConfig = getSamplerConfig(config);
-}
-
-int LocalSampler::argmaxSelect(struct SubsetLogits superset) {
-    auto scores = (float*)(superset.logits->readMap<float>());
-    auto size = superset.logits->getInfo()->size;
-    float max_score = scores[0];
-    int token_id = 0;
-    for (int i = 0; i < size; i++) {
-        float score = scores[i];
-        if (score > max_score) {
-            max_score = score;
-            token_id = i;
-        }
-    }
-    return select(superset, token_id);
-}
-
-struct SubsetLogits LocalSampler::topK(struct SubsetLogits superset) {
-    int K = mConfig.topK;
-    auto scores = (float*)(superset.logits->readMap<float>());
-    auto size = superset.logits->getInfo()->size;
-    // 1. time complexity: O(nlogk)
-    std::priority_queue<IndexScore, std::vector<IndexScore>, IndexScoreCmpGreater> heap;
-    for (int i = 0; i < size; i++) {
-        IndexScore m;
-        m.index = i;
-        m.score = scores[i];
-        if (heap.size() < K) {
-            heap.push(m);
-        } 
-        else {
-            if (heap.top().score < m.score) {
-                heap.pop();
-                heap.push(m);
-            }
-        }
-    }
-    // 2. store top K results
-    auto subset = createSubsetLogits(K);
-    float* topKscores = (float*)(subset.logits->writeMap<float>());
-    for (int i = 0; i < K; i++) {
-        subset.index[K-i-1] = heap.top().index;
-        topKscores[K-i-1]  = heap.top().score;
-        heap.pop();
-    }
-    transformIndex(superset, subset);
-    return subset;
-}
-
-int LocalSampler::packSoftmax(MNN::Express::VARP logits, std::vector<IndexScore>& index_scores, float temperature) {
-    auto prob_varp = _TempratureSoftmax(logits, temperature);
-    auto probs = (float*)(prob_varp->readMap<float>());
-    auto size = prob_varp->getInfo()->size;
-    index_scores.resize(size);
-    for (int i = 0; i < size; i++) {
-        IndexScore m;
-        m.index = i;
-        m.score = probs[i];
-        index_scores[i] = m;
-    }
-    return size;
-}
-
-struct SubsetLogits LocalSampler::topP(struct SubsetLogits superset) {
-    float p = mConfig.topP, temperature = mConfig.temperature;
-    std::vector<IndexScore> index_scores;
-    int size = packSoftmax(superset.logits, index_scores, temperature);
-    // 1. make max heap
-    std::make_heap(index_scores.begin(), index_scores.end(), IndexScoreCmpLess());
-    // 2. top p algorithm
-    auto scores = (float*)(superset.logits->readMap<float>());
-    std::vector<int> index;
-    std::vector<float> subset_logits;
-    float cumulative = 0.0f; 
-    while (cumulative < p && !index_scores.empty()) {
-        std::pop_heap(index_scores.begin(), index_scores.end(), IndexScoreCmpLess());
-        IndexScore m = index_scores.back();
-        index_scores.pop_back();
-        index.push_back(m.index);
-        subset_logits.push_back(scores[m.index]);
-        cumulative += m.score;
-    }
-    auto subset = createSubsetLogits(subset_logits, index);
-    transformIndex(superset, subset);
-    return subset;
-}
-
-struct SubsetLogits LocalSampler::minP(struct SubsetLogits superset) {
-    float p = mConfig.minP, temperature = mConfig.temperature;
-    std::vector<IndexScore> index_scores;
-    int size = packSoftmax(superset.logits, index_scores, temperature);
-    // 1. make max heap
-    std::make_heap(index_scores.begin(), index_scores.end(), IndexScoreCmpLess());
-    // 2. min p algorithm
-    auto scores = (float*)(superset.logits->readMap<float>());
-    std::vector<int> index;
-    std::vector<float> subset_logits;
-    for (int i = 0; i < size; ++i) {
-        std::pop_heap(index_scores.begin(), index_scores.end(), IndexScoreCmpLess());
-        IndexScore m = index_scores.back();
-        if (m.score < p && !index.empty()) break;
-        index_scores.pop_back();
-        index.push_back(m.index);
-        subset_logits.push_back(scores[m.index]);
-    }
-    auto subset = createSubsetLogits(subset_logits, index);
-    transformIndex(superset, subset);
-    return subset;
-}
-
-struct SubsetLogits LocalSampler::tfs(struct SubsetLogits superset) {
-    float z = mConfig.tfsZ, temperature = mConfig.temperature;
-    // tfs algorithm
-    // 1. softmax
-    std::vector<IndexScore> index_scores;
-    int size = packSoftmax(superset.logits, index_scores, temperature);
-    // 2. sort
-    std::sort(index_scores.begin(), index_scores.end(), IndexScoreCmpGreater());
-    auto scores = (float*)(superset.logits->readMap<float>());
-    // 3. calculate derivatives
-    std::vector<float> derivatives(size - 2, 0.0f);
-    float first = index_scores[0].score - index_scores[1].score;
-    float second = index_scores[1].score - index_scores[2].score;
-    for (int i = 0; i < size - 2; ++i) {
-        second = index_scores[i+1].score - index_scores[i+2].score;
-        derivatives[i] = std::fabs(first - second);
-        first = second;
-    }
-    // 4. normalize derivatives
-    float derivatives_sum = 0.0;
-    for (int i = 0; i < size - 2; ++i) derivatives_sum += derivatives[i];
-    float derivatives_sum_rec = 1.0f / derivatives_sum;
-    for (int i = 0; i < size - 2; ++i) derivatives[i] *= derivatives_sum_rec;
-    // 5. cumulate, discard last 2 for sure.
-    float cumulative = 0.0; 
-    std::vector<int> index;
-    std::vector<float> subset_logits;
-    for (int i = 0; i < size - 2; ++i) {
-        IndexScore m = index_scores[i];
-        cumulative += derivatives[i];
-        if (cumulative >= z && !index.empty()) break;
-        index.push_back(m.index);
-        subset_logits.push_back(scores[m.index]);
-    }
-    auto subset = createSubsetLogits(subset_logits, index);
-    transformIndex(superset, subset);
-    return subset;
-}
-
-struct SubsetLogits LocalSampler::typical(struct SubsetLogits superset) {
-    float p = mConfig.typical, temperature = mConfig.temperature;
-    auto prob_varp = _TempratureSoftmax(superset.logits, temperature);
-    auto probs = (float*)(prob_varp->readMap<float>());
-    auto size = prob_varp->getInfo()->size;
-    std::vector<IndexScore> index_scores;
-    index_scores.resize(size);
-    // 1. calcaluate dist
-    float entropy = 0.0f;
-    for (int i = 0; i < size; i++) entropy -= probs[i] * std::log(probs[i]);
-    for (int i = 0; i < size; i++) {
-        IndexScore m;
-        m.index = i;
-        m.score = std::fabs(entropy + std::log(probs[i]));
-        index_scores[i] = m;
-    }
-    // 2. make min heap for dist
-    std::make_heap(index_scores.begin(), index_scores.end(), IndexScoreCmpGreater());
-    // 3. typical p algorithm
-    auto scores = (float*)(superset.logits->readMap<float>());
-    float cumulative = 0.0f;
-    std::vector<int> index;
-    std::vector<float> subset_logits;
-    for (int i = 0; i < size; ++i) {
-        std::pop_heap(index_scores.begin(), index_scores.end(), IndexScoreCmpGreater());
-        IndexScore m = index_scores.back();
-        cumulative += probs[m.index];
-        if (cumulative >= p && !index.empty()) break;
-        index_scores.pop_back();
-        index.push_back(m.index);
-        subset_logits.push_back(scores[m.index]);
-    }
-    auto subset = createSubsetLogits(subset_logits, index);
-    transformIndex(superset, subset);
-    return subset;
-}
-
-// presence penalty
-// no frequency penalty now!
-struct SubsetLogits LocalSampler::penalty(struct SubsetLogits subset) {
-    float penalty = mConfig.penaltyConfig.penalty;
-    int ngram = mConfig.penaltyConfig.ngram; 
-    float ngram_factor = mConfig.penaltyConfig.ngram_factor;
-    float temperature = mConfig.temperature;
-    bool penalizeNgram = (ngram_factor > 1.0f);
-    if (penalty <= 1.0f) return subset; // no penalty!
-    penalty = std::min(penalty, mConfig.penaltyConfig.max_penalty);
-    // initialization
-    std::vector<int>& prev = mLlm->mLlmSessionInfos[0].tokens;
-    std::unordered_map<int, float> penalty_map;
-    // 1. local ngram info, reversed order
-    std::vector<int> ngram_info(ngram-1);
-    if (penalizeNgram) {
-        for (int n = 0; n < ngram_info.size(); ++n) {
-            ngram_info[n] = prev[prev.size()-1-n];
-        }
-    }
-    // 2. generate penalty map
-    for (int i = 0; i < prev.size(); ++i) {
-        if (penalty_map.count(prev[i]) == 0) penalty_map[prev[i]] = penalty;
-        if (penalizeNgram) {
-            float ngram_penalty = penalty;
-            for (int j = i-1; i-j < ngram && j>=0; --j) {
-                int idx = i-j-1;
-                if (prev[j] != ngram_info[idx]) break;
-                ngram_penalty *= ngram_factor;
-                // no repeat larger than ngram!
-                if (idx == ngram_info.size()-1) ngram_penalty = mConfig.penaltyConfig.max_penalty;
-            }
-            if (ngram_penalty > penalty_map[prev[i]]) penalty_map[prev[i]] = ngram_penalty;
-        }
-    }
-    // 3. penalize logits according to penalty_map
-    auto scoresMap = (float*)(subset.logits->writeMap<float>());
-    for (auto it = penalty_map.begin(); it != penalty_map.end(); ++it) {
-        scoresMap[it->first] = (scoresMap[it->first] >= 0.0f) ? (scoresMap[it->first]/it->second) : (scoresMap[it->first]*it->second);
-    }
-    return subset;
-}
-
-struct SubsetLogits LocalSampler::mixed(struct SubsetLogits subset) {
-    for (auto sampler : mConfig.mixedSamplers) {
-        subset = subsetSampler(sampler, subset);
-    }
-    return subset;
-}
-
-struct SubsetLogits LocalSampler::subsetSampler(std::string sampler_type, struct SubsetLogits subset) {
-    if (sampler_type == "penalty") subset = penalty(subset);
-    if (sampler_type == "topK") subset = topK(subset);
-    if (sampler_type == "topP") subset = topP(subset);
-    if (sampler_type == "minP") subset = minP(subset);
-    if (sampler_type == "tfs") subset = tfs(subset);
-    if (sampler_type == "typical") subset = typical(subset);
-    if (sampler_type == "mixed") subset = mixed(subset);
-    // if greedy and temperate, just let the Selector handle it.
-    return subset;
-}
-
-int LocalSampler::handleSelect(struct SubsetLogits subset) {
-    if (mConfig.select_type == "greedy") return argmaxSelect(subset);
-    else if(mConfig.select_type =="temperature") return reSoftmaxSelect(subset, mConfig.temperature);
-    return 0;
-}
-
-int LocalSampler::algorithm(MNN::Express::VARP logits) {
-    struct SubsetLogits subset = createSubsetLogits(logits);
-    // process subsetSampler
-    subset = subsetSampler(mConfig.type, subset);
-    // select token from the subset
-    int res = handleSelect(subset); 
-    // return
-    Express::ExecutorScope::Current()->gc(Express::Executor::FULL);
-    return res;
-}
-
-std::string LocalSampler::handleToken(int token, std::ostream* os, const char* end_with) {
-    // CommonPrefix and Candidates managements
-    mLlm->mLlmSessionInfos[0].tokens.push_back(token);
-    std::string output_str = mLlm->tokenizer_decode(mLlm->mLlmSessionInfos[0].tokens.back());
-    // print
-    *os << output_str << std::flush;
-    return output_str;
-}
-
-std::string LocalSampler::sample(const std::vector<int>& input_ids, std::ostream* os, const char* end_with, struct TimePerformance* time_perf) {
-    // initialization for time performance
-    PrefillTimePerformance prefill_time;
-    prefill_time.prefill_prev_token_ = mLlm->mLlmSessionInfos[0].tokens.size();
-    prefill_time.prefill_token_ = input_ids.size();
-    appendNewPromptRecord(time_perf, input_ids.size(), mLlm->reuse_kv());
-    // initialization
-    std::string output_str; 
-    mLlm->mLlmSessionInfos[0].tokens.insert(mLlm->mLlmSessionInfos[0].tokens.end(), input_ids.begin(), input_ids.end());
-    // all_seq_len_ in sampler functions as kv_seq_len_, prev_seq_len_ = all_seq_len_ - seq_len
-    mLlm->mLlmSessionInfos[0].all_seq_len_ = mLlm->mLlmSessionInfos[0].tokens.size() - input_ids.size(); 
-    mLlm->mLlmSessionInfos[0].gen_seq_len_ = 0;
-    // prefill 
-    auto st = std::chrono::system_clock::now();
-    auto logits = mLlm->forward(input_ids, true);
-    if (nullptr == logits.get()) {
-        return "";
-    }
-    int token = algorithm(logits);
-    // record time
-    auto et = std::chrono::system_clock::now();
-    prefill_time.prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
-    time_perf->prefill_record_.push_back(prefill_time);
-    // handle the new token
-    output_str += handleToken(token, os, end_with);
-    // decode
-    while (mLlm->mLlmSessionInfos[0].gen_seq_len_ < mConfig.max_new_tokens
-            && mLlm->mLlmSessionInfos[0].all_seq_len_ < mConfig.max_all_tokens) {
-        DecodeTimePerformance decode_time;
-        decode_time.decode_prev_token_ = mLlm->mLlmSessionInfos[0].tokens.size();
-        st = std::chrono::system_clock::now();
-        // next token
-        logits = mLlm->forward({mLlm->mLlmSessionInfos[0].tokens.back()}, false);
-        if (nullptr == logits.get()) {
-            return output_str;
-        }
-        if (logits->getInfo()->size == 0) {
-            return output_str;
-        }
-        token = algorithm(logits);
-        et = std::chrono::system_clock::now();
-        decode_time.decode_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
-        time_perf->decode_record_.push_back(decode_time);
-        if (mLlm->is_stop(token)) {
-            *os << end_with << std::flush;
-            break;
-        } else {
-            output_str += handleToken(token);
-        }
-    }
-    if (mLlm->mLlmSessionInfos[0].all_seq_len_ == mConfig.max_all_tokens) {
-        std::cout << "sequence length reaches maximum allowed." << std::endl;
-    }
-    // return output_str
-    return output_str;
-}
-
-
-} // Transformer
-} // MNN
\ No newline at end of file
diff --git a/transformers/llm/engine/src/sampler.hpp b/transformers/llm/engine/src/sampler.hpp
deleted file mode 100644
index 93266d580..000000000
--- a/transformers/llm/engine/src/sampler.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef SAMPLER_hpp
-#define SAMPLER_hpp
-
-#include <vector>
-#include <memory>
-#include <string>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <streambuf>
-#include <functional>
-#include <unordered_map>
-#include <utility>
-
-#include <MNN/expr/Expr.hpp>
-#include <MNN/expr/Module.hpp>
-#include <MNN/expr/MathOp.hpp>
-#include <MNN/expr/NeuralNetWorkOp.hpp>
-
-#include "evaluation/evaluation.hpp"
-#include "llmconfig.hpp"
-#include "llm/llm.hpp"
-
-
-namespace MNN {
-namespace Transformer {
-
-MNN_PUBLIC MNN::Express::VARP _TempratureSoftmax(MNN::Express::VARP logits, float temperature, int axis = -1);
-
-class Llm;
-
-// a index and its corresponding score
-struct IndexScore {
-    int index;
-    float score;
-};
-struct IndexScoreCmpLess{
-    bool operator()(IndexScore a, IndexScore b) {
-        return a.score < b.score;
-    }
-};
-struct IndexScoreCmpGreater{
-    bool operator()(IndexScore a, IndexScore b) {
-        return a.score > b.score;
-    }
-};
-// a series of index and their corresponding logits
-struct SubsetLogits{
-    std::vector<int> index;
-    MNN::Express::VARP logits;
-    bool is_subset;
-};
-
-class MNN_PUBLIC Sampler {
-public:
-    class LlmSamplerConfig {
-    public:
-        int max_new_tokens = 512;
-        int max_all_tokens = 2048;
-    };
-protected:
-    Llm* mLlm;
-    int select(struct SubsetLogits& subset, int id);
-    int randomSelect(float* probs, size_t size);
-    int randomSelect(MNN::Express::VARP probs);
-    int reSoftmaxSelect(struct SubsetLogits subset, float temperature=1.0);
-    SubsetLogits createSubsetLogits(MNN::Express::VARP logits);
-    SubsetLogits createSubsetLogits(MNN::Express::VARP logits, const std::vector<int>& index);
-    SubsetLogits createSubsetLogits(int size);
-    SubsetLogits createSubsetLogits(const std::vector<float>& scores, const std::vector<int>& index);
-    void transformIndex(struct SubsetLogits& superset, struct SubsetLogits& subset);
-public:
-    static Sampler* createSampler(Llm* llm, const std::string& config_path);
-    static Sampler* createSampler(Llm* llm, std::shared_ptr<LlmConfig> config);
-    virtual std::string sample(const std::vector<int>& input_ids, std::ostream* os = &std::cout, const char* end_with = nullptr, struct TimePerformance* time_perf = nullptr) = 0;
-    virtual std::vector<float> perplexity(std::string prompt_file, std::ostream* perfOS) { return std::vector<float>(); }
-    // prepare for another round of sampling
-    // in the future, only reset its own.
-    virtual void reset(Llm* llm) { mLlm = llm; }
-};
-
-
-class MNN_PUBLIC LocalSampler: public Sampler {
-public:
-    class LocalSamplerConfig : public LlmSamplerConfig {
-    public:
-        struct SamplerPenaltyConfig {
-            float penalty = 1.05;
-            int ngram = 8;
-            float ngram_factor = 1.02; // panalize repeated ngram with a multiplied ngram_factor.
-            float max_penalty = 10.;
-            std::string sampler = "temperature"; // "greedy", "temperature". 
-        };
-        std::string type = "temperature";
-        std::string select_type = "temperature";
-        float temperature = 0.8;
-        int topK = 40;
-        float topP = 0.9;
-        float minP = 0.05;
-        float tfsZ = 1.0;
-        float typical = 0.95;
-        struct SamplerPenaltyConfig penaltyConfig;
-        std::vector<std::string> mixedSamplers= {"topK", "tfs", "typical", "topP", "min_p", "temperature"};
-        void configSampler(std::string sampler_type, std::shared_ptr<LlmConfig> llmConfig);
-        void configGreedy(std::shared_ptr<LlmConfig> llmConfig);
-        void configTemperature(std::shared_ptr<LlmConfig> llmConfig);
-        void configTopK(std::shared_ptr<LlmConfig> llmConfig);
-        void configTopP(std::shared_ptr<LlmConfig> llmConfig);
-        void configMinP(std::shared_ptr<LlmConfig> llmConfig);
-        void configTFS(std::shared_ptr<LlmConfig> llmConfig);
-        void configTypical(std::shared_ptr<LlmConfig> llmConfig);
-        void configPenalty(std::shared_ptr<LlmConfig> llmConfig);
-        void configMixed(std::shared_ptr<LlmConfig> llmConfig);
-    };
-protected:
-    LocalSamplerConfig mConfig;
-    LocalSamplerConfig getSamplerConfig(std::shared_ptr<LlmConfig> llmConfig);
-    int argmaxSelect(struct SubsetLogits superset);
-    int packSoftmax(MNN::Express::VARP logits, std::vector<IndexScore>& index_scores, float temperature = 1.0);
-    struct SubsetLogits penalty(struct SubsetLogits superset);
-    struct SubsetLogits topK(struct SubsetLogits superset);
-    struct SubsetLogits topP(struct SubsetLogits superset);
-    struct SubsetLogits minP(struct SubsetLogits superset);
-    struct SubsetLogits tfs(struct SubsetLogits superset);
-    struct SubsetLogits typical(struct SubsetLogits superset);
-    struct SubsetLogits mixed(struct SubsetLogits subset);
-    struct SubsetLogits subsetSampler(std::string sampler_type, struct SubsetLogits subset);
-    int handleSelect(struct SubsetLogits subset);
-    std::string handleToken(int token, std::ostream* os = &std::cout, const char* end_with = nullptr);
-public:
-    LocalSampler(Llm* llm, std::shared_ptr<LlmConfig> config);
-    int algorithm(MNN::Express::VARP logits);
-    virtual std::string sample(const std::vector<int>& input_ids, std::ostream* os = &std::cout, const char* end_with = nullptr, struct TimePerformance* time_perf = nullptr) override;
-};
-
-
-} // Transformer
-} // MNN
-
-
-#endif // SAMPLER_hpp
\ No newline at end of file
diff --git a/transformers/llm/engine/src/tokenizer.cpp b/transformers/llm/engine/src/tokenizer.cpp
index 913afacfc..87f02c868 100644
--- a/transformers/llm/engine/src/tokenizer.cpp
+++ b/transformers/llm/engine/src/tokenizer.cpp
@@ -475,7 +475,7 @@ void Tiktoken::encode(const std::string& str, std::vector<int>& ids) {
         } else {
             // If no matching symbol is found, this typically means an error in the encoding
             // or the input text contains characters that the encoder doesn't know how to handle
-            std::cerr << "Error: No encoding found for the sequence starting at position " << i << " , symbol: " << str[i-2] << std::endl;
+            std::cerr << "Error: No encoding found for the sequence starting at position " << i << std::endl;
             return;
         }
     }
diff --git a/transformers/llm/engine/test/bench_en.txt b/transformers/llm/engine/test/bench_en.txt
deleted file mode 100644
index 6f6ecbe2d..000000000
--- a/transformers/llm/engine/test/bench_en.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-calculate 8*12
-translate the following into Chinese：It's a beautiful day to learn something new.
-Describe top 5 characters a leader needs, and explain why.
\ No newline at end of file
diff --git a/transformers/llm/eval/evaluate_perplexity.py b/transformers/llm/eval/evaluate_perplexity.py
index 50b5fe163..7b467bb58 100644
--- a/transformers/llm/eval/evaluate_perplexity.py
+++ b/transformers/llm/eval/evaluate_perplexity.py
@@ -17,7 +17,7 @@ def main(args):
     dataset_dir = eval_dataset.split("/")[1]
 
     dataset = load_dataset(dataset_name, dataset_dir, split="test")
-    input_ids = model.tokenizer_encode("\n\n".join(dataset["text"]), False)
+    input_ids = model.tokenizer_encode("\n\n".join(dataset["text"]))
     stride = 512
     context_length = stride + stride // 2
     seq_len = len(input_ids)
diff --git a/transformers/llm/export/.gitignore b/transformers/llm/export/.gitignore
deleted file mode 100644
index ddb5fb2d5..000000000
--- a/transformers/llm/export/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*
-!.gitignore
-!llmexport.py
-!README.md
\ No newline at end of file
diff --git a/transformers/llm/export/README.md b/transformers/llm/export/README.md
index 371adab67..136f1329f 100644
--- a/transformers/llm/export/README.md
+++ b/transformers/llm/export/README.md
@@ -25,26 +25,18 @@ pip install .
 
 ## 用法
 
-1. 下载模型
+1. 将需要导出的LLM项目clone到本地，如：chatglm2-6b
 ```sh
-git clone https://huggingface.co/Qwen/Qwen2-1.5B-Instruct
+git clone https://huggingface.co/THUDM/chatglm2-6b
 # 如果huggingface下载慢可以使用modelscope
-git clone https://modelscope.cn/qwen/Qwen2-1.5B-Instruct.git
+git clone https://modelscope.cn/ZhipuAI/chatglm2-6b.git
 ```
-2. 测试模型
+2. 导出模型
 ```sh
-# 测试文本输入
-llmexport --path Qwen2-1.5B-Instruct --test "你好"
-# 测试图像文本
-llmexport --path Qwen2-VL-2B-Instruct  --test "<img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>介绍一下图片里的内容"
-```
-
-3. 导出模型
-```sh
-# 将Qwen2-1.5B-Instruct导出为onnx模型
-llmexport --path Qwen2-1.5B-Instruct --export onnx
-# 将Qwen2-1.5B-Instruct导出为mnn模型, 量化参数为4bit, blokc-wise = 128
-llmexport --path Qwen2-1.5B-Instruct --export mnn --quant_bit 4 --quant_block 128
+# 将chatglm2-6b导出为onnx模型
+llmexport --path ../chatglm2-6b --export onnx
+# 将chatglm2-6b导出为mnn模型, 量化参数为4bit, blokc-wise = 128
+llmexport --path ../chatglm2-6b --export mnn --quant_bit 4 --quant_block 128
 ```
 
 ## 功能
@@ -56,6 +48,14 @@ llmexport --path Qwen2-1.5B-Instruct --export mnn --quant_bit 4 --quant_block 12
 - 使用`--lm_quant_bit`来制定lm_head层权重的量化bit数，不指定则使用`--quant_bit`的量化bit数
 - 支持使用自己编译的`MNNConvert`，使用`--mnnconvert`
 
+`--test`测试示例
+```sh
+# 测试文本输入
+llmexport --path Qwen2-1.5B-Instruct --test "你好"
+# 测试图像文本
+llmexport --path Qwen2-VL-2B-Instruct  --test "<img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>介绍一下图片里的内容"
+```
+
 ## 参数
 ```
 usage: llmexport.py [-h] --path PATH [--type TYPE] [--lora_path LORA_PATH] [--dst_path DST_PATH] [--test TEST] [--export EXPORT]
@@ -90,8 +90,8 @@ options:
 
 ## 支持模型
 
-- llama/llama2/llama3/llama3.2/tinyllama
-- qwen/qwen1.5/qwen2/qwen-vl/qwen2-vl/qwen2.5
+- llama/llama2/llama3/tinyllama
+- qwen/qwen1.5/qwen2/qwen-vl
 - baichuan2/phi-2/internlm/yi/deepseek
 - chatglm/codegeex/chatglm2/chatglm3
 - phi-2/gemma-2
diff --git a/transformers/llm/export/llmexport.py b/transformers/llm/export/llmexport.py
index 7a7a72b76..a25c69c13 100644
--- a/transformers/llm/export/llmexport.py
+++ b/transformers/llm/export/llmexport.py
@@ -119,7 +119,8 @@ def regist_mllama(self):
                 'embed_': 'language_model.model.embed_tokens',
                 'blocks_': 'language_model.model.layers',
                 'final_layernorm_': 'language_model.model.norm',
-                'visual': 'vision_model'
+                'visual': 'vision_model',
+                'multi_modal_projector': 'multi_modal_projector'
             },
             'decoder': {
                 'self_attn': 'self_attn',
@@ -423,7 +424,6 @@ def pseudo_quantize_tensor(self, w: torch.Tensor):
 
     def quantize(self):
         for i in tqdm(range(len(self.modules)), desc="AWQ"):
-            # if i > 0: break
             # Move module and inputs to correct device
             common_device = next(self.modules[i].parameters()).device
             if common_device is None or str(common_device) == "cpu":
@@ -1254,10 +1254,10 @@ def build_weight(self, name, has_bias, ic, oc):
                linear.out_features == oc and
                (linear.bias is not None) == has_bias)
         weight_name, bias_name = f'{name}_weight', f'{name}_bias'
-        weight = linear.weight.data.transpose(1, 0).flatten().numpy()
+        weight = linear.weight.data.transpose(1, 0).flatten().float().numpy()
         self.make_external(weight_name, weight, [ic, oc])
         if has_bias:
-            bias = linear.bias.data.flatten().numpy()
+            bias = linear.bias.data.flatten().float().numpy()
             self.make_external(bias_name, bias, [oc])
         return weight_name, bias_name
 
@@ -1293,17 +1293,17 @@ class MNNConveter:
     def __init__(self, onnx_path, weight_ops, config):
         self.weight_ops = weight_ops
         self.config = config
-        self.quant_block = config.quant_block
-        self.quant_bit = config.quant_bit
-        self.lm_quant_bit = config.lm_quant_bit
-        self.symmetric = config.symmetric
+        self.quant_block = config.args.quant_block
+        self.quant_bit = config.args.quant_bit
+        self.lm_quant_bit = config.args.lm_quant_bit
+        self.symmetric = config.args.sym
         self.mnn_weight_offset = 0
         self.onnx_model_path = onnx_path
         self.mnn_name = os.path.basename(onnx_path).replace('.onnx', '.mnn')
-        self.mnn_model_path = os.path.join(config.dst_path, self.mnn_name)
+        self.mnn_model_path = os.path.join(config.args.dst_path, self.mnn_name)
         self.mnn_weight_path = f'{self.mnn_model_path}.weight'
-        if os.path.exists(config.mnnconvert):
-            self.mnnconvert = config.mnnconvert
+        if os.path.exists(config.args.mnnconvert):
+            self.mnnconvert = config.args.mnnconvert
         else:
             self.mnnconvert = None
 
@@ -1750,6 +1750,13 @@ def __init__(self, attn, layer_id, config):
                     self.q_proj.bias.data = torch.zeros(split_sizes[0])
                     self.k_proj.bias.data = torch.zeros(split_sizes[1])
                     self.v_proj.bias.data = torch.zeros(split_sizes[2])
+            self.q_proj.weight.requires_grad = False
+            self.k_proj.weight.requires_grad = False
+            self.v_proj.weight.requires_grad = False
+            self.q_proj.bias.requires_grad = False
+            self.k_proj.bias.requires_grad = False
+            self.v_proj.bias.requires_grad = False
+
 
     def forward(
         self,
@@ -1835,11 +1842,11 @@ def __init__(self, config):
             self.rotary_dim = config.rotary_dim
         if self.model_type == 'chatglm':
             self.rotary_dim = config.head_dim // 2
+        self.theta = 1.0 / (self.rope_theta ** (torch.arange(0, self.rotary_dim, 2, dtype=torch.float32) / self.rotary_dim))
 
     def forward(self, position_ids):
-        theta = 1.0 / (self.rope_theta ** (torch.arange(0, self.rotary_dim, 2, dtype=torch.float32) / self.rotary_dim))
         position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
+        idx_theta = position_ids * self.theta
         rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)])
         if self.model_type != 'chatglm2':
             rotary_pos_emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
@@ -1886,6 +1893,22 @@ def chatglm_rotary_pos(self, x, cos, sin):
         x2 = (x2 * cos2) + (rotate_half(x2) * sin2)
         return torch.cat((x1, x2), dim=-1)
 
+class VisionRotary(Rotary):
+    def __init__(self, config):
+        super().__init__(config)
+
+    # support [h_pos, w_pos]
+    def forward(self, position_ids):
+        # [2, patch_len, 1]
+        position_ids = position_ids.float().unsqueeze(-1)
+        idx_theta = position_ids * self.theta
+        # [patch_len, rotary_dim]
+        idx_theta = idx_theta.permute(1, 0, 2).reshape(-1, self.rotary_dim)
+        rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)])
+        rotary_pos_emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        rotary_pos_emb = rotary_pos_emb.unsqueeze(2).unsqueeze(1)
+        return rotary_pos_emb
+
 class Decoder(torch.nn.Module):
     def __init__(self, decoder, layer_id, config):
         super().__init__()
@@ -1942,7 +1965,7 @@ def forward(
             hidden_states = self.mlp(hidden_states)
             hidden_states = self.post_feedforward_layernorm(hidden_states)
             hidden_states = residual + hidden_states
-        elif cross_attention_mask is not None:
+        elif cross_attention_mask is not None and self.cross_decoder:
             hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
             residual = hidden_states
             hidden_states = self.post_attention_layernorm(hidden_states)
@@ -1965,7 +1988,7 @@ def __init__(self, lm_, final_layernorm_, config):
         self.final_layernorm = final_layernorm_
         self.lm = lm_
         self.hidden_size = config.hidden_size
-        self.ppl = config.ppl
+        self.ppl = config.args.ppl
 
     def forward(self, hidden_states):
         if not self.ppl:
@@ -1975,9 +1998,11 @@ def forward(self, hidden_states):
         m_logits = self.lm(hidden_states)
         return m_logits
 
+# visual start
 class Visual(torch.nn.Module):
     def __init__(self, visual, base):
         super().__init__()
+        self.model_type = base.model_type
         self.visual = visual.eval()
         self.embed_ = base.embed
         self.tokenizer = base.tokenizer
@@ -2009,6 +2034,9 @@ def init_config(self):
         self.llm_config['image_mean'] = image_mean.tolist()
         self.llm_config['image_norm'] = image_norm.tolist()
 
+    def export(self, onnx_path):
+        raise NotImplementedError
+
     def load(self):
         raise NotImplementedError
 
@@ -2036,6 +2064,21 @@ def load(self):
         self.llm_config['vision_end'] = self.tokenizer.img_end_id
         self.llm_config['image_pad'] = self.tokenizer.img_pad_id
 
+    def export(self, onnx_path):
+        input_images = torch.randn((1, 3, self.image_size, self.image_size))
+        onnx_model = f'{onnx_path}/visual.onnx'
+        torch.onnx.export(self, (input_images),
+                        onnx_model,
+                        input_names=['input_images'],
+                        output_names=['image_embeds'],
+                        dynamic_axes={
+                            "input_images": { 0: "size" },
+                        },
+                        do_constant_folding=True,
+                        verbose=False,
+                        opset_version=15)
+        return onnx_model
+
     def forward(self, images):
         return self.visual(images).transpose(1, 0)
 
@@ -2062,7 +2105,8 @@ def __init__(self, visual, base):
         self.temporal_patch_size = 2
         self.patch_size = 14
         self.merge_size = 2
-        self.image_size = 420
+        self.image_height = 420
+        self.image_width = 420
         self.image_embeds = None
         super().__init__(visual, base)
 
@@ -2070,10 +2114,37 @@ def load(self):
         self.vision_start_id = self.config.vision_start_token_id
         self.vision_end_id = self.config.vision_end_token_id
         self.image_pad_id = self.config.image_token_id
-        self.llm_config['image_size'] = self.image_size
+        self.llm_config['image_size'] = self.image_height
         self.llm_config['vision_start'] = self.vision_start_id
         self.llm_config['vision_end'] = self.vision_end_id
         self.llm_config['image_pad'] = self.image_pad_id
+        # load model
+        config = self.visual.config
+        self.hidden_size = config.embed_dim
+        self.num_attention_heads = config.num_heads
+        self.num_key_value_heads = config.num_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.rope_theta = 10000.0
+        self.rotary_dim = self.head_dim // 2
+        self.rotary = VisionRotary(self)
+        self.model_map = {
+            'decoder': {
+                'self_attn': 'attn',
+                'mlp': 'mlp',
+                'input_layernorm': 'norm1',
+                'post_attention_layernorm': 'norm2'
+            },
+            'attention': {
+                'qkv_proj': 'qkv',
+                'o_proj': 'proj'
+            }
+        }
+        self.patch_embed = self.visual.patch_embed
+        self.blocks = []
+        for block in self.visual.blocks.children():
+            layer_id = len(self.blocks)
+            self.blocks.append(Decoder(block, layer_id, self))
+        self.merger = self.visual.merger
 
     def str_to_ids(self, prompt):
         if '<img>' in prompt and '</img>' in prompt:
@@ -2086,6 +2157,11 @@ def str_to_ids(self, prompt):
             for part in parts:
                 if re.match(pattern, part):
                     img_content = re.search(r'<img>(.*?)</img>', part).group(1)
+                    # find <hw></hw> in image_content
+                    match = re.search(r'<hw>(.*?)</hw>', img_content)
+                    img_content = img_content[:match.start()] + img_content[match.end():]
+                    hw = match.group(1).split(',')
+                    self.image_height, self.image_width = int(hw[0]), int(hw[1])
                     if img_content.startswith('http://') or img_content.startswith('https://'):
                         image_obj = Image.open(requests.get(img_content, stream=True).raw)
                     img_pad_len = self.img_process(image_obj)
@@ -2099,7 +2175,18 @@ def str_to_ids(self, prompt):
         input_ids = self.tokenizer(txt_prompt, return_tensors="pt")['input_ids']
         return input_ids
 
-    def forward(self, images):
+    def forward(self, flatten_patches, position_ids, attention_mask):
+        rotary_pos_emb = self.rotary(position_ids)
+        hidden_states = self.patch_embed(flatten_patches)
+        if rotary_pos_emb.dtype != hidden_states.dtype:
+            rotary_pos_emb = rotary_pos_emb.to(hidden_states.dtype)
+        for blk in self.blocks:
+            hidden_states, _ = blk(hidden_states, rotary_pos_emb=rotary_pos_emb, attention_mask=attention_mask)
+        image_embeds = self.merger(hidden_states)
+        image_embeds = image_embeds.unsqueeze(1)
+        return image_embeds
+
+    def images_forward(self, images):
         images = [images] * self.temporal_patch_size
         patches = torch.concat(images, axis=0)
         _, channel, height, width = patches.shape
@@ -2121,13 +2208,53 @@ def forward(self, images):
             grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
         )
         image_grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
-        image_embeds = self.visual(flatten_patches, image_grid_thw)
-        image_embeds = image_embeds.unsqueeze(1)
-        return image_embeds
+        pos_ids = []
+        for t, h, w in image_grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.merge_size,
+                self.merge_size,
+                w // self.merge_size,
+                self.merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.merge_size,
+                self.merge_size,
+                w // self.merge_size,
+                self.merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids]))
+        position_ids = torch.cat(pos_ids, dim=0)
+        seq_len = grid_t * grid_h * grid_w
+        attention_mask = torch.zeros([1, seq_len, seq_len], dtype=torch.float)
+        return self.forward(flatten_patches, position_ids, attention_mask)
+
+    def smart_resize(self, height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280):
+        if height < factor or width < factor:
+            raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+        elif max(height, width) / min(height, width) > 200:
+            raise ValueError(
+                f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+            )
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
 
     def img_process(self, image):
-        resized_height = self.image_size
-        resized_width = self.image_size
         from transformers.image_transforms import (
             convert_to_rgb,
             resize,
@@ -2143,6 +2270,7 @@ def img_process(self, image):
         )
         image = convert_to_rgb(image)
         image = to_numpy_array(image)
+        resized_height, resized_width = self.smart_resize(self.image_height, self.image_width)
         format = infer_channel_dimension_format(image)
         resample = PILImageResampling.BICUBIC
         image = resize(image, size=(resized_height, resized_width), resample=resample, input_data_format=format)
@@ -2151,7 +2279,7 @@ def img_process(self, image):
         image = np.expand_dims(image, [0])
         image = image.transpose(0, 3, 1, 2)
         image = torch.from_numpy(image)
-        self.image_embeds = self.forward(image)
+        self.image_embeds = self.images_forward(image)
         return self.image_embeds.shape[0]
 
     def embed(self, input_ids, images = None, videos = None):
@@ -2161,9 +2289,29 @@ def embed(self, input_ids, images = None, videos = None):
             input_embeds[image_mask] = self.image_embeds
         return input_embeds
 
+    def export(self, onnx_path):
+        patch = torch.randn([900, 1176])
+        posision_ids = torch.zeros([2, 900], dtype=torch.int32)
+        attention_mask = torch.zeros([1, 900, 900], dtype=torch.float)
+        onnx_model = f'{onnx_path}/visual.onnx'
+        torch.onnx.export(self, (patch, posision_ids, attention_mask),
+                        onnx_model,
+                        input_names=['patches', 'position_ids', 'attention_mask'],
+                        output_names=['image_embeds'],
+                        dynamic_axes={
+                            "patches": { 0: "size" },
+                            "position_ids": { 1: "size" },
+                            "attention_mask": { 1: "size", 2: "size" }
+                        },
+                        do_constant_folding=True,
+                        verbose=False,
+                        opset_version=15)
+        return onnx_model
+
 class MllamaVision(Visual):
     def __init__(self, visual, base):
         super().__init__(visual, base)
+        self.multi_modal_projector = base.multi_modal_projector
         self.image_objs = []
 
     def load(self):
@@ -2192,12 +2340,11 @@ def str_to_ids(self, prompt):
         input_ids = self.tokenizer(txt_prompt, return_tensors="pt")['input_ids']
         # image process
         for img in self.image_objs:
-            image_embeds = self.img_process(img)
-            print(image_embeds.shape)
-            pass
+            self.img_process(img)
         return input_ids
 
     def img_process(self, image):
+        self.image_size = 560
         resized_height = self.image_size
         resized_width = self.image_size
         from transformers.image_transforms import (
@@ -2224,19 +2371,164 @@ def img_process(self, image):
         image = np.expand_dims(image, [0, 1, 2])
         pad_val = np.zeros_like(image)
         image = np.concatenate([image, pad_val, pad_val, pad_val], axis=2)
-        print(image.shape)
         image = torch.from_numpy(image)
-        image_embeds = self.forward(image)
-        print(image_embeds.shape)
-        return image_embeds
+        self.cross_attention_states = self.forward(image)
 
     def forward(self, images):
         aspect_ratio_ids = torch.tensor([[1]])
         aspect_ratio_mask = torch.tensor([[[1, 0, 0, 0]]])
-        return self.visual(images, aspect_ratio_ids, aspect_ratio_mask)
+        vision_outputs = self.visual(images, aspect_ratio_ids, aspect_ratio_mask)
+        cross_attention_states = vision_outputs[0]
+        cross_attention_states = cross_attention_states.type(self.multi_modal_projector.weight.dtype)
+        cross_attention_states = self.multi_modal_projector(cross_attention_states).reshape(
+                -1, cross_attention_states.shape[-2], self.hidden_size)
+        return cross_attention_states
+
+    def embed(self, input_ids, images = None, videos = None):
+        txt_embeds = self.embed_(input_ids)
+        return txt_embeds
+# visual end
+
+# audio start
+class Audio(torch.nn.Module):
+    def __init__(self, audio, base):
+        super().__init__()
+        self.audio = audio
+        self.embed_ = base.embed
+        self.tokenizer = base.tokenizer
+        self.config = base.config
+        self.hidden_size = base.hidden_size
+        self.llm_config = base.llm_config
+        self.quant_bit = 16
+        self.init_config()
+        self.load()
+
+    @staticmethod
+    def get_audio(model_type):
+        audio_models = {
+            'qwen2_audio': Qwen2Audio,
+        }
+        if model_type in audio_models:
+            return audio_models[model_type]
+        return None
+
+    def init_config(self):
+        self.llm_config['is_audio'] = True
+
+    def load(self):
+        raise NotImplementedError
+
+    def str_to_ids(self, prompt):
+        input_ids = self.tokenizer(prompt, return_tensors="pt")['input_ids']
+        return input_ids
+
+    def forward(self, images):
+        raise NotImplementedError
 
     def embed(self, input_ids, images = None, videos = None):
-        return self.embed_(input_ids)
+        raise NotImplementedError
+
+class Qwen2Audio(Audio):
+    def __init__(self, audio, base):
+        super().__init__(audio, base)
+        self.audio_embeds = None
+        self.audio_pad_id = 151646
+        self.n_fft = 400
+        self.sampling_rate = 16000
+        self.hop_length = 160
+        self.chunk_length = 30
+        self.feature_size = 128
+        self.n_samples = self.chunk_length * self.sampling_rate
+        self.max_length = self.n_samples // self.hop_length
+        from transformers.audio_utils import mel_filter_bank
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + self.n_fft // 2,
+            num_mel_filters=self.feature_size,
+            min_frequency=0.0,
+            max_frequency=8000.0,
+            sampling_rate=self.sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+    def load(self):
+        # model
+        self.audio_tower = self.audio.audio_tower
+        self.multi_modal_projector = self.audio.multi_modal_projector
+        # config
+        self.llm_config['is_audio'] = True
+
+    def str_to_ids(self, prompt):
+        if '<audio>' in prompt and '</audio>' in prompt:
+            import re
+            from io import BytesIO
+            from urllib.request import urlopen
+            import librosa
+            pattern = r'(<audio>.*?</audio>)'
+            parts = re.split(pattern, prompt)
+            txt_prompt = ''
+            for part in parts:
+                if re.match(pattern, part):
+                    audio_content = re.search(r'<audio>(.*?)</audio>', part).group(1)
+                    if audio_content.startswith('http://') or audio_content.startswith('https://'):
+                        audio_obj = librosa.load(BytesIO(urlopen(audio_content).read()), sr=self.sampling_rate)[0]
+                    audio_embed_len = self.audio_process(audio_obj)
+                    audio_pad_str = '<|AUDIO|>' * audio_embed_len
+                    txt_prompt += audio_pad_str
+                else:
+                    txt_prompt += part
+        else:
+            txt_prompt = prompt
+        input_ids = self.tokenizer(txt_prompt, return_tensors="pt")['input_ids']
+        return input_ids
+
+    def forward(self, input_features):
+        input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device)
+        inputs_embeds = torch.nn.functional.gelu(self.audio_tower.conv1(input_features))
+        inputs_embeds = torch.nn.functional.gelu(self.audio_tower.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        _, seq_len, _ = inputs_embeds.shape
+        embed_pos = self.audio_tower.embed_positions.weight[:seq_len, :]
+        hidden_states = inputs_embeds + embed_pos
+        for encoder_layer in self.audio_tower.layers:
+            hidden_states = encoder_layer(hidden_states, None, None)[0]
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.audio_tower.avg_pooler(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.audio_tower.layer_norm(hidden_states)
+        audio_features = self.multi_modal_projector(hidden_states)
+        return audio_features
+
+    def _torch_extract_fbank_features(self, waveform):
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        if waveform.dim() == 2:
+            max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
+            log_spec = torch.maximum(log_spec, max_val - 8.0)
+        else:
+            log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec
+
+    def audio_process(self, audio_obj):
+        # audio_obj = np.pad(audio_obj, (0, self.n_samples - audio_obj.shape[0]))
+        waveform = torch.from_numpy(audio_obj).type(torch.float32)
+        input_features = self._torch_extract_fbank_features(waveform).unsqueeze(0)
+        audio_embeds = self.forward(input_features)
+        self.audio_embeds = audio_embeds.permute([1, 0, 2])
+        return self.audio_embeds.shape[0]
+
+    def embed(self, input_ids, images = None, videos = None):
+        input_embeds = self.embed_(input_ids)
+        if self.audio_embeds is not None:
+            audio_mask = (input_ids == self.audio_pad_id).squeeze()
+            input_embeds[audio_mask] = self.audio_embeds.type(input_embeds.dtype)
+        return input_embeds
+# audio end
 
 class LlmExporter(torch.nn.Module):
     '''
@@ -2249,40 +2541,31 @@ def __init__(self, args):
         self.load_model(args.path)
 
     def init_from_args(self, args):
+        self.args = args
         self.max_length = 128
         self.stop_ids = []
-        self.visual = None
         self.dst_name = 'llm'
         # load config from args
-        self.path = args.path
-        self.dst_path = args.dst_path
-        self.onnx_path = os.path.join(self.dst_path, 'onnx')
-        self.tokenizer_path = args.tokenizer_path
-        self.lora_path = args.lora_path
-        self.need_onnx_slim = args.onnx_slim
-        self.ppl = args.ppl
-        self.awq = args.awq
-        self.quant_bit = args.quant_bit
-        self.quant_block = args.quant_block
-        self.symmetric = args.sym
-        self.mnnconvert = args.mnnconvert
-        if self.tokenizer_path is None:
-            self.tokenizer_path = self.path
-        if args.lm_quant_bit is not None:
-            self.lm_quant_bit = args.lm_quant_bit
-        else:
-            self.lm_quant_bit = self.quant_bit
+        self.onnx_path = os.path.join(self.args.dst_path, 'onnx')
+        if self.args.tokenizer_path is None:
+            self.args.tokenizer_path = self.args.path
+        if args.lm_quant_bit is None:
+            self.args.lm_quant_bit = self.args.quant_bit
         # init export dst dir
-        if not os.path.exists(self.dst_path):
-            os.makedirs(self.dst_path)
+        if not os.path.exists(self.args.dst_path):
+            os.makedirs(self.args.dst_path)
         if not os.path.exists(self.onnx_path):
             os.makedirs(self.onnx_path)
 
     def load_pretrained(self, model_path: str):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path, trust_remote_code=True, use_fast=False)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_path, trust_remote_code=True, use_fast=False)
         if 'Qwen2-VL' in model_path:
             from transformers import Qwen2VLForConditionalGeneration
             self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_path, torch_dtype='auto').eval()
+        elif 'Qwen2-Audio' in model_path:
+            from transformers import Qwen2AudioForConditionalGeneration
+            self.audio = Qwen2AudioForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+            self.model = self.audio.language_model
         elif 'Llama-3.2' in model_path and 'Vision' in model_path:
             from transformers import MllamaForConditionalGeneration
             self.model = MllamaForConditionalGeneration.from_pretrained(model_path, torch_dtype='auto').eval()
@@ -2292,9 +2575,9 @@ def load_pretrained(self, model_path: str):
             except:
                 self.model = AutoModel.from_pretrained(model_path, torch_dtype='auto', trust_remote_code=True).eval()
         self.config = self.model.config
-        if self.lora_path is not None:
+        if self.args.lora_path is not None:
             from peft import PeftModel
-            adapter = PeftModel.from_pretrained(self.model, model_id=self.lora_path)
+            adapter = PeftModel.from_pretrained(self.model, model_id=self.args.lora_path)
             self.model = adapter.merge_and_unload(progressbar=True)
 
     @staticmethod
@@ -2322,13 +2605,15 @@ def load_model(self, model_path):
                     self.stop_ids.append(id)
         self.stop_ids = [stop_id for stop_id in self.stop_ids if stop_id is not None]
         self.stop_ids = list(set(self.stop_ids))
+        self.visual = None
         model_mapper = ModelMapper()
 
         self.tie_word_embeddings = (hasattr(self.config, 'tie_word_embeddings') and self.config.tie_word_embeddings)
         self.model_type, self.model_map = model_mapper.get_map(self.config)
-        if self.awq:
+
+        if self.args.awq:
             self.model.float()
-        else:
+        if self.args.export is not None:
             # set norm's weight as float for export
             def visit_module(module):
                 if not isinstance(module, torch.nn.Linear) and hasattr(module, 'weight'):
@@ -2337,6 +2622,7 @@ def visit_module(module):
                     visit_module(child)
             visit_module(self.model)
         # print(self.config, self.model_type, self.model_map, self.model)
+        # exit(0)
         # load config info
         ModelMapper.do_map(self, self.config, self.model_map['config'])
         if not hasattr(self, 'num_key_value_heads') or self.num_key_value_heads is None:
@@ -2365,16 +2651,12 @@ def visit_module(module):
             "position_ids" : { 1: "seq_len" },
             "past_key_values" : { 3: "history_len" }
         }
-        prompt_template = self.build_prompt_template()
         self.llm_config = {
             'hidden_size' : self.hidden_size,
             'layer_nums' : self.num_hidden_layers,
             'attention_mask': self.attention_mask_type,
             'key_value_shape': self.past_kv_shape[1:],
-            "system_prompt_template": prompt_template['system'].format(query='%s'),
-            'user_prompt_template': prompt_template['user'].format(query='%s'),
-            'assistant_prefix': prompt_template['assistant_prefix'],
-            'assistant_suffix': prompt_template['assistant_suffix'],
+            "prompt_template": self.build_prompt('%s'),
             'is_visual': False
         }
         # load modules
@@ -2400,8 +2682,13 @@ def visit_module(module):
         self.lm = Lm(self.lm_, self.final_layernorm_, self)
         # visual model
         if self.visual is not None:
-            self.visual.float()
+            if self.args.export is not None:
+                self.visual.float()
             self.visual = Visual.get_visual(self.model_type)(self.visual, self)
+        if hasattr(self, 'audio') and self.audio is not None:
+            self.audio = Audio.get_audio(self.audio.config.model_type)(self.audio, self)
+        else:
+            self.audio = None
         return model_path
 
     def get_attention_mask(self) -> torch.Tensor:
@@ -2440,9 +2727,14 @@ def chatglm_position_ids(self):
     def visual_embed(self, input_ids):
         return self.visual.embed(input_ids)
 
+    def audio_embed(self, input_ids):
+        return self.audio.embed(input_ids)
+
     def embedding(self, input_ids):
         if self.visual is not None and self.token_len == 0:
             input_embeds = self.visual_embed(input_ids)
+        elif self.audio is not None and self.token_len == 0:
+            input_embeds = self.audio_embed(input_ids)
         else:
             input_embeds = self.embed(input_ids)
         return input_embeds
@@ -2458,13 +2750,15 @@ def forward(self,
         hidden_states = input_ids # llm forward without embedding
         presents = [None for i in range(self.num_hidden_layers)]
         rotary_pos_emb = self.rotary(position_ids)
+        if self.args.test and rotary_pos_emb.dtype != hidden_states.dtype:
+            rotary_pos_emb = rotary_pos_emb.type(hidden_states.dtype)
         for i in range(self.num_hidden_layers):
             if self.blocks[i].cross_decoder and cross_attention_states is None:
                 continue
             hidden_states, kv = self.blocks[i](hidden_states, rotary_pos_emb, attention_mask, past_key_values[i])
             presents[i] = kv
         logits = self.lm(hidden_states)
-        if not self.ppl:
+        if not self.args.ppl:
             logits = logits.reshape(-1)
         if presents[0].shape == presents[-1].shape and None not in presents:
             presents = torch.stack(presents)
@@ -2473,136 +2767,79 @@ def forward(self,
         return logits, presents
 
     # some test functions
-    def build_prompt_template(self) -> Dict[str, str]:
-        template = {
-            'system': '',
-            'user': '',
-            'assistant_prefix': '',
-            'assistant_suffix': '',
-        }
+    def build_prompt(self, query):
         # just for test
-        if 'Qwen' in self.path or 'Qwen2' in self.path or 'QwQ' in self.path or 'reader' in self.path:
-            template['system'] = '<|im_start|>system\n{query}<|im_end|>\n'
-            template['user'] = '<|im_start|>user\n{query}<|im_end|>\n'
-            template['assistant_prefix'] = '<|im_start|>assistant\n'
-            template['assistant_suffix'] = '<|im_end|>\n'
-            return template
-        if 'Baichuan2' in self.path:
-            template['user'] = '<reserved_106>{query}<reserved_107>'
-            return template
-        if 'internlm' in self.path:
-            template['user'] = '<|User|>:{query}<eoh>\n'
-            template['assistant_prefix'] = '<|Bot|>:'
-            template['assistant_suffix'] = '<eoh>\n'
-            return template
-        if 'TinyLlama' in self.path:
-            template['system'] = '<s><|system|>\n{query}</s>\n'
-            template['user'] = '<|user|>\n{query}</s>\n'
-            template['assistant_prefix'] = '<|assistant|>\n'
-            template['assistant_suffix'] = '</s>\n'
-            return template
-        if 'Yi' in self.path:
-            template['user'] = '<|im_start|> user\n{query}<|im_end|>\n'
-            template['assistant_prefix'] = '<|im_start|> assistant\n'
-            template['assistant_suffix'] = '<|im_end|>\n'
-            return template
-        if 'deepseek' in self.path:
-            template['user'] = '<|begin_of_sentence|>User: {query}\n'
-            template['assistant_prefix'] = '\nAssistant: '
-            template['assistant_suffix'] = '\n<|end_of_sentence|>'
-            return template
-        if 'Llama-3.1' in self.path:
-            template['system'] = '<|start_header_id|>system<|end_header_id|>\n\n{query}<|eot_id|>'
-            template['user'] = '<|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|>'
-            template['assistant_prefix'] = '<|start_header_id|>assistant<|end_header_id|>\n\n'
-            template['assistant_suffix'] = '<|eot_id|>'
-            return template
-        if 'Llama-3' in self.path:
-            template['system'] = '<|start_header_id|>system<|end_header_id|>\n\n{query}<|eot_id|>'
-            template['user'] = '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|>'
-            template['assistant_prefix'] = '<|start_header_id|>assistant<|end_header_id|>\n\n'
-            template['assistant_suffix'] = '<|eot_id|>'
-            return template
-        if 'Llama-2' in self.path:
-            template['user'] = '[INST]{query}[/INST]'
-            return template
-        if 'chatglm2' in self.path:
-            template['user'] = '[Round 1]\n\n问：{query}\n\n'
-            template['assistant_prefix'] = '答：'
-            template['assistant_suffix'] = '\n\n'
-            return template
-        if 'chatglm3' in self.path or 'glm-4' in self.path:
-            template['user'] = '<|user|>\n{query}\n'
-            template['assistant_prefix'] = '<|assistant|>\n'
-            template['assistant_suffix'] = '\n'
-            return template
-        if 'chatglm' in self.path:
-            template['user'] = '{query}[gMASK]<sop>'
-            return template
-        if 'phi-2' in self.path:
-            template['user'] = 'Instruct: {query}\n'
-            template['assistant_prefix'] = 'Output:'
-            template['assistant_suffix'] = '\n'
-            return template
-        if 'gemma-2' in self.path:
-            template['system'] = '<start_of_turn>system\n{query}<end_of_turn>\n'
-            template['user'] = '<bos><start_of_turn>user\n{query}<end_of_turn>\n'
-            template['assistant_prefix'] = '<start_of_turn>model\n'
-            template['assistant_suffix'] = '<end_of_turn>\n'
-            return template
-        if 'OpenELM' in self.path:
-            template['user'] = '<s>{query}'
-            return template
-        if 'SmolLM2' in self.path:
-            template['system'] = '<|im_start|>system\n{query}<|im_end|>\n'
-            template['user'] = '<|im_start|>user\n{query}<|im_end|>\n'
-            template['assistant_prefix'] = '<|im_start|>assistant\n'
-            template['assistant_suffix'] = '<|im_end|>\n'
-            return template
-        # not matched
-        return template
-    
-    def build_prompt(self, queries, roles):
-        template = self.build_prompt_template(self)
-        prompt = ""
-        for item in zip(queries, roles):
-            query, role = item
-            if '{query}' in template[role]:
-                prompt += template[role].format(query=query)
-            else:
-                prompt += role + '\n' + query +'\n'
-        return prompt + template['assistant_prefix']
+        if 'Qwen2' in self.args.path or 'QwQ' in self.args.path or 'reader' in self.args.path:
+            return f'<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
+        if 'Qwen' in self.args.path:
+            return f'\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
+        if 'Baichuan2' in self.args.path:
+            return f'<reserved_106>{query}<reserved_107>'
+        if 'internlm' in self.args.path:
+            return f'<|User|>:{query}<eoh>\n<|Bot|>:'
+        if 'TinyLlama' in self.args.path:
+            return f'<s><|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s>\n<|user|>\n{query}</s>\n<|assistant|>\n'
+        if 'Yi' in self.args.path:
+            return f'<|im_start|> user\n{query}<|im_end|>\n<|im_start|> assistant\n'
+        if 'deepseek' in self.args.path:
+            return f'<|begin_of_sentence|>User: {query}\n\nAssistant:'
+        if 'Llama-3.1' in self.args.path:
+            return f'<|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+        if 'Llama-3' in self.args.path:
+            return f'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+        if 'Llama-2' in self.args.path:
+            return f'[INST]{query}[/INST]'
+        if 'chatglm2' in self.args.path:
+            return f'[Round 1]\n\n问：{query}\n\n答：'
+        if 'chatglm3' in self.args.path or 'glm-4' in self.args.path:
+            return f'<|user|>\n{query}\n<|assistant|>\n'
+        if 'chatglm' in self.args.path:
+            return f'{query}[gMASK]<sop>'
+        if 'phi-2' in self.args.path:
+            return f'Instruct: {query}\nOutput:'
+        if 'gemma-2' in self.args.path:
+            return f'<bos><start_of_turn>user\n{query}<end_of_turn>\n<start_of_turn>model\n'
+        if 'OpenELM' in self.args.path:
+            return f'<s>{query}'
+        if 'SmolLM2' in self.args.path:
+            return f'<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
+        return query
 
     def str_to_ids(self, prompt):
         if self.visual is not None:
             return self.visual.str_to_ids(prompt)
+        if self.audio is not None:
+            return self.audio.str_to_ids(prompt)
         input_ids = self.tokenizer(prompt, return_tensors="pt")['input_ids']
         return input_ids
 
     def id_to_str(self, token_id):
-        def contains_replacement(text): return '\uFFFD' in text
-        def decode_id(token_id):
-            return self.tokenizer.convert_tokens_to_string(
-                    self.tokenizer._convert_id_to_token(int(token_id)))
-        def decode_ids(token_ids):
-            return self.tokenizer.convert_tokens_to_string(
-                    self.tokenizer.convert_ids_to_tokens(token_ids))
-        word = decode_id(int(token_id))
-        # Smollm tokenizer will produce half chinese character, using buffer to decode
-        if contains_replacement(word):
-            self.decode_buffer.append(token_id)
-            buffer_txt = decode_ids(self.decode_buffer)
-            if not contains_replacement(buffer_txt):
-                word = buffer_txt
-                self.decode_buffer.clear()
-            else:
-                word = ''
+        try:
+            word = self.tokenizer.decode(int(token_id))
+        except:
+            def contains_replacement(text): return '\uFFFD' in text
+            def decode_id(token_id):
+                return self.tokenizer.convert_tokens_to_string(
+                        self.tokenizer._convert_id_to_token(int(token_id)))
+            def decode_ids(token_ids):
+                return self.tokenizer.convert_tokens_to_string(
+                        self.tokenizer.convert_ids_to_tokens(token_ids))
+            word = decode_id(int(token_id))
+            # Smollm tokenizer will produce half chinese character, using buffer to decode
+            if contains_replacement(word):
+                self.decode_buffer.append(token_id)
+                buffer_txt = decode_ids(self.decode_buffer)
+                if not contains_replacement(buffer_txt):
+                    word = buffer_txt
+                    self.decode_buffer.clear()
+                else:
+                    word = ''
         return word
 
     def response(self, query):
         # self.imitate_quant()
         self.decode_buffer = []
-        prompt = self.build_prompt(['You are a helpful assistant!', query], roles=['system', 'user'])
+        prompt = self.build_prompt(query)
         input_ids = self.str_to_ids(prompt)
         if self.visual is not None:
             cross_attention_states = self.visual.cross_attention_states
@@ -2636,14 +2873,20 @@ def response(self, query):
     def export_visual(self):
         if self.visual is None:
             return
-        input_images = torch.randn((1, 3, self.visual.image_size, self.visual.image_size))
-        model = self.visual
-        onnx_model = f'{self.onnx_path}/visual.onnx'
-        torch.onnx.export(model, (input_images),
+        return self.visual.export(self.onnx_path)
+
+    @spinner_run(f'export audio to ')
+    def export_audio(self):
+        if self.audio is None:
+            return
+        input_features = torch.randn((1, self.audio.feature_size, self.audio.max_length))
+        model = self.audio.float()
+        onnx_model = f'{self.onnx_path}/audio.onnx'
+        torch.onnx.export(model, (input_features),
                         onnx_model,
-                        input_names=['input_images'],
-                        output_names=['image_embeds'],
-                        dynamic_axes={"input_images": {
+                        input_names=['input_features'],
+                        output_names=['audio_embeds'],
+                        dynamic_axes={"input_features": {
                             0: "size"
                         }},
                         do_constant_folding=True,
@@ -2661,19 +2904,19 @@ def export_embed(self):
             tensor_data = self.embed.embed.weight.data.bfloat16()
         data_ptr = tensor_data.untyped_storage().data_ptr()
         buffer = (ctypes.c_byte * (tensor_data.numel() * 2)).from_address(data_ptr)
-        embedding_file = f'{self.dst_path}/embeddings_bf16.bin'
+        embedding_file = f'{self.args.dst_path}/embeddings_bf16.bin'
         with open(embedding_file, 'wb') as f:
             f.write(buffer)
         return embedding_file
 
     @spinner_run(f'export config to ')
     def export_config(self, mnn_config = False):
-        config_json = f'{self.dst_path}/llm_config.json'
+        config_json = f'{self.args.dst_path}/llm_config.json'
         with open(config_json, 'w', encoding='utf-8') as f:
             json.dump(self.llm_config, f, ensure_ascii=False, indent=4)
         if not mnn_config:
             return config_json
-        with open(f'{self.dst_path}/config.json', 'w', encoding='utf-8') as f:
+        with open(f'{self.args.dst_path}/config.json', 'w', encoding='utf-8') as f:
             config = {
                 "llm_model": f"{self.dst_name}.mnn",
                 "llm_weight": f"{self.dst_name}.mnn.weight",
@@ -2682,11 +2925,18 @@ def export_config(self, mnn_config = False):
                 "precision": "low",
                 "memory": "low"
             }
+            if self.visual is not None or self.audio is not None:
+                config['mllm'] = {
+                    'backend_type': "cpu",
+                    "thread_num": 4,
+                    "precision": "low",
+                    "memory": "low"
+                }
             json.dump(config, f, ensure_ascii=False, indent=4)
         return config_json
 
     def imitate_quant(self):
-        def quant_dequant(linear, quant_bit = self.quant_bit, quant_block = self.quant_block):
+        def quant_dequant(linear, quant_bit = self.args.quant_bit, quant_block = self.args.quant_block):
             weight = linear.weight.data
             oc, ic = weight.shape
             if quant_block == 0:
@@ -2742,7 +2992,7 @@ def onnx_load_param(self, onnx_path):
         return OnnxRebuilder(onnx_path, self.unloaded_ops).rebuild()
 
     @spinner_run(f'slim the graph of ')
-    def onnx_slim(self, onnx_model):
+    def slim_onnx(self, onnx_model):
         import onnxslim
         model = onnxslim.slim(onnx_model)
         onnx.save(model, onnx_model)
@@ -2782,7 +3032,7 @@ def awq_quant(self):
         self.is_awq_quantized = True
 
     def export(self, export_type):
-        if self.awq:
+        if self.args.awq:
             self.awq_quant()
         export_mnn = export_type == 'mnn'
         # export tokenizer
@@ -2793,14 +3043,16 @@ def export(self, export_type):
             self.export_embed()
         if self.visual:
             visual_onnx = self.export_visual()
-            #if self.need_onnx_slim:
-                #visual_onnx = self.onnx_slim(visual_onnx)
             if export_mnn:
                 MNNConveter(visual_onnx, None, self).export(quant_bit=self.visual.quant_bit)
+        if self.audio:
+            audio_onnx = self.export_audio()
+            if export_mnn:
+                MNNConveter(audio_onnx, None, self).export(quant_bit=self.audio.quant_bit)
         # export graph to llm.onnx
         onnx_model = self.export_onnx()
-        if self.need_onnx_slim:
-            self.onnx_slim(onnx_model)
+        if self.args.onnx_slim:
+            self.slim_onnx(onnx_model)
         if export_mnn:
             # convert onnx to mnn and quant weight
             MNNConveter(onnx_model, self.unloaded_ops, self).export()
@@ -2823,8 +3075,8 @@ def export(self, export_type):
     @spinner_run(f'export tokenizer to ')
     def export_tokenizer(self):
         # load tokenizer file
-        tokenizer_model = os.path.join(self.tokenizer_path, 'tokenizer.model')
-        ice_text_model = os.path.join(self.tokenizer_path, 'ice_text.model')
+        tokenizer_model = os.path.join(self.args.tokenizer_path, 'tokenizer.model')
+        ice_text_model = os.path.join(self.args.tokenizer_path, 'ice_text.model')
         try:
             import sentencepiece as spm
             if os.path.exists(tokenizer_model):
@@ -2835,7 +3087,7 @@ def export_tokenizer(self):
                 self.sp_model = None
         except:
             self.sp_model = None
-        merge_file = os.path.join(self.path, 'merges.txt')
+        merge_file = os.path.join(self.args.path, 'merges.txt')
         if os.path.exists(merge_file):
             self.merge_txt = merge_file
         else:
@@ -2854,7 +3106,7 @@ def write_header(fp, type, speicals, prefix = []):
             fp.write(f'{len(speicals)} {len(self.stop_ids)} {len(prefix)}\n')
             write_line(fp, speicals, self.stop_ids, prefix)
 
-        file_path = os.path.join(self.dst_path, "tokenizer.txt")
+        file_path = os.path.join(self.args.dst_path, "tokenizer.txt")
         special_list = list(self.tokenizer.added_tokens_decoder.keys())
         if hasattr(self.tokenizer, 'special_tokens'):
             for k, v in self.tokenizer.special_tokens.items():
@@ -2888,7 +3140,7 @@ def write_header(fp, type, speicals, prefix = []):
                     token_type = UNUSED
                 elif self.sp_model.IsByte(i):
                     token_type = BYTE
-                if self.path == 'Chatglm_6b':
+                if self.args.path == 'Chatglm_6b':
                     if '<n>' in token: token = '\n'
                     if '<|tab|>' in token: token = '\t'
                     if '<|blank_' in token: token = ' ' * int(token[8:token.find('|>')])
@@ -3099,8 +3351,8 @@ def export(self, export_type):
         self.export_config(export_mnn)
         self.export_embed()
         onnx_model = self.export_onnx()
-        if self.need_onnx_slim:
-            self.onnx_slim(onnx_model)
+        if self.args.onnx_slim:
+            self.slim_onnx(onnx_model)
         if export_mnn:
             MNNConveter(onnx_model, None, self).export()
 
diff --git a/transformers/llm/export/requirements.txt b/transformers/llm/export/requirements.txt
index 6d095790f..d0b77b53e 100644
--- a/transformers/llm/export/requirements.txt
+++ b/transformers/llm/export/requirements.txt
@@ -1,5 +1,5 @@
 datasets
-MNN>=3.0.0
+MNN
 onnx
 onnxslim
 onnxruntime
@@ -9,6 +9,6 @@ Requests
 sentencepiece
 torch
 tqdm
-transformers>=4.45
+transformers
 yaspin
 numpy