diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index db7e3890396..8883e5ee026 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -118,27 +118,29 @@ include_directories( # # declare targets # +add_library(executorch_backend INTERFACE) add_library(qcir INTERFACE qcir_schema_output) add_library(qcir_utils STATIC) -add_library(qnn_schema INTERFACE ${_qnn_schema__outputs}) -add_library(executorch_backend INTERFACE) +add_library(qnn_backend STATIC) +add_library(qnn_backend_cache STATIC) +add_library(qnn_context STATIC) +add_library(qnn_device STATIC) add_library(qnn_executorch_backend SHARED) add_library(qnn_executorch_header INTERFACE) add_library(qnn_executorch_logging STATIC) -add_library(qnn_manager STATIC) +add_library(qnn_factory STATIC) add_library(qnn_function_interface INTERFACE) +add_library(qnn_graph STATIC) +add_library(qnn_header INTERFACE) add_library(qnn_implementation STATIC) -add_library(qnn_sys_function_interface INTERFACE) -add_library(qnn_sys_implementation STATIC) add_library(qnn_logger STATIC) +add_library(qnn_manager STATIC) +add_library(qnn_mem_manager STATIC) add_library(qnn_profiler STATIC) -add_library(qnn_device STATIC) -add_library(qnn_context STATIC) -add_library(qnn_backend_cache STATIC) -add_library(qnn_graph STATIC) -add_library(qnn_backend STATIC) -add_library(qnn_factory STATIC) -add_library(qnn_header INTERFACE) +add_library(qnn_schema INTERFACE ${_qnn_schema__outputs}) +add_library(qnn_sys_function_interface INTERFACE) +add_library(qnn_sys_implementation STATIC) +add_library(shared_buffer STATIC) add_library(wrappers STATIC) add_library(utils STATIC) @@ -220,6 +222,13 @@ target_link_libraries(qnn_graph qnn_context qnn_profiler ) +target_link_libraries(qnn_mem_manager + PRIVATE + qnn_executorch_logging + qnn_implementation + qnn_context +) + target_link_libraries(qnn_factory PUBLIC qnn_header @@ -229,6 +238,7 @@ target_link_libraries(qnn_factory qnn_device qnn_context qnn_graph + qnn_mem_manager ) target_link_libraries(qnn_manager PRIVATE @@ -236,6 +246,7 @@ target_link_libraries(qnn_manager wrappers qnn_schema utils + shared_buffer ) target_link_libraries(qnn_executorch_backend PRIVATE @@ -249,7 +260,11 @@ target_link_libraries(utils PRIVATE qnn_executorch_logging ) - +target_link_libraries(shared_buffer + PRIVATE + qnn_executorch_logging + ${CMAKE_DL_LIBS} +) # # add linker option # diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp index 2a2cda84c55..9d80fd735aa 100644 --- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp +++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp @@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper( Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) { if (data != nullptr) { + QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW; QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_; if (copy_data) { owned_data_ = std::make_unique(bytes_); @@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) { return Error::Ok; } +Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) { + QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(tensor_)->memHandle = mem_handle; + return Error::Ok; +} + // base function for Create TensorWrapper std::shared_ptr CreateTensorWrapper( const std::string& tensor_name, diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h index 5c2be693486..c973196e9d5 100644 --- a/backends/qualcomm/aot/wrappers/TensorWrapper.h +++ b/backends/qualcomm/aot/wrappers/TensorWrapper.h @@ -59,16 +59,38 @@ class TensorWrapper { return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC; }; - const void* GetStaticTensorData() const { - return QNN_VER_PTR(tensor_)->clientBuf.data; + std::uint32_t* GetDims() const { + return QNN_VER_PTR(tensor_)->dimensions; + }; + + Qnn_DataType_t GetDataType() const { + return QNN_VER_PTR(tensor_)->dataType; + }; + + Qnn_MemHandle_t const GetMemHandle() { + return QNN_VER_PTR(tensor_)->memHandle; + }; + + Qnn_TensorMemType_t GetMemType() const { + return QNN_VER_PTR(tensor_)->memType; }; std::string GetName() const { return qnn_tensor_name_; }; + std::uint32_t GetRank() const { + return QNN_VER_PTR(tensor_)->rank; + }; + + const void* GetStaticTensorData() const { + return QNN_VER_PTR(tensor_)->clientBuf.data; + }; + Error SetName(const std::string& name); + Error SetMemHandle(Qnn_MemHandle_t mem_handle); + private: // need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION std::string qnn_tensor_name_; diff --git a/backends/qualcomm/passes/insert_io_qdq.py b/backends/qualcomm/passes/insert_io_qdq.py index e1dd55a916a..971e4895c36 100644 --- a/backends/qualcomm/passes/insert_io_qdq.py +++ b/backends/qualcomm/passes/insert_io_qdq.py @@ -38,6 +38,12 @@ def _ceate_args(self, target: torch.fx.node.Target, quant_attrs: Dict): arg_schemas = list(target._schema.arguments)[1:] for arg_schema in arg_schemas: name = arg_schema.name + # TODO: Due to the new parameter "out_dtype" in the dequantize node, + # it could not be found in the quant_attrs of other nodes, + # and it will cause a key error. For now, the output type + # of our dequantize node is only float. (by default in pytorch) + if name == "out_dtype": + continue value = quant_attrs[name] if type(arg_schema.type) == torch.tensor and type(value) in [int, float]: value = torch.tensor(value) diff --git a/backends/qualcomm/runtime/CMakeLists.txt b/backends/qualcomm/runtime/CMakeLists.txt index 615c6320b5d..3a59c3ba2b3 100644 --- a/backends/qualcomm/runtime/CMakeLists.txt +++ b/backends/qualcomm/runtime/CMakeLists.txt @@ -47,3 +47,10 @@ target_sources(utils PRIVATE ${CMAKE_CURRENT_LIST_DIR}/Utils.cpp ) + +# shared_buffer +target_sources(shared_buffer + PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h + ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp +) diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h index e3c76742e2a..d54de1059d7 100644 --- a/backends/qualcomm/runtime/QnnExecuTorch.h +++ b/backends/qualcomm/runtime/QnnExecuTorch.h @@ -8,8 +8,10 @@ #pragma once #ifdef __cplusplus +#include #include #else +#include #include #endif @@ -31,6 +33,16 @@ typedef struct { } // clang-format on +/// Allocate specific tensors (usually graph inputs and outputs) on shared +/// memory. Users are responsible to allocate "enough" tensor bytes, and set +/// alignment as MemoryAllocator::kDefaultAlignment. +/// See runtime/core/memory_allocator.h. The function returns a valid pointer +/// if allocation is successful. +void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment); + +/// Free the allocated shared memory. +void QnnExecuTorchFreeCustomMem(void* buffer_ptr); + #ifdef __cplusplus } #endif // __cplusplus diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index b093c274c38..77449703c5f 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -188,9 +188,14 @@ Error QnnExecuTorchBackend::execute( std::vector input_tensor_structs; std::vector output_tensor_structs; + input_tensor_structs.reserve(input_tensors.size()); for (int i = 0; i < input_tensors.size(); ++i) { - input_tensors[i]->FillDataBuffer( - args[i]->toTensor().const_data_ptr(), true /* copy_data */); + if (qnn_manager->RegisterMem( + args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) != + Error::Ok) { + input_tensors[i]->FillDataBuffer( + args[i]->toTensor().const_data_ptr(), true /* copy_data */); + } input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct()); } @@ -198,9 +203,12 @@ Error QnnExecuTorchBackend::execute( for (const auto& output_tensor : output_tensors) { // pos=0 limits the search to the prefix if (output_tensor->GetName().rfind("output_", 0) == 0) { - output_tensor->FillDataBuffer( - args[output_index]->toTensor().mutable_data_ptr(), - false /* copy_data */); + void* mutable_data_ptr = + args[output_index]->toTensor().mutable_data_ptr(); + if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) != + Error::Ok) { + output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */); + } output_index++; } output_tensor_structs.push_back(output_tensor->CloneTensorStruct()); diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 3303a08309d..dc3217fc1c8 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -6,9 +6,9 @@ * LICENSE file in the root directory of this source tree. */ #include +#include #include #include - #include #include #include @@ -54,7 +54,9 @@ QnnManager::QnnManager( "the size of qnn context binary: %d", qnn_executorch_context_binary.nbytes); QNN_EXECUTORCH_LOG_INFO( - "Is on-device graph construction: %d", options_->online_prepare()); + "Is on-device graph construction: %d", options->online_prepare()); + QNN_EXECUTORCH_LOG_INFO( + "Enable shared buffer: %d", options->shared_buffer()); } if (library_path.empty()) { @@ -82,6 +84,53 @@ Error QnnManager::LoadQnnLibrary() { return ret; } +Error QnnManager::RegisterMem( + void* data_ptr, + const std::shared_ptr& tensor_wrapper) { + SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager(); + // Not enable shared buffer + if (!options_->shared_buffer()) + return Error::Internal; + + if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) { + QNN_EXECUTORCH_LOG_WARN( + "Backend %s doesn't supported shared buffer.", + EnumNameQnnExecuTorchBackendType( + options_->backend_options()->backend_type())); + return Error::Internal; + } + + if (!shared_buffer_manager.IsAllocated(data_ptr)) { + // It means two scenarios here: + // 1. the input and output partitioned graph + // 2. Actually, user doesn't allocate shared buffer with + // QnnExecuTorchAllocCustomMem API + return Error::Internal; + } else if (backend_params_ptr_->qnn_mem_manager_ptr_->IsRegistered( + tensor_wrapper->GetMemHandle())) { + if (options_->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo) + QNN_EXECUTORCH_LOG_INFO( + "Tensor name %s has been registered shared memory.", + tensor_wrapper->GetName().c_str()); + return Error::Ok; + } + + int32_t mem_fd = SharedBuffer::GetSharedBufferManager().MemToFd(data_ptr); + if (mem_fd == -1) { + QNN_EXECUTORCH_LOG_WARN( + "Tensor name %s is failed to get file descriptor.", + tensor_wrapper->GetName().c_str()); + return Error::Internal; + } + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_->qnn_mem_manager_ptr_->RegisterMem( + tensor_wrapper, mem_fd) == Error::Ok, + Internal, + "Fail to register to shared memory."); + + return Error::Ok; +} + Error QnnManager::Init() { ET_CHECK_OR_RETURN_ERROR( LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library"); @@ -219,14 +268,6 @@ void QnnManager::Destroy() { qnn_loaded_backend_.TerminateAllBackends(); } -bool QnnManager::IsAvailable() { - return true; -} - -bool QnnManager::IsOnlinePrepare() { - return options_->online_prepare(); -} - bool QnnManager::IsNodeSupportedByBackend( std::vector>& op_wrappers) { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -329,3 +370,14 @@ Error QnnManager::Compile( } // namespace qnn } // namespace executor } // namespace torch +void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) { + using torch::executor::qnn::SharedBuffer; + void* buffer_ptr = + SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment); + return buffer_ptr; +} + +void QnnExecuTorchFreeCustomMem(void* buffer_ptr) { + using torch::executor::qnn::SharedBuffer; + SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr); +} diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index a0a5b35e14d..639d3534de4 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -42,14 +42,18 @@ class QnnManager { void Destroy(); - bool IsAvailable(); + bool IsAvailable() { + return true; + } + + bool IsOnlinePrepare() { + return options_->online_prepare(); + } bool IsTensorDump() { return options_->tensor_dump_output_path()->size() > 0; } - bool IsOnlinePrepare(); - bool IsNodeSupportedByBackend( std::vector>& op_wrappers); @@ -57,6 +61,10 @@ class QnnManager { std::vector>& op_wrappers, QnnExecuTorchContextBinary& qnn_executorch_context_binary); + Error RegisterMem( + void* data_ptr, + const std::shared_ptr& tensor_wrapper); + std::vector> GetGraphInputs() { return input_tensors_; } diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp new file mode 100644 index 00000000000..423c5d63723 --- /dev/null +++ b/backends/qualcomm/runtime/SharedBuffer.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include + +// Refer to the QNN HTP Shared Buffer Tutorial +// in Qualcomm® AI Engine Direct document +constexpr uint8_t RPCMEM_HEAP_ID_SYSTEM = 25; +constexpr uint8_t RPCMEM_DEFAULT_FLAGS = 1; + +namespace torch { +namespace executor { +namespace qnn { + +namespace { + +intptr_t alignTo(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + +} // namespace + +std::mutex SharedBuffer::init_mutex_; + +SharedBuffer& SharedBuffer::GetSharedBufferManager() { + std::lock_guard lk(init_mutex_); + static SharedBuffer shared_buffer_manager; + if (!shared_buffer_manager.GetInitialize()) { + Error status = shared_buffer_manager.Load(); + if (status == Error::Ok) { + shared_buffer_manager.SetInitialize(true); + } + } + return shared_buffer_manager; +} + +SharedBuffer::~SharedBuffer() { + if (initialize_) { + SharedBuffer::GetSharedBufferManager().UnLoad(); + } +}; + +void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) { + if (!initialize_) { + QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized."); + return nullptr; + } + // do alignment: + auto allocate_bytes = static_cast(bytes + alignment); + void* buf = rpc_mem_alloc_( + RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory."); + return nullptr; + } + auto aligned_buf = reinterpret_cast( + alignTo(alignment, reinterpret_cast(buf))); + bool status = + restore_map_.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory."); + rpc_mem_free_(buf); + } + return aligned_buf; +} + +int32_t SharedBuffer::MemToFd(void* buf) { + int32_t memFd = -1; + if (!initialize_) { + QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized."); + } else { + memFd = rpc_mem_to_fd_(buf); + } + return memFd; +} + +void SharedBuffer::FreeMem(void* buf) { + if (!initialize_) { + QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized."); + } else if (restore_map_.count(buf) == 0) { + QNN_EXECUTORCH_LOG_WARN("Don't free an unallocated tensor."); + } else { + rpc_mem_free_(restore_map_[buf]); + restore_map_.erase(buf); + } +} + +bool SharedBuffer::IsAllocated(void* buf) { + return restore_map_.count(buf) != 0U; +} + +Error SharedBuffer::Load() { + // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib/ + // and /vendor/lib64/ respectively. + lib_cdsp_rpc_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (lib_cdsp_rpc_ == nullptr) { + QNN_EXECUTORCH_LOG_ERROR( + "Unable to load shared buffer. dlerror(): %s", dlerror()); + return Error::Internal; + } + rpc_mem_alloc_ = reinterpret_cast( // NOLINT + dlsym(lib_cdsp_rpc_, "rpcmem_alloc")); + rpc_mem_free_ = reinterpret_cast( // NOLINT + dlsym(lib_cdsp_rpc_, "rpcmem_free")); + rpc_mem_to_fd_ = reinterpret_cast( // NOLINT + dlsym(lib_cdsp_rpc_, "rpcmem_to_fd")); + if (nullptr == rpc_mem_alloc_ || nullptr == rpc_mem_free_ || + nullptr == rpc_mem_to_fd_) { + QNN_EXECUTORCH_LOG_ERROR( + "Unable to access symbols in shared buffer. dlerror(): %s", dlerror()); + dlclose(lib_cdsp_rpc_); + return Error::Internal; + } + return Error::Ok; +} + +Error SharedBuffer::UnLoad() { + if (dlclose(lib_cdsp_rpc_) != 0) { + QNN_EXECUTORCH_LOG_ERROR( + "Unable to close shared buffer. dlerror(): %s", dlerror()); + return Error::Internal; + }; + return Error::Ok; +} +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h new file mode 100644 index 00000000000..1803e8af879 --- /dev/null +++ b/backends/qualcomm/runtime/SharedBuffer.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once +#include +#include +#include +#include +#include +#include + +using RpcMemAllocFn_t = void* (*)(int, uint32_t, int); +using RpcMemFreeFn_t = void (*)(void*); +using RpcMemToFdFn_t = int (*)(void*); + +namespace torch { +namespace executor { +namespace qnn { +class SharedBuffer final { + public: + SharedBuffer(const SharedBuffer&) = delete; + SharedBuffer& operator=(const SharedBuffer&) = delete; + SharedBuffer(SharedBuffer&&) = delete; + SharedBuffer& operator=(SharedBuffer&&) = delete; + ~SharedBuffer(); + + static SharedBuffer& GetSharedBufferManager(); + void* AllocMem(size_t bytes, size_t alignment); + // map a buffer allocated via RPCMem to a file descriptor so it can be + // registered with a backend via QnnMem_register() + int32_t MemToFd(void* buf); + + void FreeMem(void* buf); + + bool IsAllocated(void* buf); + + bool GetInitialize() { + return initialize_; + } + void SetInitialize(bool initialize) { + initialize_ = initialize; + } + + private: + SharedBuffer() = default; + + // dlopen RPCMem library and dlysm required functions + Error Load(); + + Error UnLoad(); + + // Pointer to the dlopen'd libcdsprpc.so shared library which contains + // rpcmem_alloc, rpcmem_free, rpcmem_to_fd APIs + void* lib_cdsp_rpc_; + // Function pointer to rpcmem_alloc + RpcMemAllocFn_t rpc_mem_alloc_; + // Function pointer to rpcmem_free + RpcMemFreeFn_t rpc_mem_free_; + // Function pointer to rpcmem_to_fd + RpcMemToFdFn_t rpc_mem_to_fd_; + std::unordered_map restore_map_; + std::atomic_bool initialize_{false}; + static std::mutex init_mutex_; +}; + +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt index 65871d22e14..6541989be15 100644 --- a/backends/qualcomm/runtime/backends/CMakeLists.txt +++ b/backends/qualcomm/runtime/backends/CMakeLists.txt @@ -109,6 +109,14 @@ target_sources(qnn_backend ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCommon.cpp ) +# qnn_mem_manager +target_sources(qnn_mem_manager + PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/QnnMemManager.h + PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/QnnMemManager.cpp +) + # qnn_factory target_sources(qnn_factory PUBLIC diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index d90f850386a..acb95524682 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -69,6 +69,8 @@ std::unique_ptr QnnBackendFactory::Create( options->graph_name()->str(), options->soc_info(), htp_options); + backend_params->qnn_mem_manager_ptr_ = std::make_unique( + implementation, backend_params->qnn_context_ptr_.get()); backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED; return backend_params; } break; diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h index bfed40d9aaa..ab47113a538 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ typedef struct BackendConfigParameters { std::unique_ptr qnn_context_ptr_; std::unique_ptr qnn_device_ptr_; std::unique_ptr qnn_graph_ptr_; + std::unique_ptr qnn_mem_manager_ptr_; // Default ctor BackendConfigParameters() @@ -40,10 +42,12 @@ typedef struct BackendConfigParameters { backend_init_state_(BackendInitializeState::UNINITIALIZED), qnn_context_ptr_(nullptr), qnn_device_ptr_(nullptr), - qnn_graph_ptr_(nullptr) {} + qnn_graph_ptr_(nullptr), + qnn_mem_manager_ptr_(nullptr) {} // Default dtor ~BackendConfigParameters() { qnn_graph_ptr_.reset(); + qnn_mem_manager_ptr_.reset(); qnn_context_ptr_.reset(); qnn_device_ptr_.reset(); qnn_backend_ptr_.reset(); diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp new file mode 100644 index 00000000000..8f8317e0136 --- /dev/null +++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include + +namespace torch { +namespace executor { +namespace qnn { + +bool QnnMemManager::IsRegistered(Qnn_MemHandle_t handle) { + return registered_set_.count(handle) != 0U; +} + +Error QnnMemManager::RegisterMem( + const std::shared_ptr& tensor_wrapper, + int32_t mem_fd) { + const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); + Qnn_MemDescriptor_t descriptor = { + {tensor_wrapper->GetRank(), tensor_wrapper->GetDims(), nullptr}, + tensor_wrapper->GetDataType(), + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + error = qnn_interface.qnn_mem_register( + context_->GetHandle(), + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_WARN( + "Tensor %s is failed to register shared memory. Error %d", + tensor_wrapper->GetName().c_str(), + QNN_GET_ERROR_CODE(error)); + return Error::Internal; + } + tensor_wrapper->SetMemHandle(handle); + registered_set_.insert(handle); + QNN_EXECUTORCH_LOG_INFO( + "Tensor %s is successfully registered to shared memory.", + tensor_wrapper->GetName().c_str()); + return Error::Ok; +} + +void QnnMemManager::DeRegisterMem() { + const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + for (auto& mem_handle : registered_set_) { + error = qnn_interface.qnn_mem_de_register(&mem_handle, /*numHandles=*/1); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_WARN( + "Failed to de-register shared memory. Error %d", + QNN_GET_ERROR_CODE(error)); + } + } + registered_set_.clear(); +} + +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h new file mode 100644 index 00000000000..9d5949db16a --- /dev/null +++ b/backends/qualcomm/runtime/backends/QnnMemManager.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once +#include +#include +#include +#include + +namespace torch { +namespace executor { +namespace qnn { + +class QnnMemManager { + public: + explicit QnnMemManager( + const QnnImplementation& implementation, + QnnContext* context) + : implementation_(implementation), context_(context) {} + ~QnnMemManager() { + DeRegisterMem(); + } + + Error RegisterMem( + const std::shared_ptr& tensor_wrapper, + int32_t mem_fd); + + bool IsRegistered(Qnn_MemHandle_t handle); + + private: + void DeRegisterMem(); + + const QnnImplementation& implementation_; + QnnContext* context_; + std::unordered_set registered_set_; +}; +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py index b3b70328ae0..0f926fc0975 100644 --- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py +++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py @@ -131,3 +131,4 @@ class QnnExecuTorchOptions: online_prepare: bool = False tensor_dump_output_path: str = "" profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff + shared_buffer: bool = False diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs index c19bf681bbf..8c4d23172f0 100644 --- a/backends/qualcomm/serialization/schema.fbs +++ b/backends/qualcomm/serialization/schema.fbs @@ -172,6 +172,9 @@ table QnnExecuTorchOptions { /// Profiling level of the delegate and the backend. Default is off. profile_level:QnnExecuTorchProfileLevel; + + /// Enables usage of shared buffer between application and backend for graph I/O. + shared_buffer:bool; } root_type QnnExecuTorchOptions; diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index e36c6e5ecd8..66a3ad5c613 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -56,6 +56,7 @@ def setUp(self): online_prepare=TestQNN.online_prepare, tensor_dump_output_path="", profile=TestQNN.enable_profile, + shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_arange(self): @@ -389,6 +390,7 @@ def setUp(self): online_prepare=TestQNN.online_prepare, tensor_dump_output_path="", profile=TestQNN.enable_profile, + shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_conv1d_relu_log_softmax(self): @@ -484,6 +486,7 @@ def setUp(self): online_prepare=TestQNN.online_prepare, tensor_dump_output_path="", profile=TestQNN.enable_profile, + shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_16a4w_conv2d(self): @@ -880,6 +883,7 @@ def setUp(self): online_prepare=TestQNN.online_prepare, tensor_dump_output_path="", profile=TestQNN.enable_profile, + shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_conv1d_relu_log_softmax(self): @@ -1077,6 +1081,24 @@ def test_qnn_backend_profile_op(self): expected_profile_events=25, ) + def test_qnn_backend_shared_buffer(self): + TestQNN.shared_buffer = True + backend_options = generate_htp_compiler_spec( + use_fp16=True, + ) + TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + shared_buffer=True, + ) + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + self.lower_module_and_test_output( + module, + sample_input, + expected_partitions=1, + ) + class TestQNNQuantizedUtils(TestQNN): # TODO: refactor to support different backends @@ -1179,6 +1201,25 @@ def test_qnn_backend_profile_op(self): expected_profile_events=26, ) + def test_qnn_backend_shared_buffer(self): + TestQNN.shared_buffer = True + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + shared_buffer=True, + ) + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output( + module, + sample_input, + expected_partitions=1, + ) + class TestExampleScript(TestQNN): def required_envs(self, conditions=None) -> bool: @@ -1215,6 +1256,8 @@ def test_mobilenet_v2(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1248,6 +1291,8 @@ def test_inception_v3(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1281,6 +1326,8 @@ def test_inception_v4(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1314,6 +1361,8 @@ def test_vit(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1346,6 +1395,8 @@ def test_edsr(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1378,6 +1429,8 @@ def test_deeplab_v3(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1411,6 +1464,8 @@ def test_dummy_llama2(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1442,6 +1497,8 @@ def test_ptq_dummy_llama2(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1475,6 +1532,8 @@ def test_mobilebert(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1515,6 +1574,8 @@ def test_ptq_mobilebert(self): ] if self.host: cmds.extend(["--host", self.host]) + if self.shared_buffer: + cmds.extend(["--shared_buffer"]) p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -1585,6 +1646,7 @@ def setup_environment(): TestQNN.online_prepare = args.online_prepare TestQNN.enable_profile = args.enable_profile TestQNN.error_only = args.error_only + TestQNN.shared_buffer = args.shared_buffer return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index dc0da7f75dc..ee7d6a7a3b6 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -32,6 +32,7 @@ from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from executorch.exir.program._program import ExecutorchProgram from executorch.sdk import generate_etrecord from executorch.sdk.inspector import Inspector @@ -64,6 +65,7 @@ class TestQNN(unittest.TestCase): use_8a8w: str = "8a8w" use_16a16w: str = "16a16w" use_16a4w: str = "16a4w" + shared_buffer: bool = False def _assert_outputs_equal(self, model_output, ref_output): self.assertTrue(len(ref_output) == len(model_output)) @@ -183,7 +185,19 @@ def lower_module_and_test_output( delegated_program.exported_program = to_backend( delegated_program.exported_program, qnn_partitioner ) - exec_prog = delegated_program.to_executorch() + exec_prog = delegated_program.to_executorch( + exir.ExecutorchBackendConfig( + # For shared buffer, user must pass the memory address + # which is allocated by RPC memory to executor runner. + # Therefore, won't want to pre-allocate + # by memory manager in runtime. + memory_planning_pass=MemoryPlanningPass( + memory_planning_algo="greedy", + alloc_graph_input=not self.shared_buffer, + alloc_graph_output=not self.shared_buffer, + ) + ) + ) # Assert the backend name is qnn self.assertEqual( diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 1af9572bd3b..7fa696efbac 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -190,6 +190,7 @@ def generate_qnn_executorch_compiler_spec( online_prepare: bool = False, tensor_dump_output_path: str = "", profile: bool = False, + shared_buffer: bool = False, ) -> List[CompileSpec]: """ Helper function generating compiler specs for Qualcomm AI Engine Direct @@ -215,6 +216,8 @@ def generate_qnn_executorch_compiler_spec( profile: Enable profile the performance of per operator. Note that for now only support kProfileDetailed to profile the performance of each operator with cycle unit. + shared_buffer: Enables usage of shared buffer between application + and backend for graph I/O. Returns: List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct. @@ -250,6 +253,9 @@ def generate_qnn_executorch_compiler_spec( else: qnn_executorch_options.profile_level = QnnExecuTorchProfileLevel.kProfileOff + if shared_buffer: + qnn_executorch_options.shared_buffer = True + if ( online_prepare and backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt index 905deca6445..54772f5c781 100644 --- a/examples/qualcomm/CMakeLists.txt +++ b/examples/qualcomm/CMakeLists.txt @@ -100,7 +100,7 @@ target_link_libraries(qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump - ${FLATCC_LIB} + ${FLATCCRT_LIB} gflags ) target_compile_options(qnn_executor_runner diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index 0b13122e961..bd18cdc16b1 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -17,7 +17,10 @@ * Currently we assume that the outputs are all fp32 tensors. */ +#include #include +#include +#include #include #include #include @@ -25,6 +28,7 @@ #include #include #include + #include #include @@ -47,14 +51,55 @@ DEFINE_string( DEFINE_string(input_list_path, "input_list.txt", "Model input list path."); DEFINE_int32(iteration, 1, "Iterations of inference."); DEFINE_int32(warm_up, 0, "Pre-run before inference."); +DEFINE_bool( + shared_buffer, + false, + "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend."); DEFINE_string( etdump_path, "etdump.etdp", "If etdump generation is enabled an etdump will be written out to this path"); using namespace torch::executor; +using torch::executor::MemoryAllocator; using torch::executor::util::FileDataLoader; +class CustomMemory { + public: + CustomMemory(bool shared_buffer) : shared_buffer_(shared_buffer){}; + bool Allocate(size_t bytes, size_t alignment) { + if (shared_buffer_) { + ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment); + } else { + input_data_.resize(bytes); + ptr_ = input_data_.data(); + } + return ptr_ != nullptr; + } + + ~CustomMemory() { + if (shared_buffer_) { + if (ptr_ != nullptr) { + QnnExecuTorchFreeCustomMem(ptr_); + } + } + } + + void* GetPtr() { + return ptr_; + } + + CustomMemory(const CustomMemory&) = delete; + CustomMemory(CustomMemory&&) = delete; + CustomMemory& operator=(const CustomMemory&) = delete; + CustomMemory& operator=(CustomMemory&&) = delete; + + private: + bool shared_buffer_{false}; + void* ptr_{nullptr}; + std::vector input_data_; +}; + int main(int argc, char** argv) { runtime_init(); @@ -167,10 +212,58 @@ int main(int argc, char** argv) { ET_LOG(Info, "Method loaded."); // Prepare the inputs. - // Use ones-initialized inputs. - auto inputs = util::PrepareInputTensors(*method); + // Allocate data memory for inputs and outputs + std::vector> in_custom_mem; + std::vector> out_custom_mem; + in_custom_mem.reserve(method->inputs_size()); + out_custom_mem.reserve(method->outputs_size()); + + for (int input_index = 0; input_index < method->inputs_size(); + ++input_index) { + MethodMeta method_meta = method->method_meta(); + Result tensor_meta = method_meta.input_tensor_meta(input_index); + in_custom_mem.push_back( + std::make_unique(FLAGS_shared_buffer)); + std::unique_ptr& custom_mem_ptr = in_custom_mem.back(); + ET_CHECK_MSG( + custom_mem_ptr->Allocate( + tensor_meta->nbytes(), MemoryAllocator::kDefaultAlignment), + "Failed to allocate custom memory. tensor index: %d, bytes: %zu", + input_index, + tensor_meta->nbytes()); + TensorImpl impl = TensorImpl( + tensor_meta->scalar_type(), + /*dim=*/tensor_meta->sizes().size(), + const_cast(tensor_meta->sizes().data()), + custom_mem_ptr->GetPtr(), + const_cast(tensor_meta->dim_order().data())); + Error ret = method->set_input(Tensor(&impl), input_index); + ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret); + } + for (int output_index = 0; output_index < method->outputs_size(); + ++output_index) { + const exec_aten::Tensor& t = method->get_output(output_index).toTensor(); + out_custom_mem.push_back( + std::make_unique(FLAGS_shared_buffer)); + std::unique_ptr& custom_mem_ptr = out_custom_mem.back(); + ET_CHECK_MSG( + custom_mem_ptr->Allocate( + t.nbytes(), MemoryAllocator::kDefaultAlignment), + "Failed to allocate custom memory. tensor index: %d, bytes: %zu", + output_index, + t.nbytes()); + Error ret = method->set_output_data_ptr( + custom_mem_ptr->GetPtr(), t.nbytes(), output_index); + if (ret != Error::Ok) { + // This can error if the outputs are already pre-allocated. Ignore + // this error because it doesn't affect correctness, but log it. + ET_LOG( + Error, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret); + } + } ET_LOG(Info, "Inputs prepared."); + // Fill in data for input std::ifstream input_list(FLAGS_input_list_path); if (input_list.is_open()) { size_t num_inputs = method->inputs_size(); @@ -205,31 +298,38 @@ int main(int argc, char** argv) { input_files.size()); for (int input_index = 0; input_index < num_inputs; ++input_index) { - exec_aten::Tensor& t = method->mutable_input(input_index).toTensor(); - std::vector input_data(t.nbytes()); + MethodMeta method_meta = method->method_meta(); + Result tensor_meta = + method_meta.input_tensor_meta(input_index); + std::ifstream fin(input_files[input_index], std::ios::binary); fin.seekg(0, fin.end); size_t file_size = fin.tellg(); ET_CHECK_MSG( - file_size == t.nbytes(), + file_size == tensor_meta->nbytes(), "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu", input_index, file_size, - t.nbytes()); + tensor_meta->nbytes()); fin.seekg(0, fin.beg); - fin.read(input_data.data(), file_size); + fin.read( + static_cast(in_custom_mem[input_index]->GetPtr()), + file_size); fin.close(); - std::vector sizes(t.dim()); - for (int i = 0; i < sizes.size(); ++i) { - sizes[i] = t.sizes().data()[i]; - } - - auto t_impl = TensorImpl( - t.scalar_type(), t.dim(), sizes.data(), input_data.data()); - Error ret = method->set_input(EValue(Tensor(&t_impl)), input_index); + // For pre-allocated use case, we need to call set_input + // to copy data for the input tensors since they doesn't + // share the data with in_custom_mem. + TensorImpl impl = TensorImpl( + tensor_meta->scalar_type(), + /*dim=*/tensor_meta->sizes().size(), + const_cast(tensor_meta->sizes().data()), + in_custom_mem[input_index]->GetPtr(), + const_cast( + tensor_meta->dim_order().data())); + Error ret = method->set_input(Tensor(&impl), input_index); ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret); } @@ -313,21 +413,5 @@ int main(int argc, char** argv) { ET_LOG(Info, "Model executed successfully."); } - // Dump the etdump data containing profiling/debugging data to the specified - // file. - etdump_result result = etdump_gen.get_etdump_data(); - if (result.buf != nullptr && result.size > 0) { - ET_LOG( - Info, - "Write etdump to %s, Size = %zu", - FLAGS_etdump_path.c_str(), - result.size); - FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+"); - fwrite((uint8_t*)result.buf, 1, result.size, f); - fclose(f); - free(result.buf); - } - - util::FreeInputs(inputs); return 0; } diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py index 133e64d8568..4e08ab078c2 100755 --- a/examples/qualcomm/scripts/deeplab_v3.py +++ b/examples/qualcomm/scripts/deeplab_v3.py @@ -109,6 +109,7 @@ def get_dataset(data_size, dataset_dir, download): skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -128,6 +129,7 @@ def get_dataset(data_size, dataset_dir, download): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/dummy_llama2.py b/examples/qualcomm/scripts/dummy_llama2.py index dd37f816004..8178ae5a5a4 100755 --- a/examples/qualcomm/scripts/dummy_llama2.py +++ b/examples/qualcomm/scripts/dummy_llama2.py @@ -128,6 +128,7 @@ def create_device_inputs(example_inputs, use_kv_cache): inputs, custom_annotations=(), quant_dtype=quant_dtype, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -141,6 +142,7 @@ def create_device_inputs(example_inputs, use_kv_cache): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py index f844b094c03..50639d41894 100755 --- a/examples/qualcomm/scripts/edsr.py +++ b/examples/qualcomm/scripts/edsr.py @@ -156,6 +156,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -175,6 +176,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py index e93e13ac33f..cdb84f6e8c6 100644 --- a/examples/qualcomm/scripts/export_example.py +++ b/examples/qualcomm/scripts/export_example.py @@ -12,6 +12,7 @@ ) from executorch.backends.qualcomm.utils.utils import ( capture_program, + generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, ) from executorch.examples.models import MODEL_NAME_TO_MODEL @@ -71,12 +72,13 @@ edge_copy = copy.deepcopy(edge_program) # Delegate to QNN backend + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) qnn_partitioner = QnnPartitioner( generate_qnn_executorch_compiler_spec( - is_fp16=False, soc_model=QcomChipset.SM8550, - debug=False, - saver=False, + backend_options=backend_options, ) ) with validation_disabled(): diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py index 244e38edbe5..a3b5c41923d 100755 --- a/examples/qualcomm/scripts/inception_v3.py +++ b/examples/qualcomm/scripts/inception_v3.py @@ -111,6 +111,7 @@ def get_data_loader(): skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -130,6 +131,7 @@ def get_data_loader(): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py index db3feda2708..06b8047a18c 100755 --- a/examples/qualcomm/scripts/inception_v4.py +++ b/examples/qualcomm/scripts/inception_v4.py @@ -110,6 +110,7 @@ def get_data_loader(): skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -129,6 +130,7 @@ def get_data_loader(): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index dc148afa8eb..84d130d4244 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -294,6 +294,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=quant_dtype, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -313,6 +314,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py index 5f214a6f8ca..e389c00b3ec 100755 --- a/examples/qualcomm/scripts/mobilenet_v2.py +++ b/examples/qualcomm/scripts/mobilenet_v2.py @@ -111,6 +111,7 @@ def get_data_loader(): skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, ) if args.compile_only: @@ -130,6 +131,7 @@ def get_data_loader(): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py index ff22f93c4f4..63e1480b625 100755 --- a/examples/qualcomm/scripts/torchvision_vit.py +++ b/examples/qualcomm/scripts/torchvision_vit.py @@ -150,6 +150,7 @@ def get_data_loader(): f"{args.artifact}/{pte_filename}", inputs, quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, ) # setup required paths accordingly # qnn_sdk : QNN SDK path setup in environment variable @@ -165,6 +166,7 @@ def get_data_loader(): device_id=args.device, host_id=args.host, soc_model=args.model, + shared_buffer=args.shared_buffer, ) adb.push(inputs=inputs, input_list=input_list) adb.execute() diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py index c815867f2d6..4f8e5b419c6 100755 --- a/examples/qualcomm/scripts/utils.py +++ b/examples/qualcomm/scripts/utils.py @@ -32,6 +32,7 @@ ) from executorch.exir.backend.backend_api import to_backend from executorch.exir.capture._config import ExecutorchBackendConfig +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -46,6 +47,7 @@ def __init__( soc_model, host_id=None, error_only=False, + shared_buffer=False, ): self.qnn_sdk = qnn_sdk self.artifact_path = artifact_path @@ -65,6 +67,7 @@ def __init__( } self.soc_model = arch_table[soc_model] self.error_only = error_only + self.shared_buffer = shared_buffer def _adb(self, cmd): if not self.host_id: @@ -123,6 +126,7 @@ def execute(self): f"--output_folder_path {self.output_folder}", f"--input_list_path {self.input_list_filename}", f"--etdump_path {self.etdump_path}", + "--shared_buffer" if self.shared_buffer else "", ] ) qnn_executor_runner_cmds = " ".join( @@ -157,6 +161,7 @@ def build_executorch_binary( skip_node_id_set=None, skip_node_op_set=None, quant_dtype: Optional[QuantDtype] = None, + shared_buffer=False, ): if quant_dtype: quantizer = QnnQuantizer() @@ -202,6 +207,7 @@ def build_executorch_binary( backend_options=backend_options, debug=False, saver=False, + shared_buffer=shared_buffer, ), skip_node_id_set, skip_node_op_set, @@ -209,7 +215,18 @@ def build_executorch_binary( edge_prog.exported_program = to_backend(edge_prog.exported_program, qnn_partitioner) edge_prog.exported_program.graph_module.graph.print_tabular() exec_prog = edge_prog.to_executorch( - config=ExecutorchBackendConfig(extract_constant_segment=False) + config=ExecutorchBackendConfig( + extract_constant_segment=False, + # For shared buffer, user must pass the memory address + # which is allocated by RPC memory to executor runner. + # Therefore, won't want to pre-allocate + # by memory manager in runtime. + memory_planning_pass=MemoryPlanningPass( + memory_planning_algo="greedy", + alloc_graph_input=not shared_buffer, + alloc_graph_output=not shared_buffer, + ), + ) ) with open(f"{file_name}.pte", "wb") as file: file.write(exec_prog.buffer) @@ -338,6 +355,13 @@ def setup_common_args_and_variables(): type=str, ) + parser.add_argument( + "-z", + "--shared_buffer", + help="Enables usage of shared buffer between application and backend for graph I/O.", + action="store_true", + ) + # QNN_SDK_ROOT might also be an argument, but it is used in various places. # So maybe it's fine to just use the environment. if "QNN_SDK_ROOT" not in os.environ: