diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index db7e3890396..8883e5ee026 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -118,27 +118,29 @@ include_directories(
 #
 # declare targets
 #
+add_library(executorch_backend INTERFACE)
 add_library(qcir INTERFACE qcir_schema_output)
 add_library(qcir_utils STATIC)
-add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
-add_library(executorch_backend INTERFACE)
+add_library(qnn_backend STATIC)
+add_library(qnn_backend_cache STATIC)
+add_library(qnn_context STATIC)
+add_library(qnn_device STATIC)
 add_library(qnn_executorch_backend SHARED)
 add_library(qnn_executorch_header INTERFACE)
 add_library(qnn_executorch_logging STATIC)
-add_library(qnn_manager STATIC)
+add_library(qnn_factory STATIC)
 add_library(qnn_function_interface INTERFACE)
+add_library(qnn_graph STATIC)
+add_library(qnn_header INTERFACE)
 add_library(qnn_implementation STATIC)
-add_library(qnn_sys_function_interface INTERFACE)
-add_library(qnn_sys_implementation STATIC)
 add_library(qnn_logger STATIC)
+add_library(qnn_manager STATIC)
+add_library(qnn_mem_manager STATIC)
 add_library(qnn_profiler STATIC)
-add_library(qnn_device STATIC)
-add_library(qnn_context STATIC)
-add_library(qnn_backend_cache STATIC)
-add_library(qnn_graph STATIC)
-add_library(qnn_backend STATIC)
-add_library(qnn_factory STATIC)
-add_library(qnn_header INTERFACE)
+add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
+add_library(qnn_sys_function_interface INTERFACE)
+add_library(qnn_sys_implementation STATIC)
+add_library(shared_buffer STATIC)
 add_library(wrappers STATIC)
 add_library(utils STATIC)
 
@@ -220,6 +222,13 @@ target_link_libraries(qnn_graph
     qnn_context
     qnn_profiler
 )
+target_link_libraries(qnn_mem_manager
+    PRIVATE
+    qnn_executorch_logging
+    qnn_implementation
+    qnn_context
+)
+
 target_link_libraries(qnn_factory
     PUBLIC
     qnn_header
@@ -229,6 +238,7 @@ target_link_libraries(qnn_factory
     qnn_device
     qnn_context
     qnn_graph
+    qnn_mem_manager
 )
 target_link_libraries(qnn_manager
     PRIVATE
@@ -236,6 +246,7 @@ target_link_libraries(qnn_manager
     wrappers
     qnn_schema
     utils
+    shared_buffer
 )
 target_link_libraries(qnn_executorch_backend
     PRIVATE
@@ -249,7 +260,11 @@ target_link_libraries(utils
     PRIVATE
     qnn_executorch_logging
 )
-
+target_link_libraries(shared_buffer
+    PRIVATE
+    qnn_executorch_logging
+    ${CMAKE_DL_LIBS}
+)
 #
 # add linker option
 #
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
index 2a2cda84c55..9d80fd735aa 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
@@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper(
 
 Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
   if (data != nullptr) {
+    QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
     QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
     if (copy_data) {
       owned_data_ = std::make_unique<char[]>(bytes_);
@@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) {
   return Error::Ok;
 }
 
+Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) {
+  QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+  QNN_VER_PTR(tensor_)->memHandle = mem_handle;
+  return Error::Ok;
+}
+
 // base function for Create TensorWrapper
 std::shared_ptr<TensorWrapper> CreateTensorWrapper(
     const std::string& tensor_name,
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h
index 5c2be693486..c973196e9d5 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.h
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.h
@@ -59,16 +59,38 @@ class TensorWrapper {
     return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC;
   };
 
-  const void* GetStaticTensorData() const {
-    return QNN_VER_PTR(tensor_)->clientBuf.data;
+  std::uint32_t* GetDims() const {
+    return QNN_VER_PTR(tensor_)->dimensions;
+  };
+
+  Qnn_DataType_t GetDataType() const {
+    return QNN_VER_PTR(tensor_)->dataType;
+  };
+
+  Qnn_MemHandle_t const GetMemHandle() {
+    return QNN_VER_PTR(tensor_)->memHandle;
+  };
+
+  Qnn_TensorMemType_t GetMemType() const {
+    return QNN_VER_PTR(tensor_)->memType;
   };
 
   std::string GetName() const {
     return qnn_tensor_name_;
   };
 
+  std::uint32_t GetRank() const {
+    return QNN_VER_PTR(tensor_)->rank;
+  };
+
+  const void* GetStaticTensorData() const {
+    return QNN_VER_PTR(tensor_)->clientBuf.data;
+  };
+
   Error SetName(const std::string& name);
 
+  Error SetMemHandle(Qnn_MemHandle_t mem_handle);
+
  private:
   // need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION
   std::string qnn_tensor_name_;
diff --git a/backends/qualcomm/passes/insert_io_qdq.py b/backends/qualcomm/passes/insert_io_qdq.py
index e1dd55a916a..971e4895c36 100644
--- a/backends/qualcomm/passes/insert_io_qdq.py
+++ b/backends/qualcomm/passes/insert_io_qdq.py
@@ -38,6 +38,12 @@ def _ceate_args(self, target: torch.fx.node.Target, quant_attrs: Dict):
         arg_schemas = list(target._schema.arguments)[1:]
         for arg_schema in arg_schemas:
             name = arg_schema.name
+            # TODO: Due to the new parameter "out_dtype" in the dequantize node,
+            # it could not be found in the quant_attrs of other nodes,
+            # and it will cause a key error. For now, the output type
+            # of our dequantize node is only float. (by default in pytorch)
+            if name == "out_dtype":
+                continue
             value = quant_attrs[name]
             if type(arg_schema.type) == torch.tensor and type(value) in [int, float]:
                 value = torch.tensor(value)
diff --git a/backends/qualcomm/runtime/CMakeLists.txt b/backends/qualcomm/runtime/CMakeLists.txt
index 615c6320b5d..3a59c3ba2b3 100644
--- a/backends/qualcomm/runtime/CMakeLists.txt
+++ b/backends/qualcomm/runtime/CMakeLists.txt
@@ -47,3 +47,10 @@ target_sources(utils
     PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
 )
+
+# shared_buffer
+target_sources(shared_buffer
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h
+    ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp
+)
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
index e3c76742e2a..d54de1059d7 100644
--- a/backends/qualcomm/runtime/QnnExecuTorch.h
+++ b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -8,8 +8,10 @@
 #pragma once
 
 #ifdef __cplusplus
+#include <cstddef>
 #include <cstdint>
 #else
+#include <stddef.h>
 #include <stdint.h>
 #endif
 
@@ -31,6 +33,16 @@ typedef struct {
   }
 // clang-format on
 
+/// Allocate specific tensors (usually graph inputs and outputs) on shared
+/// memory. Users are responsible to allocate "enough" tensor bytes, and set
+/// alignment as MemoryAllocator::kDefaultAlignment.
+/// See runtime/core/memory_allocator.h. The function returns a valid pointer
+/// if allocation is successful.
+void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
+
+/// Free the allocated shared memory.
+void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index b093c274c38..77449703c5f 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -188,9 +188,14 @@ Error QnnExecuTorchBackend::execute(
   std::vector<Qnn_Tensor_t> input_tensor_structs;
   std::vector<Qnn_Tensor_t> output_tensor_structs;
 
+  input_tensor_structs.reserve(input_tensors.size());
   for (int i = 0; i < input_tensors.size(); ++i) {
-    input_tensors[i]->FillDataBuffer(
-        args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+    if (qnn_manager->RegisterMem(
+            args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
+        Error::Ok) {
+      input_tensors[i]->FillDataBuffer(
+          args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+    }
     input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
   }
 
@@ -198,9 +203,12 @@ Error QnnExecuTorchBackend::execute(
   for (const auto& output_tensor : output_tensors) {
     // pos=0 limits the search to the prefix
     if (output_tensor->GetName().rfind("output_", 0) == 0) {
-      output_tensor->FillDataBuffer(
-          args[output_index]->toTensor().mutable_data_ptr(),
-          false /* copy_data */);
+      void* mutable_data_ptr =
+          args[output_index]->toTensor().mutable_data_ptr();
+      if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
+          Error::Ok) {
+        output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
+      }
       output_index++;
     }
     output_tensor_structs.push_back(output_tensor->CloneTensorStruct());
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 3303a08309d..dc3217fc1c8 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
+#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 #include <executorch/backends/qualcomm/runtime/Utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
-
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
@@ -54,7 +54,9 @@ QnnManager::QnnManager(
         "the size of qnn context binary: %d",
         qnn_executorch_context_binary.nbytes);
     QNN_EXECUTORCH_LOG_INFO(
-        "Is on-device graph construction: %d", options_->online_prepare());
+        "Is on-device graph construction: %d", options->online_prepare());
+    QNN_EXECUTORCH_LOG_INFO(
+        "Enable shared buffer: %d", options->shared_buffer());
   }
 
   if (library_path.empty()) {
@@ -82,6 +84,53 @@ Error QnnManager::LoadQnnLibrary() {
   return ret;
 }
 
+Error QnnManager::RegisterMem(
+    void* data_ptr,
+    const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
+  SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
+  // Not enable shared buffer
+  if (!options_->shared_buffer())
+    return Error::Internal;
+
+  if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Backend %s doesn't supported shared buffer.",
+        EnumNameQnnExecuTorchBackendType(
+            options_->backend_options()->backend_type()));
+    return Error::Internal;
+  }
+
+  if (!shared_buffer_manager.IsAllocated(data_ptr)) {
+    // It means two scenarios here:
+    // 1. the input and output partitioned graph
+    // 2. Actually, user doesn't allocate shared buffer with
+    // QnnExecuTorchAllocCustomMem API
+    return Error::Internal;
+  } else if (backend_params_ptr_->qnn_mem_manager_ptr_->IsRegistered(
+                 tensor_wrapper->GetMemHandle())) {
+    if (options_->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo)
+      QNN_EXECUTORCH_LOG_INFO(
+          "Tensor name %s has been registered shared memory.",
+          tensor_wrapper->GetName().c_str());
+    return Error::Ok;
+  }
+
+  int32_t mem_fd = SharedBuffer::GetSharedBufferManager().MemToFd(data_ptr);
+  if (mem_fd == -1) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor name %s is failed to get file descriptor.",
+        tensor_wrapper->GetName().c_str());
+    return Error::Internal;
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      backend_params_ptr_->qnn_mem_manager_ptr_->RegisterMem(
+          tensor_wrapper, mem_fd) == Error::Ok,
+      Internal,
+      "Fail to register to shared memory.");
+
+  return Error::Ok;
+}
+
 Error QnnManager::Init() {
   ET_CHECK_OR_RETURN_ERROR(
       LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library");
@@ -219,14 +268,6 @@ void QnnManager::Destroy() {
   qnn_loaded_backend_.TerminateAllBackends();
 }
 
-bool QnnManager::IsAvailable() {
-  return true;
-}
-
-bool QnnManager::IsOnlinePrepare() {
-  return options_->online_prepare();
-}
-
 bool QnnManager::IsNodeSupportedByBackend(
     std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -329,3 +370,14 @@ Error QnnManager::Compile(
 } // namespace qnn
 } // namespace executor
 } // namespace torch
+void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
+  using torch::executor::qnn::SharedBuffer;
+  void* buffer_ptr =
+      SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment);
+  return buffer_ptr;
+}
+
+void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
+  using torch::executor::qnn::SharedBuffer;
+  SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
+}
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index a0a5b35e14d..639d3534de4 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -42,14 +42,18 @@ class QnnManager {
 
   void Destroy();
 
-  bool IsAvailable();
+  bool IsAvailable() {
+    return true;
+  }
+
+  bool IsOnlinePrepare() {
+    return options_->online_prepare();
+  }
 
   bool IsTensorDump() {
     return options_->tensor_dump_output_path()->size() > 0;
   }
 
-  bool IsOnlinePrepare();
-
   bool IsNodeSupportedByBackend(
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);
 
@@ -57,6 +61,10 @@ class QnnManager {
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
       QnnExecuTorchContextBinary& qnn_executorch_context_binary);
 
+  Error RegisterMem(
+      void* data_ptr,
+      const std::shared_ptr<TensorWrapper>& tensor_wrapper);
+
   std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
     return input_tensors_;
   }
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
new file mode 100644
index 00000000000..423c5d63723
--- /dev/null
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <dlfcn.h>
+#include <executorch/backends/qualcomm/runtime/Logging.h>
+#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
+
+// Refer to the QNN HTP Shared Buffer Tutorial
+// in Qualcomm® AI Engine Direct document
+constexpr uint8_t RPCMEM_HEAP_ID_SYSTEM = 25;
+constexpr uint8_t RPCMEM_DEFAULT_FLAGS = 1;
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+namespace {
+
+intptr_t alignTo(size_t alignment, intptr_t offset) {
+  return offset % alignment == 0 ? offset
+                                 : offset +
+          (static_cast<intptr_t>(alignment) -
+           offset % static_cast<intptr_t>(alignment));
+}
+
+} // namespace
+
+std::mutex SharedBuffer::init_mutex_;
+
+SharedBuffer& SharedBuffer::GetSharedBufferManager() {
+  std::lock_guard<std::mutex> lk(init_mutex_);
+  static SharedBuffer shared_buffer_manager;
+  if (!shared_buffer_manager.GetInitialize()) {
+    Error status = shared_buffer_manager.Load();
+    if (status == Error::Ok) {
+      shared_buffer_manager.SetInitialize(true);
+    }
+  }
+  return shared_buffer_manager;
+}
+
+SharedBuffer::~SharedBuffer() {
+  if (initialize_) {
+    SharedBuffer::GetSharedBufferManager().UnLoad();
+  }
+};
+
+void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
+  if (!initialize_) {
+    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
+    return nullptr;
+  }
+  // do alignment:
+  auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+  void* buf = rpc_mem_alloc_(
+      RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+  if (buf == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
+    return nullptr;
+  }
+  auto aligned_buf = reinterpret_cast<void*>(
+      alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
+  bool status =
+      restore_map_.insert(std::pair<void*, void*>(aligned_buf, buf)).second;
+  if (!status) {
+    QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
+    rpc_mem_free_(buf);
+  }
+  return aligned_buf;
+}
+
+int32_t SharedBuffer::MemToFd(void* buf) {
+  int32_t memFd = -1;
+  if (!initialize_) {
+    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
+  } else {
+    memFd = rpc_mem_to_fd_(buf);
+  }
+  return memFd;
+}
+
+void SharedBuffer::FreeMem(void* buf) {
+  if (!initialize_) {
+    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
+  } else if (restore_map_.count(buf) == 0) {
+    QNN_EXECUTORCH_LOG_WARN("Don't free an unallocated tensor.");
+  } else {
+    rpc_mem_free_(restore_map_[buf]);
+    restore_map_.erase(buf);
+  }
+}
+
+bool SharedBuffer::IsAllocated(void* buf) {
+  return restore_map_.count(buf) != 0U;
+}
+
+Error SharedBuffer::Load() {
+  // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib/
+  // and /vendor/lib64/ respectively.
+  lib_cdsp_rpc_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+  if (lib_cdsp_rpc_ == nullptr) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Unable to load shared buffer. dlerror(): %s", dlerror());
+    return Error::Internal;
+  }
+  rpc_mem_alloc_ = reinterpret_cast<RpcMemAllocFn_t>( // NOLINT
+      dlsym(lib_cdsp_rpc_, "rpcmem_alloc"));
+  rpc_mem_free_ = reinterpret_cast<RpcMemFreeFn_t>( // NOLINT
+      dlsym(lib_cdsp_rpc_, "rpcmem_free"));
+  rpc_mem_to_fd_ = reinterpret_cast<RpcMemToFdFn_t>( // NOLINT
+      dlsym(lib_cdsp_rpc_, "rpcmem_to_fd"));
+  if (nullptr == rpc_mem_alloc_ || nullptr == rpc_mem_free_ ||
+      nullptr == rpc_mem_to_fd_) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Unable to access symbols in shared buffer. dlerror(): %s", dlerror());
+    dlclose(lib_cdsp_rpc_);
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+Error SharedBuffer::UnLoad() {
+  if (dlclose(lib_cdsp_rpc_) != 0) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Unable to close shared buffer. dlerror(): %s", dlerror());
+    return Error::Internal;
+  };
+  return Error::Ok;
+}
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h
new file mode 100644
index 00000000000..1803e8af879
--- /dev/null
+++ b/backends/qualcomm/runtime/SharedBuffer.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/runtime/core/error.h>
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+using RpcMemAllocFn_t = void* (*)(int, uint32_t, int);
+using RpcMemFreeFn_t = void (*)(void*);
+using RpcMemToFdFn_t = int (*)(void*);
+
+namespace torch {
+namespace executor {
+namespace qnn {
+class SharedBuffer final {
+ public:
+  SharedBuffer(const SharedBuffer&) = delete;
+  SharedBuffer& operator=(const SharedBuffer&) = delete;
+  SharedBuffer(SharedBuffer&&) = delete;
+  SharedBuffer& operator=(SharedBuffer&&) = delete;
+  ~SharedBuffer();
+
+  static SharedBuffer& GetSharedBufferManager();
+  void* AllocMem(size_t bytes, size_t alignment);
+  // map a buffer allocated via RPCMem to a file descriptor so it can be
+  // registered with a backend via QnnMem_register()
+  int32_t MemToFd(void* buf);
+
+  void FreeMem(void* buf);
+
+  bool IsAllocated(void* buf);
+
+  bool GetInitialize() {
+    return initialize_;
+  }
+  void SetInitialize(bool initialize) {
+    initialize_ = initialize;
+  }
+
+ private:
+  SharedBuffer() = default;
+
+  // dlopen RPCMem library and dlysm required functions
+  Error Load();
+
+  Error UnLoad();
+
+  // Pointer to the dlopen'd libcdsprpc.so shared library which contains
+  // rpcmem_alloc, rpcmem_free, rpcmem_to_fd APIs
+  void* lib_cdsp_rpc_;
+  // Function pointer to rpcmem_alloc
+  RpcMemAllocFn_t rpc_mem_alloc_;
+  // Function pointer to rpcmem_free
+  RpcMemFreeFn_t rpc_mem_free_;
+  // Function pointer to rpcmem_to_fd
+  RpcMemToFdFn_t rpc_mem_to_fd_;
+  std::unordered_map<void*, void*> restore_map_;
+  std::atomic_bool initialize_{false};
+  static std::mutex init_mutex_;
+};
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index 65871d22e14..6541989be15 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -109,6 +109,14 @@ target_sources(qnn_backend
     ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCommon.cpp
 )
 
+# qnn_mem_manager
+target_sources(qnn_mem_manager
+    PUBLIC
+    ${CMAKE_CURRENT_LIST_DIR}/QnnMemManager.h
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/QnnMemManager.cpp
+)
+
 # qnn_factory
 target_sources(qnn_factory
     PUBLIC
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index d90f850386a..acb95524682 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -69,6 +69,8 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           options->graph_name()->str(),
           options->soc_info(),
           htp_options);
+      backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
+          implementation, backend_params->qnn_context_ptr_.get());
       backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
       return backend_params;
     } break;
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
index bfed40d9aaa..ab47113a538 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
@@ -14,6 +14,7 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnGraphCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnLogger.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnMemManager.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpDevice.h>
@@ -33,6 +34,7 @@ typedef struct BackendConfigParameters {
   std::unique_ptr<QnnContext> qnn_context_ptr_;
   std::unique_ptr<QnnDevice> qnn_device_ptr_;
   std::unique_ptr<QnnGraph> qnn_graph_ptr_;
+  std::unique_ptr<QnnMemManager> qnn_mem_manager_ptr_;
 
   // Default ctor
   BackendConfigParameters()
@@ -40,10 +42,12 @@ typedef struct BackendConfigParameters {
         backend_init_state_(BackendInitializeState::UNINITIALIZED),
         qnn_context_ptr_(nullptr),
         qnn_device_ptr_(nullptr),
-        qnn_graph_ptr_(nullptr) {}
+        qnn_graph_ptr_(nullptr),
+        qnn_mem_manager_ptr_(nullptr) {}
   // Default dtor
   ~BackendConfigParameters() {
     qnn_graph_ptr_.reset();
+    qnn_mem_manager_ptr_.reset();
     qnn_context_ptr_.reset();
     qnn_device_ptr_.reset();
     qnn_backend_ptr_.reset();
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
new file mode 100644
index 00000000000..8f8317e0136
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/qualcomm/runtime/backends/QnnMemManager.h>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+bool QnnMemManager::IsRegistered(Qnn_MemHandle_t handle) {
+  return registered_set_.count(handle) != 0U;
+}
+
+Error QnnMemManager::RegisterMem(
+    const std::shared_ptr<TensorWrapper>& tensor_wrapper,
+    int32_t mem_fd) {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+  Qnn_MemDescriptor_t descriptor = {
+      {tensor_wrapper->GetRank(), tensor_wrapper->GetDims(), nullptr},
+      tensor_wrapper->GetDataType(),
+      QNN_MEM_TYPE_ION,
+      {{mem_fd}}};
+  Qnn_MemHandle_t handle = nullptr;
+  Qnn_ErrorHandle_t error = QNN_SUCCESS;
+  error = qnn_interface.qnn_mem_register(
+      context_->GetHandle(),
+      &descriptor,
+      /*numDescriptors=*/1,
+      &handle);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor %s is failed to register shared memory. Error %d",
+        tensor_wrapper->GetName().c_str(),
+        QNN_GET_ERROR_CODE(error));
+    return Error::Internal;
+  }
+  tensor_wrapper->SetMemHandle(handle);
+  registered_set_.insert(handle);
+  QNN_EXECUTORCH_LOG_INFO(
+      "Tensor %s is successfully registered to shared memory.",
+      tensor_wrapper->GetName().c_str());
+  return Error::Ok;
+}
+
+void QnnMemManager::DeRegisterMem() {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+  Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+  for (auto& mem_handle : registered_set_) {
+    error = qnn_interface.qnn_mem_de_register(&mem_handle, /*numHandles=*/1);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Failed to de-register shared memory. Error %d",
+          QNN_GET_ERROR_CODE(error));
+    }
+  }
+  registered_set_.clear();
+}
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h
new file mode 100644
index 00000000000..9d5949db16a
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/backends/qualcomm/aot/wrappers/TensorWrapper.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
+#include <unordered_set>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+class QnnMemManager {
+ public:
+  explicit QnnMemManager(
+      const QnnImplementation& implementation,
+      QnnContext* context)
+      : implementation_(implementation), context_(context) {}
+  ~QnnMemManager() {
+    DeRegisterMem();
+  }
+
+  Error RegisterMem(
+      const std::shared_ptr<TensorWrapper>& tensor_wrapper,
+      int32_t mem_fd);
+
+  bool IsRegistered(Qnn_MemHandle_t handle);
+
+ private:
+  void DeRegisterMem();
+
+  const QnnImplementation& implementation_;
+  QnnContext* context_;
+  std::unordered_set<Qnn_MemHandle_t> registered_set_;
+};
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
index b3b70328ae0..0f926fc0975 100644
--- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py
+++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -131,3 +131,4 @@ class QnnExecuTorchOptions:
     online_prepare: bool = False
     tensor_dump_output_path: str = ""
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
+    shared_buffer: bool = False
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
index c19bf681bbf..8c4d23172f0 100644
--- a/backends/qualcomm/serialization/schema.fbs
+++ b/backends/qualcomm/serialization/schema.fbs
@@ -172,6 +172,9 @@ table QnnExecuTorchOptions {
 
   /// Profiling level of the delegate and the backend. Default is off.
   profile_level:QnnExecuTorchProfileLevel;
+  
+  /// Enables usage of shared buffer between application and backend for graph I/O.
+  shared_buffer:bool;
 }
 
 root_type QnnExecuTorchOptions;
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index e36c6e5ecd8..66a3ad5c613 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -56,6 +56,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_arange(self):
@@ -389,6 +390,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_conv1d_relu_log_softmax(self):
@@ -484,6 +486,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_16a4w_conv2d(self):
@@ -880,6 +883,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_conv1d_relu_log_softmax(self):
@@ -1077,6 +1081,24 @@ def test_qnn_backend_profile_op(self):
             expected_profile_events=25,
         )
 
+    def test_qnn_backend_shared_buffer(self):
+        TestQNN.shared_buffer = True
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=True,
+        )
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            shared_buffer=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+        )
+
 
 class TestQNNQuantizedUtils(TestQNN):
     # TODO: refactor to support different backends
@@ -1179,6 +1201,25 @@ def test_qnn_backend_profile_op(self):
             expected_profile_events=26,
         )
 
+    def test_qnn_backend_shared_buffer(self):
+        TestQNN.shared_buffer = True
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            shared_buffer=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+        )
+
 
 class TestExampleScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -1215,6 +1256,8 @@ def test_mobilenet_v2(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1248,6 +1291,8 @@ def test_inception_v3(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1281,6 +1326,8 @@ def test_inception_v4(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1314,6 +1361,8 @@ def test_vit(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1346,6 +1395,8 @@ def test_edsr(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1378,6 +1429,8 @@ def test_deeplab_v3(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1411,6 +1464,8 @@ def test_dummy_llama2(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1442,6 +1497,8 @@ def test_ptq_dummy_llama2(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1475,6 +1532,8 @@ def test_mobilebert(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1515,6 +1574,8 @@ def test_ptq_mobilebert(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1585,6 +1646,7 @@ def setup_environment():
     TestQNN.online_prepare = args.online_prepare
     TestQNN.enable_profile = args.enable_profile
     TestQNN.error_only = args.error_only
+    TestQNN.shared_buffer = args.shared_buffer
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index dc0da7f75dc..ee7d6a7a3b6 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -32,6 +32,7 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program._program import ExecutorchProgram
 from executorch.sdk import generate_etrecord
 from executorch.sdk.inspector import Inspector
@@ -64,6 +65,7 @@ class TestQNN(unittest.TestCase):
     use_8a8w: str = "8a8w"
     use_16a16w: str = "16a16w"
     use_16a4w: str = "16a4w"
+    shared_buffer: bool = False
 
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
@@ -183,7 +185,19 @@ def lower_module_and_test_output(
         delegated_program.exported_program = to_backend(
             delegated_program.exported_program, qnn_partitioner
         )
-        exec_prog = delegated_program.to_executorch()
+        exec_prog = delegated_program.to_executorch(
+            exir.ExecutorchBackendConfig(
+                # For shared buffer, user must pass the memory address
+                # which is allocated by RPC memory to executor runner.
+                # Therefore, won't want to pre-allocate
+                # by memory manager in runtime.
+                memory_planning_pass=MemoryPlanningPass(
+                    memory_planning_algo="greedy",
+                    alloc_graph_input=not self.shared_buffer,
+                    alloc_graph_output=not self.shared_buffer,
+                )
+            )
+        )
 
         # Assert the backend name is qnn
         self.assertEqual(
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 1af9572bd3b..7fa696efbac 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -190,6 +190,7 @@ def generate_qnn_executorch_compiler_spec(
     online_prepare: bool = False,
     tensor_dump_output_path: str = "",
     profile: bool = False,
+    shared_buffer: bool = False,
 ) -> List[CompileSpec]:
     """
     Helper function generating compiler specs for Qualcomm AI Engine Direct
@@ -215,6 +216,8 @@ def generate_qnn_executorch_compiler_spec(
         profile: Enable profile the performance of per operator.
             Note that for now only support kProfileDetailed to
             profile the performance of each operator with cycle unit.
+        shared_buffer: Enables usage of shared buffer between application
+            and backend for graph I/O.
 
     Returns:
         List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct.
@@ -250,6 +253,9 @@ def generate_qnn_executorch_compiler_spec(
     else:
         qnn_executorch_options.profile_level = QnnExecuTorchProfileLevel.kProfileOff
 
+    if shared_buffer:
+        qnn_executorch_options.shared_buffer = True
+
     if (
         online_prepare
         and backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 905deca6445..54772f5c781 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -100,7 +100,7 @@ target_link_libraries(qnn_executor_runner
     qnn_executorch_backend
     full_portable_ops_lib
     etdump
-    ${FLATCC_LIB}
+    ${FLATCCRT_LIB}
     gflags
 )
 target_compile_options(qnn_executor_runner
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 0b13122e961..bd18cdc16b1 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -17,7 +17,10 @@
  * Currently we assume that the outputs are all fp32 tensors.
  */
 
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
@@ -25,6 +28,7 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/sdk/etdump/etdump_flatcc.h>
 #include <executorch/util/util.h>
+
 #include <gflags/gflags.h>
 
 #include <fstream>
@@ -47,14 +51,55 @@ DEFINE_string(
 DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
 DEFINE_int32(iteration, 1, "Iterations of inference.");
 DEFINE_int32(warm_up, 0, "Pre-run before inference.");
+DEFINE_bool(
+    shared_buffer,
+    false,
+    "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend.");
 
 DEFINE_string(
     etdump_path,
     "etdump.etdp",
     "If etdump generation is enabled an etdump will be written out to this path");
 using namespace torch::executor;
+using torch::executor::MemoryAllocator;
 using torch::executor::util::FileDataLoader;
 
+class CustomMemory {
+ public:
+  CustomMemory(bool shared_buffer) : shared_buffer_(shared_buffer){};
+  bool Allocate(size_t bytes, size_t alignment) {
+    if (shared_buffer_) {
+      ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment);
+    } else {
+      input_data_.resize(bytes);
+      ptr_ = input_data_.data();
+    }
+    return ptr_ != nullptr;
+  }
+
+  ~CustomMemory() {
+    if (shared_buffer_) {
+      if (ptr_ != nullptr) {
+        QnnExecuTorchFreeCustomMem(ptr_);
+      }
+    }
+  }
+
+  void* GetPtr() {
+    return ptr_;
+  }
+
+  CustomMemory(const CustomMemory&) = delete;
+  CustomMemory(CustomMemory&&) = delete;
+  CustomMemory& operator=(const CustomMemory&) = delete;
+  CustomMemory& operator=(CustomMemory&&) = delete;
+
+ private:
+  bool shared_buffer_{false};
+  void* ptr_{nullptr};
+  std::vector<char> input_data_;
+};
+
 int main(int argc, char** argv) {
   runtime_init();
 
@@ -167,10 +212,58 @@ int main(int argc, char** argv) {
   ET_LOG(Info, "Method loaded.");
 
   // Prepare the inputs.
-  // Use ones-initialized inputs.
-  auto inputs = util::PrepareInputTensors(*method);
+  // Allocate data memory for inputs and outputs
+  std::vector<std::unique_ptr<CustomMemory>> in_custom_mem;
+  std::vector<std::unique_ptr<CustomMemory>> out_custom_mem;
+  in_custom_mem.reserve(method->inputs_size());
+  out_custom_mem.reserve(method->outputs_size());
+
+  for (int input_index = 0; input_index < method->inputs_size();
+       ++input_index) {
+    MethodMeta method_meta = method->method_meta();
+    Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(input_index);
+    in_custom_mem.push_back(
+        std::make_unique<CustomMemory>(FLAGS_shared_buffer));
+    std::unique_ptr<CustomMemory>& custom_mem_ptr = in_custom_mem.back();
+    ET_CHECK_MSG(
+        custom_mem_ptr->Allocate(
+            tensor_meta->nbytes(), MemoryAllocator::kDefaultAlignment),
+        "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
+        input_index,
+        tensor_meta->nbytes());
+    TensorImpl impl = TensorImpl(
+        tensor_meta->scalar_type(),
+        /*dim=*/tensor_meta->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+        custom_mem_ptr->GetPtr(),
+        const_cast<TensorImpl::DimOrderType*>(tensor_meta->dim_order().data()));
+    Error ret = method->set_input(Tensor(&impl), input_index);
+    ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret);
+  }
+  for (int output_index = 0; output_index < method->outputs_size();
+       ++output_index) {
+    const exec_aten::Tensor& t = method->get_output(output_index).toTensor();
+    out_custom_mem.push_back(
+        std::make_unique<CustomMemory>(FLAGS_shared_buffer));
+    std::unique_ptr<CustomMemory>& custom_mem_ptr = out_custom_mem.back();
+    ET_CHECK_MSG(
+        custom_mem_ptr->Allocate(
+            t.nbytes(), MemoryAllocator::kDefaultAlignment),
+        "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
+        output_index,
+        t.nbytes());
+    Error ret = method->set_output_data_ptr(
+        custom_mem_ptr->GetPtr(), t.nbytes(), output_index);
+    if (ret != Error::Ok) {
+      // This can error if the outputs are already pre-allocated. Ignore
+      // this error because it doesn't affect correctness, but log it.
+      ET_LOG(
+          Error, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret);
+    }
+  }
   ET_LOG(Info, "Inputs prepared.");
 
+  // Fill in data for input
   std::ifstream input_list(FLAGS_input_list_path);
   if (input_list.is_open()) {
     size_t num_inputs = method->inputs_size();
@@ -205,31 +298,38 @@ int main(int argc, char** argv) {
           input_files.size());
 
       for (int input_index = 0; input_index < num_inputs; ++input_index) {
-        exec_aten::Tensor& t = method->mutable_input(input_index).toTensor();
-        std::vector<char> input_data(t.nbytes());
+        MethodMeta method_meta = method->method_meta();
+        Result<TensorInfo> tensor_meta =
+            method_meta.input_tensor_meta(input_index);
+
         std::ifstream fin(input_files[input_index], std::ios::binary);
         fin.seekg(0, fin.end);
         size_t file_size = fin.tellg();
 
         ET_CHECK_MSG(
-            file_size == t.nbytes(),
+            file_size == tensor_meta->nbytes(),
             "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
             input_index,
             file_size,
-            t.nbytes());
+            tensor_meta->nbytes());
 
         fin.seekg(0, fin.beg);
-        fin.read(input_data.data(), file_size);
+        fin.read(
+            static_cast<char*>(in_custom_mem[input_index]->GetPtr()),
+            file_size);
         fin.close();
 
-        std::vector<TensorImpl::SizesType> sizes(t.dim());
-        for (int i = 0; i < sizes.size(); ++i) {
-          sizes[i] = t.sizes().data()[i];
-        }
-
-        auto t_impl = TensorImpl(
-            t.scalar_type(), t.dim(), sizes.data(), input_data.data());
-        Error ret = method->set_input(EValue(Tensor(&t_impl)), input_index);
+        // For pre-allocated use case, we need to call set_input
+        // to copy data for the input tensors since they doesn't
+        // share the data with in_custom_mem.
+        TensorImpl impl = TensorImpl(
+            tensor_meta->scalar_type(),
+            /*dim=*/tensor_meta->sizes().size(),
+            const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+            in_custom_mem[input_index]->GetPtr(),
+            const_cast<TensorImpl::DimOrderType*>(
+                tensor_meta->dim_order().data()));
+        Error ret = method->set_input(Tensor(&impl), input_index);
         ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret);
       }
 
@@ -313,21 +413,5 @@ int main(int argc, char** argv) {
     ET_LOG(Info, "Model executed successfully.");
   }
 
-  // Dump the etdump data containing profiling/debugging data to the specified
-  // file.
-  etdump_result result = etdump_gen.get_etdump_data();
-  if (result.buf != nullptr && result.size > 0) {
-    ET_LOG(
-        Info,
-        "Write etdump to %s, Size = %zu",
-        FLAGS_etdump_path.c_str(),
-        result.size);
-    FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
-    fwrite((uint8_t*)result.buf, 1, result.size, f);
-    fclose(f);
-    free(result.buf);
-  }
-
-  util::FreeInputs(inputs);
   return 0;
 }
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index 133e64d8568..4e08ab078c2 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -109,6 +109,7 @@ def get_dataset(data_size, dataset_dir, download):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -128,6 +129,7 @@ def get_dataset(data_size, dataset_dir, download):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/dummy_llama2.py b/examples/qualcomm/scripts/dummy_llama2.py
index dd37f816004..8178ae5a5a4 100755
--- a/examples/qualcomm/scripts/dummy_llama2.py
+++ b/examples/qualcomm/scripts/dummy_llama2.py
@@ -128,6 +128,7 @@ def create_device_inputs(example_inputs, use_kv_cache):
         inputs,
         custom_annotations=(),
         quant_dtype=quant_dtype,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -141,6 +142,7 @@ def create_device_inputs(example_inputs, use_kv_cache):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index f844b094c03..50639d41894 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -156,6 +156,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -175,6 +176,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index e93e13ac33f..cdb84f6e8c6 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
+    generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
 from executorch.examples.models import MODEL_NAME_TO_MODEL
@@ -71,12 +72,13 @@
     edge_copy = copy.deepcopy(edge_program)
 
     # Delegate to QNN backend
+    backend_options = generate_htp_compiler_spec(
+        use_fp16=False,
+    )
     qnn_partitioner = QnnPartitioner(
         generate_qnn_executorch_compiler_spec(
-            is_fp16=False,
             soc_model=QcomChipset.SM8550,
-            debug=False,
-            saver=False,
+            backend_options=backend_options,
         )
     )
     with validation_disabled():
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 244e38edbe5..a3b5c41923d 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -111,6 +111,7 @@ def get_data_loader():
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -130,6 +131,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index db3feda2708..06b8047a18c 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -110,6 +110,7 @@ def get_data_loader():
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -129,6 +130,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index dc148afa8eb..84d130d4244 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -294,6 +294,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=quant_dtype,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -313,6 +314,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index 5f214a6f8ca..e389c00b3ec 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -111,6 +111,7 @@ def get_data_loader():
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -130,6 +131,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index ff22f93c4f4..63e1480b625 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -150,6 +150,7 @@ def get_data_loader():
         f"{args.artifact}/{pte_filename}",
         inputs,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
     # setup required paths accordingly
     # qnn_sdk       : QNN SDK path setup in environment variable
@@ -165,6 +166,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
index c815867f2d6..4f8e5b419c6 100755
--- a/examples/qualcomm/scripts/utils.py
+++ b/examples/qualcomm/scripts/utils.py
@@ -32,6 +32,7 @@
 )
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -46,6 +47,7 @@ def __init__(
         soc_model,
         host_id=None,
         error_only=False,
+        shared_buffer=False,
     ):
         self.qnn_sdk = qnn_sdk
         self.artifact_path = artifact_path
@@ -65,6 +67,7 @@ def __init__(
         }
         self.soc_model = arch_table[soc_model]
         self.error_only = error_only
+        self.shared_buffer = shared_buffer
 
     def _adb(self, cmd):
         if not self.host_id:
@@ -123,6 +126,7 @@ def execute(self):
                 f"--output_folder_path {self.output_folder}",
                 f"--input_list_path {self.input_list_filename}",
                 f"--etdump_path {self.etdump_path}",
+                "--shared_buffer" if self.shared_buffer else "",
             ]
         )
         qnn_executor_runner_cmds = " ".join(
@@ -157,6 +161,7 @@ def build_executorch_binary(
     skip_node_id_set=None,
     skip_node_op_set=None,
     quant_dtype: Optional[QuantDtype] = None,
+    shared_buffer=False,
 ):
     if quant_dtype:
         quantizer = QnnQuantizer()
@@ -202,6 +207,7 @@ def build_executorch_binary(
             backend_options=backend_options,
             debug=False,
             saver=False,
+            shared_buffer=shared_buffer,
         ),
         skip_node_id_set,
         skip_node_op_set,
@@ -209,7 +215,18 @@ def build_executorch_binary(
     edge_prog.exported_program = to_backend(edge_prog.exported_program, qnn_partitioner)
     edge_prog.exported_program.graph_module.graph.print_tabular()
     exec_prog = edge_prog.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_constant_segment=False,
+            # For shared buffer, user must pass the memory address
+            # which is allocated by RPC memory to executor runner.
+            # Therefore, won't want to pre-allocate
+            # by memory manager in runtime.
+            memory_planning_pass=MemoryPlanningPass(
+                memory_planning_algo="greedy",
+                alloc_graph_input=not shared_buffer,
+                alloc_graph_output=not shared_buffer,
+            ),
+        )
     )
     with open(f"{file_name}.pte", "wb") as file:
         file.write(exec_prog.buffer)
@@ -338,6 +355,13 @@ def setup_common_args_and_variables():
         type=str,
     )
 
+    parser.add_argument(
+        "-z",
+        "--shared_buffer",
+        help="Enables usage of shared buffer between application and backend for graph I/O.",
+        action="store_true",
+    )
+
     # QNN_SDK_ROOT might also be an argument, but it is used in various places.
     # So maybe it's fine to just use the environment.
     if "QNN_SDK_ROOT" not in os.environ: