Merge pull request #3 from neuropilot-captain/extract_share_runtime

neuropilot-captain · web-flow · commit 7e6a7d6685a2 · 2025-06-03T15:36:05.000+08:00
Support weight sharing in MTK Runtime
diff --git a/backends/mediatek/runtime/NeuronBackend.cpp b/backends/mediatek/runtime/NeuronBackend.cpp
@@ -12,8 +12,8 @@
 #include "NeuronPayloadHeader.h"
 #include "api/NeuronAdapter.h"
 
+#include <executorch/runtime/executor/pte_data_map.h>
 #include "executorch/runtime/core/error.h"
-#include "executorch/runtime/core/exec_aten/util/dim_order_util.h"
 
 #include <algorithm>
 #include <memory>
@@ -24,6 +24,7 @@ namespace executorch {
 namespace backends {
 namespace neuron {
 
+using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::BackendExecutionContext;
 using executorch::runtime::BackendInitContext;
@@ -37,12 +38,22 @@ using executorch::runtime::Result;
 
 const char kHighAddrKey[] = "HighAddr";
 const char kImportForeverKey[] = "ImportForever";
+const char kSharedWeightsKey[] = "ExtractSharedBlobKey";
 
 Result<DelegateHandle*> NeuronBackend::init(
     BackendInitContext& context,
     FreeableBuffer* processed,
     ArrayRef<CompileSpec> compile_specs) const {
   NeuronDelegateSetting setting;
+  MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
+  NeuronExecuTorchDelegate* delegate =
+      runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
+  if (delegate == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
+  new (delegate) NeuronExecuTorchDelegate();
+
   for (auto& compile_spec : compile_specs) {
     if (std::strcmp(compile_spec.key, kHighAddrKey) == 0) {
       setting.mHighAddr = *static_cast<char*>(compile_spec.value.buffer);
@@ -53,11 +64,39 @@ Result<DelegateHandle*> NeuronBackend::init(
           "NeuronBackend",
           "IsImportForever Enable : %d",
           setting.mImportForever);
+    } else if (std::strcmp(compile_spec.key, kSharedWeightsKey) == 0) {
+      setting.mSharedWeights = true;
+      std::string shared_weights_key(
+          static_cast<char*>(compile_spec.value.buffer),
+          compile_spec.value.nbytes);
+      LogInfo(
+          "NeuronBackend",
+          "SharedWeights Enabled for %s",
+          shared_weights_key.c_str());
+
+      const NamedDataMap* named_data_map = context.get_named_data_map();
+      Result<FreeableBuffer> shared_weights =
+          named_data_map->get_data(shared_weights_key.c_str());
+
+      if (shared_weights.ok()) {
+        LogInfo(
+            "NeuronBackend",
+            "Loaded shared weights from named_data_map. Size: %zu",
+            shared_weights.get().size());
+        FreeableBuffer& buffer = shared_weights.get();
+        delegate->SetSharedWeights(buffer);
+      } else {
+        LogError(
+            "NeuronBackend",
+            "Failed to load shared weights from named_data_map.");
+        return Error::Internal;
+      }
     } else {
       LogWarn("NeuronBackend", "unknown compile spec: %s", compile_spec.key);
     }
   }
   auto Payload = NeuronPayload(processed->data(), processed->size());
+
   LogInfo(
       "NeuronBackend",
       "version %u, input %u, output %u, length %u, payload size: %zu",
@@ -67,19 +106,7 @@ Result<DelegateHandle*> NeuronBackend::init(
       Payload.Header.DataLen,
       processed->size());
 
-  MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
-  NeuronExecuTorchDelegate* delegate =
-      runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
-  if (delegate == nullptr) {
-    return Error::MemoryAllocationFailed;
-  }
-
-  new (delegate) NeuronExecuTorchDelegate();
-
-  if (delegate == nullptr) {
-    return nullptr;
-  }
-  auto res = delegate->LoadCompiledNetwork(Payload, setting);
+  int res = delegate->LoadCompiledNetwork(Payload, setting);
   return res == NEURON_NO_ERROR ? delegate : nullptr;
 }
 
@@ -111,21 +138,25 @@ Error NeuronExecuTorchDelegate::execute(
     return Error::InvalidState;
   };
 
+  ET_CHECK_OR_RETURN_ERROR(
+      CheckDimOrder(args) == NEURON_NO_ERROR,
+      Internal,
+      "Expecting default dim_order but got a non default dim_order tensor input");
+
+  PrepareInputsOuputs(args);
+
   auto allocator = dynamic_cast<torch::executor::neuron::BufferAllocator*>(
       context.get_temp_allocator());
-  size_t inputCount = mInputSizes.size(), outputCount = mOutputSizes.size();
+
+  bool has_shared_weights_input = neuron_shared_weights_.size() > 0;
+
+  size_t inputCount =
+      has_shared_weights_input ? mInputSizes.size() + 1 : mInputSizes.size();
+  size_t outputCount = mOutputSizes.size();
 
   for (int i = 0; i < inputCount; i++) {
-    auto tensor_in = args[i]->toTensor();
-    ET_CHECK_OR_RETURN_ERROR(
-        runtime::is_contiguous_dim_order(
-            tensor_in.dim_order().data(), tensor_in.dim()),
-        Internal,
-        "Expecting default dim_order but got a non default dim_order tensor for external input %u",
-        i);
-
-    auto data_ptr = args[i]->toTensor().data_ptr();
-    auto data_size = args[i]->toTensor().nbytes();
+    auto data_ptr = mPreparedInputs[i].data_ptr;
+    auto data_size = mPreparedInputs[i].size;
     if (IsCached</*isInput=*/true>(i, data_ptr)) {
       continue;
     };
@@ -140,22 +171,20 @@ Error NeuronExecuTorchDelegate::execute(
     }
   }
 
-  for (int o = inputCount; o < inputCount + outputCount; o++) {
-    auto data_ptr = args[o]->toTensor().data_ptr();
-    auto data_size = args[o]->toTensor().nbytes();
-    auto output_index = o - inputCount;
-    if (IsCached</*isInput=*/false>(output_index, data_ptr)) {
+  for (int o = 0; o < outputCount; o++) {
+    auto data_ptr = mPreparedOutputs[o].data_ptr;
+    auto data_size = mPreparedOutputs[o].size;
+    if (IsCached</*isInput=*/false>(o, data_ptr)) {
       continue;
     };
     auto unit = allocator != nullptr ? allocator->Find(data_ptr) : nullptr;
     if (unit) {
-      UpdateCache</*isInput=*/false>(output_index, data_ptr);
+      UpdateCache</*isInput=*/false>(o, data_ptr);
       size_t offset = (char*)data_ptr - (char*)unit->GetAddress();
       mExecutor.SetInputOutputFromMemory</*isInput*/ false>(
-          output_index, unit->GetNeuronMemory(), offset, data_size);
+          o, unit->GetNeuronMemory(), offset, data_size);
     } else {
-      mExecutor.SetInputOutput</*isInput=*/false>(
-          output_index, data_ptr, data_size);
+      mExecutor.SetInputOutput</*isInput=*/false>(o, data_ptr, data_size);
     }
   }
 
diff --git a/backends/mediatek/runtime/include/NeuronBackend.h b/backends/mediatek/runtime/include/NeuronBackend.h
@@ -18,6 +18,7 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include "executorch/runtime/core/exec_aten/util/dim_order_util.h"
 
 #include <memory>
 #include <unordered_map>
@@ -27,6 +28,10 @@ namespace executorch {
 namespace backends {
 namespace neuron {
 
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+
 class NeuronBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
@@ -53,6 +58,8 @@ struct NeuronDelegateSetting {
 
   bool mImportForever = false;
 
+  bool mSharedWeights = false;
+
   std::string ToRuntimeOption() {
     if (mHighAddr && mImportForever) {
       return "--apusys-config \"{ \\\"high_addr\\\": true, \\\"import_forever\\\": true }\"";
@@ -68,6 +75,14 @@ struct NeuronDelegateSetting {
 
 class NeuronExecuTorchDelegate {
  public:
+  struct InputOutputInfo {
+    void* data_ptr;
+    size_t size;
+
+    InputOutputInfo(void* ptr, size_t sz)
+        : data_ptr(ptr), size(sz) {}
+  };
+
   class MemoryCache {
    public:
     template <bool isInput>
@@ -103,16 +118,22 @@ class NeuronExecuTorchDelegate {
     auto res = mExecutor.LoadFromCompiledNetwork(
         payload.CompiledNetwork,
         payload.Header.DataLen,
-        payload.Header.InputCount,
+        mSettings.mSharedWeights ? payload.Header.InputCount + 1
+                                 : payload.Header.InputCount,
         payload.Header.OutputCount,
         runtimeOption);
     CHECK_NO_ERROR(res);
     CHECK_TRUE(mExecutor.IsValid());
-    SummaryIoCounts();
+    SummarizeIoSizes(payload.Header.InputCount, payload.Header.OutputCount);
     mPLock = std::unique_ptr<ScopePerformancer>(new ScopePerformancer);
     return NEURON_NO_ERROR;
   }
 
+  int SetSharedWeights(FreeableBuffer& buffer) {
+    neuron_shared_weights_.push_back(std::move(buffer));
+    return NEURON_NO_ERROR;
+  }
+
   ::executorch::runtime::Error execute(
       ET_UNUSED ::executorch::runtime::BackendExecutionContext& context,
       ::executorch::runtime::EValue** args) const;
@@ -128,33 +149,84 @@ class NeuronExecuTorchDelegate {
     mCache.UpdateCache<isInput>(index, ptr);
   }
 
-  int SummaryIoCounts() {
-    for (int i = 0;; i++) {
+  int SummarizeIoSizes(uint32_t input_count, uint32_t output_count) {
+    for (int i = 0; i < input_count; i++) {
       size_t size = mExecutor.GetInputOutputPaddedSize</*isInput*/ true>(i);
       if (size == 0) {
-        break;
+        LogWarn("NeuronBackend", "Model input:%d got size: %lu", i, size);
       }
       LogInfo("NeuronBackend", "Model input:%d size: %lu", i, size);
       mInputSizes.push_back(size);
     }
-    for (int o = 0;; o++) {
+    for (int o = 0; o < output_count; o++) {
       size_t size = mExecutor.GetInputOutputPaddedSize</*isInput*/ false>(o);
       if (size == 0) {
-        break;
+        LogWarn("NeuronBackend", "Model output:%d got size: %lu", o, size);
       }
       LogInfo("NeuronBackend", "Model output:%d size: %lu", o, size);
       mOutputSizes.push_back(size);
     }
     return NEURON_NO_ERROR;
   }
 
+  int CheckDimOrder(EValue** args) const {
+    size_t data_input_count = mInputSizes.size();
+    for (int i = 0; i < data_input_count; i++) {
+      auto tensor_in = args[i]->toTensor();
+      LogInfo("NeuronBackend", "Checking dim order for input %d", i);
+      if (!runtime::is_contiguous_dim_order(
+              tensor_in.dim_order().data(), tensor_in.dim())) {
+        return NEURON_BAD_DATA;
+      }
+    }
+
+    return NEURON_NO_ERROR;
+  }
+
+  int PrepareInputsOuputs(EValue** args) const {
+    bool has_shared_weights_input = neuron_shared_weights_.size() > 0;
+
+    size_t data_input_count = mInputSizes.size();
+    size_t data_output_count = mOutputSizes.size();
+
+    // Prepare input data
+    for (int i = 0; i < data_input_count; i++) {
+      auto tensor_in = args[i]->toTensor();
+      auto data_ptr = tensor_in.data_ptr();
+      auto data_size = tensor_in.nbytes();
+      mPreparedInputs.push_back(InputOutputInfo{data_ptr, data_size});
+    }
+
+    // Prepare shared weights if any as the last model input
+    if (has_shared_weights_input) {
+      FreeableBuffer& buffer = neuron_shared_weights_.at(0);
+      mPreparedInputs.push_back(
+          InputOutputInfo{const_cast<void*>(buffer.data()), buffer.size()});
+    }
+
+    // Prepare output data
+    for (int o = data_output_count; o < data_input_count + data_output_count;
+         o++) {
+      auto tensor_out = args[o]->toTensor();
+      auto data_ptr = tensor_out.data_ptr();
+      auto data_size = tensor_out.nbytes();
+      mPreparedOutputs.push_back(InputOutputInfo{data_ptr, data_size});
+    }
+
+    return NEURON_NO_ERROR;
+  }
+
   int HintNeuronBackend(::executorch::runtime::EValue** args) const;
 
  private:
   std::vector<size_t> mInputSizes;
 
   std::vector<size_t> mOutputSizes;
 
+  mutable std::vector<InputOutputInfo> mPreparedInputs;
+
+  mutable std::vector<InputOutputInfo> mPreparedOutputs;
+
   mutable MemoryCache mCache;
 
   std::unique_ptr<ScopePerformancer> mPLock;
@@ -165,6 +237,8 @@ class NeuronExecuTorchDelegate {
 
   mutable std::unordered_set<const void*> mHasImported;
 
+  mutable std::vector<FreeableBuffer> neuron_shared_weights_;
+
  private:
   NeuronExecuTorchDelegate(const NeuronExecuTorchDelegate&);