Skip to content

Commit 7e6a7d6

Browse files
Merge pull request #3 from neuropilot-captain/extract_share_runtime
Support weight sharing in MTK Runtime
2 parents a0bfa5d + a6da626 commit 7e6a7d6

File tree

2 files changed

+144
-41
lines changed

2 files changed

+144
-41
lines changed

backends/mediatek/runtime/NeuronBackend.cpp

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
#include "NeuronPayloadHeader.h"
1313
#include "api/NeuronAdapter.h"
1414

15+
#include <executorch/runtime/executor/pte_data_map.h>
1516
#include "executorch/runtime/core/error.h"
16-
#include "executorch/runtime/core/exec_aten/util/dim_order_util.h"
1717

1818
#include <algorithm>
1919
#include <memory>
@@ -24,6 +24,7 @@ namespace executorch {
2424
namespace backends {
2525
namespace neuron {
2626

27+
using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
2728
using executorch::runtime::ArrayRef;
2829
using executorch::runtime::BackendExecutionContext;
2930
using executorch::runtime::BackendInitContext;
@@ -37,12 +38,22 @@ using executorch::runtime::Result;
3738

3839
const char kHighAddrKey[] = "HighAddr";
3940
const char kImportForeverKey[] = "ImportForever";
41+
const char kSharedWeightsKey[] = "ExtractSharedBlobKey";
4042

4143
Result<DelegateHandle*> NeuronBackend::init(
4244
BackendInitContext& context,
4345
FreeableBuffer* processed,
4446
ArrayRef<CompileSpec> compile_specs) const {
4547
NeuronDelegateSetting setting;
48+
MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
49+
NeuronExecuTorchDelegate* delegate =
50+
runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
51+
if (delegate == nullptr) {
52+
return Error::MemoryAllocationFailed;
53+
}
54+
55+
new (delegate) NeuronExecuTorchDelegate();
56+
4657
for (auto& compile_spec : compile_specs) {
4758
if (std::strcmp(compile_spec.key, kHighAddrKey) == 0) {
4859
setting.mHighAddr = *static_cast<char*>(compile_spec.value.buffer);
@@ -53,11 +64,39 @@ Result<DelegateHandle*> NeuronBackend::init(
5364
"NeuronBackend",
5465
"IsImportForever Enable : %d",
5566
setting.mImportForever);
67+
} else if (std::strcmp(compile_spec.key, kSharedWeightsKey) == 0) {
68+
setting.mSharedWeights = true;
69+
std::string shared_weights_key(
70+
static_cast<char*>(compile_spec.value.buffer),
71+
compile_spec.value.nbytes);
72+
LogInfo(
73+
"NeuronBackend",
74+
"SharedWeights Enabled for %s",
75+
shared_weights_key.c_str());
76+
77+
const NamedDataMap* named_data_map = context.get_named_data_map();
78+
Result<FreeableBuffer> shared_weights =
79+
named_data_map->get_data(shared_weights_key.c_str());
80+
81+
if (shared_weights.ok()) {
82+
LogInfo(
83+
"NeuronBackend",
84+
"Loaded shared weights from named_data_map. Size: %zu",
85+
shared_weights.get().size());
86+
FreeableBuffer& buffer = shared_weights.get();
87+
delegate->SetSharedWeights(buffer);
88+
} else {
89+
LogError(
90+
"NeuronBackend",
91+
"Failed to load shared weights from named_data_map.");
92+
return Error::Internal;
93+
}
5694
} else {
5795
LogWarn("NeuronBackend", "unknown compile spec: %s", compile_spec.key);
5896
}
5997
}
6098
auto Payload = NeuronPayload(processed->data(), processed->size());
99+
61100
LogInfo(
62101
"NeuronBackend",
63102
"version %u, input %u, output %u, length %u, payload size: %zu",
@@ -67,19 +106,7 @@ Result<DelegateHandle*> NeuronBackend::init(
67106
Payload.Header.DataLen,
68107
processed->size());
69108

70-
MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
71-
NeuronExecuTorchDelegate* delegate =
72-
runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
73-
if (delegate == nullptr) {
74-
return Error::MemoryAllocationFailed;
75-
}
76-
77-
new (delegate) NeuronExecuTorchDelegate();
78-
79-
if (delegate == nullptr) {
80-
return nullptr;
81-
}
82-
auto res = delegate->LoadCompiledNetwork(Payload, setting);
109+
int res = delegate->LoadCompiledNetwork(Payload, setting);
83110
return res == NEURON_NO_ERROR ? delegate : nullptr;
84111
}
85112

@@ -111,21 +138,25 @@ Error NeuronExecuTorchDelegate::execute(
111138
return Error::InvalidState;
112139
};
113140

141+
ET_CHECK_OR_RETURN_ERROR(
142+
CheckDimOrder(args) == NEURON_NO_ERROR,
143+
Internal,
144+
"Expecting default dim_order but got a non default dim_order tensor input");
145+
146+
PrepareInputsOuputs(args);
147+
114148
auto allocator = dynamic_cast<torch::executor::neuron::BufferAllocator*>(
115149
context.get_temp_allocator());
116-
size_t inputCount = mInputSizes.size(), outputCount = mOutputSizes.size();
150+
151+
bool has_shared_weights_input = neuron_shared_weights_.size() > 0;
152+
153+
size_t inputCount =
154+
has_shared_weights_input ? mInputSizes.size() + 1 : mInputSizes.size();
155+
size_t outputCount = mOutputSizes.size();
117156

118157
for (int i = 0; i < inputCount; i++) {
119-
auto tensor_in = args[i]->toTensor();
120-
ET_CHECK_OR_RETURN_ERROR(
121-
runtime::is_contiguous_dim_order(
122-
tensor_in.dim_order().data(), tensor_in.dim()),
123-
Internal,
124-
"Expecting default dim_order but got a non default dim_order tensor for external input %u",
125-
i);
126-
127-
auto data_ptr = args[i]->toTensor().data_ptr();
128-
auto data_size = args[i]->toTensor().nbytes();
158+
auto data_ptr = mPreparedInputs[i].data_ptr;
159+
auto data_size = mPreparedInputs[i].size;
129160
if (IsCached</*isInput=*/true>(i, data_ptr)) {
130161
continue;
131162
};
@@ -140,22 +171,20 @@ Error NeuronExecuTorchDelegate::execute(
140171
}
141172
}
142173

143-
for (int o = inputCount; o < inputCount + outputCount; o++) {
144-
auto data_ptr = args[o]->toTensor().data_ptr();
145-
auto data_size = args[o]->toTensor().nbytes();
146-
auto output_index = o - inputCount;
147-
if (IsCached</*isInput=*/false>(output_index, data_ptr)) {
174+
for (int o = 0; o < outputCount; o++) {
175+
auto data_ptr = mPreparedOutputs[o].data_ptr;
176+
auto data_size = mPreparedOutputs[o].size;
177+
if (IsCached</*isInput=*/false>(o, data_ptr)) {
148178
continue;
149179
};
150180
auto unit = allocator != nullptr ? allocator->Find(data_ptr) : nullptr;
151181
if (unit) {
152-
UpdateCache</*isInput=*/false>(output_index, data_ptr);
182+
UpdateCache</*isInput=*/false>(o, data_ptr);
153183
size_t offset = (char*)data_ptr - (char*)unit->GetAddress();
154184
mExecutor.SetInputOutputFromMemory</*isInput*/ false>(
155-
output_index, unit->GetNeuronMemory(), offset, data_size);
185+
o, unit->GetNeuronMemory(), offset, data_size);
156186
} else {
157-
mExecutor.SetInputOutput</*isInput=*/false>(
158-
output_index, data_ptr, data_size);
187+
mExecutor.SetInputOutput</*isInput=*/false>(o, data_ptr, data_size);
159188
}
160189
}
161190

backends/mediatek/runtime/include/NeuronBackend.h

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <executorch/runtime/backend/interface.h>
1919
#include <executorch/runtime/core/error.h>
2020
#include <executorch/runtime/core/evalue.h>
21+
#include "executorch/runtime/core/exec_aten/util/dim_order_util.h"
2122

2223
#include <memory>
2324
#include <unordered_map>
@@ -27,6 +28,10 @@ namespace executorch {
2728
namespace backends {
2829
namespace neuron {
2930

31+
using executorch::runtime::EValue;
32+
using executorch::runtime::FreeableBuffer;
33+
using executorch::runtime::Result;
34+
3035
class NeuronBackend final : public ::executorch::runtime::BackendInterface {
3136
public:
3237
::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
@@ -53,6 +58,8 @@ struct NeuronDelegateSetting {
5358

5459
bool mImportForever = false;
5560

61+
bool mSharedWeights = false;
62+
5663
std::string ToRuntimeOption() {
5764
if (mHighAddr && mImportForever) {
5865
return "--apusys-config \"{ \\\"high_addr\\\": true, \\\"import_forever\\\": true }\"";
@@ -68,6 +75,14 @@ struct NeuronDelegateSetting {
6875

6976
class NeuronExecuTorchDelegate {
7077
public:
78+
struct InputOutputInfo {
79+
void* data_ptr;
80+
size_t size;
81+
82+
InputOutputInfo(void* ptr, size_t sz)
83+
: data_ptr(ptr), size(sz) {}
84+
};
85+
7186
class MemoryCache {
7287
public:
7388
template <bool isInput>
@@ -103,16 +118,22 @@ class NeuronExecuTorchDelegate {
103118
auto res = mExecutor.LoadFromCompiledNetwork(
104119
payload.CompiledNetwork,
105120
payload.Header.DataLen,
106-
payload.Header.InputCount,
121+
mSettings.mSharedWeights ? payload.Header.InputCount + 1
122+
: payload.Header.InputCount,
107123
payload.Header.OutputCount,
108124
runtimeOption);
109125
CHECK_NO_ERROR(res);
110126
CHECK_TRUE(mExecutor.IsValid());
111-
SummaryIoCounts();
127+
SummarizeIoSizes(payload.Header.InputCount, payload.Header.OutputCount);
112128
mPLock = std::unique_ptr<ScopePerformancer>(new ScopePerformancer);
113129
return NEURON_NO_ERROR;
114130
}
115131

132+
int SetSharedWeights(FreeableBuffer& buffer) {
133+
neuron_shared_weights_.push_back(std::move(buffer));
134+
return NEURON_NO_ERROR;
135+
}
136+
116137
::executorch::runtime::Error execute(
117138
ET_UNUSED ::executorch::runtime::BackendExecutionContext& context,
118139
::executorch::runtime::EValue** args) const;
@@ -128,33 +149,84 @@ class NeuronExecuTorchDelegate {
128149
mCache.UpdateCache<isInput>(index, ptr);
129150
}
130151

131-
int SummaryIoCounts() {
132-
for (int i = 0;; i++) {
152+
int SummarizeIoSizes(uint32_t input_count, uint32_t output_count) {
153+
for (int i = 0; i < input_count; i++) {
133154
size_t size = mExecutor.GetInputOutputPaddedSize</*isInput*/ true>(i);
134155
if (size == 0) {
135-
break;
156+
LogWarn("NeuronBackend", "Model input:%d got size: %lu", i, size);
136157
}
137158
LogInfo("NeuronBackend", "Model input:%d size: %lu", i, size);
138159
mInputSizes.push_back(size);
139160
}
140-
for (int o = 0;; o++) {
161+
for (int o = 0; o < output_count; o++) {
141162
size_t size = mExecutor.GetInputOutputPaddedSize</*isInput*/ false>(o);
142163
if (size == 0) {
143-
break;
164+
LogWarn("NeuronBackend", "Model output:%d got size: %lu", o, size);
144165
}
145166
LogInfo("NeuronBackend", "Model output:%d size: %lu", o, size);
146167
mOutputSizes.push_back(size);
147168
}
148169
return NEURON_NO_ERROR;
149170
}
150171

172+
int CheckDimOrder(EValue** args) const {
173+
size_t data_input_count = mInputSizes.size();
174+
for (int i = 0; i < data_input_count; i++) {
175+
auto tensor_in = args[i]->toTensor();
176+
LogInfo("NeuronBackend", "Checking dim order for input %d", i);
177+
if (!runtime::is_contiguous_dim_order(
178+
tensor_in.dim_order().data(), tensor_in.dim())) {
179+
return NEURON_BAD_DATA;
180+
}
181+
}
182+
183+
return NEURON_NO_ERROR;
184+
}
185+
186+
int PrepareInputsOuputs(EValue** args) const {
187+
bool has_shared_weights_input = neuron_shared_weights_.size() > 0;
188+
189+
size_t data_input_count = mInputSizes.size();
190+
size_t data_output_count = mOutputSizes.size();
191+
192+
// Prepare input data
193+
for (int i = 0; i < data_input_count; i++) {
194+
auto tensor_in = args[i]->toTensor();
195+
auto data_ptr = tensor_in.data_ptr();
196+
auto data_size = tensor_in.nbytes();
197+
mPreparedInputs.push_back(InputOutputInfo{data_ptr, data_size});
198+
}
199+
200+
// Prepare shared weights if any as the last model input
201+
if (has_shared_weights_input) {
202+
FreeableBuffer& buffer = neuron_shared_weights_.at(0);
203+
mPreparedInputs.push_back(
204+
InputOutputInfo{const_cast<void*>(buffer.data()), buffer.size()});
205+
}
206+
207+
// Prepare output data
208+
for (int o = data_output_count; o < data_input_count + data_output_count;
209+
o++) {
210+
auto tensor_out = args[o]->toTensor();
211+
auto data_ptr = tensor_out.data_ptr();
212+
auto data_size = tensor_out.nbytes();
213+
mPreparedOutputs.push_back(InputOutputInfo{data_ptr, data_size});
214+
}
215+
216+
return NEURON_NO_ERROR;
217+
}
218+
151219
int HintNeuronBackend(::executorch::runtime::EValue** args) const;
152220

153221
private:
154222
std::vector<size_t> mInputSizes;
155223

156224
std::vector<size_t> mOutputSizes;
157225

226+
mutable std::vector<InputOutputInfo> mPreparedInputs;
227+
228+
mutable std::vector<InputOutputInfo> mPreparedOutputs;
229+
158230
mutable MemoryCache mCache;
159231

160232
std::unique_ptr<ScopePerformancer> mPLock;
@@ -165,6 +237,8 @@ class NeuronExecuTorchDelegate {
165237

166238
mutable std::unordered_set<const void*> mHasImported;
167239

240+
mutable std::vector<FreeableBuffer> neuron_shared_weights_;
241+
168242
private:
169243
NeuronExecuTorchDelegate(const NeuronExecuTorchDelegate&);
170244

0 commit comments

Comments
 (0)