Skip to content

Commit 69d0da4

Browse files
DzAvrilXuZhi
authored andcommitted
[BYOC][ACL] Fix list is not supported as an input node (apache#10801)
* [BYOC][ACL] Fix list is not supported as an input node * fix clang lint error * fix compile warnning * fix python module import error * rename concatenate test file * fix always MakeACLTensor with same eid 0 * do not offload concat default * fix concattnate test failure * fix test failure * fix lint error * fix lint * remove global var offload_concat * support concatenate with pattern table mechanism * disable pylint dangerous-default-value warning Co-authored-by: XuZhi <[email protected]>
1 parent 71b595d commit 69d0da4

File tree

8 files changed

+320
-30
lines changed

8 files changed

+320
-30
lines changed

python/tvm/relay/op/contrib/arm_compute_lib.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17-
# pylint: disable=invalid-name, unused-argument
17+
# pylint: disable=invalid-name, unused-argument, dangerous-default-value
1818
"""Arm Compute Library supported operators."""
1919
import tvm
2020
from tvm import relay
@@ -23,7 +23,7 @@
2323
from tvm.relay.build_module import bind_params_by_name
2424
from tvm.relay.expr import const
2525

26-
from ...dataflow_pattern import is_constant, is_expr, is_op, wildcard
26+
from ...dataflow_pattern import is_constant, is_expr, is_op, is_tuple, wildcard
2727
from ..strategy.generic import is_depthwise_conv2d
2828
from .register import register_pattern_table
2929

@@ -42,7 +42,7 @@ def is_arm_compute_runtime_enabled():
4242
return False
4343

4444

45-
def partition_for_arm_compute_lib(mod, params=None, **opts):
45+
def partition_for_arm_compute_lib(mod, params=None, disabled_ops=["concatenate"], **opts):
4646
"""Partition the graph greedily offloading supported
4747
operators to Arm Compute Library.
4848
@@ -52,6 +52,8 @@ def partition_for_arm_compute_lib(mod, params=None, **opts):
5252
The module to run passes on.
5353
params : Optional[Dict[str, NDArray]]
5454
Constant input parameters.
55+
disabled_ops : Optional[list]
56+
Ops do not want to offload to ACL.
5557
5658
Returns
5759
-------
@@ -63,7 +65,7 @@ def partition_for_arm_compute_lib(mod, params=None, **opts):
6365
seq = tvm.transform.Sequential(
6466
[
6567
transform.InferType(),
66-
transform.MergeComposite(arm_compute_lib_pattern_table()),
68+
transform.MergeComposite(arm_compute_lib_pattern_table(disabled_ops)),
6769
transform.AnnotateTarget("arm_compute_lib", False),
6870
transform.PartitionGraph(),
6971
]
@@ -128,7 +130,7 @@ def convert_conv(attrs, inputs, tinfos, desired_layouts):
128130

129131

130132
@register_pattern_table("arm_compute_lib")
131-
def arm_compute_lib_pattern_table():
133+
def arm_compute_lib_pattern_table(disabled_ops=["concatenate"]):
132134
"""Get the ACL pattern table."""
133135

134136
def conv_pattern():
@@ -220,6 +222,17 @@ def l2_pool2d_pattern():
220222
pattern = is_op("sqrt")(pattern)
221223
return pattern
222224

225+
def concatenate_pattern():
226+
"""Create an concatenate pattern from equivalent relay operators.
227+
228+
Returns
229+
-------
230+
pattern : dataflow_pattern.AltPattern
231+
Denotes the concatenate pattern.
232+
"""
233+
pattern = is_op("concatenate")(is_tuple(None))
234+
return pattern
235+
223236
def check_conv(extract):
224237
"""Check conv pattern is supported by ACL."""
225238
call = extract
@@ -266,6 +279,19 @@ def check_l2_pool2d(extract):
266279
pool = extract.args[0]
267280
return avg_pool2d(pool)
268281

282+
def check_concatenate(expr):
283+
"""Check concatenate pattern is supported by ACL."""
284+
if "concatenate" in disabled_ops:
285+
return False
286+
attrs, type_args = expr.attrs, expr.type_args
287+
for idx in range(len(type_args[0].fields)):
288+
if type_args[0].fields[idx].dtype not in ["float32", "uint8"]:
289+
return False
290+
# ACL concatenate only supports maximum 4 dimensions input tensor
291+
if attrs.axis not in [-4, -3, -2, -1, 0, 1, 2, 3]:
292+
return False
293+
return True
294+
269295
return [
270296
("arm_compute_lib.conv2d", conv_pattern(), check_conv),
271297
("arm_compute_lib.qnn_conv2d", qnn_conv_pattern(), check_qnn_conv),
@@ -274,6 +300,7 @@ def check_l2_pool2d(extract):
274300
("arm_compute_lib.qnn_conv2d", qnn_conv_pattern(), check_qnn_conv),
275301
("arm_compute_lib.avg_pool2d", avg_pool2d_pattern(), check_avg_pool2d),
276302
("arm_compute_lib.l2_pool2d", l2_pool2d_pattern(), check_l2_pool2d),
303+
("arm_compute_lib.concatenate", concatenate_pattern(), check_concatenate),
277304
]
278305

279306

src/relay/backend/contrib/arm_compute_lib/codegen.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
9999
json_node = CreateCompositeAvgPool2DJSONNode(cn);
100100
} else if (name == "arm_compute_lib.l2_pool2d") {
101101
json_node = CreateCompositeL2Pool2DJSONNode(cn);
102+
} else if (name == "arm_compute_lib.concatenate") {
103+
return AddCommonSingleJSONNode(cn, "concatenate");
102104
} else {
103105
LOG(FATAL) << "Unrecognized Arm Compute Library pattern: " << name;
104106
}
@@ -342,6 +344,30 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
342344
SetCallNodeAttribute(json_node, avg_pool);
343345
return json_node;
344346
}
347+
348+
/*!
349+
* \brief Create a JSON representation of a single operator.
350+
* \param cn The call to be represented.
351+
* \param name The name of the operator.
352+
* \return A list of graph entry nodes.
353+
*/
354+
std::vector<JSONGraphNodeEntry> AddCommonSingleJSONNode(const CallNode* cn, std::string name) {
355+
std::vector<JSONGraphNodeEntry> inputs;
356+
for (const auto& arg : cn->args) {
357+
auto res = VisitExpr(arg);
358+
inputs.insert(inputs.end(), res.begin(), res.end());
359+
}
360+
auto node = std::make_shared<JSONGraphNode>(name, /* name_ */
361+
"kernel", /* op_type_ */
362+
inputs, 1 /* num_outputs_ */);
363+
364+
const auto* fn = cn->op.as<FunctionNode>();
365+
ICHECK(fn);
366+
const auto* callNode = fn->body.as<CallNode>();
367+
ICHECK(callNode);
368+
SetCallNodeAttribute(node, callNode);
369+
return AddNode(node, GetRef<Expr>(cn));
370+
}
345371
};
346372

347373
/*!

src/runtime/contrib/arm_compute_lib/acl_runtime.cc

Lines changed: 70 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
3232
#include <arm_compute/core/Types.h>
3333
#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
34+
#include <arm_compute/runtime/NEON/functions/NEConcatenateLayer.h>
3435
#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
3536
#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
3637
#include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
@@ -91,12 +92,21 @@ class ACLRuntime : public JSONRuntimeBase {
9192
* \return Status of inference.
9293
*/
9394
void Run() override {
94-
for (size_t i = 0; i < input_nodes_.size(); ++i) {
95-
auto nid = input_nodes_[i];
96-
uint32_t eid = EntryID(nid, 0);
95+
for (size_t nid_idx = 0; nid_idx < input_nodes_.size(); ++nid_idx) {
96+
auto nid = input_nodes_[nid_idx];
9797
if (nodes_[nid].GetOpType() == "input") {
98-
void* data = data_entry_[eid]->data;
99-
CheckACLError(layer_.inputs[i].allocator()->import_memory(data));
98+
for (uint32_t eid_idx = 0; eid_idx < nodes_[nid].GetNumOutput(); eid_idx++) {
99+
uint32_t eid = EntryID(nid, eid_idx);
100+
void* data = data_entry_[eid]->data;
101+
auto key = std::pair<uint32_t, uint32_t>(nid, eid_idx);
102+
if (layer_.json_inputid_to_layer_inputid.count(key) > 0) {
103+
CheckACLError(
104+
layer_.inputs[layer_.json_inputid_to_layer_inputid[key]].allocator()->import_memory(
105+
data));
106+
} else {
107+
CheckACLError(layer_.inputs[nid_idx].allocator()->import_memory(data));
108+
}
109+
}
100110
}
101111
}
102112

@@ -149,6 +159,8 @@ class ACLRuntime : public JSONRuntimeBase {
149159
CreateMaximumLayer(&layer_, node);
150160
} else if ("add" == op_name || "qnn.add" == op_name) {
151161
CreateAddLayer(&layer_, node);
162+
} else if ("concatenate" == op_name) {
163+
CreateConcatenateLayer(&layer_, node);
152164
} else {
153165
LOG(FATAL) << "Unsupported op: " << op_name;
154166
}
@@ -166,6 +178,9 @@ class ACLRuntime : public JSONRuntimeBase {
166178
std::shared_ptr<arm_compute::IFunction> function;
167179
std::vector<arm_compute::Tensor> inputs;
168180
std::vector<arm_compute::Tensor> outputs;
181+
// maps the input index of JSON node to the index of the ACL layer's inputs
182+
// this is optional (i.e.only when an operator uses the eid index)
183+
std::map<std::pair<uint32_t, uint32_t>, uint32_t> json_inputid_to_layer_inputid;
169184
};
170185

171186
/*!
@@ -175,17 +190,25 @@ class ACLRuntime : public JSONRuntimeBase {
175190
* \param tensor The tensor to represent.
176191
* \param scale (optional) The scale of the tensor as an input.
177192
* \param offset (optional) The offset of the tensor as an input.
193+
* \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
194+
* setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
195+
* _num_dimensions should be 3 rather than 1.
196+
* \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
197+
* dimensions of the shape.
178198
* \return ACL Tensor.
179199
*/
180200
arm_compute::Tensor MakeACLTensorFromJSONEntry(const JSONGraphNodeEntry& tensor,
181201
JSONGraphNodeEntry* scale = nullptr,
182-
JSONGraphNodeEntry* offset = nullptr) {
202+
JSONGraphNodeEntry* offset = nullptr,
203+
bool apply_dim_correction = true,
204+
bool increase_dim_unit = true) {
183205
JSONGraphNode node = nodes_[tensor.id_];
184206
void* node_data = nullptr;
185207
if (node.GetOpType() == "const") {
186208
node_data = data_entry_[EntryID(tensor)]->data;
187209
}
188-
return MakeACLTensorFromJSONNode(node, scale, offset, node_data);
210+
return MakeACLTensorFromJSONNode(node, scale, offset, node_data, apply_dim_correction,
211+
increase_dim_unit, tensor.index_);
189212
}
190213

191214
/*!
@@ -196,19 +219,26 @@ class ACLRuntime : public JSONRuntimeBase {
196219
* \param scale (optional) The scale of the tensor as an input.
197220
* \param offset (optional) The offset of the tensor as an input.
198221
* \param data (optional) Constant data of input node.
222+
* \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
223+
* setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
224+
* _num_dimensions should be 3 rather than 1.
225+
* \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
226+
* dimensions of the shape.
227+
* \param entry_index The entry index.
199228
* \return ACL Tensor.
200229
*/
201-
arm_compute::Tensor MakeACLTensorFromJSONNode(const JSONGraphNode& node,
202-
JSONGraphNodeEntry* scale = nullptr,
203-
JSONGraphNodeEntry* offset = nullptr,
204-
void* data = nullptr) {
230+
arm_compute::Tensor MakeACLTensorFromJSONNode(
231+
const JSONGraphNode& node, JSONGraphNodeEntry* scale = nullptr,
232+
JSONGraphNodeEntry* offset = nullptr, void* data = nullptr, bool apply_dim_correction = true,
233+
bool increase_dim_unit = true, uint32_t entry_index = 0) {
205234
const DLTensor* scale_data = nullptr;
206235
const DLTensor* offset_data = nullptr;
207236
if (scale && offset) {
208237
scale_data = data_entry_[EntryID(*scale)];
209238
offset_data = data_entry_[EntryID(*offset)];
210239
}
211-
return MakeACLTensor(node, data, scale_data, offset_data);
240+
return MakeACLTensor(node, data, scale_data, offset_data, apply_dim_correction,
241+
increase_dim_unit, entry_index);
212242
}
213243

214244
/*!
@@ -510,6 +540,34 @@ class ACLRuntime : public JSONRuntimeBase {
510540
layer->function = f;
511541
}
512542

543+
/*!
544+
* \brief Create a Concatenate layer.
545+
*
546+
* \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.c
547+
* \param node The JSON representation of the operator.
548+
*/
549+
void CreateConcatenateLayer(CachedLayer* layer, const JSONGraphNode& node) {
550+
std::vector<std::string> axis = node.GetAttr<std::vector<std::string>>("axis");
551+
std::vector<const arm_compute::ITensor*> inputs;
552+
for (auto input : node.GetInputs()) {
553+
layer->inputs.push_back(MakeACLTensorFromJSONEntry(input, nullptr, nullptr, false));
554+
layer->json_inputid_to_layer_inputid[std::pair<uint32_t, uint32_t>(input.id_, input.index_)] =
555+
layer->inputs.size() - 1;
556+
}
557+
for (size_t i = 0; i < layer->inputs.size(); i++) {
558+
inputs.push_back(&layer->inputs[i]);
559+
}
560+
layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
561+
int dimNum = layer->inputs[0].info()->num_dimensions();
562+
auto function = std::make_shared<arm_compute::NEConcatenateLayer>();
563+
// the shape of input tensor will be reversed after passing to ACL
564+
// for example a tensor with shape [1, 2, 3, 4] will be changed to
565+
// [4, 3, 2, 1] at ACL side. So the axis here should be preprocessed.
566+
auto a = std::stoi(axis[0]);
567+
function->configure(inputs, &layer->outputs[0], a < 0 ? -a - 1 : dimNum - a - 1);
568+
layer->function = function;
569+
}
570+
513571
/*! \brief Allow ACL functions to request auxiliary memory from TVM. */
514572
ACLAllocator allocator_;
515573
/*!

src/runtime/contrib/arm_compute_lib/acl_utils.cc

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,14 @@ void CheckACLError(const arm_compute::Status& status) {
4040
}
4141

4242
arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
43-
const DLTensor* scale, const DLTensor* offset) {
43+
const DLTensor* scale, const DLTensor* offset,
44+
bool apply_dim_correction, bool increase_dim_unit,
45+
uint32_t entry_index) {
4446
arm_compute::Tensor tensor;
45-
std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
46-
DLDataType dtype = tensor_rep.GetOpDataType()[0];
47-
arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset);
47+
std::vector<int64_t> shape = tensor_rep.GetOpShape()[entry_index];
48+
DLDataType dtype = tensor_rep.GetOpDataType()[entry_index];
49+
arm_compute::TensorInfo info =
50+
MakeACLTensorInfo(shape, dtype, scale, offset, apply_dim_correction, increase_dim_unit);
4851
info.set_is_resizable(false);
4952
tensor.allocator()->init(info);
5053
if (data != nullptr) {
@@ -55,10 +58,11 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
5558

5659
arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
5760
const DLDataType& dtype, const DLTensor* scale,
58-
const DLTensor* offset) {
61+
const DLTensor* offset, bool apply_dim_correction,
62+
bool increase_dim_unit) {
5963
arm_compute::TensorShape acl_shape;
6064
for (unsigned int i = shape.size(); i > 0; --i) {
61-
acl_shape.set(shape.size() - i, shape[i - 1]);
65+
acl_shape.set(shape.size() - i, shape[i - 1], apply_dim_correction, increase_dim_unit);
6266
}
6367
arm_compute::DataType acl_dtype = MakeACLDataType(dtype);
6468
arm_compute::TensorInfo info(acl_shape, 1, acl_dtype, arm_compute::DataLayout::NHWC);

src/runtime/contrib/arm_compute_lib/acl_utils.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,9 @@ void CheckACLError(const arm_compute::Status& status);
6363
* \return arm_compute::Tensor.
6464
*/
6565
arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data = nullptr,
66-
const DLTensor* scale = nullptr,
67-
const DLTensor* offset = nullptr);
66+
const DLTensor* scale = nullptr, const DLTensor* offset = nullptr,
67+
bool apply_dim_correction = true, bool increase_dim_unit = true,
68+
uint32_t entry_index = 0);
6869

6970
/*!
7071
* \brief Make an acl tensor info object from JSON tensor
@@ -78,7 +79,9 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data =
7879
*/
7980
arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
8081
const DLDataType& dtype, const DLTensor* scale = nullptr,
81-
const DLTensor* offset = nullptr);
82+
const DLTensor* offset = nullptr,
83+
bool apply_dim_correction = true,
84+
bool increase_dim_unit = true);
8285

8386
/*!
8487
* \brief Create a memory manager for use with a layer that

src/runtime/contrib/json/json_runtime.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ class JSONRuntimeBase : public ModuleNode {
186186
for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
187187
input_var_eid_.push_back(EntryID(nid, j));
188188
}
189+
nodes_[nid].SetNumOutput(nodes_[nid].GetOpShape().size());
189190
} else {
190191
ICHECK_EQ(nodes_[nid].op_type_, "const");
191192
auto pos = std::find(std::begin(const_names_), std::end(const_names_), name);

0 commit comments

Comments
 (0)