Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ARM CPU] Fix dnnl separation #2

Closed
wants to merge 10 commits into from
737 changes: 251 additions & 486 deletions src/plugins/intel_cpu/src/nodes/deconv.cpp

Large diffs are not rendered by default.

46 changes: 7 additions & 39 deletions src/plugins/intel_cpu/src/nodes/deconv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <memory>
#include <string>
#include <vector>
#include "common/dnnl_executor.h"
#include "executors/deconv_list.hpp"

namespace ov {
namespace intel_cpu {
Expand All @@ -20,6 +20,7 @@ class Deconvolution : public Node {
Deconvolution(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);

void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
const std::vector<MemoryDescPtr>& outputDesc) override;
void createPrimitive() override;
Expand All @@ -41,7 +42,7 @@ class Deconvolution : public Node {
bool canFuse(const NodePtr& node) const override;

const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); }
const std::vector<ptrdiff_t>& getStride() const { return stride; }
const std::vector<ptrdiff_t>& getStride() const { return deconvAttrs.stride; }

void prepareParams() override;
void execute(dnnl::stream strm) override;
Expand All @@ -58,46 +59,14 @@ class Deconvolution : public Node {

private:
using executorPtr = std::shared_ptr<DnnlExecutor>;
executorPtr execPtr = nullptr;

class DeconvExecutorDefault : public DnnlExecutor {
public:
DeconvExecutorDefault(const dnnl::convolution_backward_data::primitive_desc& pd,
const dnnl::memory::desc& inMemDesc,
const dnnl::memory::desc& weightMemDesc,
const dnnl::memory::desc& outMemDesc,
const dnnl::engine& engine);
};

class DeconvExecutorInt8 : public DnnlExecutor {
public:
DeconvExecutorInt8(const dnnl::deconvolution_forward::primitive_desc& pd,
const dnnl::memory::desc& inMemDesc,
const dnnl::memory::desc& weightMemDesc,
const dnnl::memory::desc& outMemDesc,
const dnnl::engine& engine);
};
std::shared_ptr<DeconvExecutor> execPtrDeconv = nullptr;

// have to hold reference (shared_ptr) to forward convolution primitive_desc
// since backward one uses the reference to it as a hint
std::vector<dnnl::convolution_forward::primitive_desc> fwdConvPD;

bool withGroups = false;
bool isDW = false;
bool isInt8 = false;
bool autoPad = false;
bool externOutShape = false;
size_t groupNum = 1;
size_t IC = 0;
size_t OC = 0;
std::vector<ptrdiff_t> kernel;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> dilation;
ov::CoordinateDiff paddingL;
ov::CoordinateDiff paddingR;
ov::CoordinateDiff outputPadding;
std::vector<int32_t> lastOutputSpatialDims;
VectorDims int8WeightDims;
VectorDims expectedBiasDims {};
bool useACL = false;
DeconvAttrs deconvAttrs;

Shape inShape;

Expand All @@ -112,7 +81,6 @@ class Deconvolution : public Node {
void initPaddingR(const Shape &inShape, const Shape &outShape);
std::vector<int32_t> readOutputSpatialDims() const;
std::pair<VectorDims, VectorDims> makeDummyInOutShape();
bool withBiases = false;
size_t biasPort;

std::string errorPrefix;
Expand Down
128 changes: 128 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_deconv.hpp"
#include "acl_utils.hpp"
#include "ie_parallel.hpp"

namespace ov {
namespace intel_cpu {

using namespace arm_compute;

//FIXME: add context
AclDeconvExecutor::AclDeconvExecutor() : DeconvExecutor() {}

bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
this->deconvAttrs = deconvAttrs;
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto weiDims = srcDescs[1]->getShape().getStaticDims();
//swap input and output channels dimensions to be align with ACL
//weights tensor shape is changed because ACL expects [W, H, I, O] tensor while OV uses [I, O, H, W] tensor
std::swap(weiDims[0], weiDims[1]);
auto dstDims = dstDescs[0]->getShape().getStaticDims();

VectorDims biasDims;
TensorInfo biasTensorInfo;
if (deconvAttrs.withBiases) {
biasDims = srcDescs[2]->getShape().getStaticDims();
//bias presicion is I32 but ACL requests bias precision as input ones
biasTensorInfo = TensorInfo(shapeCast(biasDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
}

TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1,
precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));

unsigned int pad_l = deconvAttrs.paddingL.at(1);
unsigned int pad_r = deconvAttrs.paddingR.at(1);
unsigned int pad_t = deconvAttrs.paddingL.at(0);
unsigned int pad_b = deconvAttrs.paddingR.at(0);
unsigned int stride_x = deconvAttrs.stride.at(1);
unsigned int stride_y = deconvAttrs.stride.at(0);
unsigned int dilation_x = deconvAttrs.dilation.at(1) + 1;
unsigned int dilation_y = deconvAttrs.dilation.at(0) + 1;

arm_compute::PadStrideInfo deconv_info(stride_x, stride_y, pad_l, pad_r, pad_t, pad_b, arm_compute::DimensionRoundingType::FLOOR);
arm_compute::Size2D dilation(dilation_x, dilation_y);

arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
&weiTensorInfo,
deconvAttrs.withBiases ? &biasTensorInfo : nullptr,
&dstTensorInfo,
deconv_info);
if (!status) {
DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
return false;
}

srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
if (deconvAttrs.withBiases)
biasTensor.allocator()->init(biasTensorInfo);

deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiases ? &biasTensor : nullptr, &dstTensor, deconv_info);

return true;
}

static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector<float>& dst_data) {
const auto src_data = reinterpret_cast<float*>(srcMemPtr->GetPtr());

const int DIM0 = srcMemPtr->getStaticDims()[0];
const int DIM1 = srcMemPtr->getStaticDims()[1];
const int DIM2 = srcMemPtr->getStaticDims()[2];
const int DIM3 = srcMemPtr->getStaticDims()[3];

parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) {
for (int dim3 = 0; dim3 < DIM3; ++dim3) {
const int src_off = dim0 * DIM1 * DIM2 * DIM3 +
dim1 * DIM2 * DIM3 +
dim2 * DIM3 +
dim3;
const int dst_off = dim1 * DIM0 * DIM2 * DIM3 +
dim0 * DIM2 * DIM3 +
dim2 * DIM3 +
dim3;

dst_data[dst_off] = src_data[src_off];
}
});
}

void AclDeconvExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst,
const void *post_ops_data_, const dnnl::stream &strm) {
//weights tensor shape is changed because ACL expects [W, H, I, O] tensor while OV uses [I, O, H, W] tensor
std::vector<float> weiBuffer(src[1]->getStaticDims()[0] *
src[1]->getStaticDims()[1] *
src[1]->getStaticDims()[2] *
src[1]->getStaticDims()[3]);
transpose_to_1023(src[1], weiBuffer);

srcTensor.allocator()->import_memory(src[0]->GetPtr());
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
weiTensor.allocator()->import_memory(weiBuffer.data());
if (deconvAttrs.withBiases)
biasTensor.allocator()->import_memory(src[2]->GetPtr());

deconv->run();

srcTensor.allocator()->free();
dstTensor.allocator()->free();
weiTensor.allocator()->free();
if (deconvAttrs.withBiases)
biasTensor.allocator()->free();
}

} // namespace intel_cpu
} // namespace ov
112 changes: 112 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/deconv.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

class AclDeconvExecutor : public DeconvExecutor {
public:
AclDeconvExecutor();

bool init(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_,
const dnnl::stream &strm) override;

impl_desc_type getImplType() const override {
return implType;
}

private:
DeconvAttrs deconvAttrs;
impl_desc_type implType = impl_desc_type::acl;

arm_compute::Tensor srcTensor;
arm_compute::Tensor weiTensor;
arm_compute::Tensor biasTensor;
arm_compute::Tensor dstTensor;
std::unique_ptr<arm_compute::NEDeconvolutionLayer> deconv = nullptr;
};

class AclDeconvExecutorBuilder : public DeconvExecutorBuilder {
public:
bool isSupported(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
if (srcDescs[0]->getShape().getDims().size() != 2 &&
srcDescs[1]->getShape().getDims().size() != 2 &&
dstDescs[0]->getShape().getDims().size() != 2) {
DEBUG_LOG("AclDeconvExecutor does not support dimension:",
" src[0]=", srcDescs[0]->getPrecision(),
" src[1]=", srcDescs[1]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP32 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP16 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" src[1]=", srcDescs[1]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
if (deconvAttrs.withBiases &&
srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) {
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
" src[2]=", srcDescs[2]->getPrecision());
return false;
}

if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
if (deconvAttrs.withBiases &&
!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[2]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
srcDescs[2]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" src[2]=", srcDescs[2]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
return true;
}

DeconvExecutorPtr makeExecutor() const override {
return std::make_shared<AclDeconvExecutor>();
}
};

} // namespace intel_cpu
} // namespace ov
67 changes: 67 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/deconv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "deconv.hpp"

namespace ov {
namespace intel_cpu {

using namespace InferenceEngine;

DeconvExecutor::DeconvExecutor() {}

size_t DeconvKey::hash() const {
using namespace dnnl::impl;
using namespace dnnl::impl::primitive_hashing;

size_t seed = 0;

for (const auto& ptr : {inp0, inp1, bias, out}) {
if (ptr) {
seed = hash_combine(seed, get_md_hash(*ptr->getDnnlDesc().get()));
}
}

seed = get_vector_hash(seed, stride);
seed = get_vector_hash(seed, dilation);
seed = get_vector_hash(seed, paddingL);
seed = get_vector_hash(seed, paddingR);

seed = hash_combine(seed, isInt8);

seed = hash_combine(seed, get_attr_hash(*attr.get()));
seed = hash_combine(seed, implType);
return seed;
}

bool DeconvKey::operator==(const DeconvKey &rhs) const {
bool retVal = true;
if (inp0 != rhs.inp0) {
retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc();
}
if (inp1 != rhs.inp1) {
retVal = retVal && inp1 && rhs.inp1 && inp1->getDnnlDesc() == rhs.inp1->getDnnlDesc();
}

if (bias != rhs.bias) {
retVal = retVal && bias && rhs.bias && bias->getDnnlDesc() == rhs.bias->getDnnlDesc();
}

if (out != rhs.out) {
retVal = retVal && out && rhs.out && out->getDnnlDesc() == rhs.out->getDnnlDesc();
}

retVal = retVal && stride == rhs.stride;
retVal = retVal && dilation == rhs.dilation;
retVal = retVal && paddingL == rhs.paddingL;
retVal = retVal && paddingR == rhs.paddingR;

retVal = retVal && isInt8 == rhs.isInt8;

retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType;
return retVal;
}

} // namespace intel_cpu
} // namespace ov
Loading