Skip to content

Commit

Permalink
Bf16 crop layer (#4)
Browse files Browse the repository at this point in the history
* [IE TESTS][CPU] Cpu specific test for the Crop layer has been created.

* [IE TESTS][CPU] Deprecated Crop single layer test removed.

* [CPU BF16] Bfloat16 precision was added to the Crop layer.

* [CPU BF16] Crop layer minor code improvements.

* [IE TESTS][CPU] Crop layer test added 2D tensor tests.

* [IE TESTS][CPU] Crop layer test, obsolete comment removed.

* [IE TESTS][CPU] Fixed CropIE include path.

* Crop test fix for older gcc compiler.
  • Loading branch information
maxnick committed Nov 16, 2020
1 parent 9b9f84b commit 4422150
Show file tree
Hide file tree
Showing 5 changed files with 293 additions and 293 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class INFERENCE_ENGINE_API_CLASS(CropIE) : public Op {
void validate_and_infer_types() override;

std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
bool evaluate(const HostTensorVector& outputs,
const HostTensorVector& inputs) const override;

std::vector<int64_t> axes, dim, offset;
};
Expand Down
77 changes: 77 additions & 0 deletions inference-engine/src/legacy_api/src/ngraph_ops/crop_ie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,80 @@ void op::CropIE::validate_and_infer_types() {

set_output_type(0, get_input_element_type(0), PartialShape(output_shape));
}

bool op::CropIE::evaluate(const HostTensorVector &outputs, const HostTensorVector &inputs) const {
if (inputs.front()->get_element_type() != outputs.front()->get_element_type()) {
throw ngraph_error("Input and output data types must be the same!");
}

auto *dst_ptr = outputs.front()->get_data_ptr<uint8_t>();

const int ndims = dim.size();

const size_t OFFSET_N = (ndims > 0) ? offset.at(0) : 0;
const size_t OFFSET_C = (ndims > 1) ? offset.at(1) : 0;
const size_t OFFSET_D = (ndims > 4) ? offset.at(ndims - 3) : 0;
const size_t OFFSET_H = (ndims > 2) ? offset.at(ndims - 2) : 0;
const size_t OFFSET_W = (ndims > 3) ? offset.at(ndims - 1) : 0;

auto outputShape = get_output_partial_shape(0).get_shape();

const size_t ON = (ndims > 0) ? outputShape.at(0) : 1;
const size_t OC = (ndims > 1) ? outputShape.at(1) : 1;
const size_t OD = (ndims > 4) ? outputShape.at(ndims - 3) : 1;
const size_t OH = (ndims > 2) ? outputShape.at(ndims - 2) : 1;
const size_t OW = (ndims > 3) ? outputShape.at(ndims - 1) : 1;

auto inputShape = get_input_partial_shape(0).get_shape();

const size_t IN = (ndims > 0) ? inputShape.at(0) : 1;
const size_t IC = (ndims > 1) ? inputShape.at(1) : 1;
const size_t ID = (ndims > 4) ? inputShape.at(ndims - 3) : 1;
const size_t IH = (ndims > 2) ? inputShape.at(ndims - 2) : 1;
const size_t IW = (ndims > 3) ? inputShape.at(ndims - 1) : 1;

auto dst_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t {
return (n * OC * OD * OH * OW + c * OD * OH * OW + d * OH * OW + h * OW + w);
};
auto src_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t {
return (n * IC * ID * IH * IW + c * ID * IH * IW + d * IH * IW + h * IW + w);
};

if (IN - OFFSET_N < ON) {
throw ngraph_error("Wrong offset!");
}
if (IC - OFFSET_C < OC) {
throw ngraph_error("Wrong offset!");
}
if (IC - OFFSET_C < OC) {
throw ngraph_error("Wrong offset!");
}
if (ID - OFFSET_D < OD) {
throw ngraph_error("Wrong offset!");
}
if (IH - OFFSET_H < OH) {
throw ngraph_error("Wrong offset!");
}
if (IW - OFFSET_W < OW) {
throw ngraph_error("Wrong offset!");
}

size_t dataSize = inputs.front()->get_element_type().size();

auto src_ptr = inputs.front()->get_data_ptr<const uint8_t>();
for (size_t n = 0; n < ON; ++n) {
for (size_t c = 0; c < OC; ++c) {
for (size_t d = 0; d < OD; ++d) {
for (size_t h = 0; h < OH; ++h) {
for (size_t w = 0; w < OW; ++w) {
memcpy(dst_ptr + dataSize * dst_off(n, c, d, h, w),
src_ptr + dataSize * src_off(n + OFFSET_N, c + OFFSET_C, d + OFFSET_D, h + OFFSET_H, w + OFFSET_W),
dataSize);
}
}
}
}
}

return true;
}
64 changes: 18 additions & 46 deletions inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,12 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
return;

InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
if (precision != InferenceEngine::Precision::FP32)
precision = InferenceEngine::Precision::FP32;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
precision = getCnnLayer()->outData[0]->getPrecision();
if (precision != InferenceEngine::Precision::FP32)
precision = InferenceEngine::Precision::FP32;
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
if (inputDataType != outputDataType) {
outputDataType = inputDataType; // Crop doesn't convert precisions, only moves data
}

auto& inDims = getParentEdgeAt(0)->getDims();
if (inDims.ndims() != 2 && inDims.ndims() != 4 && inDims.ndims() != 5) {
Expand Down Expand Up @@ -125,19 +124,19 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
if (!MKLDNNMemory::IsPlainFormat(parentMem.GetFormat())) {
m_block_size = parentMem.GetDescriptor().data.layout_desc.blocking.block_dims[1];
}
int m_inner_dim = dims[dims.size() - 1] * m_block_size;
const int m_inner_dim = dims[dims.size() - 1] * m_block_size;

const memory &dst_d = getChildEdgeAt(0)->getMemory().GetPrimitive();

int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
const int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;

// TODO: Rewrite it in general case. For every tensor
// and rank, without using letter N,C,D,H,W
int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;
const int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
const int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
const int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
const int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
const int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;

// TODO: Check applicability of dyn_batch_lim in early steps.
// crop of batch dimension doesn't support dyn batch.
Expand All @@ -155,42 +154,16 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
const int IH = (src_ndims > 2) ? src_dims[src_dims.size() - 2] : 1;
const int IW = (src_ndims > 3) ? src_dims[src_dims.size() - 1] : 1;

const auto *src_data = reinterpret_cast<const float*>(parentMem.GetData()) +
parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
float *dst_data = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
const uint8_t itemSize = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(parentMem.GetDataType()));

const auto *src_data = reinterpret_cast<const uint8_t *>(parentMem.GetData()) +
itemSize * parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) +
itemSize * getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;

#ifdef _WIN32
if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
for (int n = 0; n < ON; ++n) {
cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float));
}
} else {
for (int n = 0; n < ON; ++n) {
for (int c = 0; c < OC; c += m_block_size) {
for (int d = 0; d < OD; ++d) {
for (int h = 0; h < OH; ++h) {
int dst_ind =
n*OC*OD*OH*OW + c*OD*OH*OW + d*OH*OW*m_block_size +
h*OW*m_block_size;

int src_ind =
(n+OFFSET_N)*IC*ID*IH*IW +
(c+OFFSET_C)*ID*IH*IW +
(d+OFFSET_D)*IH*IW*m_block_size +
(h+OFFSET_H)*IW*m_block_size +
OFFSET_W*m_block_size;

cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float));
}
}
}
}
}
#else
if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
parallel_for(ON, [&](int n) {
cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float));
cpu_memcpy(dst_data + itemSize * n * OC, src_data + itemSize *((n+OFFSET_N)*IC + OFFSET_C), OC * itemSize);
});
} else {
parallel_for2d(ON, (OC / m_block_size), [&](int n, int c) {
Expand All @@ -201,15 +174,14 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
((d+OFFSET_D)*IH*IW + OFFSET_H*IW + OFFSET_W)*m_block_size;

for (int h = 0; h < OH; ++h) {
cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float));
cpu_memcpy(dst_data + itemSize * dst_ind, src_data + itemSize * src_ind, m_inner_dim * itemSize);

src_ind += IW * m_block_size;
dst_ind += OW * m_block_size;
}
}
});
}
#endif
}

bool MKLDNNCropNode::created() const {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "../src/legacy_api/include/legacy/ngraph_ops/crop_ie.hpp"
#include "ngraph_functions/builders.hpp"
#include "test_utils/cpu_test_utils.hpp"

using namespace InferenceEngine;
using namespace CPUTestUtils;

namespace CPULayerTestsDefinitions {

typedef std::tuple<
std::vector<size_t>, //input shape
std::vector<int64_t>, //dims
std::vector<int64_t> // offset
> testCaseParams;

typedef std::tuple<
testCaseParams,
InferenceEngine::Precision, // Net precision. We'll use only the net precision because the primitive is not supposed to convert precisions.
std::string, // Device name
std::map<std::string, std::string>, // Additional network configuration
CPUSpecificParams> CropLayerCPUTestParamSet;

class CropLayerCPUTest : public testing::WithParamInterface<CropLayerCPUTestParamSet>,
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
public:
static std::string getTestCaseName(testing::TestParamInfo<CropLayerCPUTestParamSet> obj) {
testCaseParams testCase;
InferenceEngine::Precision netPrc;
std::string targetName;
std::map<std::string, std::string> additionalConfig;

CPUSpecificParams cpuParams;
std::tie(testCase, netPrc, targetName, additionalConfig, cpuParams) = obj.param;

std::ostringstream result;
result << "inShape=" << CommonTestUtils::vec2str(std::get<0>(testCase)) << "_";
result << "dims=" << CommonTestUtils::vec2str(std::get<1>(testCase)) << "_";
result << "offset=" << CommonTestUtils::vec2str(std::get<2>(testCase)) << "_";
result << "netPRC=" << netPrc.name() << "_";
result << "targetDevice=" << targetName;
result << CPUTestsBase::getTestCaseName(cpuParams);

return result.str();
}
protected:
void SetUp() override {
testCaseParams testCase;
std::vector<size_t> inpShape;
std::vector<int64_t> dims;
std::vector<int64_t> offset;
InferenceEngine::Precision netPrecision;
std::map<std::string, std::string> additionalConfig;
CPUSpecificParams cpuParams;
std::tie(testCase, netPrecision, targetDevice, additionalConfig, cpuParams) = this->GetParam();
std::tie(inpShape, dims, offset) = testCase;
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
inPrc = outPrc = netPrecision;

configuration.insert(additionalConfig.begin(), additionalConfig.end());

auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto params = ngraph::builder::makeParams(ngPrc, {inpShape});
auto paramOuts = ngraph::helpers::convert2OutputVector(
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));

std::vector<int64_t> axes;
for (size_t i = 0; i < inpShape.size(); ++i) {
axes.push_back(i);
}
auto ss = std::make_shared<ngraph::op::CropIE>(paramOuts[0], axes, dims, offset);

std::string strExpectedPrc;
if (Precision::BF16 == inPrc) {
strExpectedPrc = "BF16";
} else if (Precision::FP32 == inPrc) {
strExpectedPrc = "FP32";
}

selectedType = "unknown_" + strExpectedPrc;

ss->get_rt_info() = getCPUInfo();

ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
function = std::make_shared<ngraph::Function>(results, params, "Crop");
}
};

TEST_P(CropLayerCPUTest, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()

Run();
CheckCPUImpl(executableNetwork, "Crop");
}

namespace {
// Withing the test scope we don't need any implicit bf16 optimisations, so let's run the network as is.
std::map<std::string, std::string> additional_config = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}};

std::vector<Precision> netPrc = {Precision::BF16, Precision::FP32};

std::vector<testCaseParams> testCasesPlain2D = {testCaseParams{{32, 32}, {32, 10}, {0, 20}},
testCaseParams{{32, 20}, {30, 10}, {2, 10}}};

const auto CropParamsPlain2D = ::testing::Combine(
::testing::ValuesIn(testCasesPlain2D),
::testing::ValuesIn(netPrc),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(additional_config),
::testing::Values(emptyCPUSpec));

INSTANTIATE_TEST_CASE_P(CompareWithRefs_Plain_2D, CropLayerCPUTest, CropParamsPlain2D, CropLayerCPUTest::getTestCaseName);

std::vector<testCaseParams> testCasesPlain4D = {testCaseParams{{1, 5, 32, 32}, {1, 2, 23, 23}, {0, 2, 5, 4}},
testCaseParams{{1, 5, 32, 32}, {1, 5, 5, 5}, {0, 0, 20, 20}},
testCaseParams{{1, 5, 32, 32}, {1, 5, 32, 10}, {0, 0, 0, 20}},
testCaseParams{{1, 5, 32, 20}, {1, 5, 30, 10}, {0, 0, 2, 10}}};

std::vector<CPUSpecificParams> cpuParams_4D = {
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
CPUSpecificParams({nchw}, {nchw}, {}, {})
};

const auto CropParamsPlain4D = ::testing::Combine(
::testing::ValuesIn(testCasesPlain4D),
::testing::ValuesIn(netPrc),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(additional_config),
::testing::Values(cpuParams_4D.at(1)));

INSTANTIATE_TEST_CASE_P(CompareWithRefs_Plain_4D, CropLayerCPUTest, CropParamsPlain4D, CropLayerCPUTest::getTestCaseName);

std::vector<testCaseParams> testCasesBlocked4D = {testCaseParams{{1, 16, 32, 32}, {1, 16, 5, 5}, {0, 0, 20, 20}},
testCaseParams{{1, 32, 32, 32}, {1, 16, 32, 10}, {0, 0, 0, 20}}};

const auto CropParamsBlocked4D = ::testing::Combine(
::testing::ValuesIn(testCasesBlocked4D),
::testing::ValuesIn(netPrc),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(additional_config),
::testing::Values(filterCPUSpecificParams(cpuParams_4D).front()));

INSTANTIATE_TEST_CASE_P(CompareWithRefs_Blocked_4D, CropLayerCPUTest, CropParamsBlocked4D, CropLayerCPUTest::getTestCaseName);

std::vector<testCaseParams> testCasesPlain4DynBatch = {testCaseParams{{10, 5, 32, 32}, {1, 2, 23, 23}, {0, 2, 5, 4}},
testCaseParams{{10, 5, 32, 32}, {1, 5, 5, 5}, {0, 0, 20, 20}},
testCaseParams{{10, 5, 32, 32}, {1, 5, 32, 10}, {0, 0, 0, 20}},
testCaseParams{{10, 5, 32, 20}, {1, 5, 30, 10}, {0, 0, 2, 10}}};

std::map<std::string, std::string> additional_config_dyn_batch = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO},
{PluginConfigParams::KEY_DYN_BATCH_ENABLED, PluginConfigParams::YES}};

const auto CropParamsPlain4DynBatch = ::testing::Combine(
::testing::ValuesIn(testCasesPlain4DynBatch),
::testing::ValuesIn(netPrc),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(additional_config_dyn_batch),
::testing::Values(cpuParams_4D.at(1)));

INSTANTIATE_TEST_CASE_P(CompareWithRefs_Blocked_4DynBatch, CropLayerCPUTest, CropParamsPlain4DynBatch, CropLayerCPUTest::getTestCaseName);

std::vector<testCaseParams> testCasesPlain5D = {testCaseParams{{1, 5, 32, 20, 14}, {1, 5, 30, 10, 8}, {0, 0, 2, 10, 6}},
testCaseParams{{5, 9, 32, 20, 14}, {2, 5, 30, 10, 8}, {3, 4, 2, 10, 6}}};

std::vector<CPUSpecificParams> cpuParams_5D = {
CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
CPUSpecificParams({ncdhw}, {ncdhw}, {}, {})
};

const auto CropParamsPlain5D = ::testing::Combine(
::testing::ValuesIn(testCasesPlain5D),
::testing::ValuesIn(netPrc),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(additional_config),
::testing::Values(cpuParams_5D.at(1)));

INSTANTIATE_TEST_CASE_P(CompareWithRefs_Plain_5D, CropLayerCPUTest, CropParamsPlain5D, CropLayerCPUTest::getTestCaseName);

std::vector<testCaseParams> testCasesBlocked5D = {testCaseParams{{1, 32, 32, 20, 14}, {1, 16, 30, 10, 8}, {0, 0, 2, 10, 6}},
testCaseParams{{5, 32, 32, 20, 14}, {2, 32, 30, 10, 8}, {3, 0, 2, 10, 6}}};

const auto CropParamsBlocked5D = ::testing::Combine(
::testing::ValuesIn(testCasesBlocked5D),
::testing::ValuesIn(netPrc),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(additional_config),
::testing::Values(cpuParams_5D.at(0)));

INSTANTIATE_TEST_CASE_P(CompareWithRefs_Blocked_5D, CropLayerCPUTest, CropParamsBlocked5D, CropLayerCPUTest::getTestCaseName);

} // namespace
} // namespace CPULayerTestsDefinitions

Loading

0 comments on commit 4422150

Please sign in to comment.