Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[PERFORMANCE] [v1.x] Layer normalization code from Marian for CPU #19601

Merged
merged 13 commits into from
Jan 5, 2021
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT
option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON
option(USE_LAPACK "Build with lapack support" ON)
option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently slower than internal. No effect unless USE_MKL_IF_AVAILABLE is set." OFF)
if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
option(USE_MKLDNN "Build with MKL-DNN support" ON)
else()
Expand Down Expand Up @@ -279,6 +280,9 @@ if(ENABLE_TESTCOVERAGE)
link_libraries(gcov)
endif()

if(USE_MKL_LAYERNORM)
add_definitions(-DMXNET_USE_MKL_LAYERNORM=1)
endif()
if(USE_MKLDNN)
# CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
if(MSVC)
Expand Down Expand Up @@ -447,6 +451,16 @@ if(USE_OPENMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
# Enable pragma omp simd
# "While the name of this switch is 'experimental', the switch itself, and
# the functionality it enables is fully supported and production-ready.
# The name reflects that it doesn’t enable any complete subset or
# version of an OpenMP standard."
# -- https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/
if(MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental")
endif()
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
add_definitions(-DMXNET_USE_OPENMP=1)
Expand Down
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@
docs/python_docs/themes/mx-theme
3rdparty/intgemm
3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h
src/operator/nn/layer_norm.cc

=======================================================================================
3-clause BSD license
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ ifeq ($(USE_MKLDNN), 1)
LIB_DEP += $(MKLDNNROOT)/lib/libdnnl.a
endif

# Use MKL's layernorm implementation. Only has an impact if MKL is compiled in.
ifeq ($(USE_MKL_LAYERNORM), 1)
CFLAGS += -DMXNET_USE_MKL_LAYERNORM=1
endif

# setup opencv
ifeq ($(USE_OPENCV), 1)
CFLAGS += -DMXNET_USE_OPENCV=1
Expand Down
1 change: 1 addition & 0 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ build_ubuntu_cpu_mkl() {
DEV=1 \
USE_CPP_PACKAGE=1 \
USE_BLAS=mkl \
USE_MKL_LAYERNORM=1 \
USE_TVM_OP=1 \
USE_MKLDNN=0 \
USE_INTEL_PATH=/opt/intel \
Expand Down
174 changes: 156 additions & 18 deletions src/operator/nn/layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,37 @@
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* Function LayerNormCPUKernel is adapated from Marian
* https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp
* under the MIT license
* MIT License
*
* Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam
* Mickiewicz University
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* All or part of this file was contributed by Intel under license:
* Copyright (C) 2017-2018 Intel Corporation
* SPDX-License-Identifier: MIT
*
*/

/*!
Expand All @@ -27,10 +58,6 @@
#include <nnvm/op_attr_types.h>
#include "../elemwise_op_common.h"

#if MSHADOW_USE_MKL == 1
#include "../mkl_functions-inl.h"
#endif

namespace mxnet {
namespace op {

Expand Down Expand Up @@ -68,23 +95,126 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
return true;
}

template<>
void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
/* CPU optimized kernel for LayerNorm assuming axis = -1.
* Data is the underlying storage data type.
* Accum is the type to use for accumulation.
* Apparently there isn't a reduction operator for half_t and anyway it isn't
* efficient to use on the CPU, so use float for reduction of half_t.
*
* width is the number of values being summed to compute a mean.
* instances is how many independent layer normalization problems are packed into the tensors.
*
* Inputs:
* data is instances x width
* gamma is width
* beta is width
*
* Outputs:
* out is instances x width, can be same as data
* mean is instances: means of each problem
* std is instances: standard deviation of each problem
*
*/
template <typename Data, typename Accum = typename
/* By default accumulate in float32 for float16. Otherwise use same type. */
std::conditional<std::is_same<mshadow::half::half_t, Data>::value,
float,
Data>::type>
void LayerNormCPUKernel(size_t width,
size_t instances,
Data eps,
const Data *data,
const Data *gamma,
const Data *beta,
Data *out,
Data *mean,
Data *std) {
// Parallelize over independent instances to normalize.
// MSVC says index variable in OpenMP 'for' statement must have signed integral type.
const mshadow::index_t signed_instances = static_cast<mshadow::index_t>(instances);
#pragma omp parallel for
for (nnvm::dim_t j = 0; j < signed_instances; ++j) {
const Data *from = data + j * width;

// Sum the values to compute mean.
Accum sum = 0.f;
#pragma omp simd reduction(+ : sum)
for (size_t i = 0; i < width; ++i) {
sum += from[i];
}
Accum mean_value = sum / width;
mean[j] = static_cast<Data>(mean_value);

// Sum squares from mean to compute stddev.
Accum squares = 0.f;
#pragma omp simd reduction(+ : squares)
for (size_t i = 0; i < width; ++i) {
Accum off = from[i] - mean_value;
squares += off * off;
}
Accum sigma = std::sqrt(squares / width + eps);
std[j] = static_cast<Data>(sigma);

// Write normalized values.
Data *to = out + j * width;
#pragma omp simd
for (size_t i = 0; i < width; ++i) {
to[i] = (from[i] - mean_value) * gamma[i] / sigma + beta[i];
}
}
}

#if MSHADOW_USE_MKL == 1
void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
/* Wrap the above LayerNormCPUKernel in MXNet's API. Returns true if it
* is able to run.
*/
bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 3U);

switch (req[layernorm::kOut]) {
case kNullOp:
return true;
case kWriteTo:
break;
case kWriteInplace:
break;
default:
// Should only be kAddTo, which isn't supported by the others implementation either.
return false;
}
// Axis must be the last one.
int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim());
if (axis != inputs[layernorm::kData].ndim() - 1) {
return false;
}
MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, {
LayerNormCPUKernel<DType>(
inputs[layernorm::kData].shape_[axis],
outputs[layernorm::kMean].Size(),
param.eps,
inputs[layernorm::kData].dptr<DType>(),
inputs[layernorm::kGamma].dptr<DType>(),
inputs[layernorm::kBeta].dptr<DType>(),
outputs[layernorm::kOut].dptr<DType>(),
outputs[layernorm::kMean].dptr<DType>(),
outputs[layernorm::kStd].dptr<DType>());
});
return true;
}

#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
if (req[0] == kNullOp) return;
if (req[0] == kNullOp) return true;
CHECK_NE(req[0], kAddTo);
CHECK_EQ(inputs.size(), 3U);
int axis = GetRealAxis(param.axis, inputs[0].ndim());
Expand Down Expand Up @@ -113,13 +243,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
outputs[layernorm::kStd].dptr<DType>(),
static_cast<DType>(param.eps));
});
return true;
} else {
// fallback
LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
return false;
}
}
#endif

template<>
void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
#endif
if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
}

template<>
void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
Expand Down Expand Up @@ -175,11 +317,7 @@ axis to be the last item in the input shape.
})
.set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
#if MSHADOW_USE_MKL == 1
.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
#else
.set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
#endif
.set_attr<nnvm::FGradient>("FGradient", [](const nnvm::ObjectPtr& n,
const std::vector<nnvm::NodeEntry>& ograds) {
std::vector<nnvm::NodeEntry> heads;
Expand Down