From 9653b67c18f6a2d69724b504574039acc3bc6d81 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 9 Apr 2019 01:48:57 -0700 Subject: [PATCH 001/105] Beginning of RTC of pointwise ops --- src/operator/fusion/fused_op-inl.h | 420 +++++++++++++++++++++++++++++ src/operator/fusion/fused_op.cc | 59 ++++ src/operator/fusion/fused_op.cu | 298 ++++++++++++++++++++ src/operator/fusion/fused_op.h | 110 ++++++++ 4 files changed, 887 insertions(+) create mode 100644 src/operator/fusion/fused_op-inl.h create mode 100644 src/operator/fusion/fused_op.cc create mode 100644 src/operator/fusion/fused_op.cu create mode 100644 src/operator/fusion/fused_op.h diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h new file mode 100644 index 000000000000..26a0ec93b6f0 --- /dev/null +++ b/src/operator/fusion/fused_op-inl.h @@ -0,0 +1,420 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ +#define MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ + +#include + +namespace mxnet { + +namespace detail { + +const std::string fp16_support_string = R"code( +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#if defined(__cplusplus) + struct __align__(2) __half { + __host__ __device__ __half() { } + protected: + unsigned short __x; + }; + /* All intrinsic functions are only available to nvcc compilers */ + #if defined(__CUDACC__) + /* Definitions of intrinsics */ + __device__ inline __half __float2half(const float f) { + __half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; + } + __device__ inline float __half2float(const __half h) { + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); + return val; + } + #endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ +#undef __HALF_TO_US +#undef __HALF_TO_CUS +typedef __half half; +)code"; + +const std::string fused_op_function_definitions = R"code( +template +struct LoadType { + using Type = DType; +}; + +template <> +struct LoadType { + using Type = float; +}; + +template +inline typename LoadType::Type load(const DType * input, int i) { + return input[i]; +} + +template <> +inline float load(const half * input, int i) { + return __half2float(input[i]); +} + +template +inline void store(const typename LoadType::Type value, int i, DType * output) { + output[i] = value; +} + +template <> +inline void store(const float value, int i, half * output) { + output[i] = __float2half(value); +} + +template +inline DType add(const DType a, const DType2 b) { + return a + b; +} + +template +inline DType sub(const DType a, const DType2 b) { + return a - b; +} + +template +inline DType mul(const DType a, const DType2 b) { + return a * b; +} + +template +inline DType div(const DType a, const DType2 b) { + return a / b; +} + +template +inline DType pow(const DType a, const DType2 b) { + return powf(a, b); +} + +template +inline DType max(const DType a, const DType2 b) { + return a > b ? a : b; +} + +template +inline DType min(const DType a, const DType2 b) { + return a < b ? a : b; +} + +template +inline OutType cast(const DType val) { + return static_cast(val); +} + +// activations + +template +inline DType relu(const DType val) { + return val > 0 ? val : 0; +} + +template +inline DType backward_relu(const DType val, const DType grad) { + return val > 0 ? grad : 0; +} + +template +inline DType sigmoid(const DType val) { + return 1.f/(1 + expf(-val)); +} + +template +inline DType backward_sigmoid(const DType val, const DType grad) { + const DType ep1 = 1 + expf(-val); + return grad * expf(-val)/(ep1*ep1); +} + +template +inline DType softrelu(const DType val) { + return logf(1 + expf(val)); +} + +template +inline DType backward_softrelu(const DType val, const DType grad) { + return grad * sigmoid(val); +} + +template +inline DType softsign(const DType val) { + return val / (1 + absf(val)); +} + +template +inline DType backward_softsign(const DType val, const DType grad) { + const DType ap1 = 1 + absf(val); + return grad / (ap1 * ap1); +} + +// exp and log + +template +inline DType exp(const DType val) { + return expf(val); +} + +template +inline DType backward_exp(const DType val, const DType grad) { + return grad * expf(val); +} + +template +inline DType expm1(const DType val) { + return expm1f(val); +} + +template +inline DType backward_expm1(const DType val, const DType grad) { + return grad * expf(val); +} + +template +inline DType log(const DType val) { + return logf(val); +} + +template +inline DType backward_log(const DType val, const DType grad) { + return grad / val; +} + +template +inline DType log10(const DType val) { + return log10f(val); +} + +template +inline DType backward_log10(const DType val, const DType grad) { + return grad / (val * logf(10)); +} + +template +inline DType log2(const DType val) { + return log2f(val); +} + +template +inline DType backward_log2(const DType val, const DType grad) { + return grad / (val * logf(2)); +} + +template +inline DType log1p(const DType val) { + return log1pf(val); +} + +template +inline DType backward_log1p(const DType val, const DType grad) { + return grad / (1 + val); +} + +// trigonometric + +template +inline DType sin(const DType val) { + return sinf(val); +} + +template +inline DType backward_sin(const DType val, const DType grad) { + return grad * cosf(val); +} + +template +inline DType cos(const DType val) { + return cosf(val); +} + +template +inline DType backward_cos(const DType val, const DType grad) { + return -grad * sinf(val); +} + +template +inline DType tan(const DType val) { + return tanf(val); +} + +// Uses output from tan +template +inline DType backward_tan(const DType val, const DType grad) { + return grad * (val * val + 1); +} + +template +inline DType arcsin(const DType val) { + return asinf(val); +} + +template +inline DType backward_arcsin(const DType val, const DType grad) { + return grad / sqrtf(1 - val*val); +} + +template +inline DType arccos(const DType val) { + return acosf(val); +} + +template +inline DType backward_arccos(const DType val, const DType grad) { + return -grad / sqrtf(1 - val*val); +} + +template +inline DType arctan(const DType val) { + return atanf(val); +} + +template +inline DType backward_arctan(const DType val, const DType grad) { + return grad / (1 + val*val); +} + +template +inline DType sinh(const DType val) { + return sinhf(val); +} + +template +inline DType backward_sinh(const DType val, const DType grad) { + return grad * coshf(val); +} + +template +inline DType cosh(const DType val) { + return coshf(val); +} + +template +inline DType backward_cosh(const DType val, const DType grad) { + return grad * sinhf(val); +} + +template +inline DType tanh(const DType val) { + return tanhf(val); +} + +// Uses tanh output +template +inline DType backward_tanh(const DType val, const DType grad) { + return grad * (1 - val * val); +} + +template +inline DType arcsinh(const DType val) { + return asinhf(val); +} + +template +inline DType backward_arcsinh(const DType val, const DType grad) { + return grad / sqrtf(val * val + 1); +} + +template +inline DType arccosh(const DType val) { + return acoshf(val); +} + +template +inline DType backward_arccosh(const DType val, const DType grad) { + return grad / sqrtf(val * val - 1); +} + +template +inline DType arctanh(const DType val) { + return atanhf(val); +} + +template +inline DType backward_arctanh(const DType val, const DType grad) { + return grad / (1 - val * val); +} + +// sqrt + +template +inline DType sqrt(const DType val) { + return sqrtf(val); +} + +template +inline DType backward_sqrt(const DType val, const DType grad) { + return 0.5 * grad * rsqrtf(val); +} + +template +inline DType rsqrt(const DType val) { + return rsqrtf(val); +} + +template +inline DType backward_rsqrt(const DType val, const DType grad) { + const DType inv = 1 / val; + return -0.5 * grad * sqrtf(inv) * inv; +} + +template +inline DType cbrt(const DType val) { + return cbrtf(val); +} + +template +inline DType backward_cbrt(const DType val, const DType grad) { + const DType inv = rcbrtf(val); + return 1.f/3.f * grad * inv * inv; +} + +template +inline DType rcbrt(const DType val) { + return rcbrtf(val); +} + +template +inline DType backward_rcbrt(const DType val, const DType grad) { + const DType inv = 1 / val; + return -1.f/3.f * grad * cbrtf(inv) * inv; +} + +)code"; + +const std::string fused_op_kernel_begin = R"code( +const int tid = threadIdx.x + blockIdx.x * blockDim.x; +for (int i = tid; i < N; i+= gridDim.x * blockDim.x) { +)code"; + +const std::string fused_op_kernel_end = R"code( +} +} +)code"; + +} // namespace detail + +} // namespace mxnet + +#endif // MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc new file mode 100644 index 000000000000..ced2da00f234 --- /dev/null +++ b/src/operator/fusion/fused_op.cc @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "./fused_op.h" +#include "../operator_common.h" + +namespace mxnet { + +DMLC_REGISTER_PARAMETER(FusedOpConfig); + +void FusedOpParamParser(nnvm::NodeAttrs* attrs) { + FusedOpConfig param; + try { + param.Init(attrs->dict); + } catch (const dmlc::ParamError& e) { + std::ostringstream os; + os << e.what(); + os << ", in operator " << attrs->op->name << "(" + << "name=\"" << attrs->name << "\""; + for (const auto& k : attrs->dict) { + os << ", " << k.first << "=\"" << k.second << "\""; + } + os << ")"; + throw dmlc::ParamError(os.str()); + } + if (!param.code.empty()) { + attrs->parsed = FusedOpPtr(new FusedOp(param)); + } +} + +NNVM_REGISTER_OP(FusedOp) +.set_num_inputs([](const NodeAttrs& attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + return op->num_inputs(); + }) +.set_num_outputs([](const NodeAttrs& attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + return op->num_outputs(); + }) +.set_attr_parser(FusedOpParamParser) +.add_argument("data", "NDArray-or-Symbol[]", "Data"); + +} // namespace mxnet diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu new file mode 100644 index 000000000000..8390d899c16b --- /dev/null +++ b/src/operator/fusion/fused_op.cu @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include "./fused_op.h" +#include "./fused_op-inl.h" +#include "../operator_common.h" +#include "../elemwise_op_common.h" +#include "../../executor/exec_pass.h" +#include + +namespace mxnet { + +namespace detail { + +inline std::string mshadowTypeToString(int type) { + switch (type) { + case mshadow::kFloat32: + return "float"; + case mshadow::kFloat64: + return "double"; + case mshadow::kFloat16: + return "half"; + case mshadow::kUint8: + return "unsigned char"; + case mshadow::kInt8: + return "char"; + case mshadow::kInt32: + return "int"; + case mshadow::kInt64: + return "long long"; + default: + LOG(FATAL) << "Unknown type enum " << type; + } + return ""; +} + +} // namespace detail + +FusedOp::FusedOp(const FusedOpConfig& config) { + this->code_ = config.code; + this->inputs_ = std::vector(config.num_inputs); + this->outputs_ = std::vector(config.num_outputs); + this->symbol_ = nnvm::pass::LoadJSON(config.symbol_json); + this->initialized_ = false; +} + +template <> +void FusedOp::Forward(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mshadow; + CHECK_GE(outputs.size(), 1) << "There needs to be at least 1 output."; + + if (!initialized_) { + int device; + CUdevice cuDevice; + CUcontext context; + CUmodule module; + CUDA_CALL(cudaGetDevice(&device)) + CUDA_DRIVER_CALL(cuDeviceGet(&cuDevice, device)); + CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cuDevice)); + CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[0])); + CUDA_DRIVER_CALL(cuModuleGetFunction(&kernel_, + module, + kernel_name_.c_str())); + initialized_ = true; + } + Stream* s = ctx.get_stream(); + auto stream = Stream::GetStream(s); + std::vector args; + size_t N = outputs[0].shape_.Size(); + args.push_back(&N); + unsigned int num_blocks = (N + FusedOp::NTHREADS - 1) / FusedOp::NTHREADS; + std::vector ptrs; + for (const auto &data : inputs) { + MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { + Tensor tensor = data.FlatTo1D(s); + ptrs.push_back(tensor.dptr_); + }); + } + for (const auto &data : outputs) { + MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { + Tensor tensor = data.FlatTo1D(s); + ptrs.push_back(tensor.dptr_); + }); + } + for (auto &ptr : ptrs) { + args.push_back(reinterpret_cast(&ptr)); + } + CUDA_DRIVER_CALL( + cuLaunchKernel(kernel_, + num_blocks, 1, 1, // grid dim + FusedOp::NTHREADS, 1, 1, // block dim + 0, stream, // shared mem and stream + &(args[0]), 0)); // arguments +} + +template <> +void FusedOp::Backward(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + std::cout << "Backward!" << std::endl; +} + +template <> +bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + std::vector input_shapes(*in_attrs); + this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), + std::move(input_shapes), + "__shape__"); + + const auto& g = this->symbol_.indexed_graph(); + + std::vector out_shapes; + const std::vector shapes = this->symbol_.GetAttr("shape"); + for (auto& e : g.outputs()) { + out_shapes.push_back(shapes[g.entry_id(e)]); + } + CHECK_EQ(out_shapes.size(), out_attrs->size()); + for (size_t i = 0; i < out_attrs->size(); ++i) { + op::shape_assign(&(out_attrs->at(i)), out_shapes[i]); + } + bool inferred = true; + for (const auto& attr : *in_attrs) { + inferred = inferred && !op::shape_is_none(attr); + } + for (const auto& attr : *out_attrs) { + inferred = inferred && !op::shape_is_none(attr); + } + return inferred; +} + +template <> +bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + std::vector input_types(*in_attrs); + this->symbol_ = mxnet::exec::InferType(std::move(this->symbol_), + std::move(input_types), + "__dtype__"); + + const auto& g = this->symbol_.indexed_graph(); + + std::vector out_types; + const std::vector types = this->symbol_.GetAttr("dtype"); + for (auto& e : g.outputs()) { + out_types.push_back(types[g.entry_id(e)]); + } + CHECK_EQ(out_types.size(), out_attrs->size()); + for (size_t i = 0; i < out_attrs->size(); ++i) { + op::type_assign(&(out_attrs->at(i)), out_types[i]); + } + bool inferred = true; + for (const auto& attr : *in_attrs) { + inferred = inferred && !op::type_is_none(attr); + } + for (const auto& attr : *out_attrs) { + inferred = inferred && !op::type_is_none(attr); + } + const bool types_known = inferred; + if (types_known) { + LOG(INFO) << "Without types"; + LOG(INFO) << code_; + LOG(INFO) << "Filling type information"; + std::string aux_code = ""; + std::string kernel_params = ""; + size_t num_params = in_attrs->size() + out_attrs->size(); + size_t i = 0; + for (const auto &type : *in_attrs) { + std::string type_name = detail::mshadowTypeToString(type); + aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; + kernel_params += "DType" + std::to_string(i) + "* input" + std::to_string(i); + ++i; + if (i < num_params) { + kernel_params += ", "; + } + } + for (const auto &type : *out_attrs) { + std::string type_name = detail::mshadowTypeToString(type); + aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; + kernel_params += "DType" + std::to_string(i) + "* output" + + std::to_string(i - in_attrs->size()); + ++i; + if (i < num_params) { + kernel_params += ", "; + } + } + code_ = detail::fp16_support_string + "\n" + + detail::fused_op_function_definitions + "\n" + + aux_code + "\n" + + "__global__ void FusedKernel_" + attrs.name + + "(size_t N, " + kernel_params + ") {\n" + + detail::fused_op_kernel_begin + "\n" + + code_ + "\n" + + detail::fused_op_kernel_end; + LOG(INFO) << code_; + nvrtcProgram program; + NVRTC_CALL( + nvrtcCreateProgram(&program, // prog + &code_[0], // buffer + (attrs.name + "_kernel.cu").c_str(), // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + const char *opts[] = {"--gpu-architecture=compute_70", + "--std=c++11", + "-default-device"}; + const std::string kernel_name_demangled = "FusedKernel_" + attrs.name; + NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); + + nvrtcResult compileResult = nvrtcCompileProgram(program, // prog + 3, // numOptions + opts); // options + // Obtain compilation log from the program. + size_t logSize; + NVRTC_CALL(nvrtcGetProgramLogSize(program, &logSize)); + std::string log(logSize, '\0'); + NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); + CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; + // Obtain PTX from the program. + size_t ptxSize; + NVRTC_CALL(nvrtcGetPTXSize(program, &ptxSize)); + ptx_.reserve(ptxSize); + NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); + const char *name; + NVRTC_CALL(nvrtcGetLoweredName(program, + kernel_name_demangled.c_str(), + &name)); + kernel_name_ = name; + // Destroy the program. + NVRTC_CALL(nvrtcDestroyProgram(&program)); + } + return types_known; +} + + + +void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + op->Forward(attrs, ctx, inputs, req, outputs); +} +void FusedOpBackwardGPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + op->Backward(attrs, ctx, inputs, req, outputs); +} + +bool FusedOpInferShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + return op->InferShape(attrs, in_attrs, out_attrs); +} + +bool FusedOpInferType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + return op->InferType(attrs, in_attrs, out_attrs); +} + +NNVM_REGISTER_OP(FusedOp) +.set_attr("FInferShape", FusedOpInferShape) +.set_attr("FInferType", FusedOpInferType) +.set_attr("FCompute", FusedOpForwardGPU); + +} // namespace mxnet diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h new file mode 100644 index 000000000000..decf2026ce4b --- /dev/null +++ b/src/operator/fusion/fused_op.h @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef MXNET_OPERATOR_FUSION_FUSED_OP_H_ +#define MXNET_OPERATOR_FUSION_FUSED_OP_H_ + +#include +#include +#include +#include + +namespace mxnet { + +struct FusedOpConfig : public dmlc::Parameter { + std::string code; + std::string symbol_json; + int num_inputs; + int num_outputs; + DMLC_DECLARE_PARAMETER(FusedOpConfig) { + DMLC_DECLARE_FIELD(code) + .describe("Generated code."); + DMLC_DECLARE_FIELD(symbol_json) + .describe("JSON of the replaced symbol."); + DMLC_DECLARE_FIELD(num_inputs) + .describe("Number of inputs."); + DMLC_DECLARE_FIELD(num_outputs) + .describe("Number of outputs."); + } +}; + +struct FusedOpEntry {}; + +class FusedOp { + public: + static const int NTHREADS = 512; + + explicit FusedOp(const FusedOpConfig& config); + ~FusedOp() {} + uint32_t num_inputs() const { + return inputs_.size(); + } + uint32_t num_outputs() const { + return outputs_.size(); + } + uint32_t num_backward_inputs() const { + return backward_inputs_.size(); + } + uint32_t num_backward_outputs() const { + return backward_outputs_.size(); + } + + template + void Forward(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); + + template + void Backward(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); + + template + bool InferShape(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs); + + template + bool InferType(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs); + + private: + std::vector inputs_; + std::vector outputs_; + std::vector backward_inputs_; + std::vector backward_outputs_; + + std::string code_; + nnvm::Graph symbol_; + std::string ptx_; + std::string kernel_name_; + bool initialized_; + CUfunction kernel_; +}; + +using FusedOpPtr = std::shared_ptr; + +} // namespace mxnet + +#endif // MXNET_OPERATOR_FUSION_FUSED_OP_H_ From 0e1774f55f19439715f3ca40d76d5531c1c6760c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 29 Apr 2019 05:53:38 -0700 Subject: [PATCH 002/105] Code generation from the given JSON --- src/operator/fusion/fused_op-inl.h | 327 +++++++++++++++++++++++++++-- src/operator/fusion/fused_op.cc | 5 +- src/operator/fusion/fused_op.cu | 319 +++++++++++++++++++--------- src/operator/fusion/fused_op.h | 27 +-- 4 files changed, 546 insertions(+), 132 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 26a0ec93b6f0..e82afeec63eb 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -21,6 +21,9 @@ #define MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ #include +#include +#include +#include "../nn/activation-inl.h" namespace mxnet { @@ -55,6 +58,173 @@ const std::string fp16_support_string = R"code( typedef __half half; )code"; +const std::string type_support_string = R"code( +using float32 = float; +using float64 = double; +using float16 = half; +using uint8 = unsigned char; +using int8 = char; +using int32 = int; +using int64 = long long; +)code"; + +const std::map fused_op_binary_ops = { + {"elemwise_add", "add"}, + {"_plus" , "add"}, + {"_Plus" , "add"}, + {"_add" , "add"}, + {"elemwise_sub", "sub"}, + {"_minus" , "sub"}, + {"_Minus" , "sub"}, + {"_sub" , "sub"}, + {"elemwise_mul", "mul"}, + {"_mul" , "mul"}, + {"_Mul" , "mul"}, + {"elemwise_div", "div"}, + {"_div" , "div"}, + {"_Div" , "div"}, + {"_Power" , "pow"}, + {"_power" , "pow"}, + {"_Maximum" , "max"}, + {"_maximum" , "max"}, + {"_Minimum" , "min"}, + {"_minimum" , "min"} +}; + +const std::map fused_op_unary_ops = { + {"amp_cast" , "identity"}, + {"relu" , "relu"}, + {"sigmoid" , "sigmoid"}, + {"softsign" , "softsign"}, + {"exp" , "exp"}, + {"expm1" , "expm1"}, + {"log" , "log"}, + {"log10" , "log10"}, + {"log2" , "log2"}, + {"log1p" , "log1p"}, + {"degrees" , "degrees"}, + {"radians" , "radians"}, + {"sin" , "sin"}, + {"cos" , "cos"}, + {"tan" , "tan"}, + {"arcsin" , "arcsin"}, + {"arccos" , "arccos"}, + {"arccos" , "arccos"}, + {"arctan" , "arctan"}, + {"sinh" , "sinh"}, + {"cosh" , "cosh"}, + {"tanh" , "tanh"}, + {"arcsinh" , "arcsinh"}, + {"arccosh" , "arccosh"}, + {"arctanh" , "arctanh"}, + {"sqrt" , "sqrt"}, + {"rsqrt" , "rsqrt"}, + {"cbrt" , "cbrt"}, + {"rcbrt" , "rcbrt"}, + {"square" , "square"}, + {"squeeze" , "identity"}, + {"zeros_like" , "zero"}, + {"ones_like" , "one"}, + {"flatten" , "identity"}, + {"Reshape" , "identity"}, + {"reshape" , "identity"}, + {"expand_dims", "identity"}, + {"round" , "round"}, + {"rint" , "rint"}, + {"fix" , "fix"}, + {"floor" , "floor"}, + {"ceil" , "ceil"}, + {"trunc" , "trunc"}, + {"sign" , "sign"}, + {"reciprocal" , "reciprocal"}, + {"abs" , "abs"}, + {"gamma" , "gamma"}, + {"gammaln" , "gammaln"}, + {"erf" , "erf"}, + {"erfinv" , "erfinv"}, + {"_copy" , "identity"} +}; + +const std::map> fused_op_special_ops = { + {"_plus_scalar", {"add(%, %)", "_0", "scalar"}}, + {"_PlusScalar", {"add(%, %)", "_0", "scalar"}}, + {"_minus_scalar", {"sub(%, %)", "_0", "scalar"}}, + {"_MinusScalar", {"sub(%, %)", "_0", "scalar"}}, + {"_rminus_scalar", {"(-sub(%, %))", "_0", "scalar"}}, + {"_RMinusScalar", {"(-sub(%, %))", "_0", "scalar"}}, + {"_mul_scalar", {"mul(%, %)", "_0", "scalar"}}, + {"_MulScalar", {"mul(%, %)", "_0", "scalar"}}, + {"_div_scalar", {"div(%, %)", "_0", "scalar"}}, + {"_DivScalar", {"div(%, %)", "_0", "scalar"}}, + {"_rdiv_scalar", {"rdiv(%, %)", "_0", "scalar"}}, + {"_RDivScalar", {"rdiv(%, %)", "_0", "scalar"}}, + {"Cast", {"cast<%>(%)", "dtype", "_0"}}, + {"cast", {"cast<%>(%)", "dtype", "_0"}}, + {"Activation", {"%(%)", "act_type", "_0"}}, + {"clip", {"clip(%, %, %)", "_0", "a_min", "a_max"}}, + {"_zeros", {"zero<%>(0)", "dtype"}}, + {"_ones", {"one<%>(0)", "dtype"}}, + {"negative", {"(-%)", "_0"}}, + {"_hypot", {"hypot(%, %)", "_0", "_1"}}, + {"_hypot_scalar", {"hypot(%, %)", "_0", "scalar"}}, + {"_backward_relu", {"backward_relu(%, %)", "_1", "_0"}}, + {"_backward_sigmoid", {"backward_sigmoid(%, %)", "_1", "_0"}}, + {"_backward_Activation", {"((% == " + std::to_string(mxnet::op::activation::kReLU) + + " || % == " + std::to_string(mxnet::op::activation::kSigmoid) + + " || % == " + std::to_string(mxnet::op::activation::kTanh) + + ") ? backward_%(%, %) : backward_%(%, %))", + "act_type", "act_type", "act_type", "act_type", + "_1", "_0", "_2", "_0"}}, + {"_backward_expm1", {"backward_expm1(%, %)", "_1", "_0"}}, + {"_backward_log", {"backward_log(%, %)", "_1", "_0"}}, + {"_backward_log10", {"backward_log10(%, %)", "_1", "_0"}}, + {"_backward_log2", {"backward_log2(%, %)", "_1", "_0"}}, + {"_backward_log1p", {"backward_log1p(%, %)", "_1", "_0"}}, + {"_backward_sin", {"backward_sin(%, %)", "_1", "_0"}}, + {"_backward_cos", {"backward_cos(%, %)", "_1", "_0"}}, + {"_backward_tan", {"backward_tan(%, %)", "_1", "_0"}}, + {"_backward_arcsin", {"backward_arcsin(%, %)", "_1", "_0"}}, + {"_backward_arccos", {"backward_arccos(%, %)", "_1", "_0"}}, + {"_backward_arctan", {"backward_arctan(%, %)", "_1", "_0"}}, + {"_backward_sinh", {"backward_sinh(%, %)", "_1", "_0"}}, + {"_backward_cosh", {"backward_cosh(%, %)", "_1", "_0"}}, + {"_backward_tanh", {"backward_tanh(%, %)", "_1", "_0"}}, + {"_backward_arcsinh", {"backward_arcsinh(%, %)", "_1", "_0"}}, + {"_backward_arccosh", {"backward_arccosh(%, %)", "_1", "_0"}}, + {"_backward_arctanh", {"backward_arctanh(%, %)", "_1", "_0"}}, + {"_backward_sqrt", {"backward_sqrt(%, %)", "_1", "_0"}}, + {"_backward_rsqrt", {"backward_rsqrt(%, %)", "_1", "_0"}}, + {"_backward_cbrt", {"backward_cbrt(%, %)", "_1", "_0"}}, + {"_backward_rcbrt", {"backward_rcbrt(%, %)", "_1", "_0"}}, + {"_backward_square", {"backward_square(%, %)", "_1", "_0"}}, + {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, + {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, + {"_backward_rdiv_scalar", {"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}, + {"_backward_hypot_scalar", {"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}} + // TODO(ptredak): arange +}; + +// Multiple inputs/multiple outputs +const std::map>> fused_op_mimo_ops = { + {"_backward_sub", {{"(%)", "_0"}, + {"(-(%))", "_0"}}}, + {"_backward_mul", {{"(% * %)", "_0", "_2"}, + {"(% * %)", "_0", "_1"}}}, + {"_backward_mul_scalar", {{"(% * %)", "_0", "scalar"}}}, + {"_backward_div", {{"(% / %)", "_0", "_2"}, + {"(-% * % / (% * %))", "_0", "_1", "_2", "_2"}}}, + {"_backward_power", {{"(% * % * powf(%, % - 1))", "_0", "_2", "_1", "_2"}, + {"(% * powf(%, %) & logf(%))", "_0", "_1", "_2", "_1"}}}, + {"_backward_power_scalar", {{"(% * % * powf(%, % - 1))", "_0", "scalar", "_1", "scalar"}}}, + {"_backward_rpower_scalar", {{"(% * powf(%, %) & logf(%))", "_0", "scalar", "_2", "scalar"}}}, + {"_backward_maximum", {{"((% > %) ? % : 0)", "_1", "_2", "_0"}, + {"((% > %) ? 0 : %)", "_1", "_2", "_0"}}}, + {"_backward_minimum", {{"((% < %) ? % : 0)", "_1", "_2", "_0"}, + {"((% < %) ? 0 : %)", "_1", "_2", "_0"}}}, + {"_backward_hypot", {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "_2"}, + {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} +}; + const std::string fused_op_function_definitions = R"code( template struct LoadType { @@ -86,6 +256,11 @@ inline void store(const float value, int i, half * output) { output[i] = __float2half(value); } +template +inline DType identity(const DType val) { + return val; +} + template inline DType add(const DType a, const DType2 b) { return a + b; @@ -106,6 +281,11 @@ inline DType div(const DType a, const DType2 b) { return a / b; } +template +inline DType rdiv(const DType a, const DType2 b) { + return b / a; +} + template inline DType pow(const DType a, const DType2 b) { return powf(a, b); @@ -121,9 +301,21 @@ inline DType min(const DType a, const DType2 b) { return a < b ? a : b; } +template +inline DType hypot(const DType a, const DType2 b) { + return hypotf(a, b); +} + template -inline OutType cast(const DType val) { - return static_cast(val); +inline typename LoadType::Type cast(const DType val) { + return static_cast::Type>(val); +} + +// TODO(ptredak): this is not exactly identity, needs type inference +// in the middle of the graph to do it right +template +inline DType amp_multicast(const DType val) { + return val; } // activations @@ -144,9 +336,8 @@ inline DType sigmoid(const DType val) { } template -inline DType backward_sigmoid(const DType val, const DType grad) { - const DType ep1 = 1 + expf(-val); - return grad * expf(-val)/(ep1*ep1); +inline DType backward_sigmoid(const DType out, const DType grad) { + return grad * out * (1 - out); } template @@ -161,12 +352,12 @@ inline DType backward_softrelu(const DType val, const DType grad) { template inline DType softsign(const DType val) { - return val / (1 + absf(val)); + return val / (1 + fabsf(val)); } template inline DType backward_softsign(const DType val, const DType grad) { - const DType ap1 = 1 + absf(val); + const DType ap1 = 1 + fabsf(val); return grad / (ap1 * ap1); } @@ -234,6 +425,18 @@ inline DType backward_log1p(const DType val, const DType grad) { // trigonometric +constexpr double pi = 3.14159265358979323846; + +template +inline DType degrees(const DType val) { + return (val / pi) * 180; +} + +template +inline DType radians(const DType val) { + return (val / 180.0) * pi; +} + template inline DType sin(const DType val) { return sinf(val); @@ -261,8 +464,8 @@ inline DType tan(const DType val) { // Uses output from tan template -inline DType backward_tan(const DType val, const DType grad) { - return grad * (val * val + 1); +inline DType backward_tan(const DType out, const DType grad) { + return grad * (out * out + 1); } template @@ -322,8 +525,8 @@ inline DType tanh(const DType val) { // Uses tanh output template -inline DType backward_tanh(const DType val, const DType grad) { - return grad * (1 - val * val); +inline DType backward_tanh(const DType out, const DType grad) { + return grad * (1 - out * out); } template @@ -364,8 +567,8 @@ inline DType sqrt(const DType val) { } template -inline DType backward_sqrt(const DType val, const DType grad) { - return 0.5 * grad * rsqrtf(val); +inline DType backward_sqrt(const DType out, const DType grad) { + return 0.5 * grad / out; } template @@ -385,9 +588,8 @@ inline DType cbrt(const DType val) { } template -inline DType backward_cbrt(const DType val, const DType grad) { - const DType inv = rcbrtf(val); - return 1.f/3.f * grad * inv * inv; +inline DType backward_cbrt(const DType out, const DType grad) { + return grad / (3.0f * out * out); } template @@ -401,6 +603,99 @@ inline DType backward_rcbrt(const DType val, const DType grad) { return -1.f/3.f * grad * cbrtf(inv) * inv; } +template +inline DType square(const DType val) { + return val * val; +} + +template +inline DType backward_square(const DType val, const DType grad) { + return 2 * val * grad; +} + +template +inline DType zero(const DType val) { + return 0; +} + +template +inline DType one(const DType val) { + return 1; +} + +template +inline DType round(const DType val) { + return roundf(val); +} + +template +inline DType rint(const DType val) { + return rintf(val); +} + +template +inline DType fix(const DType val) { + const auto floor = floorf(val); + const auto ceil = ceilf(val); + return (floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil; +} + +template +inline DType floor(const DType val) { + return floorf(val); +} + +template +inline DType ceil(const DType val) { + return ceilf(val); +} + +template +inline DType trunc(const DType val) { + return truncf(val); +} + +template +inline DType clip(const DType val, const float a_min, const float a_max) { + return max(min(val, a_max), a_min); +} + +template +inline DType sign(const DType val) { + if (val < 0) return -1; + return val > 0 ? 1 : 0; +} + +template +inline DType reciprocal(const DType val) { + return 1.0f / val; +} + +template +inline DType abs(const DType val) { + return fabsf(val); +} + +template +inline DType gamma(const DType val) { + return tgammaf(val); +} + +template +inline DType gammaln(const DType val) { + return lgammaf(val); +} + +template +inline DType erf(const DType val) { + return erff(val); +} + +template +inline DType erfinv(const DType val) { + return erfinvf(val); +} + )code"; const std::string fused_op_kernel_begin = R"code( diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index ced2da00f234..48ce8b2a182a 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -39,9 +39,8 @@ void FusedOpParamParser(nnvm::NodeAttrs* attrs) { os << ")"; throw dmlc::ParamError(os.str()); } - if (!param.code.empty()) { - attrs->parsed = FusedOpPtr(new FusedOp(param)); - } + CHECK(!param.symbol_json.empty()); + attrs->parsed = FusedOpPtr(new FusedOp(param)); } NNVM_REGISTER_OP(FusedOp) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 8390d899c16b..31692d9dc1d5 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -24,6 +24,7 @@ #include "../operator_common.h" #include "../elemwise_op_common.h" #include "../../executor/exec_pass.h" +#include "../../common/cuda_utils.h" #include namespace mxnet { @@ -55,11 +56,132 @@ inline std::string mshadowTypeToString(int type) { } // namespace detail FusedOp::FusedOp(const FusedOpConfig& config) { - this->code_ = config.code; this->inputs_ = std::vector(config.num_inputs); this->outputs_ = std::vector(config.num_outputs); this->symbol_ = nnvm::pass::LoadJSON(config.symbol_json); this->initialized_ = false; + this->cc_major_ = -1; + this->cc_minor_ = -1; + + this->GenerateCode(); +} + +void FusedOp::GenerateCode() { + const auto& g = this->symbol_.indexed_graph(); + std::string code = ""; + int temp_name_counter = 0; + using NodeEntry = nnvm::IndexedGraph::NodeEntry; + std::map, std::string> variables; + + std::vector outputs(g.num_nodes()); + + for (size_t i = 0; i < g.num_nodes(); ++i) { + const auto& node = g[i]; + if (node.source != nullptr) { + outputs[i] = node.source->num_outputs(); + } else { + outputs[i] = 0; + } + } + + for (size_t i = 0; i < g.num_nodes(); ++i) { + const auto& node = g[i]; + const auto* source = node.source; + if (source != nullptr) { + std::string var_name = "temp" + std::to_string(temp_name_counter++); + if (source->is_variable()) { + code += "const auto " + var_name + " = load(" + source->attrs.name + ", i);\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = var_name; + } else { + std::string op_name = source->op()->name; + if (detail::fused_op_binary_ops.find(op_name) != detail::fused_op_binary_ops.end()) { + std::string op = detail::fused_op_binary_ops.at(op_name); + const auto& arg1 = variables[{node.inputs[0].node_id, node.inputs[0].index}]; + const auto& arg2 = variables[{node.inputs[1].node_id, node.inputs[1].index}]; + code += "const auto " + var_name + " = " + op + + "(" + arg1 + ", " + arg2 + ");\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = var_name; + continue; + } + + if (detail::fused_op_unary_ops.find(op_name) != detail::fused_op_unary_ops.end()) { + std::string op = detail::fused_op_unary_ops.at(op_name); + const auto& arg1 = variables[{node.inputs[0].node_id, node.inputs[0].index}]; + code += "const auto " + var_name + " = " + op + + "(" + arg1 + ");\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = var_name; + continue; + } + + if (detail::fused_op_special_ops.find(op_name) != detail::fused_op_special_ops.end()) { + const std::vector& op_desc = detail::fused_op_special_ops.at(op_name); + std::string fmt = op_desc[0]; + for (size_t j = 1; j < op_desc.size(); ++j) { + const std::string& desc = op_desc[j]; + std::string sub; + if (desc[0] == '_') { + // Argument + int arg_id = std::stoi(desc.substr(1)); + sub = variables[{node.inputs[arg_id].node_id, node.inputs[arg_id].index}]; + } else { + sub = source->attrs.dict.at(desc); + } + size_t pos = fmt.find("%"); + CHECK_NE(pos, std::string::npos); + fmt.replace(pos, 1, sub); + } + code += "const auto " + var_name + " = " + fmt + ";\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = var_name; + continue; + } + + if (detail::fused_op_mimo_ops.find(op_name) != detail::fused_op_mimo_ops.end()) { + const std::vector>& op_descs = + detail::fused_op_mimo_ops.at(op_name); + CHECK_EQ(outputs[i], op_descs.size()); + size_t count = 0; + for (const auto& op_desc : op_descs) { + var_name = "temp" + std::to_string(temp_name_counter++); + std::string fmt = op_desc[0]; + for (size_t j = 1; j < op_desc.size(); ++j) { + const std::string& desc = op_desc[j]; + std::string sub; + if (desc[0] == '_') { + // Argument + int arg_id = std::stoi(desc.substr(1)); + sub = variables[{node.inputs[arg_id].node_id, node.inputs[arg_id].index}]; + } else { + sub = source->attrs.dict.at(desc); + } + size_t pos = fmt.find("%"); + CHECK_NE(pos, std::string::npos); + fmt.replace(pos, 1, sub); + } + code += "const auto " + var_name + " = " + fmt + ";\n"; + variables[{i, count}] = var_name; + ++count; + } + continue; + } + LOG(FATAL) << "Unrecognized op " + op_name; + } + } else { + LOG(FATAL) << "Encountered node with NULL source."; + } + } + + int counter = 0; + for (const auto& entry : g.outputs()) { + const std::string& var = variables[{entry.node_id, entry.index}]; + code += "store(" + var + ", i, output" + std::to_string(counter) + ");\n"; + ++counter; + } + + this->code_ = code; } template <> @@ -71,7 +193,108 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, using namespace mshadow; CHECK_GE(outputs.size(), 1) << "There needs to be at least 1 output."; + std::vector in_dtypes; + std::vector out_dtypes; + + size_t counter = 0; + for (const auto& blob : inputs) { + in_dtypes.push_back(blob.type_flag_); + initialized_ = initialized_ && (blob.type_flag_ == inputs_[counter].dtype); + inputs_[counter].dtype = blob.type_flag_; + ++counter; + } + + counter = 0; + for (const auto& blob : outputs) { + out_dtypes.push_back(blob.type_flag_); + initialized_ = initialized_ && (blob.type_flag_ == outputs_[counter].dtype); + outputs_[counter].dtype = blob.type_flag_; + ++counter; + } + + // Get compute capability of the current GPU + int dev_id = ctx.run_ctx.ctx.dev_id; + int cc_major = ComputeCapabilityMajor(dev_id); + int cc_minor = ComputeCapabilityMinor(dev_id); + + initialized_ = initialized_ && cc_major == this->cc_major_; + initialized_ = initialized_ && cc_minor == this->cc_minor_; + this->cc_major_ = cc_major; + this->cc_minor_ = cc_minor; + if (!initialized_) { + LOG(INFO) << code_; + std::string aux_code = ""; + std::string kernel_params = ""; + size_t num_params = in_dtypes.size() + out_dtypes.size(); + size_t i = 0; + for (const auto &type : in_dtypes) { + std::string type_name = detail::mshadowTypeToString(type); + aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; + kernel_params += "DType" + std::to_string(i) + "* input" + std::to_string(i); + ++i; + if (i < num_params) { + kernel_params += ", "; + } + } + for (const auto &type : out_dtypes) { + std::string type_name = detail::mshadowTypeToString(type); + aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; + kernel_params += "DType" + std::to_string(i) + "* output" + + std::to_string(i - in_dtypes.size()); + ++i; + if (i < num_params) { + kernel_params += ", "; + } + } + code_ = detail::fp16_support_string + "\n" + + detail::type_support_string + "\n" + + detail::fused_op_function_definitions + "\n" + + aux_code + "\n" + + "__global__ void FusedKernel_" + attrs.name + + "(size_t N, " + kernel_params + ") {\n" + + detail::fused_op_kernel_begin + "\n" + + code_ + "\n" + + detail::fused_op_kernel_end; + nvrtcProgram program; + NVRTC_CALL( + nvrtcCreateProgram(&program, // prog + &code_[0], // buffer + (attrs.name + "_kernel.cu").c_str(), // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + std::string gpu_arch = "--gpu-architecture=compute_" + + std::to_string(this->cc_major_) + + std::to_string(this->cc_minor_); + + const char *opts[] = {gpu_arch.c_str(), + "--std=c++11", + "-default-device"}; + const std::string kernel_name_demangled = "FusedKernel_" + attrs.name; + NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); + + nvrtcResult compileResult = nvrtcCompileProgram(program, // prog + 3, // numOptions + opts); // options + // Obtain compilation log from the program. + size_t logSize; + NVRTC_CALL(nvrtcGetProgramLogSize(program, &logSize)); + std::string log(logSize, '\0'); + NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); + CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; + // Obtain PTX from the program. + size_t ptxSize; + NVRTC_CALL(nvrtcGetPTXSize(program, &ptxSize)); + ptx_.reserve(ptxSize); + NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); + const char *name; + NVRTC_CALL(nvrtcGetLoweredName(program, + kernel_name_demangled.c_str(), + &name)); + kernel_name_ = name; + // Destroy the program. + NVRTC_CALL(nvrtcDestroyProgram(&program)); int device; CUdevice cuDevice; CUcontext context; @@ -115,15 +338,6 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, &(args[0]), 0)); // arguments } -template <> -void FusedOp::Backward(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - std::cout << "Backward!" << std::endl; -} - template <> bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, @@ -181,84 +395,9 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, for (const auto& attr : *out_attrs) { inferred = inferred && !op::type_is_none(attr); } - const bool types_known = inferred; - if (types_known) { - LOG(INFO) << "Without types"; - LOG(INFO) << code_; - LOG(INFO) << "Filling type information"; - std::string aux_code = ""; - std::string kernel_params = ""; - size_t num_params = in_attrs->size() + out_attrs->size(); - size_t i = 0; - for (const auto &type : *in_attrs) { - std::string type_name = detail::mshadowTypeToString(type); - aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* input" + std::to_string(i); - ++i; - if (i < num_params) { - kernel_params += ", "; - } - } - for (const auto &type : *out_attrs) { - std::string type_name = detail::mshadowTypeToString(type); - aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* output" + - std::to_string(i - in_attrs->size()); - ++i; - if (i < num_params) { - kernel_params += ", "; - } - } - code_ = detail::fp16_support_string + "\n" + - detail::fused_op_function_definitions + "\n" + - aux_code + "\n" + - "__global__ void FusedKernel_" + attrs.name + - "(size_t N, " + kernel_params + ") {\n" + - detail::fused_op_kernel_begin + "\n" + - code_ + "\n" + - detail::fused_op_kernel_end; - LOG(INFO) << code_; - nvrtcProgram program; - NVRTC_CALL( - nvrtcCreateProgram(&program, // prog - &code_[0], // buffer - (attrs.name + "_kernel.cu").c_str(), // name - 0, // numHeaders - NULL, // headers - NULL)); // includeNames - const char *opts[] = {"--gpu-architecture=compute_70", - "--std=c++11", - "-default-device"}; - const std::string kernel_name_demangled = "FusedKernel_" + attrs.name; - NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); - - nvrtcResult compileResult = nvrtcCompileProgram(program, // prog - 3, // numOptions - opts); // options - // Obtain compilation log from the program. - size_t logSize; - NVRTC_CALL(nvrtcGetProgramLogSize(program, &logSize)); - std::string log(logSize, '\0'); - NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); - CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; - // Obtain PTX from the program. - size_t ptxSize; - NVRTC_CALL(nvrtcGetPTXSize(program, &ptxSize)); - ptx_.reserve(ptxSize); - NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); - const char *name; - NVRTC_CALL(nvrtcGetLoweredName(program, - kernel_name_demangled.c_str(), - &name)); - kernel_name_ = name; - // Destroy the program. - NVRTC_CALL(nvrtcDestroyProgram(&program)); - } - return types_known; + return inferred; } - - void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, @@ -267,14 +406,6 @@ void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, const FusedOpPtr& op = nnvm::get(attrs.parsed); op->Forward(attrs, ctx, inputs, req, outputs); } -void FusedOpBackwardGPU(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - const FusedOpPtr& op = nnvm::get(attrs.parsed); - op->Backward(attrs, ctx, inputs, req, outputs); -} bool FusedOpInferShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index decf2026ce4b..f257f5f4b06d 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -28,13 +28,10 @@ namespace mxnet { struct FusedOpConfig : public dmlc::Parameter { - std::string code; std::string symbol_json; int num_inputs; int num_outputs; DMLC_DECLARE_PARAMETER(FusedOpConfig) { - DMLC_DECLARE_FIELD(code) - .describe("Generated code."); DMLC_DECLARE_FIELD(symbol_json) .describe("JSON of the replaced symbol."); DMLC_DECLARE_FIELD(num_inputs) @@ -44,7 +41,10 @@ struct FusedOpConfig : public dmlc::Parameter { } }; -struct FusedOpEntry {}; +struct FusedOpEntry { + FusedOpEntry() : dtype(-1) {} + int dtype; +}; class FusedOp { public: @@ -58,12 +58,6 @@ class FusedOp { uint32_t num_outputs() const { return outputs_.size(); } - uint32_t num_backward_inputs() const { - return backward_inputs_.size(); - } - uint32_t num_backward_outputs() const { - return backward_outputs_.size(); - } template void Forward(const nnvm::NodeAttrs& attrs, @@ -72,13 +66,6 @@ class FusedOp { const std::vector &req, const std::vector &outputs); - template - void Backward(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs); - template bool InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, @@ -90,10 +77,10 @@ class FusedOp { std::vector *out_attrs); private: + void GenerateCode(); + std::vector inputs_; std::vector outputs_; - std::vector backward_inputs_; - std::vector backward_outputs_; std::string code_; nnvm::Graph symbol_; @@ -101,6 +88,8 @@ class FusedOp { std::string kernel_name_; bool initialized_; CUfunction kernel_; + int cc_major_; + int cc_minor_; }; using FusedOpPtr = std::shared_ptr; From 8bf294584541f1b5ac0a96ca6177a6a97c87e2cf Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Sat, 4 May 2019 20:26:26 -0700 Subject: [PATCH 003/105] add initial simple_partition_pass and use it for pointwise fusion --- src/executor/exec_pass.h | 9 + src/executor/graph_executor.cc | 26 ++ src/executor/node_entry_count.cc | 46 ++++ src/executor/simple_partition_pass.h | 386 +++++++++++++++++++++++++++ 4 files changed, 467 insertions(+) create mode 100644 src/executor/node_entry_count.cc create mode 100644 src/executor/simple_partition_pass.h diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index f544d6ba3392..976c592358fa 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -170,6 +170,15 @@ void AttachOpResources(const Graph& g, */ Graph DetectInplaceAddTo(Graph g); +using NodeEntryMapCounter = + std::unordered_map; +/*!\brief + * This is to count how many time each output is used by another node (or the output of the graph) + */ +NodeEntryMapCounter GetNodeEntryCount(const Graph& g); + +Graph FusePointwise(Graph&& g); + /*! * \brief Infer shapes in the graph given the information. * \param graph The input graph. diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index efcb58231ccc..c3567f934f4b 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -784,6 +784,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, const nnvm::NodeEntryMap& feed_dict) { nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, grad_req_types); + // The following code of shape and dtype inferences and argument // initialization is for simple_bind only. Regular bind operation // should do this differently. @@ -980,6 +981,31 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, const std::vector& grad_req_types) { // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); + DFSVisit(g.outputs, [](const nnvm::NodePtr n) { + if (n->op() == nullptr) { + LOG(INFO) << n->attrs.name; + } else { + LOG(INFO) << n->attrs.name << ": " << n->op()->name; + } + for (auto e : n->inputs) { + LOG(INFO) << " - " << e.node->attrs.name; + } + }); + + g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); + g = FusePointwise(std::move(g)); + LOG(INFO) << "\n\n"; + LOG(INFO) << "AFTER:"; + DFSVisit(g.outputs, [](const nnvm::NodePtr n) { + if (n->op() == nullptr) { + LOG(INFO) << n->attrs.name; + } else { + LOG(INFO) << n->attrs.name << ": " << n->op()->name; + } + for (auto e : n->inputs) { + LOG(INFO) << " - " << e.node->attrs.name; + } + }); // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, diff --git a/src/executor/node_entry_count.cc b/src/executor/node_entry_count.cc new file mode 100644 index 000000000000..938b005bb46f --- /dev/null +++ b/src/executor/node_entry_count.cc @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file node_entry_count.cc + * \brief function that count how many times a node entry is used + * \author Clement Fuji Tsang + */ +#include "./exec_pass.h" + +// TODO(cfujitsang): should this be pushed to nnvm repository ? +namespace mxnet { +namespace exec { + +NodeEntryMapCounter GetNodeEntryCount(const nnvm::Graph& g) { + NodeEntryMapCounter outputs; + DFSVisit(g.outputs, [&outputs](const nnvm::NodePtr& node) { + for (auto e : node->inputs) { + outputs[e]++; + } + }); + for (auto e : g.outputs) { + outputs[e]++; + } + return outputs; +} + +} // namespace exec +} // namespace mxnet diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h new file mode 100644 index 000000000000..ec7f160f9c5f --- /dev/null +++ b/src/executor/simple_partition_pass.h @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file simple_partition_pass.h + * \brief + * \author Clement Fuji Tsang + */ +#ifndef MXNET_EXECUTOR_SIMPLE_PARTITION_PASS_H_ +#define MXNET_EXECUTOR_SIMPLE_PARTITION_PASS_H_ + +#include +#include +#include +#include + +#include "exec_pass.h" + +namespace mxnet { +namespace exec { + + +/*! + * \brief Custom graph class, which will contain bi-directional nodes + * we need to compute DFS and reverse DFS for graph partitioning + */ +class BidirectionalGraph { + public: + struct Node { + nnvm::Node* nnvmptr; + std::vector inputs; + std::vector outputs; + }; + + explicit BidirectionalGraph(const Graph &g) { + auto& idx = g.indexed_graph(); + auto num_nodes = idx.num_nodes(); + nodes.reserve(num_nodes); + nnvm2nid.reserve(num_nodes); + outputs.reserve(idx.outputs().size()); + DFSVisit(g.outputs, [this](const nnvm::NodePtr& n) { + Node new_node; + new_node.nnvmptr = n.get(); + nnvm2nid[n.get()] = static_cast(nodes.size()); + nodes.emplace_back(std::move(new_node)); + }); + for (const auto& it : nnvm2nid) { + nnvm::Node* nnvmnode = it.first; + uint32_t nid = it.second; + for (auto& n : nnvmnode->inputs) { + uint32_t input_nid = nnvm2nid[n.node.get()]; + nodes[input_nid].outputs.emplace_back(&nodes[nid]); + nodes[nid].inputs.emplace_back(&nodes[input_nid]); + } + } + for (auto& e : g.outputs) { + uint32_t nid = nnvm2nid[e.node.get()]; + outputs.emplace_back(&nodes[nid]); + } + } + + template + std::vector> get_subsets(FCompatible is_compatible) { + std::vector> subgraphs; + std::unordered_set incomp_set; + std::unordered_set all_set(nodes.size()); + std::vector separation_sets; + LOG(INFO) << "1"; + for (Node& node : nodes) { + if (!is_compatible(node.nnvmptr)) { + incomp_set.insert(&node); + std::unordered_set in_graph; + std::unordered_set out_graph; + std::vector dummy_head; + dummy_head.emplace_back(&node); + DFS(dummy_head, false, [&out_graph](Node* node) { + out_graph.insert(node); + }); + DFS(dummy_head, true, [&in_graph](Node* node) { + in_graph.insert(node); + }); + if (!(in_graph.empty() || out_graph.empty())) + separation_sets.push_back(std::make_pair(in_graph, out_graph)); + } + all_set.emplace(&node); + } + LOG(INFO) << "2"; + IncompMap incomp_map; + std::unordered_set comp_set; + comp_set.insert(all_set.begin(), all_set.end()); + for (Node* n : incomp_set) { + comp_set.erase(n); + } + LOG(INFO) << "3"; + for (Node* n : comp_set) { + for (PairSet p : separation_sets) { + if (p.first.count(n)) { + incomp_map[n].insert(p.second.begin(), p.second.end()); + } else if (p.second.count(n)) { + incomp_map[n].insert(p.first.begin(), p.first.end()); + } + } + for (Node* incomp_n : incomp_set) { + incomp_map[n].erase(incomp_n); + } + } + std::unordered_set unused_set; + unused_set.reserve(comp_set.size()); + + for (auto& n : comp_set) { + unused_set.insert(n); + } + LOG(INFO) << "4"; + std::unordered_set visited; + std::deque stack(outputs.begin(), outputs.end()); + while (!stack.empty()) { + Node* vertex = stack.front(); + stack.pop_front(); + if (!visited.count(vertex)) { + visited.insert(vertex); + if (unused_set.count(vertex)) { + subgraphs.emplace_back(naive_grow_subgraph(vertex, &unused_set, &incomp_map)); + } + for (Node* input : vertex->inputs) { + stack.emplace_back(input); + } + } + } + LOG(INFO) << "5"; + return subgraphs; + } + + private: + using PairSet = std::pair, std::unordered_set>; + using PairVec = std::pair, std::vector>; + using IncompMap = std::unordered_map>; + + template + void DFS(const std::vector& heads, bool reverse, FVisit fvisit) { + std::unordered_set visited; + std::vector vec(heads.begin(), heads.end()); + visited.reserve(heads.size()); + while (!vec.empty()) { + Node* vertex = vec.back(); + vec.pop_back(); + if (visited.count(vertex) == 0) { + visited.insert(vertex); + fvisit(vertex); + std::vector nexts = reverse ? vertex->inputs : vertex->outputs; + for (Node* node : nexts) { + if (visited.count(node) == 0) { + vec.emplace_back(node); + } + } + } + } + } + + std::unordered_set naive_grow_subgraph(Node* head, + std::unordered_set* unused_set, + IncompMap* incomp_map) { + std::unordered_set subgraph; + std::unordered_set incomp_set; + std::deque stack; + stack.emplace_back(head); + while (!stack.empty()) { + Node* vertex = stack.back(); + stack.pop_back(); + if (unused_set->count(vertex) && !incomp_set.count(vertex)) { + unused_set->erase(vertex); + subgraph.insert(vertex); + incomp_set.insert((*incomp_map)[vertex].begin(), (*incomp_map)[vertex].end()); + for (Node* input : vertex->inputs) { + if (unused_set->count(input) && !incomp_set.count(input)) { + stack.emplace_back(input); + } + } + for (Node* output : vertex->outputs) { + if (unused_set->count(output) && !incomp_set.count(output)) { + stack.emplace_back(output); + } + } + } + } + return subgraph; + } + + friend class Graph; + + std::vector nodes; + std::unordered_map nnvm2nid; + std::vector outputs; + +}; // class BidirectionalGraph + +using NodeEntrySet = std::unordered_set; +using NodeRawPtrSet = std::unordered_set; + +/*! + * \brief get the output nodes of the subgraph in the main graph + * \return a map between the node in the main graph and the output index of the subgraph node +*/ +// std::vector +nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_set) { + //std::vector outputs; + //NodeEntrySet _outputs; + nnvm::NodeEntryMap outputs; + uint32_t count = 0; + for (auto& e : g.outputs) { + if (subgraph_set.count(e.node.get()) && !outputs.count(e)) { + outputs.insert({e, count++}); + } + } + DFSVisit(g.outputs, [&subgraph_set, &outputs, &count](const nnvm::NodePtr &node){ + if (!subgraph_set.count(node.get())) { + for (auto& e : node->inputs) { + if (subgraph_set.count(e.node.get()) && !outputs.count(e)) { + outputs.insert({e, count++}); + } + } + } + }); + //outputs.insert(outputs.begin(), _outputs.begin(), _outputs.end()); + return outputs; +} + +/*! + * \brief create new input nodes of the subgraph and plug them + * \return the inputs of the subgraph node in the main graph +*/ +std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_set) { + std::vector inputs; + nnvm::NodeEntryMap entry_map; + DFSVisit(g.outputs, [&subgraph_set, &inputs, &entry_map](const nnvm::NodePtr &node){ + if (subgraph_set.count(node.get())) { + for (auto &e : node->inputs) { + if (!subgraph_set.count(e.node.get())) { + if (entry_map.count(e)) { + e = entry_map[e]; + } else { + auto new_node = nnvm::Node::Create(); + new_node->attrs.name = e.node->attrs.name + std::to_string(e.index); + entry_map.insert({e, nnvm::NodeEntry{new_node, 0, 0}}); + inputs.push_back(entry_map.at(e)); + e.node = new_node; + e.index = 0; + } + } + } + } + }); + return inputs; +} + +std::unordered_map GetGraphInputsMap(const Graph& g) { + std::unordered_map outputs; + auto& idx = g.indexed_graph(); + outputs.reserve(idx.num_nodes()); + std::vector input_nodes = idx.input_nodes(); + for (size_t i = 0; i < input_nodes.size(); ++i) { + outputs[input_nodes[i]] = static_cast(i); + } + return outputs; +} + +/*! + * \brief helper function to display what nodes are in a specific subset + */ +void dispNodesSet(Graph g, NodeRawPtrSet s) { + DFSVisit(g.outputs, [&s](const nnvm::NodePtr n){ + if (s.count(n.get())) { + std::cout << " Y " << n->attrs.name << std::endl; + } else { + std::cout << " N " << n->attrs.name << std::endl; + } + }); +} + +/*! + * \brief Replace a set of nodes by a subgraph node + */ +template +Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_sets, + FCreateNode create_subgraph_node) { + for (auto subgraph_set : subgraph_sets) { + // Create MXNet subgraph + Graph subgraph; + const auto sub_outputs_in_main = GetSubgraphOutputs(g, subgraph_set); + subgraph.outputs.resize(sub_outputs_in_main.size()); + for (auto p : sub_outputs_in_main) { + subgraph.outputs[p.second] = p.first; + } + // To generate a subgraph an input have to be replace by data node (no op) + // and it have to be agnostic to the node from which it's an output + // (For exemple even if two inputs are two different outputs from the same node) + auto inputs = GetSubgraphInputs(subgraph, subgraph_set); + auto subgraph_node = create_subgraph_node(subgraph); + subgraph_node->inputs = inputs; + // replug inputs of node out of subgraph to be output of the subgraph node + // if it was a node in the subgraph + DFSVisit(g.outputs, + [&subgraph_node, &subgraph_set, &sub_outputs_in_main](const nnvm::NodePtr node) { + if (!subgraph_set.count(node.get())) { + for (auto &e : node->inputs) { + auto it = sub_outputs_in_main.find(e); + if (it != sub_outputs_in_main.end()) { + e.node = subgraph_node; + e.index = it->second; + } + } + } + }); + // replug outputs of the graph to be output of the subgraph node + // if it was a node in the subgraph + for (auto &e : g.outputs) { + auto it = sub_outputs_in_main.find(e); + if (it != sub_outputs_in_main.end()) { + e.node = subgraph_node; + e.index = it->second; + } + } + // move control dependencies between nodes of the subgraph and out of the subgraph + // to a dependencies between the subgraph node and the nodes out of the subgraph + DFSVisit(g.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { + for (auto &e : node->control_deps) { + if (subgraph_set.count(e.get())) + e = subgraph_node; + } + }); + DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { + auto it = node->control_deps.begin(); + while (it != node->control_deps.end()) { + if (subgraph_set.count(it->get())) { + ++it; + } else { + subgraph_node->control_deps.push_back(*it); + it = node->control_deps.erase(it); + } + } + }); + } + Graph new_graph; + new_graph.outputs = g.outputs; + return new_graph; +} + +template +std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_compatible) { + BidirectionalGraph biG = BidirectionalGraph(g); + std::vector> subsets = biG.get_subsets(is_compatible); + std::vector nnvm_subsets; + nnvm_subsets.reserve(subsets.size()); + for (auto& subset : subsets) { + if (subset.size() > 1) { + NodeRawPtrSet node_set; + node_set.reserve(subset.size()); + for (auto& n : subset) { + node_set.insert(n->nnvmptr); + } + nnvm_subsets.push_back(node_set); + } + } + return nnvm_subsets; +} + +} // namespace exec +} // namespace mxnet +#endif // MXNET_EXECUTOR_SIMPLE_PARTITION_PASS_H_ From 5cbb50d61632a81f513e169f936d8875d85b4ea1 Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Sat, 4 May 2019 22:12:31 -0700 Subject: [PATCH 004/105] fix the fusion, use a symbol.Copy() at the beginning of binding function, use the name of input nodes in the cuda code --- src/executor/graph_executor.cc | 29 +++------------------------- src/executor/simple_partition_pass.h | 7 +------ src/operator/fusion/fused_op.cu | 4 +++- 3 files changed, 7 insertions(+), 33 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index c3567f934f4b..d99e2b00edca 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -981,32 +981,9 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, const std::vector& grad_req_types) { // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); - DFSVisit(g.outputs, [](const nnvm::NodePtr n) { - if (n->op() == nullptr) { - LOG(INFO) << n->attrs.name; - } else { - LOG(INFO) << n->attrs.name << ": " << n->op()->name; - } - for (auto e : n->inputs) { - LOG(INFO) << " - " << e.node->attrs.name; - } - }); g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwise(std::move(g)); - LOG(INFO) << "\n\n"; - LOG(INFO) << "AFTER:"; - DFSVisit(g.outputs, [](const nnvm::NodePtr n) { - if (n->op() == nullptr) { - LOG(INFO) << n->attrs.name; - } else { - LOG(INFO) << n->attrs.name << ": " << n->op()->name; - } - for (auto e : n->inputs) { - LOG(INFO) << " - " << e.node->attrs.name; - } - }); - // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, in_arg_ctxes, @@ -1923,7 +1900,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, arg_stype_map, default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes, &tmp_grad_req_types, &tmp_aux_state_ctxes); } - exec->Init(symbol, default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes, + exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes, tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map, tmp_grad_req_types, shared_arg_names, in_args, arg_grads, aux_states, shared_buffer, shared_exec); return exec; @@ -1948,8 +1925,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol, exec::BuildSubgraph(symbol, exec->subgraph_property(), default_ctx, group2ctx, &tmp_in_args, &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states); } - exec->Init(symbol, default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, tmp_grad_req_type, - tmp_aux_states, reinterpret_cast(shared_exec)); + exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, + tmp_grad_req_type, tmp_aux_states, reinterpret_cast(shared_exec)); return exec; } } // namespace mxnet diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index ec7f160f9c5f..09aa5625be3d 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -82,7 +82,6 @@ class BidirectionalGraph { std::unordered_set incomp_set; std::unordered_set all_set(nodes.size()); std::vector separation_sets; - LOG(INFO) << "1"; for (Node& node : nodes) { if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); @@ -101,14 +100,12 @@ class BidirectionalGraph { } all_set.emplace(&node); } - LOG(INFO) << "2"; IncompMap incomp_map; std::unordered_set comp_set; comp_set.insert(all_set.begin(), all_set.end()); for (Node* n : incomp_set) { comp_set.erase(n); } - LOG(INFO) << "3"; for (Node* n : comp_set) { for (PairSet p : separation_sets) { if (p.first.count(n)) { @@ -127,7 +124,6 @@ class BidirectionalGraph { for (auto& n : comp_set) { unused_set.insert(n); } - LOG(INFO) << "4"; std::unordered_set visited; std::deque stack(outputs.begin(), outputs.end()); while (!stack.empty()) { @@ -143,7 +139,6 @@ class BidirectionalGraph { } } } - LOG(INFO) << "5"; return subgraphs; } @@ -259,7 +254,7 @@ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_s auto new_node = nnvm::Node::Create(); new_node->attrs.name = e.node->attrs.name + std::to_string(e.index); entry_map.insert({e, nnvm::NodeEntry{new_node, 0, 0}}); - inputs.push_back(entry_map.at(e)); + inputs.push_back(e); e.node = new_node; e.index = 0; } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 31692d9dc1d5..55bf8d69ae03 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -226,12 +226,14 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, LOG(INFO) << code_; std::string aux_code = ""; std::string kernel_params = ""; + const nnvm::Symbol& sym = *attrs.subgraphs[0]; + const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* input" + std::to_string(i); + kernel_params += "DType" + std::to_string(i) + "* " +input_names[i]; ++i; if (i < num_params) { kernel_params += ", "; From fcf23c7f7e409da12fe62f158b6eec2f56ad8ce2 Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Sun, 5 May 2019 05:43:51 -0700 Subject: [PATCH 005/105] Fixes --- src/executor/graph_executor.cc | 5 +- src/executor/infer_graph_attr_pass.cc | 8 +++ src/executor/pointwise_fusion_pass.cc | 96 +++++++++++++++++++++++++++ src/operator/fusion/fused_op.cu | 19 +++--- src/operator/fusion/fused_op.h | 4 +- 5 files changed, 120 insertions(+), 12 deletions(-) create mode 100644 src/executor/pointwise_fusion_pass.cc diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index d99e2b00edca..225c41134df5 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -365,6 +365,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol, for (const auto &e : g_grad.outputs) { g.outputs.push_back(e); } + return g; } @@ -983,7 +984,9 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, nnvm::Graph g = InitFullGraph(symbol, grad_req_types); g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); - g = FusePointwise(std::move(g)); + if (dmlc::GetEnv("MXNET_USE_FUSION", true)) { + g = FusePointwise(std::move(g)); + } // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, in_arg_ctxes, diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index d72325392604..1361abc147ff 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -656,6 +656,10 @@ nnvm::Graph InferShape(nnvm::Graph&& graph, if (shape_attr_key.length() != 0) { graph.attrs["shape_attr_key"] = std::make_shared(shape_attr_key); } + std::cout << "Graph attributes before infershape" << std::endl; + for (const auto& kv : graph.attrs) { + std::cout << kv.first << std::endl; + } return InferShapeAttr( std::move(graph), mxnet::TShape(), "FInferShape", "shape_inputs", "shape_attr_key", @@ -686,6 +690,10 @@ nnvm::Graph InferType(nnvm::Graph&& graph, if (dtype_attr_key.length() != 0) { graph.attrs["dtype_attr_key"] = std::make_shared(dtype_attr_key); } + std::cout << "Graph attributes before infertype" << std::endl; + for (const auto& kv : graph.attrs) { + std::cout << kv.first << std::endl; + } return InferAttr( std::move(graph), -1, "FInferType", "dtype_inputs", "dtype_attr_key", diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc new file mode 100644 index 000000000000..8c6361dcd671 --- /dev/null +++ b/src/executor/pointwise_fusion_pass.cc @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file pointwise_fusion_pass.cc + * \brief + * \author Clement Fuji Tsang + */ +#include +#include +#include +#include +#include +#include "./simple_partition_pass.h" +#include "../operator/fusion/fused_op-inl.h" + +namespace mxnet { +namespace exec { +namespace { + bool IsFusionCompatible(nnvm::Node* n) { + using namespace mxnet::detail; + if (n->op() == nullptr) + return false; + std::string op_name = n->op()->name; + if (fused_op_binary_ops.count(op_name)) + return true; + if (fused_op_unary_ops.count(op_name)) + return true; + if (fused_op_special_ops.count(op_name)) + return true; + if (fused_op_mimo_ops.count(op_name)) + return true; + return false; + } + + nnvm::NodePtr CreateSubgraphNode(const Graph& subgraph) { + nnvm::Symbol subgraph_sym; + auto node = nnvm::Node::Create(); + subgraph_sym.outputs = subgraph.outputs; + node->attrs.subgraphs.emplace_back(std::make_shared(subgraph_sym)); + std::ostringstream name_oss, params_oss; + // the name of the new node will be the concatenation of all the node names in the subgraph + DFSVisit(subgraph.outputs, [&name_oss](const nnvm::NodePtr n) { + if (n->op() != nullptr) + name_oss << n->attrs.name << "_"; + }); + auto subgraph_name = name_oss.str(); + subgraph_name.pop_back(); + node->attrs.name = subgraph_name; + // in case the subgraph contains some of the weights + for (auto &e : subgraph_sym.ListInputNames(nnvm::Symbol::kAll)) { + params_oss << e << ";"; + } + auto params_names = params_oss.str(); + params_names.pop_back(); + //node->attrs.dict["subgraph_params_names"] = params_names; + node->attrs.dict["symbol_json"] = nnvm::pass::SaveJSON(subgraph); + node->attrs.dict["num_inputs"] = + std::to_string(subgraph.indexed_graph().input_nodes().size()); + node->attrs.dict["num_outputs"] = std::to_string(subgraph.outputs.size()); + node->attrs.op = Op::Get("FusedOp"); + node->op()->attr_parser(&(node->attrs)); + return node; + } +} + +Graph FusePointwise(Graph &&g) { + const auto & num_forward_output = g.GetAttr("num_forward_outputs"); + Graph fg; + fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), + g.outputs.begin() + num_forward_output); + auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); + g = ReplaceSubgraphs(std::move(g), subsets, CreateSubgraphNode); + + return g; +} + +} +} diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 55bf8d69ae03..84cde01e665d 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -226,7 +226,8 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, LOG(INFO) << code_; std::string aux_code = ""; std::string kernel_params = ""; - const nnvm::Symbol& sym = *attrs.subgraphs[0]; + nnvm::Symbol sym; + sym.outputs = this->symbol_.outputs; const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; @@ -342,17 +343,17 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, template <> bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - std::vector input_shapes(*in_attrs); + std::vector *in_attrs, + std::vector *out_attrs) { + std::vector input_shapes(*in_attrs); this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), std::move(input_shapes), "__shape__"); const auto& g = this->symbol_.indexed_graph(); - std::vector out_shapes; - const std::vector shapes = this->symbol_.GetAttr("shape"); + std::vector out_shapes; + const std::vector shapes = this->symbol_.GetAttr("shape"); for (auto& e : g.outputs()) { out_shapes.push_back(shapes[g.entry_id(e)]); } @@ -410,8 +411,8 @@ void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, } bool FusedOpInferShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { + std::vector *in_attrs, + std::vector *out_attrs) { const FusedOpPtr& op = nnvm::get(attrs.parsed); return op->InferShape(attrs, in_attrs, out_attrs); } @@ -424,7 +425,7 @@ bool FusedOpInferType(const nnvm::NodeAttrs& attrs, } NNVM_REGISTER_OP(FusedOp) -.set_attr("FInferShape", FusedOpInferShape) +.set_attr("FInferShape", FusedOpInferShape) .set_attr("FInferType", FusedOpInferType) .set_attr("FCompute", FusedOpForwardGPU); diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index f257f5f4b06d..625996b54e63 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -68,8 +68,8 @@ class FusedOp { template bool InferShape(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs); + std::vector *in_attrs, + std::vector *out_attrs); template bool InferType(const nnvm::NodeAttrs &attrs, From 892c18f519af55b4f14bf1c4232f22df365558ce Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 23 May 2019 13:44:35 -0700 Subject: [PATCH 006/105] Adding support for attribute inference for backward nodes when fusing --- src/executor/exec_pass.h | 9 + src/executor/infer_graph_attr_pass.cc | 247 +++++++++++++++++--------- src/executor/pointwise_fusion_pass.cc | 84 ++++++++- src/operator/fusion/fused_op.cc | 1 + src/operator/fusion/fused_op.cu | 52 ++++++ src/operator/fusion/fused_op.h | 15 ++ 6 files changed, 322 insertions(+), 86 deletions(-) diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 976c592358fa..7c8daa00835f 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -38,6 +38,15 @@ namespace mxnet { namespace exec { +template +using FAccessSubgraphAttr = std::function, std::vector> + (const NodeAttrs& attrs)>; + +using FAccessSubgraphShape = FAccessSubgraphAttr; +using FAccessSubgraphType = FAccessSubgraphAttr; +using FAccessSubgraphStorageType = FAccessSubgraphAttr; +using TIsFusion = bool; + /*! \brief reuse graph definition */ using nnvm::Graph; diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 1361abc147ff..a6cdbe6eb8ca 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -73,6 +73,7 @@ bool ApplyOpInferAttr(const nnvm::Graph& g, * \param ret graph used for attribute inference * \param emmpty_val empty value of the attribute * \param infer_name name of the function used for attribute inference + * \param infer_fusion_name name of the function used for accessing attributes in fused nodes * \param input_name name of the attribute in the graph used to store the * input data for attribute inference * \param attr_key_name name of the attribute used for inference for variable nodes @@ -90,10 +91,12 @@ bool ApplyOpInferAttr(const nnvm::Graph& g, * \param default_mode_val default value of the dispatch mode attribute on the node. Used * for storage type inference */ -template +template nnvm::Graph InferAttr(nnvm::Graph &&ret, const AttrType empty_val, const char* infer_name, + const char* infer_fusion_name, const char* input_name, const char* attr_key_name, const char* attr_name, @@ -209,51 +212,86 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, op::dispatch_mode_assign(&dispatch_modes[nid], default_mode_val); } } else if (is_backward.get(inode.source->op(), false) && - inode.control_deps.size() && bwd_identity_assign) { + inode.source->control_deps.size() && bwd_identity_assign) { CHECK(dispatch_mode_name == nullptr) << "Backward inference for node attributes is not available"; - CHECK_GE(inode.control_deps.size(), 1U) + CHECK_GE(inode.source->control_deps.size(), 1U) << "BackwardOp need to have control_deps to its forward op"; - const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; - // use gradient function to find out the correspondence. - std::vector ograd(fwd_ptr->num_outputs()); - for (size_t i = 0; i < ograd.size(); ++i) { - ograd[i].index = static_cast(i); - } - // input gradient list - auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); - const nnvm::Node* igrad_node = nullptr; - // Input gradient assignement - for (size_t i = 0; i < igrad.size(); ++i) { - if (igrad[i].node->op() == inode.source->op()) { - uint32_t eid = idx.entry_id(nid, igrad[i].index); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; - } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { - // Need to skip empty forward shape, because it may not be - // available now and it is possible to infer the forward - // shape in one of the next a few passes - CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) - << "Backward shape inconsistent with the forward shape"; + + static auto& is_fusion = Op::GetAttr("TIsFusion"); + if (!is_fusion.get(fwd_ptr->op(), false)) { + const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; + // use gradient function to find out the correspondence. + std::vector ograd(fwd_ptr->num_outputs()); + for (size_t i = 0; i < ograd.size(); ++i) { + ograd[i].index = static_cast(i); + } + // input gradient list + auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); + const nnvm::Node* igrad_node = nullptr; + // Input gradient assignement + for (size_t i = 0; i < igrad.size(); ++i) { + if (igrad[i].node->op() == inode.source->op()) { + uint32_t eid = idx.entry_id(nid, igrad[i].index); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; + } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { + // Need to skip empty forward shape, because it may not be + // available now and it is possible to infer the forward + // shape in one of the next a few passes + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + << "Backward shape inconsistent with the forward shape"; + } + if (igrad_node == nullptr) { + igrad_node = igrad[i].node.get(); + } else { + CHECK(igrad_node == igrad[i].node.get()); + } } - if (igrad_node == nullptr) { - igrad_node = igrad[i].node.get(); - } else { - CHECK(igrad_node == igrad[i].node.get()); + } + // out grad entries + CHECK(igrad_node != nullptr) + << "Cannot find matching backward op for " << inode.source->attrs.name; + for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { + const nnvm::NodeEntry& e = igrad_node->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + } } } - } - // out grad entries - CHECK(igrad_node != nullptr) - << "Cannot find matching backward op for " << inode.source->attrs.name; - for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { - const nnvm::NodeEntry& e = igrad_node->inputs[i]; - if (e.node == nullptr) { - uint32_t eid = idx.entry_id(inode.inputs[i]); + } else { + static auto& finfer_fused_shape = Op::GetAttr(infer_fusion_name); + auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); + CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << + " is marked as Fusion but does not allow accessing attributes"; + const auto& inferred_attrs = finfer(fwd_ptr->attrs); + const auto& input_attrs = inferred_attrs.first; + const auto& output_attrs = inferred_attrs.second; + CHECK(input_attrs.size() == inode.source->op()->num_outputs) << + "Number of outputs of the gradient node " << inode.source->attrs.name << + " does not match the number of inputs of the corresponding forward node"; + for (size_t i = 0; i < input_attrs.size(); ++i) { + uint32_t eid = idx.entry_id(nid, i); if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + rshape[eid] = input_attrs[i]; + } else if (!fis_none(input_attrs[i])) { + CHECK_EQ(rshape[eid], input_attrs[i]) + << "Backward shape inconsistent with the forward shape"; + } + } + for (size_t i = 0; i < output_attrs.size(); ++i) { + // We assume that the first inputs to the + // backward op are the output gradients + const auto& e = inode.source->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = output_attrs[i]; + } } } } @@ -500,51 +538,96 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, op::dispatch_mode_assign(&dispatch_modes[nid], default_mode_val); } } else if (is_backward.get(inode.source->op(), false) && - inode.control_deps.size() && bwd_identity_assign) { + inode.source->control_deps.size() && bwd_identity_assign) { CHECK(dispatch_mode_name == nullptr) << "Backward inference for node attributes is not available"; - CHECK_GE(inode.control_deps.size(), 1U) + CHECK_GE(inode.source->control_deps.size(), 1U) << "BackwardOp need to have control_deps to its forward op"; - const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; - // use gradient function to find out the correspondence. - std::vector ograd(fwd_ptr->num_outputs()); - for (size_t i = 0; i < ograd.size(); ++i) { - ograd[i].index = static_cast(i); - } - // input gradient list - auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); - const nnvm::Node* igrad_node = nullptr; - // Input gradient assignement - for (size_t i = 0; i < igrad.size(); ++i) { - if (igrad[i].node->op() == inode.source->op()) { - uint32_t eid = idx.entry_id(nid, igrad[i].index); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; - } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { - // Need to skip empty forward shape, because it may not be - // available now and it is possible to infer the forward - // shape in one of the next a few passes - CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) - << "Backward shape inconsistent with the forward shape"; + + static auto& is_fusion = Op::GetAttr("TIsFusion"); + if (!is_fusion.get(fwd_ptr->op(), false)) { + const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; + std::cout << inode.source->attrs.name << ": No fusion!" << std::endl; + // use gradient function to find out the correspondence. + std::vector ograd(fwd_ptr->num_outputs()); + for (size_t i = 0; i < ograd.size(); ++i) { + ograd[i].index = static_cast(i); + } + // input gradient list + auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); + const nnvm::Node* igrad_node = nullptr; + // Input gradient assignement + for (size_t i = 0; i < igrad.size(); ++i) { + if (igrad[i].node->op() == inode.source->op()) { + uint32_t eid = idx.entry_id(nid, igrad[i].index); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; + } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { + // Need to skip empty forward shape, because it may not be + // available now and it is possible to infer the forward + // shape in one of the next a few passes + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + << "Backward shape inconsistent with the forward shape"; + } + if (igrad_node == nullptr) { + igrad_node = igrad[i].node.get(); + } else { + CHECK(igrad_node == igrad[i].node.get()); + } } - if (igrad_node == nullptr) { - igrad_node = igrad[i].node.get(); - } else { - CHECK(igrad_node == igrad[i].node.get()); + } + // out grad entries + CHECK(igrad_node != nullptr) + << "Cannot find matching backward op for " << inode.source->attrs.name; + for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { + const nnvm::NodeEntry& e = igrad_node->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + } } } - } - // out grad entries - CHECK(igrad_node != nullptr) - << "Cannot find matching backward op for " << inode.source->attrs.name; - for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { - const nnvm::NodeEntry& e = igrad_node->inputs[i]; - if (e.node == nullptr) { - uint32_t eid = idx.entry_id(inode.inputs[i]); + } else { + std::cout << inode.source->attrs.name << ": Fusion!" << std::endl; + static auto& finfer_fused_shape = Op::GetAttr("FAccessSubgraphShape"); + auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); + CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << + " is marked as Fusion but does not allow accessing attributes"; + const auto& inferred_attrs = finfer(fwd_ptr->attrs); + const auto& input_attrs = inferred_attrs.first; + const auto& output_attrs = inferred_attrs.second; + std::cout << "Input attrs: " << input_attrs.size() << std::endl; + for (const auto& attr : input_attrs) { + std::cout << attr << std::endl; + } + std::cout << "Output attrs: " << output_attrs.size() << std::endl; + for (const auto& attr : output_attrs) { + std::cout << attr << std::endl; + } + CHECK(input_attrs.size() == inode.source->op()->num_outputs) << + "Number of outputs of the gradient node " << inode.source->attrs.name << + " does not match the number of inputs of the corresponding forward node"; + for (size_t i = 0; i < input_attrs.size(); ++i) { + uint32_t eid = idx.entry_id(nid, i); if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + rshape[eid] = input_attrs[i]; + } else if (!fis_none(input_attrs[i])) { + CHECK_EQ(rshape[eid], input_attrs[i]) + << "Backward shape inconsistent with the forward shape"; + } + } + for (size_t i = 0; i < output_attrs.size(); ++i) { + // We assume that the first inputs to the + // backward op are the output gradients + const auto& e = inode.source->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = output_attrs[i]; + } } } } @@ -622,7 +705,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, last_num_unknown = num_unknown; num_unknown = 0; for (size_t j = entry_start; j < entry_end; ++j) { + std::cout << "Checking entry " << j << std::endl; if (fis_none(rshape[j])) { + std::cout << "Entry " << j << " is none: " << rshape[j] << std::endl; num_unknown += fnum_unknown(rshape[j]); } } @@ -656,10 +741,6 @@ nnvm::Graph InferShape(nnvm::Graph&& graph, if (shape_attr_key.length() != 0) { graph.attrs["shape_attr_key"] = std::make_shared(shape_attr_key); } - std::cout << "Graph attributes before infershape" << std::endl; - for (const auto& kv : graph.attrs) { - std::cout << kv.first << std::endl; - } return InferShapeAttr( std::move(graph), mxnet::TShape(), "FInferShape", "shape_inputs", "shape_attr_key", @@ -690,13 +771,9 @@ nnvm::Graph InferType(nnvm::Graph&& graph, if (dtype_attr_key.length() != 0) { graph.attrs["dtype_attr_key"] = std::make_shared(dtype_attr_key); } - std::cout << "Graph attributes before infertype" << std::endl; - for (const auto& kv : graph.attrs) { - std::cout << kv.first << std::endl; - } - return InferAttr( + return InferAttr( std::move(graph), -1, - "FInferType", "dtype_inputs", "dtype_attr_key", + "FInferType", "FAccessSubgraphType", "dtype_inputs", "dtype_attr_key", "dtype", "dtype_num_unknown_nodes", [](const int t) { return t == -1; }, common::SameType, true, nullptr); @@ -727,9 +804,9 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph, } // for storage type, the backward attr is not necessarily the same as it's correspondence - nnvm::Graph ret = InferAttr( + nnvm::Graph ret = InferAttr( std::move(graph), -1, - "FInferStorageType", "storage_type_inputs", "storage_type_attr_key", + "FInferStorageType", "FAccessSubgraphStorageType", "storage_type_inputs", "storage_type_attr_key", "storage_type", "storage_type_num_unknown_nodes", [](const int t) { return t == -1; }, common::DefaultStorageType, false, "dispatch_mode", DispatchMode::kVariable); diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 8c6361dcd671..e4ed9cc9bb5d 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -30,6 +30,8 @@ #include #include "./simple_partition_pass.h" #include "../operator/fusion/fused_op-inl.h" +#include "../operator/fusion/fused_op.h" +#include "../operator/operator_common.h" namespace mxnet { namespace exec { @@ -81,13 +83,93 @@ namespace { } } +/*! + * \brief Replace a set of nodes by a subgraph node + */ +template +Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& subgraph_sets, + FCreateNode create_subgraph_node) { + for (auto subgraph_set : subgraph_sets) { + // Create MXNet subgraph + Graph subgraph; + const auto sub_outputs_in_main = GetSubgraphOutputs(g, subgraph_set); + subgraph.outputs.resize(sub_outputs_in_main.size()); + for (auto p : sub_outputs_in_main) { + subgraph.outputs[p.second] = p.first; + } + // To generate a subgraph an input have to be replace by data node (no op) + // and it have to be agnostic to the node from which it's an output + // (For exemple even if two inputs are two different outputs from the same node) + auto inputs = GetSubgraphInputs(subgraph, subgraph_set); + auto subgraph_node = create_subgraph_node(subgraph); + subgraph_node->inputs = inputs; + // replug inputs of node out of subgraph to be output of the subgraph node + // if it was a node in the subgraph + DFSVisit(g.outputs, + [&subgraph_node, &subgraph_set, &sub_outputs_in_main](const nnvm::NodePtr node) { + if (!subgraph_set.count(node.get())) { + for (auto &e : node->inputs) { + auto it = sub_outputs_in_main.find(e); + if (it != sub_outputs_in_main.end()) { + e.node = subgraph_node; + e.index = it->second; + } + } + } + }); + // replug outputs of the graph to be output of the subgraph node + // if it was a node in the subgraph + for (auto &e : g.outputs) { + auto it = sub_outputs_in_main.find(e); + if (it != sub_outputs_in_main.end()) { + e.node = subgraph_node; + e.index = it->second; + } + } + // move control dependencies between nodes of the subgraph and out of the subgraph + // to a dependencies between the subgraph node and the nodes out of the subgraph + const auto& index = g.indexed_graph(); + DFSVisit(g.outputs, [&subgraph_node, &subgraph_set, &index](const nnvm::NodePtr& node) { + for (auto &e : node->control_deps) { + if (subgraph_set.count(e.get())) { + uint32_t node_id = index.node_id(e.get()); + auto helper_node = op::MakeNode("_FusedOpHelper", + subgraph_node->attrs.name + "_helper", + nullptr, + nullptr, + nullptr); + helper_node->attrs.parsed = + FusedOpHelperParamPtr(new FusedOpHelperParam( + nnvm::get(subgraph_node->attrs.parsed), + node_id)); + e = helper_node; + } + } + }); + DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { + auto it = node->control_deps.begin(); + while (it != node->control_deps.end()) { + if (subgraph_set.count(it->get())) { + ++it; + } else { + subgraph_node->control_deps.push_back(*it); + it = node->control_deps.erase(it); + } + } + }); + } + Graph new_graph; + new_graph.outputs = g.outputs; + return new_graph; +} + Graph FusePointwise(Graph &&g) { const auto & num_forward_output = g.GetAttr("num_forward_outputs"); Graph fg; fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), g.outputs.begin() + num_forward_output); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); - g = ReplaceSubgraphs(std::move(g), subsets, CreateSubgraphNode); + g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); return g; } diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 48ce8b2a182a..d2a26ec13f94 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -19,6 +19,7 @@ #include "./fused_op.h" #include "../operator_common.h" +#include "../../executor/exec_pass.h" namespace mxnet { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 84cde01e665d..c146523fe854 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -401,6 +401,35 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, return inferred; } +template +std::pair, std::vector> FusedOp::GetAttrs(const std::string& attr_name, + const uint32_t node_id) { + const auto& g = this->symbol_.indexed_graph(); + const std::vector attrs = this->symbol_.GetAttr>(attr_name); + const auto& node = g[node_id]; + std::vector inputs, outputs; + for (const auto& e : node.inputs) { + inputs.emplace_back(attrs[g.entry_id(e)]); + } + outputs.resize(node.source->num_outputs()); + for (size_t i = 0; i < g.num_nodes(); ++i) { + if (i == node_id) continue; + const auto& other_node = g[i]; + for (const auto& e : other_node.inputs) { + if (e.node_id == node_id) { + outputs[e.index] = attrs[g.entry_id(e)]; + } + } + } + for (const auto& e : g.outputs()) { + if (e.node_id == node_id) { + outputs[e.index] = attrs[g.entry_id(e)]; + } + } + + return {inputs, outputs}; +} + void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, @@ -429,4 +458,27 @@ NNVM_REGISTER_OP(FusedOp) .set_attr("FInferType", FusedOpInferType) .set_attr("FCompute", FusedOpForwardGPU); +std::pair, std::vector> +FusedOpHelperShape(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAttrs("shape", node_id); +} + +std::pair, std::vector> +FusedOpHelperType(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAttrs("dtype", node_id); +} + +NNVM_REGISTER_OP(_FusedOpHelper) +.set_num_inputs(0) +.set_num_outputs(0) +.set_attr("TIsFusion", true) +.set_attr("FAccessSubgraphShape", FusedOpHelperShape) +.set_attr("FAccessSubgraphType", FusedOpHelperType); + } // namespace mxnet diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 625996b54e63..047ee8d6f303 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -76,6 +76,10 @@ class FusedOp { std::vector *in_attrs, std::vector *out_attrs); + template + std::pair, std::vector> GetAttrs(const std::string& attr_name, + const uint32_t node_id); + private: void GenerateCode(); @@ -94,6 +98,17 @@ class FusedOp { using FusedOpPtr = std::shared_ptr; +struct FusedOpHelperParam { + FusedOpPtr op; + uint32_t node_id; + + FusedOpHelperParam(FusedOpPtr op, uint32_t node_id) : + op(op), + node_id(node_id) {} +}; + +using FusedOpHelperParamPtr = std::shared_ptr; + } // namespace mxnet #endif // MXNET_OPERATOR_FUSION_FUSED_OP_H_ From 0a342a0c7bcc9134e27d7984c466264b02799b44 Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Fri, 24 May 2019 13:31:46 -0700 Subject: [PATCH 007/105] keep proper input ordering for fused Op --- src/executor/simple_partition_pass.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 09aa5625be3d..7699da823252 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -202,7 +202,6 @@ class BidirectionalGraph { std::vector nodes; std::unordered_map nnvm2nid; std::vector outputs; - }; // class BidirectionalGraph using NodeEntrySet = std::unordered_set; * \brief get the output nodes of the subgraph in the main graph * \return a map between the node in the main graph and the output index of the subgraph node */ -// std::vector nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_set) { //std::vector outputs; //NodeEntrySet _outputs; @@ -243,6 +241,7 @@ nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_ */ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_set) { std::vector inputs; + const auto &idx = g.indexed_graph(); nnvm::NodeEntryMap entry_map; DFSVisit(g.outputs, [&subgraph_set, &inputs, &entry_map](const nnvm::NodePtr &node){ if (subgraph_set.count(node.get())) { @@ -262,6 +261,11 @@ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_s } } }); + // Fix ordering of w.r.t to topology + std::sort(inputs.begin(), inputs.end(), + [&idx](const nnvm::NodeEntry lhs, const nnvm::NodeEntry rhs) { + return idx.entry_id(lhs) < idx.entry_id(rhs); + }); return inputs; } From 07de80093e4b33cf7eda98a1597a03a57a690dfa Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Fri, 24 May 2019 15:16:16 -0700 Subject: [PATCH 008/105] instantiate the indexed_graph before starting the subgraph replacement, return a new graph to reset the indexed_graph --- src/executor/pointwise_fusion_pass.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index e4ed9cc9bb5d..b38db1997fe2 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -51,7 +51,7 @@ namespace { return true; return false; } - + nnvm::NodePtr CreateSubgraphNode(const Graph& subgraph) { nnvm::Symbol subgraph_sym; auto node = nnvm::Node::Create(); @@ -97,7 +97,7 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub for (auto p : sub_outputs_in_main) { subgraph.outputs[p.second] = p.first; } - // To generate a subgraph an input have to be replace by data node (no op) + // To generate a subgraph an input have to be replaced by data node (no op) // and it have to be agnostic to the node from which it's an output // (For exemple even if two inputs are two different outputs from the same node) auto inputs = GetSubgraphInputs(subgraph, subgraph_set); @@ -164,14 +164,16 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub } Graph FusePointwise(Graph &&g) { + Graph ret; + const auto& idx = g.indexed_graph(); const auto & num_forward_output = g.GetAttr("num_forward_outputs"); Graph fg; fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), g.outputs.begin() + num_forward_output); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); - - return g; + ret.outputs = g.outputs; + return ret; } } From 975e8a65e43fed24fe745d61abfd2dec7aeb50ff Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 27 May 2019 08:26:48 -0700 Subject: [PATCH 009/105] Fuse backward --- src/executor/exec_pass.h | 3 +- src/executor/graph_executor.cc | 3 +- src/executor/infer_graph_attr_pass.cc | 44 ++++++++++- src/executor/pointwise_fusion_pass.cc | 25 +++++- src/executor/simple_partition_pass.h | 9 +++ src/operator/fusion/fused_op-inl.h | 107 ++++++++++++++------------ src/operator/fusion/fused_op.cc | 5 +- src/operator/fusion/fused_op.cu | 83 +++++++++++++++++++- src/operator/fusion/fused_op.h | 2 +- 9 files changed, 218 insertions(+), 63 deletions(-) diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 7c8daa00835f..46322bf8d9c1 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -186,7 +186,8 @@ using NodeEntryMapCounter = */ NodeEntryMapCounter GetNodeEntryCount(const Graph& g); -Graph FusePointwise(Graph&& g); +Graph FusePointwiseForward(Graph&& g); +Graph FusePointwiseBackward(Graph&& g); /*! * \brief Infer shapes in the graph given the information. diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 225c41134df5..e126a0d8e801 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -985,7 +985,8 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); if (dmlc::GetEnv("MXNET_USE_FUSION", true)) { - g = FusePointwise(std::move(g)); + g = FusePointwiseForward(std::move(g)); + g = FusePointwiseBackward(std::move(g)); } // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index a6cdbe6eb8ca..e1a2053b9d30 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -381,6 +381,41 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, return ret; } + +inline void PrintFullGraph(const nnvm::Graph& g) { + const auto& index = g.indexed_graph(); + + for (size_t i = 0; i < index.num_nodes(); ++i) { + const auto& node = index[i]; + std::cout << "Node " << i << std::endl; + const auto* source = node.source; + if (source != nullptr) { + std::cout << source->attrs.name << std::endl; + if (source->is_variable()) { + std::cout << "Variable!" << std::endl; + } + std::cout << "Inputs: " << node.inputs.size() << std::endl; + for (size_t j = 0; j < node.inputs.size(); ++j) { + std::cout << node.inputs[j].node_id << " (" << + index[node.inputs[j].node_id].source->attrs.name << ") " << + node.inputs[j].index << ". Entry id: " << + index.entry_id(node.inputs[j]) << std::endl; + } + std::cout << "Outputs: " << source->num_outputs() << std::endl; + } else { + std::cout << "NULLPTR in source" << std::endl; + } + } + + std::cout << "Graph outputs" << std::endl; + for (const auto& entry : index.outputs()) { + std::cout << entry.node_id << " (" << + index[entry.node_id].source->attrs.name << ") " << + entry.index << ". Entry id: " << + index.entry_id(entry) << std::endl; + } +} + /*!\brief * This is a version of the InferAttr function specifically for shape inference. * @@ -516,9 +551,11 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, auto infer_step = [&](uint32_t nid, bool last_iter) { const auto& inode = idx[nid]; const std::string name = inode.source->attrs.name; + std::cout << "InferStep: " << nid << " " << name << std::endl; const uint32_t num_inputs = inode.inputs.size(); const uint32_t num_outputs = inode.source->num_outputs(); if (inode.source->is_variable()) { + std::cout << "InferStep: Variable!" << std::endl; // Variable node. No operator. Only one output entry. CHECK(inode.source->op() == nullptr); CHECK_EQ(num_outputs, 1U); @@ -539,6 +576,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } else if (is_backward.get(inode.source->op(), false) && inode.source->control_deps.size() && bwd_identity_assign) { + std::cout << "InferStep: Backward!" << std::endl; CHECK(dispatch_mode_name == nullptr) << "Backward inference for node attributes is not available"; CHECK_GE(inode.source->control_deps.size(), 1U) @@ -549,7 +587,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, static auto& is_fusion = Op::GetAttr("TIsFusion"); if (!is_fusion.get(fwd_ptr->op(), false)) { const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; - std::cout << inode.source->attrs.name << ": No fusion!" << std::endl; + LOG(INFO) << inode.source->attrs.name << ": No fusion!" << std::endl; // use gradient function to find out the correspondence. std::vector ograd(fwd_ptr->num_outputs()); for (size_t i = 0; i < ograd.size(); ++i) { @@ -632,6 +670,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } } else { + std::cout << "InferStep: Dispatch!" << std::endl; DispatchMode* dispatch_mode = nullptr; bool forward_known = true; // Forward operator inference. @@ -704,6 +743,8 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } last_num_unknown = num_unknown; num_unknown = 0; + std::cout << "Will be checking entries." << std::endl; + PrintFullGraph(ret); for (size_t j = entry_start; j < entry_end; ++j) { std::cout << "Checking entry " << j << std::endl; if (fis_none(rshape[j])) { @@ -735,6 +776,7 @@ nnvm::Graph InferShape(nnvm::Graph&& graph, mxnet::ShapeVector&& shape_inputs, const std::string& shape_attr_key) { using dmlc::any; + std::cout << "Calling InferShape!" << std::endl; if (shape_inputs.size() != 0) { graph.attrs["shape_inputs"] = std::make_shared(std::move(shape_inputs)); } diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index b38db1997fe2..f38e99a1e472 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -23,6 +23,7 @@ * \brief * \author Clement Fuji Tsang */ +#include #include #include #include @@ -41,6 +42,7 @@ namespace { if (n->op() == nullptr) return false; std::string op_name = n->op()->name; + std::cout << "Visiting " << op_name << std::endl; if (fused_op_binary_ops.count(op_name)) return true; if (fused_op_unary_ops.count(op_name)) @@ -49,6 +51,12 @@ namespace { return true; if (fused_op_mimo_ops.count(op_name)) return true; + if (std::find(fused_op_variable_io_ops.begin(), + fused_op_variable_io_ops.end(), + op_name) != + fused_op_variable_io_ops.end()) + return true; + std::cout << "It was not in any list" << std::endl; return false; } @@ -89,6 +97,7 @@ namespace { template Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& subgraph_sets, FCreateNode create_subgraph_node) { + std::cout << "Fusion sets: " << subgraph_sets.size() << std::endl; for (auto subgraph_set : subgraph_sets) { // Create MXNet subgraph Graph subgraph; @@ -153,7 +162,8 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub ++it; } else { subgraph_node->control_deps.push_back(*it); - it = node->control_deps.erase(it); + //it = node->control_deps.erase(it); + ++it; } } }); @@ -163,9 +173,9 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub return new_graph; } -Graph FusePointwise(Graph &&g) { +Graph FusePointwiseForward(Graph &&g) { Graph ret; - const auto& idx = g.indexed_graph(); + g.indexed_graph(); const auto & num_forward_output = g.GetAttr("num_forward_outputs"); Graph fg; fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), @@ -176,5 +186,14 @@ Graph FusePointwise(Graph &&g) { return ret; } +Graph FusePointwiseBackward(Graph &&g) { + Graph ret; + g.indexed_graph(); + auto subsets = GetCompatibleSubsets(g, IsFusionCompatible); + g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); + ret.outputs = g.outputs; + return ret; +} + } } diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 7699da823252..7f086d556b13 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -83,6 +83,7 @@ class BidirectionalGraph { std::unordered_set all_set(nodes.size()); std::vector separation_sets; for (Node& node : nodes) { + std::cout << "Looking at " << node.nnvmptr->attrs.name << std::endl; if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); std::unordered_set in_graph; @@ -128,10 +129,13 @@ class BidirectionalGraph { std::deque stack(outputs.begin(), outputs.end()); while (!stack.empty()) { Node* vertex = stack.front(); + std::cout << "Checking " << vertex->nnvmptr->attrs.name << std::endl; stack.pop_front(); if (!visited.count(vertex)) { + std::cout << "Not visited!" << std::endl; visited.insert(vertex); if (unused_set.count(vertex)) { + std::cout << "Adding to subgraphs!" << std::endl; subgraphs.emplace_back(naive_grow_subgraph(vertex, &unused_set, &incomp_map)); } for (Node* input : vertex->inputs) { @@ -175,11 +179,16 @@ class BidirectionalGraph { std::unordered_set incomp_set; std::deque stack; stack.emplace_back(head); + std::cout << "naive grow subgraph" << std::endl; while (!stack.empty()) { Node* vertex = stack.back(); + std::cout << "Naive sees " << vertex->nnvmptr->attrs.name << std::endl; stack.pop_back(); + std::cout << "Unused: " << unused_set->count(vertex) << std::endl; + std::cout << "Compatible: " << !incomp_set.count(vertex) << std::endl; if (unused_set->count(vertex) && !incomp_set.count(vertex)) { unused_set->erase(vertex); + std::cout << "Put into subgraph!" << std::endl; subgraph.insert(vertex); incomp_set.insert((*incomp_map)[vertex].begin(), (*incomp_map)[vertex].end()); for (Node* input : vertex->inputs) { diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index e82afeec63eb..aa9a1b912edc 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -92,57 +92,58 @@ const std::map fused_op_binary_ops = { }; const std::map fused_op_unary_ops = { - {"amp_cast" , "identity"}, - {"relu" , "relu"}, - {"sigmoid" , "sigmoid"}, - {"softsign" , "softsign"}, - {"exp" , "exp"}, - {"expm1" , "expm1"}, - {"log" , "log"}, - {"log10" , "log10"}, - {"log2" , "log2"}, - {"log1p" , "log1p"}, - {"degrees" , "degrees"}, - {"radians" , "radians"}, - {"sin" , "sin"}, - {"cos" , "cos"}, - {"tan" , "tan"}, - {"arcsin" , "arcsin"}, - {"arccos" , "arccos"}, - {"arccos" , "arccos"}, - {"arctan" , "arctan"}, - {"sinh" , "sinh"}, - {"cosh" , "cosh"}, - {"tanh" , "tanh"}, - {"arcsinh" , "arcsinh"}, - {"arccosh" , "arccosh"}, - {"arctanh" , "arctanh"}, - {"sqrt" , "sqrt"}, - {"rsqrt" , "rsqrt"}, - {"cbrt" , "cbrt"}, - {"rcbrt" , "rcbrt"}, - {"square" , "square"}, - {"squeeze" , "identity"}, - {"zeros_like" , "zero"}, - {"ones_like" , "one"}, - {"flatten" , "identity"}, - {"Reshape" , "identity"}, - {"reshape" , "identity"}, - {"expand_dims", "identity"}, - {"round" , "round"}, - {"rint" , "rint"}, - {"fix" , "fix"}, - {"floor" , "floor"}, - {"ceil" , "ceil"}, - {"trunc" , "trunc"}, - {"sign" , "sign"}, - {"reciprocal" , "reciprocal"}, - {"abs" , "abs"}, - {"gamma" , "gamma"}, - {"gammaln" , "gammaln"}, - {"erf" , "erf"}, - {"erfinv" , "erfinv"}, - {"_copy" , "identity"} + {"amp_cast" , "identity"}, + {"relu" , "relu"}, + {"sigmoid" , "sigmoid"}, + {"softsign" , "softsign"}, + {"exp" , "exp"}, + {"expm1" , "expm1"}, + {"log" , "log"}, + {"log10" , "log10"}, + {"log2" , "log2"}, + {"log1p" , "log1p"}, + {"degrees" , "degrees"}, + {"radians" , "radians"}, + {"sin" , "sin"}, + {"cos" , "cos"}, + {"tan" , "tan"}, + {"arcsin" , "arcsin"}, + {"arccos" , "arccos"}, + {"arccos" , "arccos"}, + {"arctan" , "arctan"}, + {"sinh" , "sinh"}, + {"cosh" , "cosh"}, + {"tanh" , "tanh"}, + {"arcsinh" , "arcsinh"}, + {"arccosh" , "arccosh"}, + {"arctanh" , "arctanh"}, + {"sqrt" , "sqrt"}, + {"rsqrt" , "rsqrt"}, + {"cbrt" , "cbrt"}, + {"rcbrt" , "rcbrt"}, + {"square" , "square"}, + {"squeeze" , "identity"}, + {"zeros_like" , "zero"}, + {"ones_like" , "one"}, + {"flatten" , "identity"}, + {"Reshape" , "identity"}, + {"reshape" , "identity"}, + {"expand_dims" , "identity"}, + {"round" , "round"}, + {"rint" , "rint"}, + {"fix" , "fix"}, + {"floor" , "floor"}, + {"ceil" , "ceil"}, + {"trunc" , "trunc"}, + {"sign" , "sign"}, + {"reciprocal" , "reciprocal"}, + {"abs" , "abs"}, + {"gamma" , "gamma"}, + {"gammaln" , "gammaln"}, + {"erf" , "erf"}, + {"erfinv" , "erfinv"}, + {"_copy" , "identity"}, + {"_identity_with_attr_like_rhs" , "identity"} }; const std::map> fused_op_special_ops = { @@ -225,6 +226,10 @@ const std::map>> fused_op_mimo {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} }; +const std::vector fused_op_variable_io_ops = { + "add_n" +}; + const std::string fused_op_function_definitions = R"code( template struct LoadType { diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index d2a26ec13f94..80a39c07bc42 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -26,6 +26,7 @@ namespace mxnet { DMLC_REGISTER_PARAMETER(FusedOpConfig); void FusedOpParamParser(nnvm::NodeAttrs* attrs) { + std::cout << "Parser!" << std::endl; FusedOpConfig param; try { param.Init(attrs->dict); @@ -41,7 +42,9 @@ void FusedOpParamParser(nnvm::NodeAttrs* attrs) { throw dmlc::ParamError(os.str()); } CHECK(!param.symbol_json.empty()); - attrs->parsed = FusedOpPtr(new FusedOp(param)); + std::cout << "JSON: " << param.symbol_json << std::endl; + attrs->parsed = FusedOpPtr(new FusedOp(attrs, param)); + std::cout << "Empty: " << attrs->parsed.empty() < #include #include #include "./fused_op.h" @@ -55,10 +56,12 @@ inline std::string mshadowTypeToString(int type) { } // namespace detail -FusedOp::FusedOp(const FusedOpConfig& config) { +FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { this->inputs_ = std::vector(config.num_inputs); this->outputs_ = std::vector(config.num_outputs); - this->symbol_ = nnvm::pass::LoadJSON(config.symbol_json); + //this->symbol_ = nnvm::pass::LoadJSON(config.symbol_json); + this->symbol_ = nnvm::Graph(); + this->symbol_.outputs = attrs->subgraphs[0]->outputs; this->initialized_ = false; this->cc_major_ = -1; this->cc_minor_ = -1; @@ -90,7 +93,8 @@ void FusedOp::GenerateCode() { if (source != nullptr) { std::string var_name = "temp" + std::to_string(temp_name_counter++); if (source->is_variable()) { - code += "const auto " + var_name + " = load(" + source->attrs.name + ", i);\n"; + code += "const auto " + var_name + " = load(input_" + source->attrs.name + ", i);\n"; + code += "printf(\"%d: %f\\n\", i, " + var_name + ");\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = var_name; } else { @@ -167,6 +171,21 @@ void FusedOp::GenerateCode() { } continue; } + + // Special cases with variable number + // of inputs/outputs, listed in + // detail::fused_op_variable_io_ops + if (op_name == "add_n") { + CHECK_EQ(outputs[i], 1); + const auto& arg = variables[{node.inputs[0].node_id, node.inputs[0].index}]; + code += "auto " + var_name + " = " + arg + ";\n"; + for (size_t inp = 1; inp < node.inputs.size(); ++inp) { + const auto& temp_arg = variables[{node.inputs[inp].node_id, node.inputs[inp].index}]; + code += var_name + " = add(" + var_name + ", " + temp_arg + ");\n"; + } + variables[{i,0}] = var_name; + continue; + } LOG(FATAL) << "Unrecognized op " + op_name; } } else { @@ -229,12 +248,15 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, nnvm::Symbol sym; sym.outputs = this->symbol_.outputs; const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); + for (const auto& name : input_names) { + LOG(INFO) << name; + } size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* " +input_names[i]; + kernel_params += "DType" + std::to_string(i) + "* input_" +input_names[i]; ++i; if (i < num_params) { kernel_params += ", "; @@ -341,16 +363,54 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, &(args[0]), 0)); // arguments } +inline void PrintFullGraph(const nnvm::Graph& g) { + const auto& index = g.indexed_graph(); + + for (size_t i = 0; i < index.num_nodes(); ++i) { + const auto& node = index[i]; + std::cout << "Node " << i << std::endl; + const auto* source = node.source; + if (source != nullptr) { + std::cout << source->attrs.name << std::endl; + if (source->is_variable()) { + std::cout << "Variable!" << std::endl; + } + std::cout << "Inputs: " << node.inputs.size() << std::endl; + for (size_t j = 0; j < node.inputs.size(); ++j) { + std::cout << node.inputs[j].node_id << " (" << + index[node.inputs[j].node_id].source->attrs.name << ") " << + node.inputs[j].index << ". Entry id: " << + index.entry_id(node.inputs[j]) << std::endl; + } + std::cout << "Outputs: " << source->num_outputs() << std::endl; + } else { + std::cout << "NULLPTR in source" << std::endl; + } + } + + std::cout << "Graph outputs" << std::endl; + for (const auto& entry : index.outputs()) { + std::cout << entry.node_id << " (" << + index[entry.node_id].source->attrs.name << ") " << + entry.index << ". Entry id: " << + index.entry_id(entry) << std::endl; + } +} + template <> bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { std::vector input_shapes(*in_attrs); + std::cout << "InferShape in FusedOp! " << attrs.name << std::endl; + PrintFullGraph(this->symbol_); this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), std::move(input_shapes), "__shape__"); + std::cout << "End of infershape in FusedOp " << attrs.name << std::endl; const auto& g = this->symbol_.indexed_graph(); + const auto& input_nids = g.input_nodes(); std::vector out_shapes; const std::vector shapes = this->symbol_.GetAttr("shape"); @@ -361,6 +421,13 @@ bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, for (size_t i = 0; i < out_attrs->size(); ++i) { op::shape_assign(&(out_attrs->at(i)), out_shapes[i]); } + + // assign to in_attrs + for (size_t i = 0; i < in_attrs->size(); ++i) { + const auto eid = g.entry_id(input_nids[i], 0); + SHAPE_ASSIGN_CHECK(*in_attrs, i, shapes[eid]); + } + bool inferred = true; for (const auto& attr : *in_attrs) { inferred = inferred && !op::shape_is_none(attr); @@ -381,6 +448,7 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, "__dtype__"); const auto& g = this->symbol_.indexed_graph(); + const auto& input_nids = g.input_nodes(); std::vector out_types; const std::vector types = this->symbol_.GetAttr("dtype"); @@ -391,6 +459,13 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, for (size_t i = 0; i < out_attrs->size(); ++i) { op::type_assign(&(out_attrs->at(i)), out_types[i]); } + + // assign to in_attrs + for (size_t i = 0; i < in_attrs->size(); ++i) { + const auto eid = g.entry_id(input_nids[i], 0); + TYPE_ASSIGN_CHECK(*in_attrs, i, types[eid]); + } + bool inferred = true; for (const auto& attr : *in_attrs) { inferred = inferred && !op::type_is_none(attr); diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 047ee8d6f303..6f4fe3500170 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -50,7 +50,7 @@ class FusedOp { public: static const int NTHREADS = 512; - explicit FusedOp(const FusedOpConfig& config); + explicit FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config); ~FusedOp() {} uint32_t num_inputs() const { return inputs_.size(); From 6d9c0bf01fbc80bb5199c3c0cd9e1f62de8ccc23 Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Wed, 29 May 2019 09:48:28 -0400 Subject: [PATCH 010/105] fix ordering of subgraph node inputs using subgraph topological ordering instead of main graph topological ordering, add tvm.patch --- src/executor/simple_partition_pass.h | 8 +++++--- tvm.patch | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 tvm.patch diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 7f086d556b13..3bea957c4853 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -250,7 +250,6 @@ nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_ */ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_set) { std::vector inputs; - const auto &idx = g.indexed_graph(); nnvm::NodeEntryMap entry_map; DFSVisit(g.outputs, [&subgraph_set, &inputs, &entry_map](const nnvm::NodePtr &node){ if (subgraph_set.count(node.get())) { @@ -271,9 +270,12 @@ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_s } }); // Fix ordering of w.r.t to topology + Graph _g; + _g.outputs = g.outputs; + const auto &idx = _g.indexed_graph(); std::sort(inputs.begin(), inputs.end(), - [&idx](const nnvm::NodeEntry lhs, const nnvm::NodeEntry rhs) { - return idx.entry_id(lhs) < idx.entry_id(rhs); + [&idx, &entry_map](const nnvm::NodeEntry lhs, const nnvm::NodeEntry rhs) { + return idx.entry_id(entry_map.at(lhs)) < idx.entry_id(entry_map.at(rhs)); }); return inputs; } diff --git a/tvm.patch b/tvm.patch new file mode 100644 index 000000000000..c18edf864c77 --- /dev/null +++ b/tvm.patch @@ -0,0 +1,21 @@ +diff --git nnvm/src/core/graph.cc nnvm/src/core/graph.cc +index b8bcae7..af7bb4a 100644 +--- nnvm/src/core/graph.cc ++++ nnvm/src/core/graph.cc +@@ -57,6 +57,8 @@ IndexedGraph::IndexedGraph(const Graph &g) { + + DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs] + (const NodePtr& n) { ++ const auto& is_fusion = Op::GetAttr("TIsFusion"); ++ if (!n->is_variable() && is_fusion.get(n->op(), false)) return; + CHECK_LT(nodes_.size(), std::numeric_limits::max()); + uint32_t nid = static_cast(nodes_.size()); + for (const auto &subgraph : n->attrs.subgraphs) +@@ -83,6 +85,7 @@ IndexedGraph::IndexedGraph(const Graph &g) { + inputs_rptr.push_back(input_entries_.size()); + // control deps + for (const auto& nptr : n->control_deps) { ++ if (!nptr->is_variable() && is_fusion.get(nptr->op(), false)) continue; + auto it = node2index_.find(nptr.get()); + CHECK(it != node2index_.end() && it->first == nptr.get()); + control_deps_.push_back(it->second); From 384fbb00f5eaacafa9d2da07d9ba0871f198e0f0 Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Wed, 29 May 2019 11:16:01 -0400 Subject: [PATCH 011/105] excluse forward node fusion during the fusion of the nodes in the backward graph --- 3rdparty/ps-lite | 2 +- src/executor/graph_executor.cc | 3 ++- src/executor/pointwise_fusion_pass.cc | 18 +++++++++++++++--- src/executor/simple_partition_pass.h | 10 ++++++---- src/operator/fusion/fused_op.cu | 1 - 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite index 8a763892a973..f45e2e78a743 160000 --- a/3rdparty/ps-lite +++ b/3rdparty/ps-lite @@ -1 +1 @@ -Subproject commit 8a763892a973afc1acd3d4b469d05bb338a83a6e +Subproject commit f45e2e78a7430be09f76264d2f4073fb2b1d54a2 diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index e126a0d8e801..916654282c46 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -983,9 +983,10 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); - g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); if (dmlc::GetEnv("MXNET_USE_FUSION", true)) { + g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwiseForward(std::move(g)); + g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwiseBackward(std::move(g)); } // create "device" and "context" attrs for the graph diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index f38e99a1e472..a86d5d92eb2c 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -176,10 +176,10 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub Graph FusePointwiseForward(Graph &&g) { Graph ret; g.indexed_graph(); - const auto & num_forward_output = g.GetAttr("num_forward_outputs"); + const auto& num_forward_outputs = g.GetAttr("num_forward_outputs"); Graph fg; fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), - g.outputs.begin() + num_forward_output); + g.outputs.begin() + num_forward_outputs); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; @@ -189,7 +189,19 @@ Graph FusePointwiseForward(Graph &&g) { Graph FusePointwiseBackward(Graph &&g) { Graph ret; g.indexed_graph(); - auto subsets = GetCompatibleSubsets(g, IsFusionCompatible); + const auto& num_forward_outputs = g.GetAttr("num_forward_outputs"); + Graph fg; + fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), + g.outputs.begin() + num_forward_outputs); + std::unordered_set exclusion_set; + DFSVisit(fg.outputs, [&exclusion_set](const nnvm::NodePtr& n) { + exclusion_set.insert(n.get()); + }); + auto subsets = GetCompatibleSubsets(g, [&exclusion_set](nnvm::Node* n) { + if (exclusion_set.count(n)) + return false; + return IsFusionCompatible(n); + }); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; return ret; diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 3bea957c4853..1dc82933fa2e 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -90,11 +90,13 @@ class BidirectionalGraph { std::unordered_set out_graph; std::vector dummy_head; dummy_head.emplace_back(&node); - DFS(dummy_head, false, [&out_graph](Node* node) { - out_graph.insert(node); + DFS(dummy_head, false, [&out_graph, &is_compatible](Node* node) { + if (is_compatible(node->nnvmptr)) + out_graph.insert(node); }); - DFS(dummy_head, true, [&in_graph](Node* node) { - in_graph.insert(node); + DFS(dummy_head, true, [&in_graph, is_compatible](Node* node) { + if (is_compatible(node->nnvmptr)) + in_graph.insert(node); }); if (!(in_graph.empty() || out_graph.empty())) separation_sets.push_back(std::make_pair(in_graph, out_graph)); diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 277dfed3d49c..8083fe23727f 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -94,7 +94,6 @@ void FusedOp::GenerateCode() { std::string var_name = "temp" + std::to_string(temp_name_counter++); if (source->is_variable()) { code += "const auto " + var_name + " = load(input_" + source->attrs.name + ", i);\n"; - code += "printf(\"%d: %f\\n\", i, " + var_name + ");\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = var_name; } else { From b9506ff30d0110585cdcf319313680dea8f53362 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 3 Jun 2019 16:03:18 -0700 Subject: [PATCH 012/105] Dealing with fused backward nodes inferattr --- src/executor/exec_pass.h | 10 ++++ src/executor/infer_graph_attr_pass.cc | 66 +++++++++++++++++++++++---- src/executor/pointwise_fusion_pass.cc | 25 ++++++++-- src/operator/fusion/fused_op.cu | 57 ++++++++++++++++++++++- src/operator/fusion/fused_op.h | 28 ++++++++++++ tvm.patch | 2 +- 6 files changed, 173 insertions(+), 15 deletions(-) diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 46322bf8d9c1..3a7c04debce7 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -45,7 +45,17 @@ using FAccessSubgraphAttr = std::function, std::vect using FAccessSubgraphShape = FAccessSubgraphAttr; using FAccessSubgraphType = FAccessSubgraphAttr; using FAccessSubgraphStorageType = FAccessSubgraphAttr; + +template +using FProvideSubgraphAttr = std::function> &in_attrs, + const std::vector> &out_attrs)>; +using FProvideSubgraphShape = FProvideSubgraphAttr; +using FProvideSubgraphType = FProvideSubgraphAttr; +using FProvideSubgraphStorageType = FProvideSubgraphAttr; + using TIsFusion = bool; +using TIsFusionHelper = bool; /*! \brief reuse graph definition */ using nnvm::Graph; diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index e1a2053b9d30..22d29910b8e8 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -92,11 +92,12 @@ bool ApplyOpInferAttr(const nnvm::Graph& g, * for storage type inference */ template + typename FProvideSubgraphType, typename IsNone, typename FDefault> nnvm::Graph InferAttr(nnvm::Graph &&ret, const AttrType empty_val, const char* infer_name, const char* infer_fusion_name, + const char* provide_fusion_name, const char* input_name, const char* attr_key_name, const char* attr_name, @@ -220,7 +221,7 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; - static auto& is_fusion = Op::GetAttr("TIsFusion"); + static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); if (!is_fusion.get(fwd_ptr->op(), false)) { const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; // use gradient function to find out the correspondence. @@ -318,6 +319,28 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, if (finfer != nullptr) { // Call inference function of the operator. try { + static auto& is_fusion = Op::GetAttr("TIsFusion"); + if (is_fusion.get(inode.source->op(), false)) { + std::vector> in_attrs; + std::vector> out_attrs; + for (const auto& dep_node : inode.source->control_deps) { + in_attrs.push_back({}); + out_attrs.push_back({}); + auto ¤t_in_attrs = in_attrs.back(); + auto ¤t_out_attrs = out_attrs.back(); + std::cout << "Control deps: " << dep_node->attrs.name << std::endl; + uint32_t dep_node_id = idx.node_id(dep_node.get()); + for (const auto& e : idx[dep_node_id].inputs) { + current_in_attrs.push_back(rshape[idx.entry_id(e)]); + } + for (size_t i = 0; i < dep_node->num_outputs(); ++i) { + current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); + } + } + auto provide = Op::GetAttr(provide_fusion_name).get(inode.source->op(), nullptr); + CHECK(provide != nullptr) << "Encountered Fusion operator that does not implement providing subgraph attr " << provide_fusion_name << "."; + provide(inode.source->attrs, in_attrs, out_attrs); + } forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, nid, &ishape, &oshape, dispatch_mode); } catch (const std::exception& e) { @@ -584,7 +607,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; - static auto& is_fusion = Op::GetAttr("TIsFusion"); + static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); if (!is_fusion.get(fwd_ptr->op(), false)) { const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; LOG(INFO) << inode.source->attrs.name << ": No fusion!" << std::endl; @@ -703,6 +726,28 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, if (finfer != nullptr) { // Call inference function of the operator. try { + static auto& is_fusion = Op::GetAttr("TIsFusion"); + if (is_fusion.get(inode.source->op(), false)) { + std::vector> in_attrs; + std::vector> out_attrs; + for (const auto& dep_node : inode.source->control_deps) { + in_attrs.push_back({}); + out_attrs.push_back({}); + auto ¤t_in_attrs = in_attrs.back(); + auto ¤t_out_attrs = out_attrs.back(); + std::cout << "Control deps: " << dep_node->attrs.name << std::endl; + uint32_t dep_node_id = idx.node_id(dep_node.get()); + for (const auto& e : idx[dep_node_id].inputs) { + current_in_attrs.push_back(rshape[idx.entry_id(e)]); + } + for (size_t i = 0; i < dep_node->num_outputs(); ++i) { + current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); + } + } + auto provide = Op::GetAttr("FProvideSubgraphShape").get(inode.source->op(), nullptr); + CHECK(provide != nullptr) << "Encountered Fusion operator that does not implement providing subgraph shape."; + provide(inode.source->attrs, in_attrs, out_attrs); + } forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, nid, &ishape, &oshape, dispatch_mode); } catch (const std::exception& e) { @@ -813,10 +858,11 @@ nnvm::Graph InferType(nnvm::Graph&& graph, if (dtype_attr_key.length() != 0) { graph.attrs["dtype_attr_key"] = std::make_shared(dtype_attr_key); } - return InferAttr( + return InferAttr( std::move(graph), -1, - "FInferType", "FAccessSubgraphType", "dtype_inputs", "dtype_attr_key", - "dtype", "dtype_num_unknown_nodes", + "FInferType", "FAccessSubgraphType", "FProvideSubgraphType", + "dtype_inputs", "dtype_attr_key", "dtype", "dtype_num_unknown_nodes", [](const int t) { return t == -1; }, common::SameType, true, nullptr); } @@ -846,10 +892,12 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph, } // for storage type, the backward attr is not necessarily the same as it's correspondence - nnvm::Graph ret = InferAttr( + nnvm::Graph ret = InferAttr( std::move(graph), -1, - "FInferStorageType", "FAccessSubgraphStorageType", "storage_type_inputs", "storage_type_attr_key", - "storage_type", "storage_type_num_unknown_nodes", + "FInferStorageType", "FAccessSubgraphStorageType", "FProvideSubgraphStorageType", + "storage_type_inputs", "storage_type_attr_key", "storage_type", + "storage_type_num_unknown_nodes", [](const int t) { return t == -1; }, common::DefaultStorageType, false, "dispatch_mode", DispatchMode::kVariable); diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index a86d5d92eb2c..98d87cec527b 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -143,7 +143,8 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub if (subgraph_set.count(e.get())) { uint32_t node_id = index.node_id(e.get()); auto helper_node = op::MakeNode("_FusedOpHelper", - subgraph_node->attrs.name + "_helper", + subgraph_node->attrs.name + "_" + + node->attrs.name + "_helper", nullptr, nullptr, nullptr); @@ -157,15 +158,33 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub }); DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { auto it = node->control_deps.begin(); + static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); + std::vector new_control_deps; while (it != node->control_deps.end()) { if (subgraph_set.count(it->get())) { ++it; } else { - subgraph_node->control_deps.push_back(*it); - //it = node->control_deps.erase(it); + if ((*it)->is_variable() || !is_fusion.get((*it)->op(), false)) { + uint32_t node_id = subgraph_node->control_deps.size(); + subgraph_node->control_deps.push_back(*it); + auto helper_node = op::MakeNode("_FusedOpOutHelper", + subgraph_node->attrs.name + "_" + + node->attrs.name + "_outhelper", + nullptr, + nullptr, + nullptr); + helper_node->attrs.parsed = + FusedOpHelperParamPtr(new FusedOpHelperParam( + nnvm::get(subgraph_node->attrs.parsed), + node_id)); + new_control_deps.push_back(helper_node); + } else { + new_control_deps.push_back(*it); + } ++it; } } + node->control_deps = new_control_deps; }); } Graph new_graph; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 8083fe23727f..e95e40e6647f 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -69,8 +69,13 @@ FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { this->GenerateCode(); } +nnvm::Graph FusedOp::GetGraphWithoutControlDeps(nnvm::Graph &old) { + return old; +} + void FusedOp::GenerateCode() { - const auto& g = this->symbol_.indexed_graph(); + const auto& codegen_graph = GetGraphWithoutControlDeps(this->symbol_); + const auto& g = codegen_graph.indexed_graph(); std::string code = ""; int temp_name_counter = 0; using NodeEntry = nnvm::IndexedGraph::NodeEntry; @@ -527,7 +532,30 @@ bool FusedOpInferType(const nnvm::NodeAttrs& attrs, return op->InferType(attrs, in_attrs, out_attrs); } +void FusedOpProvideShape(const nnvm::NodeAttrs& attrs, + const std::vector> &in_attrs, + const std::vector> &out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + op->ProvideShape(in_attrs, out_attrs); +} + +void FusedOpProvideType(const nnvm::NodeAttrs& attrs, + const std::vector> &in_attrs, + const std::vector> &out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + op->ProvideType(in_attrs, out_attrs); +} + +void FusedOpProvideStorageType(const nnvm::NodeAttrs& attrs, + const std::vector> &in_attrs, + const std::vector> &out_attrs) {} + + NNVM_REGISTER_OP(FusedOp) +.set_attr("TIsFusion", true) +.set_attr("FProvideSubgraphShape", FusedOpProvideShape) +.set_attr("FProvideSubgraphType", FusedOpProvideType) +.set_attr("FProvideSubgraphStorageType", FusedOpProvideStorageType) .set_attr("FInferShape", FusedOpInferShape) .set_attr("FInferType", FusedOpInferType) .set_attr("FCompute", FusedOpForwardGPU); @@ -551,8 +579,33 @@ FusedOpHelperType(const NodeAttrs& attrs) { NNVM_REGISTER_OP(_FusedOpHelper) .set_num_inputs(0) .set_num_outputs(0) -.set_attr("TIsFusion", true) +.set_attr("TIsGhost", true) +.set_attr("TIsFusionHelper", true) .set_attr("FAccessSubgraphShape", FusedOpHelperShape) .set_attr("FAccessSubgraphType", FusedOpHelperType); + +std::pair, std::vector> +FusedOpOutHelperShape(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAuxShape(node_id); +} + +std::pair, std::vector> +FusedOpOutHelperType(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAuxType(node_id); +} + +NNVM_REGISTER_OP(_FusedOpOutHelper) +.set_num_inputs(0) +.set_num_outputs(0) +.set_attr("TIsGhost", true) +.set_attr("TIsFusionHelper", true) +.set_attr("FAccessSubgraphShape", FusedOpOutHelperShape) +.set_attr("FAccessSubgraphType", FusedOpOutHelperType); } // namespace mxnet diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 6f4fe3500170..3ea091e78430 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -80,14 +80,42 @@ class FusedOp { std::pair, std::vector> GetAttrs(const std::string& attr_name, const uint32_t node_id); + void ProvideShape(const std::vector> &in_attrs, + const std::vector> &out_attrs) { + aux_in_shapes = in_attrs; + aux_out_shapes = out_attrs; + } + + void ProvideType(const std::vector> &in_attrs, + const std::vector> &out_attrs) { + aux_in_types = in_attrs; + aux_out_types = out_attrs; + } + + std::pair, std::vector> + GetAuxShape(const int node_id) const { + return {aux_in_shapes[node_id], aux_out_shapes[node_id]}; + } + + std::pair, std::vector> GetAuxType(const int node_id) const { + return {aux_in_types[node_id], aux_out_types[node_id]}; + } + private: void GenerateCode(); + nnvm::Graph GetGraphWithoutControlDeps(nnvm::Graph &old); std::vector inputs_; std::vector outputs_; std::string code_; nnvm::Graph symbol_; + + std::vector> aux_in_shapes; + std::vector> aux_out_shapes; + std::vector> aux_in_types; + std::vector> aux_out_types; + std::string ptx_; std::string kernel_name_; bool initialized_; diff --git a/tvm.patch b/tvm.patch index c18edf864c77..77841544b985 100644 --- a/tvm.patch +++ b/tvm.patch @@ -6,7 +6,7 @@ index b8bcae7..af7bb4a 100644 DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs] (const NodePtr& n) { -+ const auto& is_fusion = Op::GetAttr("TIsFusion"); ++ const auto& is_fusion = Op::GetAttr("TIsGhost"); + if (!n->is_variable() && is_fusion.get(n->op(), false)) return; CHECK_LT(nodes_.size(), std::numeric_limits::max()); uint32_t nid = static_cast(nodes_.size()); From f30fbbb02e831f01ea0bde7eb798070ac5e69a4c Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Tue, 4 Jun 2019 11:55:45 -0400 Subject: [PATCH 013/105] use subgraph.indexed_graph() instead of main for _FusedOpHelper nodes node_id, invert control_deps loop to modify topology of subgraph before calling its indexed_graph(), check that all node of the first DFSVisit are actually in the subgraph --- src/executor/pointwise_fusion_pass.cc | 82 +++++++++++++++------------ src/operator/fusion/fused_op.cu | 2 +- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 98d87cec527b..ea550b01f221 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -60,7 +60,7 @@ namespace { return false; } - nnvm::NodePtr CreateSubgraphNode(const Graph& subgraph) { + nnvm::NodePtr CreateSubgraphNode(const Graph& subgraph, size_t inputs_size) { nnvm::Symbol subgraph_sym; auto node = nnvm::Node::Create(); subgraph_sym.outputs = subgraph.outputs; @@ -82,8 +82,7 @@ namespace { params_names.pop_back(); //node->attrs.dict["subgraph_params_names"] = params_names; node->attrs.dict["symbol_json"] = nnvm::pass::SaveJSON(subgraph); - node->attrs.dict["num_inputs"] = - std::to_string(subgraph.indexed_graph().input_nodes().size()); + node->attrs.dict["num_inputs"] = std::to_string(inputs_size); node->attrs.dict["num_outputs"] = std::to_string(subgraph.outputs.size()); node->attrs.op = Op::Get("FusedOp"); node->op()->attr_parser(&(node->attrs)); @@ -110,7 +109,7 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub // and it have to be agnostic to the node from which it's an output // (For exemple even if two inputs are two different outputs from the same node) auto inputs = GetSubgraphInputs(subgraph, subgraph_set); - auto subgraph_node = create_subgraph_node(subgraph); + auto subgraph_node = create_subgraph_node(subgraph, inputs.size()); subgraph_node->inputs = inputs; // replug inputs of node out of subgraph to be output of the subgraph node // if it was a node in the subgraph @@ -137,7 +136,50 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub } // move control dependencies between nodes of the subgraph and out of the subgraph // to a dependencies between the subgraph node and the nodes out of the subgraph - const auto& index = g.indexed_graph(); + DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { + std::cout << "Visiting node " << node->attrs.name << std::endl; + if (subgraph_set.count(node.get())) { + std::cout << "It is in the set!" << std::endl; + std::cout << "It has " << node->control_deps.size() << " control deps!" << std::endl; + auto it = node->control_deps.begin(); + static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); + std::vector new_control_deps; + while (it != node->control_deps.end()) { + if (subgraph_set.count(it->get())) { + std::cout << "Control dep " << it->get()->attrs.name << " in the subgraph!" << std::endl; + ++it; + } else { + if ((*it)->is_variable() || !is_fusion.get((*it)->op(), false)) { + std::cout << "Control dep " << it->get()->attrs.name << " not variable nor fusion" << std::endl; + uint32_t node_id = subgraph_node->control_deps.size(); + subgraph_node->control_deps.push_back(*it); + auto helper_node = op::MakeNode("_FusedOpOutHelper", + subgraph_node->attrs.name + "_" + + node->attrs.name + "_outhelper", + nullptr, + nullptr, + nullptr); + helper_node->attrs.parsed = + FusedOpHelperParamPtr(new FusedOpHelperParam( + nnvm::get(subgraph_node->attrs.parsed), + node_id)); + new_control_deps.push_back(helper_node); + } else { + std::cout << "Control dep " << it->get()->attrs.name << " variable or fusion" << std::endl; + new_control_deps.push_back(*it); + } + ++it; + } + } + node->control_deps = new_control_deps; + std::cout << "New control deps size: " << node->control_deps.size() << std::endl; + for (const auto& d : node->control_deps) { + std::cout << "Control dep: " << d->attrs.name << std::endl; + } + } + }); + + const auto& index = subgraph.indexed_graph(); DFSVisit(g.outputs, [&subgraph_node, &subgraph_set, &index](const nnvm::NodePtr& node) { for (auto &e : node->control_deps) { if (subgraph_set.count(e.get())) { @@ -156,36 +198,6 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub } } }); - DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { - auto it = node->control_deps.begin(); - static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); - std::vector new_control_deps; - while (it != node->control_deps.end()) { - if (subgraph_set.count(it->get())) { - ++it; - } else { - if ((*it)->is_variable() || !is_fusion.get((*it)->op(), false)) { - uint32_t node_id = subgraph_node->control_deps.size(); - subgraph_node->control_deps.push_back(*it); - auto helper_node = op::MakeNode("_FusedOpOutHelper", - subgraph_node->attrs.name + "_" - + node->attrs.name + "_outhelper", - nullptr, - nullptr, - nullptr); - helper_node->attrs.parsed = - FusedOpHelperParamPtr(new FusedOpHelperParam( - nnvm::get(subgraph_node->attrs.parsed), - node_id)); - new_control_deps.push_back(helper_node); - } else { - new_control_deps.push_back(*it); - } - ++it; - } - } - node->control_deps = new_control_deps; - }); } Graph new_graph; new_graph.outputs = g.outputs; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index e95e40e6647f..2206fbfbf931 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -66,7 +66,6 @@ FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { this->cc_major_ = -1; this->cc_minor_ = -1; - this->GenerateCode(); } nnvm::Graph FusedOp::GetGraphWithoutControlDeps(nnvm::Graph &old) { @@ -246,6 +245,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, this->cc_minor_ = cc_minor; if (!initialized_) { + this->GenerateCode(); LOG(INFO) << code_; std::string aux_code = ""; std::string kernel_params = ""; From 1a2e30d0e18d49364480e232ee58f1e8e04c2db2 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 4 Jun 2019 11:51:13 -0700 Subject: [PATCH 014/105] Adding support for other reqs in codegen --- src/operator/fusion/fused_op-inl.h | 11 +++++++++++ src/operator/fusion/fused_op.cu | 15 ++++++++++++--- src/operator/fusion/fused_op.h | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index aa9a1b912edc..ce3ec3095610 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -261,6 +261,17 @@ inline void store(const float value, int i, half * output) { output[i] = __float2half(value); } +template +inline void storeadd(const typename LoadType::Type value, int i, DType * output) { + output[i] += value; +} + +template <> +inline void storeadd(const float value, int i, half * output) { + const auto previous = load(output, i); + output[i] = __float2half(value + previous); +} + template inline DType identity(const DType val) { return val; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 2206fbfbf931..0f8698bcc029 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -72,7 +72,7 @@ nnvm::Graph FusedOp::GetGraphWithoutControlDeps(nnvm::Graph &old) { return old; } -void FusedOp::GenerateCode() { +void FusedOp::GenerateCode(const std::vector &req) { const auto& codegen_graph = GetGraphWithoutControlDeps(this->symbol_); const auto& g = codegen_graph.indexed_graph(); std::string code = ""; @@ -199,7 +199,13 @@ void FusedOp::GenerateCode() { int counter = 0; for (const auto& entry : g.outputs()) { const std::string& var = variables[{entry.node_id, entry.index}]; - code += "store(" + var + ", i, output" + std::to_string(counter) + ");\n"; + if (req[counter] == kWriteTo || req[counter] == kWriteInplace) { + code += "store(" + var + ", i, output" + std::to_string(counter) + ");\n"; + } else if (req[counter] == kAddTo) { + code += "storeadd(" + var + ", i, output" + std::to_string(counter) + ");\n"; + } else { + LOG(FATAL) << "Encountered unexpected req"; + } ++counter; } @@ -245,8 +251,11 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, this->cc_minor_ = cc_minor; if (!initialized_) { - this->GenerateCode(); + this->GenerateCode(req); LOG(INFO) << code_; + for (const auto &r : req) { + std::cout << r << std::endl; + } std::string aux_code = ""; std::string kernel_params = ""; nnvm::Symbol sym; diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 3ea091e78430..7c14f1fadecc 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -102,7 +102,7 @@ class FusedOp { } private: - void GenerateCode(); + void GenerateCode(const std::vector &req); nnvm::Graph GetGraphWithoutControlDeps(nnvm::Graph &old); std::vector inputs_; From 15fbed5c5096dea06a6997a4b2ef322847d8ae3e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 4 Jun 2019 12:54:27 -0700 Subject: [PATCH 015/105] Fix --- src/operator/fusion/fused_op-inl.h | 9 ++------- src/operator/fusion/fused_op.cu | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index ce3ec3095610..594c74cbfd7d 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -170,12 +170,6 @@ const std::map> fused_op_special_ops = { {"_hypot_scalar", {"hypot(%, %)", "_0", "scalar"}}, {"_backward_relu", {"backward_relu(%, %)", "_1", "_0"}}, {"_backward_sigmoid", {"backward_sigmoid(%, %)", "_1", "_0"}}, - {"_backward_Activation", {"((% == " + std::to_string(mxnet::op::activation::kReLU) + - " || % == " + std::to_string(mxnet::op::activation::kSigmoid) + - " || % == " + std::to_string(mxnet::op::activation::kTanh) + - ") ? backward_%(%, %) : backward_%(%, %))", - "act_type", "act_type", "act_type", "act_type", - "_1", "_0", "_2", "_0"}}, {"_backward_expm1", {"backward_expm1(%, %)", "_1", "_0"}}, {"_backward_log", {"backward_log(%, %)", "_1", "_0"}}, {"_backward_log10", {"backward_log10(%, %)", "_1", "_0"}}, @@ -227,7 +221,8 @@ const std::map>> fused_op_mimo }; const std::vector fused_op_variable_io_ops = { - "add_n" + "add_n", + "_backward_Activation" }; const std::string fused_op_function_definitions = R"code( diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 0f8698bcc029..b39290f1d2f5 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -126,16 +126,20 @@ void FusedOp::GenerateCode(const std::vector &req) { if (detail::fused_op_special_ops.find(op_name) != detail::fused_op_special_ops.end()) { const std::vector& op_desc = detail::fused_op_special_ops.at(op_name); std::string fmt = op_desc[0]; + std::cout << "Generating for " << op_name << std::endl; for (size_t j = 1; j < op_desc.size(); ++j) { const std::string& desc = op_desc[j]; + std::cout << "desc: " << desc << std::endl; std::string sub; if (desc[0] == '_') { // Argument int arg_id = std::stoi(desc.substr(1)); + std::cout << "arg_id: " << arg_id << std::endl; sub = variables[{node.inputs[arg_id].node_id, node.inputs[arg_id].index}]; } else { sub = source->attrs.dict.at(desc); } + std::cout << "sub: " << sub << std::endl; size_t pos = fmt.find("%"); CHECK_NE(pos, std::string::npos); fmt.replace(pos, 1, sub); @@ -189,6 +193,25 @@ void FusedOp::GenerateCode(const std::vector &req) { variables[{i,0}] = var_name; continue; } + + if (op_name == "_backward_Activation") { + CHECK_EQ(outputs[i], 1); + std::string act_type = node.source->attrs.dict.at("act_type"); + std::string rhs, lhs; + rhs = variables[{node.inputs[0].node_id, node.inputs[0].index}]; + if (act_type == "relu" || + act_type == "sigmoid" || + act_type == "tanh") { + lhs = variables[{node.inputs[1].node_id, node.inputs[1].index}]; + } else { + lhs = variables[{node.inputs[2].node_id, node.inputs[2].index}]; + } + code += "const auto " + var_name + " = backward_" + act_type + + "(" + lhs + ", " + rhs + ");\n"; + + variables[{i,0}] = var_name; + continue; + } LOG(FATAL) << "Unrecognized op " + op_name; } } else { From 506b1263654b14d988af048e2197c3868fe6dcfd Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 4 Jun 2019 13:58:39 -0700 Subject: [PATCH 016/105] Cleaning --- src/executor/infer_graph_attr_pass.cc | 56 --------------------------- src/executor/pointwise_fusion_pass.cc | 17 +------- src/executor/simple_partition_pass.h | 9 ----- src/operator/fusion/fused_op.cc | 3 -- src/operator/fusion/fused_op.cu | 44 --------------------- 5 files changed, 2 insertions(+), 127 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 22d29910b8e8..19f75c7b6374 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -328,7 +328,6 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, out_attrs.push_back({}); auto ¤t_in_attrs = in_attrs.back(); auto ¤t_out_attrs = out_attrs.back(); - std::cout << "Control deps: " << dep_node->attrs.name << std::endl; uint32_t dep_node_id = idx.node_id(dep_node.get()); for (const auto& e : idx[dep_node_id].inputs) { current_in_attrs.push_back(rshape[idx.entry_id(e)]); @@ -404,41 +403,6 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, return ret; } - -inline void PrintFullGraph(const nnvm::Graph& g) { - const auto& index = g.indexed_graph(); - - for (size_t i = 0; i < index.num_nodes(); ++i) { - const auto& node = index[i]; - std::cout << "Node " << i << std::endl; - const auto* source = node.source; - if (source != nullptr) { - std::cout << source->attrs.name << std::endl; - if (source->is_variable()) { - std::cout << "Variable!" << std::endl; - } - std::cout << "Inputs: " << node.inputs.size() << std::endl; - for (size_t j = 0; j < node.inputs.size(); ++j) { - std::cout << node.inputs[j].node_id << " (" << - index[node.inputs[j].node_id].source->attrs.name << ") " << - node.inputs[j].index << ". Entry id: " << - index.entry_id(node.inputs[j]) << std::endl; - } - std::cout << "Outputs: " << source->num_outputs() << std::endl; - } else { - std::cout << "NULLPTR in source" << std::endl; - } - } - - std::cout << "Graph outputs" << std::endl; - for (const auto& entry : index.outputs()) { - std::cout << entry.node_id << " (" << - index[entry.node_id].source->attrs.name << ") " << - entry.index << ". Entry id: " << - index.entry_id(entry) << std::endl; - } -} - /*!\brief * This is a version of the InferAttr function specifically for shape inference. * @@ -574,11 +538,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, auto infer_step = [&](uint32_t nid, bool last_iter) { const auto& inode = idx[nid]; const std::string name = inode.source->attrs.name; - std::cout << "InferStep: " << nid << " " << name << std::endl; const uint32_t num_inputs = inode.inputs.size(); const uint32_t num_outputs = inode.source->num_outputs(); if (inode.source->is_variable()) { - std::cout << "InferStep: Variable!" << std::endl; // Variable node. No operator. Only one output entry. CHECK(inode.source->op() == nullptr); CHECK_EQ(num_outputs, 1U); @@ -599,7 +561,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } else if (is_backward.get(inode.source->op(), false) && inode.source->control_deps.size() && bwd_identity_assign) { - std::cout << "InferStep: Backward!" << std::endl; CHECK(dispatch_mode_name == nullptr) << "Backward inference for node attributes is not available"; CHECK_GE(inode.source->control_deps.size(), 1U) @@ -610,7 +571,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); if (!is_fusion.get(fwd_ptr->op(), false)) { const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; - LOG(INFO) << inode.source->attrs.name << ": No fusion!" << std::endl; // use gradient function to find out the correspondence. std::vector ograd(fwd_ptr->num_outputs()); for (size_t i = 0; i < ograd.size(); ++i) { @@ -652,7 +612,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } } else { - std::cout << inode.source->attrs.name << ": Fusion!" << std::endl; static auto& finfer_fused_shape = Op::GetAttr("FAccessSubgraphShape"); auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << @@ -660,14 +619,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, const auto& inferred_attrs = finfer(fwd_ptr->attrs); const auto& input_attrs = inferred_attrs.first; const auto& output_attrs = inferred_attrs.second; - std::cout << "Input attrs: " << input_attrs.size() << std::endl; - for (const auto& attr : input_attrs) { - std::cout << attr << std::endl; - } - std::cout << "Output attrs: " << output_attrs.size() << std::endl; - for (const auto& attr : output_attrs) { - std::cout << attr << std::endl; - } CHECK(input_attrs.size() == inode.source->op()->num_outputs) << "Number of outputs of the gradient node " << inode.source->attrs.name << " does not match the number of inputs of the corresponding forward node"; @@ -693,7 +644,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } } else { - std::cout << "InferStep: Dispatch!" << std::endl; DispatchMode* dispatch_mode = nullptr; bool forward_known = true; // Forward operator inference. @@ -735,7 +685,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, out_attrs.push_back({}); auto ¤t_in_attrs = in_attrs.back(); auto ¤t_out_attrs = out_attrs.back(); - std::cout << "Control deps: " << dep_node->attrs.name << std::endl; uint32_t dep_node_id = idx.node_id(dep_node.get()); for (const auto& e : idx[dep_node_id].inputs) { current_in_attrs.push_back(rshape[idx.entry_id(e)]); @@ -788,12 +737,8 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } last_num_unknown = num_unknown; num_unknown = 0; - std::cout << "Will be checking entries." << std::endl; - PrintFullGraph(ret); for (size_t j = entry_start; j < entry_end; ++j) { - std::cout << "Checking entry " << j << std::endl; if (fis_none(rshape[j])) { - std::cout << "Entry " << j << " is none: " << rshape[j] << std::endl; num_unknown += fnum_unknown(rshape[j]); } } @@ -821,7 +766,6 @@ nnvm::Graph InferShape(nnvm::Graph&& graph, mxnet::ShapeVector&& shape_inputs, const std::string& shape_attr_key) { using dmlc::any; - std::cout << "Calling InferShape!" << std::endl; if (shape_inputs.size() != 0) { graph.attrs["shape_inputs"] = std::make_shared(std::move(shape_inputs)); } diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index ea550b01f221..8d319c2f745b 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -42,7 +42,6 @@ namespace { if (n->op() == nullptr) return false; std::string op_name = n->op()->name; - std::cout << "Visiting " << op_name << std::endl; if (fused_op_binary_ops.count(op_name)) return true; if (fused_op_unary_ops.count(op_name)) @@ -56,7 +55,6 @@ namespace { op_name) != fused_op_variable_io_ops.end()) return true; - std::cout << "It was not in any list" << std::endl; return false; } @@ -96,7 +94,6 @@ namespace { template Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& subgraph_sets, FCreateNode create_subgraph_node) { - std::cout << "Fusion sets: " << subgraph_sets.size() << std::endl; for (auto subgraph_set : subgraph_sets) { // Create MXNet subgraph Graph subgraph; @@ -137,20 +134,15 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub // move control dependencies between nodes of the subgraph and out of the subgraph // to a dependencies between the subgraph node and the nodes out of the subgraph DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { - std::cout << "Visiting node " << node->attrs.name << std::endl; if (subgraph_set.count(node.get())) { - std::cout << "It is in the set!" << std::endl; - std::cout << "It has " << node->control_deps.size() << " control deps!" << std::endl; auto it = node->control_deps.begin(); static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); std::vector new_control_deps; while (it != node->control_deps.end()) { if (subgraph_set.count(it->get())) { - std::cout << "Control dep " << it->get()->attrs.name << " in the subgraph!" << std::endl; - ++it; + new_control_deps.push_back(*it); } else { if ((*it)->is_variable() || !is_fusion.get((*it)->op(), false)) { - std::cout << "Control dep " << it->get()->attrs.name << " not variable nor fusion" << std::endl; uint32_t node_id = subgraph_node->control_deps.size(); subgraph_node->control_deps.push_back(*it); auto helper_node = op::MakeNode("_FusedOpOutHelper", @@ -165,17 +157,12 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub node_id)); new_control_deps.push_back(helper_node); } else { - std::cout << "Control dep " << it->get()->attrs.name << " variable or fusion" << std::endl; new_control_deps.push_back(*it); } - ++it; } + ++it; } node->control_deps = new_control_deps; - std::cout << "New control deps size: " << node->control_deps.size() << std::endl; - for (const auto& d : node->control_deps) { - std::cout << "Control dep: " << d->attrs.name << std::endl; - } } }); diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 1dc82933fa2e..54adae0a9f3d 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -83,7 +83,6 @@ class BidirectionalGraph { std::unordered_set all_set(nodes.size()); std::vector separation_sets; for (Node& node : nodes) { - std::cout << "Looking at " << node.nnvmptr->attrs.name << std::endl; if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); std::unordered_set in_graph; @@ -131,13 +130,10 @@ class BidirectionalGraph { std::deque stack(outputs.begin(), outputs.end()); while (!stack.empty()) { Node* vertex = stack.front(); - std::cout << "Checking " << vertex->nnvmptr->attrs.name << std::endl; stack.pop_front(); if (!visited.count(vertex)) { - std::cout << "Not visited!" << std::endl; visited.insert(vertex); if (unused_set.count(vertex)) { - std::cout << "Adding to subgraphs!" << std::endl; subgraphs.emplace_back(naive_grow_subgraph(vertex, &unused_set, &incomp_map)); } for (Node* input : vertex->inputs) { @@ -181,16 +177,11 @@ class BidirectionalGraph { std::unordered_set incomp_set; std::deque stack; stack.emplace_back(head); - std::cout << "naive grow subgraph" << std::endl; while (!stack.empty()) { Node* vertex = stack.back(); - std::cout << "Naive sees " << vertex->nnvmptr->attrs.name << std::endl; stack.pop_back(); - std::cout << "Unused: " << unused_set->count(vertex) << std::endl; - std::cout << "Compatible: " << !incomp_set.count(vertex) << std::endl; if (unused_set->count(vertex) && !incomp_set.count(vertex)) { unused_set->erase(vertex); - std::cout << "Put into subgraph!" << std::endl; subgraph.insert(vertex); incomp_set.insert((*incomp_map)[vertex].begin(), (*incomp_map)[vertex].end()); for (Node* input : vertex->inputs) { diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 80a39c07bc42..f1e85c614439 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -26,7 +26,6 @@ namespace mxnet { DMLC_REGISTER_PARAMETER(FusedOpConfig); void FusedOpParamParser(nnvm::NodeAttrs* attrs) { - std::cout << "Parser!" << std::endl; FusedOpConfig param; try { param.Init(attrs->dict); @@ -42,9 +41,7 @@ void FusedOpParamParser(nnvm::NodeAttrs* attrs) { throw dmlc::ParamError(os.str()); } CHECK(!param.symbol_json.empty()); - std::cout << "JSON: " << param.symbol_json << std::endl; attrs->parsed = FusedOpPtr(new FusedOp(attrs, param)); - std::cout << "Empty: " << attrs->parsed.empty() < &req) { if (detail::fused_op_special_ops.find(op_name) != detail::fused_op_special_ops.end()) { const std::vector& op_desc = detail::fused_op_special_ops.at(op_name); std::string fmt = op_desc[0]; - std::cout << "Generating for " << op_name << std::endl; for (size_t j = 1; j < op_desc.size(); ++j) { const std::string& desc = op_desc[j]; - std::cout << "desc: " << desc << std::endl; std::string sub; if (desc[0] == '_') { // Argument int arg_id = std::stoi(desc.substr(1)); - std::cout << "arg_id: " << arg_id << std::endl; sub = variables[{node.inputs[arg_id].node_id, node.inputs[arg_id].index}]; } else { sub = source->attrs.dict.at(desc); } - std::cout << "sub: " << sub << std::endl; size_t pos = fmt.find("%"); CHECK_NE(pos, std::string::npos); fmt.replace(pos, 1, sub); @@ -276,9 +272,6 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, if (!initialized_) { this->GenerateCode(req); LOG(INFO) << code_; - for (const auto &r : req) { - std::cout << r << std::endl; - } std::string aux_code = ""; std::string kernel_params = ""; nnvm::Symbol sym; @@ -399,51 +392,14 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, &(args[0]), 0)); // arguments } -inline void PrintFullGraph(const nnvm::Graph& g) { - const auto& index = g.indexed_graph(); - - for (size_t i = 0; i < index.num_nodes(); ++i) { - const auto& node = index[i]; - std::cout << "Node " << i << std::endl; - const auto* source = node.source; - if (source != nullptr) { - std::cout << source->attrs.name << std::endl; - if (source->is_variable()) { - std::cout << "Variable!" << std::endl; - } - std::cout << "Inputs: " << node.inputs.size() << std::endl; - for (size_t j = 0; j < node.inputs.size(); ++j) { - std::cout << node.inputs[j].node_id << " (" << - index[node.inputs[j].node_id].source->attrs.name << ") " << - node.inputs[j].index << ". Entry id: " << - index.entry_id(node.inputs[j]) << std::endl; - } - std::cout << "Outputs: " << source->num_outputs() << std::endl; - } else { - std::cout << "NULLPTR in source" << std::endl; - } - } - - std::cout << "Graph outputs" << std::endl; - for (const auto& entry : index.outputs()) { - std::cout << entry.node_id << " (" << - index[entry.node_id].source->attrs.name << ") " << - entry.index << ". Entry id: " << - index.entry_id(entry) << std::endl; - } -} - template <> bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { std::vector input_shapes(*in_attrs); - std::cout << "InferShape in FusedOp! " << attrs.name << std::endl; - PrintFullGraph(this->symbol_); this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), std::move(input_shapes), "__shape__"); - std::cout << "End of infershape in FusedOp " << attrs.name << std::endl; const auto& g = this->symbol_.indexed_graph(); const auto& input_nids = g.input_nodes(); From cf88753a69a74753b543df157965dfef28f1a8ec Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 4 Jun 2019 15:27:38 -0700 Subject: [PATCH 017/105] Change the TVM submodule --- .gitmodules | 2 +- 3rdparty/tvm | 2 +- src/operator/fusion/fused_op.cu | 4 ++-- tvm.patch | 21 --------------------- 4 files changed, 4 insertions(+), 25 deletions(-) delete mode 100644 tvm.patch diff --git a/.gitmodules b/.gitmodules index e0ffec11bfd0..19aab0a8452a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,7 +22,7 @@ branch = master [submodule "3rdparty/tvm"] path = 3rdparty/tvm - url = https://github.com/dmlc/tvm + url = https://github.com/ptredak/tvm [submodule "3rdparty/onnx-tensorrt"] path = 3rdparty/onnx-tensorrt url = https://github.com/onnx/onnx-tensorrt.git diff --git a/3rdparty/tvm b/3rdparty/tvm index 21935dcbf56a..efb4063d8702 160000 --- a/3rdparty/tvm +++ b/3rdparty/tvm @@ -1 +1 @@ -Subproject commit 21935dcbf56ad3bd66ebff9891a6bc3865b8106d +Subproject commit efb4063d8702d27b969e1e59a13520ec1c08737c diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 99b720122406..6e1aa1e978db 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -567,7 +567,7 @@ FusedOpHelperType(const NodeAttrs& attrs) { NNVM_REGISTER_OP(_FusedOpHelper) .set_num_inputs(0) .set_num_outputs(0) -.set_attr("TIsGhost", true) +.set_attr("TIsGhost", true) .set_attr("TIsFusionHelper", true) .set_attr("FAccessSubgraphShape", FusedOpHelperShape) .set_attr("FAccessSubgraphType", FusedOpHelperType); @@ -592,7 +592,7 @@ FusedOpOutHelperType(const NodeAttrs& attrs) { NNVM_REGISTER_OP(_FusedOpOutHelper) .set_num_inputs(0) .set_num_outputs(0) -.set_attr("TIsGhost", true) +.set_attr("TIsGhost", true) .set_attr("TIsFusionHelper", true) .set_attr("FAccessSubgraphShape", FusedOpOutHelperShape) .set_attr("FAccessSubgraphType", FusedOpOutHelperType); diff --git a/tvm.patch b/tvm.patch deleted file mode 100644 index 77841544b985..000000000000 --- a/tvm.patch +++ /dev/null @@ -1,21 +0,0 @@ -diff --git nnvm/src/core/graph.cc nnvm/src/core/graph.cc -index b8bcae7..af7bb4a 100644 ---- nnvm/src/core/graph.cc -+++ nnvm/src/core/graph.cc -@@ -57,6 +57,8 @@ IndexedGraph::IndexedGraph(const Graph &g) { - - DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs] - (const NodePtr& n) { -+ const auto& is_fusion = Op::GetAttr("TIsGhost"); -+ if (!n->is_variable() && is_fusion.get(n->op(), false)) return; - CHECK_LT(nodes_.size(), std::numeric_limits::max()); - uint32_t nid = static_cast(nodes_.size()); - for (const auto &subgraph : n->attrs.subgraphs) -@@ -83,6 +85,7 @@ IndexedGraph::IndexedGraph(const Graph &g) { - inputs_rptr.push_back(input_entries_.size()); - // control deps - for (const auto& nptr : n->control_deps) { -+ if (!nptr->is_variable() && is_fusion.get(nptr->op(), false)) continue; - auto it = node2index_.find(nptr.get()); - CHECK(it != node2index_.end() && it->first == nptr.get()); - control_deps_.push_back(it->second); From b861af9c610f24a0ee9085466444ea868238d314 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 4 Jun 2019 15:30:20 -0700 Subject: [PATCH 018/105] More cleaning --- .gitmodules | 2 +- src/operator/fusion/fused_op.cu | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.gitmodules b/.gitmodules index 19aab0a8452a..7f7522a68e67 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,7 +22,7 @@ branch = master [submodule "3rdparty/tvm"] path = 3rdparty/tvm - url = https://github.com/ptredak/tvm + url = https://github.com/ptrendx/tvm [submodule "3rdparty/onnx-tensorrt"] path = 3rdparty/onnx-tensorrt url = https://github.com/onnx/onnx-tensorrt.git diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 6e1aa1e978db..b2b57848ffc9 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -277,9 +277,6 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, nnvm::Symbol sym; sym.outputs = this->symbol_.outputs; const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); - for (const auto& name : input_names) { - LOG(INFO) << name; - } size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; for (const auto &type : in_dtypes) { @@ -567,7 +564,7 @@ FusedOpHelperType(const NodeAttrs& attrs) { NNVM_REGISTER_OP(_FusedOpHelper) .set_num_inputs(0) .set_num_outputs(0) -.set_attr("TIsGhost", true) +.set_attr("TIsGhost", true) .set_attr("TIsFusionHelper", true) .set_attr("FAccessSubgraphShape", FusedOpHelperShape) .set_attr("FAccessSubgraphType", FusedOpHelperType); @@ -592,7 +589,7 @@ FusedOpOutHelperType(const NodeAttrs& attrs) { NNVM_REGISTER_OP(_FusedOpOutHelper) .set_num_inputs(0) .set_num_outputs(0) -.set_attr("TIsGhost", true) +.set_attr("TIsGhost", true) .set_attr("TIsFusionHelper", true) .set_attr("FAccessSubgraphShape", FusedOpOutHelperShape) .set_attr("FAccessSubgraphType", FusedOpOutHelperType); From d001b5d0ddedcf6e30e49ff347b69000e8c5ca51 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 5 Jun 2019 13:21:42 -0700 Subject: [PATCH 019/105] Making linter happy --- src/executor/exec_pass.h | 1 + src/executor/infer_graph_attr_pass.cc | 19 ++++++++++++++----- src/executor/pointwise_fusion_pass.cc | 9 ++++----- src/executor/simple_partition_pass.h | 14 ++++++++------ src/operator/fusion/fused_op.cu | 17 ++++++----------- src/operator/fusion/fused_op.h | 2 +- 6 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 3a7c04debce7..626557a8f968 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -34,6 +34,7 @@ #include #include #include +#include namespace mxnet { namespace exec { diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 19f75c7b6374..d2a9c18d2291 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -336,8 +336,12 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); } } - auto provide = Op::GetAttr(provide_fusion_name).get(inode.source->op(), nullptr); - CHECK(provide != nullptr) << "Encountered Fusion operator that does not implement providing subgraph attr " << provide_fusion_name << "."; + auto provide = + Op::GetAttr(provide_fusion_name).get(inode.source->op(), + nullptr); + CHECK(provide != nullptr) << + "Encountered Fusion operator that does not implement providing subgraph attr " << + provide_fusion_name << "."; provide(inode.source->attrs, in_attrs, out_attrs); } forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, @@ -612,7 +616,8 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } } else { - static auto& finfer_fused_shape = Op::GetAttr("FAccessSubgraphShape"); + static auto& finfer_fused_shape = + Op::GetAttr("FAccessSubgraphShape"); auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << " is marked as Fusion but does not allow accessing attributes"; @@ -693,8 +698,12 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); } } - auto provide = Op::GetAttr("FProvideSubgraphShape").get(inode.source->op(), nullptr); - CHECK(provide != nullptr) << "Encountered Fusion operator that does not implement providing subgraph shape."; + auto provide = + Op::GetAttr("FProvideSubgraphShape").get( + inode.source->op(), + nullptr); + CHECK(provide != nullptr) << + "Encountered Fusion operator that does not implement providing subgraph shape."; provide(inode.source->attrs, in_attrs, out_attrs); } forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 8d319c2f745b..0682dba9a213 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -23,12 +23,12 @@ * \brief * \author Clement Fuji Tsang */ -#include #include #include #include #include #include +#include #include "./simple_partition_pass.h" #include "../operator/fusion/fused_op-inl.h" #include "../operator/fusion/fused_op.h" @@ -78,7 +78,6 @@ namespace { } auto params_names = params_oss.str(); params_names.pop_back(); - //node->attrs.dict["subgraph_params_names"] = params_names; node->attrs.dict["symbol_json"] = nnvm::pass::SaveJSON(subgraph); node->attrs.dict["num_inputs"] = std::to_string(inputs_size); node->attrs.dict["num_outputs"] = std::to_string(subgraph.outputs.size()); @@ -86,7 +85,7 @@ namespace { node->op()->attr_parser(&(node->attrs)); return node; } -} +} // namespace /*! * \brief Replace a set of nodes by a subgraph node @@ -225,5 +224,5 @@ Graph FusePointwiseBackward(Graph &&g) { return ret; } -} -} +} // namespace exec +} // namespace mxnet diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 54adae0a9f3d..5b555d34ea19 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -20,7 +20,7 @@ /*! * Copyright (c) 2019 by Contributors * \file simple_partition_pass.h - * \brief + * \brief * \author Clement Fuji Tsang */ #ifndef MXNET_EXECUTOR_SIMPLE_PARTITION_PASS_H_ @@ -30,6 +30,10 @@ #include #include #include +#include +#include +#include +#include #include "exec_pass.h" @@ -215,8 +219,6 @@ using NodeRawPtrSet = std::unordered_set; * \return a map between the node in the main graph and the output index of the subgraph node */ nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_set) { - //std::vector outputs; - //NodeEntrySet _outputs; nnvm::NodeEntryMap outputs; uint32_t count = 0; for (auto& e : g.outputs) { @@ -233,7 +235,6 @@ nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_ } } }); - //outputs.insert(outputs.begin(), _outputs.begin(), _outputs.end()); return outputs; } @@ -345,7 +346,7 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set DFSVisit(g.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { for (auto &e : node->control_deps) { if (subgraph_set.count(e.get())) - e = subgraph_node; + e = subgraph_node; } }); DFSVisit(subgraph.outputs, [&subgraph_node, &subgraph_set](const nnvm::NodePtr& node) { @@ -368,7 +369,8 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set template std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_compatible) { BidirectionalGraph biG = BidirectionalGraph(g); - std::vector> subsets = biG.get_subsets(is_compatible); + std::vector> subsets = + biG.get_subsets(is_compatible); std::vector nnvm_subsets; nnvm_subsets.reserve(subsets.size()); for (auto& subset : subsets) { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index b2b57848ffc9..02b97ea7462e 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -17,9 +17,9 @@ * under the License. */ -#include #include #include +#include #include "./fused_op.h" #include "./fused_op-inl.h" #include "../operator_common.h" @@ -59,7 +59,6 @@ inline std::string mshadowTypeToString(int type) { FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { this->inputs_ = std::vector(config.num_inputs); this->outputs_ = std::vector(config.num_outputs); - //this->symbol_ = nnvm::pass::LoadJSON(config.symbol_json); this->symbol_ = nnvm::Graph(); this->symbol_.outputs = attrs->subgraphs[0]->outputs; this->initialized_ = false; @@ -68,13 +67,8 @@ FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { } -nnvm::Graph FusedOp::GetGraphWithoutControlDeps(nnvm::Graph &old) { - return old; -} - void FusedOp::GenerateCode(const std::vector &req) { - const auto& codegen_graph = GetGraphWithoutControlDeps(this->symbol_); - const auto& g = codegen_graph.indexed_graph(); + const auto& g = this->symbol_.indexed_graph(); std::string code = ""; int temp_name_counter = 0; using NodeEntry = nnvm::IndexedGraph::NodeEntry; @@ -186,7 +180,7 @@ void FusedOp::GenerateCode(const std::vector &req) { const auto& temp_arg = variables[{node.inputs[inp].node_id, node.inputs[inp].index}]; code += var_name + " = add(" + var_name + ", " + temp_arg + ");\n"; } - variables[{i,0}] = var_name; + variables[{i, 0}] = var_name; continue; } @@ -205,7 +199,7 @@ void FusedOp::GenerateCode(const std::vector &req) { code += "const auto " + var_name + " = backward_" + act_type + "(" + lhs + ", " + rhs + ");\n"; - variables[{i,0}] = var_name; + variables[{i, 0}] = var_name; continue; } LOG(FATAL) << "Unrecognized op " + op_name; @@ -540,7 +534,8 @@ NNVM_REGISTER_OP(FusedOp) .set_attr("TIsFusion", true) .set_attr("FProvideSubgraphShape", FusedOpProvideShape) .set_attr("FProvideSubgraphType", FusedOpProvideType) -.set_attr("FProvideSubgraphStorageType", FusedOpProvideStorageType) +.set_attr("FProvideSubgraphStorageType", + FusedOpProvideStorageType) .set_attr("FInferShape", FusedOpInferShape) .set_attr("FInferType", FusedOpInferType) .set_attr("FCompute", FusedOpForwardGPU); diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 7c14f1fadecc..d08b9c0fbc68 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -24,6 +24,7 @@ #include #include #include +#include namespace mxnet { @@ -103,7 +104,6 @@ class FusedOp { private: void GenerateCode(const std::vector &req); - nnvm::Graph GetGraphWithoutControlDeps(nnvm::Graph &old); std::vector inputs_; std::vector outputs_; From 48f1b945ab665bc2044774d803bb1a4c9bd610a4 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 5 Jun 2019 13:50:09 -0700 Subject: [PATCH 020/105] Do fusion only if default context is GPU --- src/executor/graph_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 916654282c46..7a48067ba90e 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -983,7 +983,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); - if (dmlc::GetEnv("MXNET_USE_FUSION", true)) { + if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) { g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwiseForward(std::move(g)); g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); From 37d4bbf7eb7d7f2c3407907fcb6b5ff63d6a20ae Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 5 Jun 2019 15:03:12 -0700 Subject: [PATCH 021/105] Fixes for tests Add powerscalar and rpowerscalar, fix return type of zero and one Cleaning, fixing lint Go back to proper TVM submodule --- .gitmodules | 2 +- 3rdparty/ps-lite | 2 +- src/executor/pointwise_fusion_pass.cc | 3 +- src/executor/simple_partition_pass.h | 2 +- src/operator/fusion/fused_op-inl.h | 44 ++++++++++++++++----------- src/operator/fusion/fused_op.cc | 1 - src/operator/fusion/fused_op.cu | 9 +++--- src/operator/fusion/fused_op.h | 3 -- 8 files changed, 35 insertions(+), 31 deletions(-) diff --git a/.gitmodules b/.gitmodules index 7f7522a68e67..e0ffec11bfd0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,7 +22,7 @@ branch = master [submodule "3rdparty/tvm"] path = 3rdparty/tvm - url = https://github.com/ptrendx/tvm + url = https://github.com/dmlc/tvm [submodule "3rdparty/onnx-tensorrt"] path = 3rdparty/onnx-tensorrt url = https://github.com/onnx/onnx-tensorrt.git diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite index f45e2e78a743..8a763892a973 160000 --- a/3rdparty/ps-lite +++ b/3rdparty/ps-lite @@ -1 +1 @@ -Subproject commit f45e2e78a7430be09f76264d2f4073fb2b1d54a2 +Subproject commit 8a763892a973afc1acd3d4b469d05bb338a83a6e diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 0682dba9a213..3cf0dacfcb35 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -67,7 +67,7 @@ namespace { // the name of the new node will be the concatenation of all the node names in the subgraph DFSVisit(subgraph.outputs, [&name_oss](const nnvm::NodePtr n) { if (n->op() != nullptr) - name_oss << n->attrs.name << "_"; + name_oss << n->op()->name << "_"; }); auto subgraph_name = name_oss.str(); subgraph_name.pop_back(); @@ -78,7 +78,6 @@ namespace { } auto params_names = params_oss.str(); params_names.pop_back(); - node->attrs.dict["symbol_json"] = nnvm::pass::SaveJSON(subgraph); node->attrs.dict["num_inputs"] = std::to_string(inputs_size); node->attrs.dict["num_outputs"] = std::to_string(subgraph.outputs.size()); node->attrs.op = Op::Get("FusedOp"); diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 5b555d34ea19..3fac35e319e4 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -253,7 +253,7 @@ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_s e = entry_map[e]; } else { auto new_node = nnvm::Node::Create(); - new_node->attrs.name = e.node->attrs.name + std::to_string(e.index); + new_node->attrs.name = "input_" + std::to_string(inputs.size()); entry_map.insert({e, nnvm::NodeEntry{new_node, 0, 0}}); inputs.push_back(e); e.node = new_node; diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 594c74cbfd7d..82d2d5807966 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -29,7 +29,7 @@ namespace mxnet { namespace detail { -const std::string fp16_support_string = R"code( +const char fp16_support_string[] = R"code( #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) #if defined(__cplusplus) @@ -58,7 +58,7 @@ const std::string fp16_support_string = R"code( typedef __half half; )code"; -const std::string type_support_string = R"code( +const char type_support_string[] = R"code( using float32 = float; using float64 = double; using float16 = half; @@ -83,8 +83,8 @@ const std::map fused_op_binary_ops = { {"elemwise_div", "div"}, {"_div" , "div"}, {"_Div" , "div"}, - {"_Power" , "pow"}, - {"_power" , "pow"}, + {"_Power" , "power"}, + {"_power" , "power"}, {"_Maximum" , "max"}, {"_maximum" , "max"}, {"_Minimum" , "min"}, @@ -158,6 +158,10 @@ const std::map> fused_op_special_ops = { {"_div_scalar", {"div(%, %)", "_0", "scalar"}}, {"_DivScalar", {"div(%, %)", "_0", "scalar"}}, {"_rdiv_scalar", {"rdiv(%, %)", "_0", "scalar"}}, + {"_power_scalar", {"power(%, %)", "_0", "scalar"}}, + {"_PowerScalar", {"power(%, %)", "_0", "scalar"}}, + {"_rpower_scalar", {"rpow(%, %)", "_0", "scalar"}}, + {"_RPowerScalar", {"rpow(%, %)", "_0", "scalar"}}, {"_RDivScalar", {"rdiv(%, %)", "_0", "scalar"}}, {"Cast", {"cast<%>(%)", "dtype", "_0"}}, {"cast", {"cast<%>(%)", "dtype", "_0"}}, @@ -195,7 +199,8 @@ const std::map> fused_op_special_ops = { {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, {"_backward_rdiv_scalar", {"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}, - {"_backward_hypot_scalar", {"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}} + {"_backward_hypot_scalar", {"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}}, + {"_backward_radians", {"radians(%)", "_0"}} // TODO(ptredak): arange }; @@ -209,13 +214,13 @@ const std::map>> fused_op_mimo {"_backward_div", {{"(% / %)", "_0", "_2"}, {"(-% * % / (% * %))", "_0", "_1", "_2", "_2"}}}, {"_backward_power", {{"(% * % * powf(%, % - 1))", "_0", "_2", "_1", "_2"}, - {"(% * powf(%, %) & logf(%))", "_0", "_1", "_2", "_1"}}}, + {"(% * powf(%, %) * logf(%))", "_0", "_1", "_2", "_1"}}}, {"_backward_power_scalar", {{"(% * % * powf(%, % - 1))", "_0", "scalar", "_1", "scalar"}}}, - {"_backward_rpower_scalar", {{"(% * powf(%, %) & logf(%))", "_0", "scalar", "_2", "scalar"}}}, - {"_backward_maximum", {{"((% > %) ? % : 0)", "_1", "_2", "_0"}, - {"((% > %) ? 0 : %)", "_1", "_2", "_0"}}}, - {"_backward_minimum", {{"((% < %) ? % : 0)", "_1", "_2", "_0"}, - {"((% < %) ? 0 : %)", "_1", "_2", "_0"}}}, + {"_backward_rpower_scalar", {{"(% * % * logf(%))", "_0", "_1", "scalar"}}}, + {"_backward_maximum", {{"((% >= %) ? % : 0)", "_1", "_2", "_0"}, + {"((% >= %) ? 0 : %)", "_1", "_2", "_0"}}}, + {"_backward_minimum", {{"((% <= %) ? % : 0)", "_1", "_2", "_0"}, + {"((% <= %) ? 0 : %)", "_1", "_2", "_0"}}}, {"_backward_hypot", {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "_2"}, {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} }; @@ -225,7 +230,7 @@ const std::vector fused_op_variable_io_ops = { "_backward_Activation" }; -const std::string fused_op_function_definitions = R"code( +const char fused_op_function_definitions[] = R"code( template struct LoadType { using Type = DType; @@ -298,10 +303,15 @@ inline DType rdiv(const DType a, const DType2 b) { } template -inline DType pow(const DType a, const DType2 b) { +inline DType power(const DType a, const DType2 b) { return powf(a, b); } +template +inline DType rpow(const DType a, const DType2 b) { + return powf(b, a); +} + template inline DType max(const DType a, const DType2 b) { return a > b ? a : b; @@ -625,12 +635,12 @@ inline DType backward_square(const DType val, const DType grad) { } template -inline DType zero(const DType val) { +inline typename LoadType::Type zero(const DType val) { return 0; } template -inline DType one(const DType val) { +inline typename LoadType::Type one(const DType val) { return 1; } @@ -709,12 +719,12 @@ inline DType erfinv(const DType val) { )code"; -const std::string fused_op_kernel_begin = R"code( +const char fused_op_kernel_begin[] = R"code( const int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < N; i+= gridDim.x * blockDim.x) { )code"; -const std::string fused_op_kernel_end = R"code( +const char fused_op_kernel_end[] = R"code( } } )code"; diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index f1e85c614439..c54f75aa01e7 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -40,7 +40,6 @@ void FusedOpParamParser(nnvm::NodeAttrs* attrs) { os << ")"; throw dmlc::ParamError(os.str()); } - CHECK(!param.symbol_json.empty()); attrs->parsed = FusedOpPtr(new FusedOp(attrs, param)); } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 02b97ea7462e..23af9870bc5f 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -20,13 +20,13 @@ #include #include #include +#include #include "./fused_op.h" #include "./fused_op-inl.h" #include "../operator_common.h" #include "../elemwise_op_common.h" #include "../../executor/exec_pass.h" #include "../../common/cuda_utils.h" -#include namespace mxnet { @@ -64,7 +64,6 @@ FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { this->initialized_ = false; this->cc_major_ = -1; this->cc_minor_ = -1; - } void FusedOp::GenerateCode(const std::vector &req) { @@ -91,7 +90,7 @@ void FusedOp::GenerateCode(const std::vector &req) { if (source != nullptr) { std::string var_name = "temp" + std::to_string(temp_name_counter++); if (source->is_variable()) { - code += "const auto " + var_name + " = load(input_" + source->attrs.name + ", i);\n"; + code += "const auto " + var_name + " = load(" + source->attrs.name + ", i);\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = var_name; } else { @@ -276,7 +275,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* input_" +input_names[i]; + kernel_params += "DType" + std::to_string(i) + "* " +input_names[i]; ++i; if (i < num_params) { kernel_params += ", "; @@ -292,7 +291,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, kernel_params += ", "; } } - code_ = detail::fp16_support_string + "\n" + + code_ = std::string(detail::fp16_support_string) + "\n" + detail::type_support_string + "\n" + detail::fused_op_function_definitions + "\n" + aux_code + "\n" + diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index d08b9c0fbc68..710f956e2843 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -29,12 +29,9 @@ namespace mxnet { struct FusedOpConfig : public dmlc::Parameter { - std::string symbol_json; int num_inputs; int num_outputs; DMLC_DECLARE_PARAMETER(FusedOpConfig) { - DMLC_DECLARE_FIELD(symbol_json) - .describe("JSON of the replaced symbol."); DMLC_DECLARE_FIELD(num_inputs) .describe("Number of inputs."); DMLC_DECLARE_FIELD(num_outputs) From 616b932d9bb3826f404147672efb283eecdce29f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 6 Jun 2019 15:14:02 -0700 Subject: [PATCH 022/105] Fix the TVM commit --- 3rdparty/tvm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/tvm b/3rdparty/tvm index efb4063d8702..88163ec1abda 160000 --- a/3rdparty/tvm +++ b/3rdparty/tvm @@ -1 +1 @@ -Subproject commit efb4063d8702d27b969e1e59a13520ec1c08737c +Subproject commit 88163ec1abda8a803d165611bc8eb0d648f753fc From 56303c8a82c1929d5f12b8416e5f3682f4eb9c91 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 6 Jun 2019 15:26:46 -0700 Subject: [PATCH 023/105] Fix lint --- src/executor/simple_partition_pass.h | 2 +- src/operator/fusion/fused_op.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 3fac35e319e4..0d47a5783089 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -153,7 +153,7 @@ class BidirectionalGraph { using PairVec = std::pair, std::vector>; using IncompMap = std::unordered_map>; - template + template void DFS(const std::vector& heads, bool reverse, FVisit fvisit) { std::unordered_set visited; std::vector vec(heads.begin(), heads.end()); diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 23af9870bc5f..beb384da0320 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include "./fused_op.h" #include "./fused_op-inl.h" #include "../operator_common.h" From 00e61cfd63bf2890ea64feb6516349748993b88b Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 6 Jun 2019 15:30:01 -0700 Subject: [PATCH 024/105] Guard fusion with MXNET_USE_CUDA --- src/executor/graph_executor.cc | 3 +++ src/executor/pointwise_fusion_pass.cc | 5 +++++ src/operator/fusion/fused_op-inl.h | 4 ++++ src/operator/fusion/fused_op.cc | 4 ++++ src/operator/fusion/fused_op.h | 3 +++ 5 files changed, 19 insertions(+) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 7a48067ba90e..752fd83e463c 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -983,12 +983,15 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); +#if MXNET_USE_CUDA if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) { g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwiseForward(std::move(g)); g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwiseBackward(std::move(g)); } +#endif // MXNET_USE_CUDA + // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, in_arg_ctxes, diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 3cf0dacfcb35..b011ac7305b4 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -23,6 +23,9 @@ * \brief * \author Clement Fuji Tsang */ + +#if MXNET_USE_CUDA + #include #include #include @@ -225,3 +228,5 @@ Graph FusePointwiseBackward(Graph &&g) { } // namespace exec } // namespace mxnet + +#endif // MXNET_USE_CUDA diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 82d2d5807966..ce121bd5da66 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -20,6 +20,8 @@ #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ #define MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ +#if MXNET_USE_CUDA + #include #include #include @@ -733,4 +735,6 @@ const char fused_op_kernel_end[] = R"code( } // namespace mxnet +#endif // MXNET_USE_CUDA + #endif // MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index c54f75aa01e7..d8d25f0b3bba 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -17,6 +17,8 @@ * under the License. */ +#if MXNET_USE_CUDA + #include "./fused_op.h" #include "../operator_common.h" #include "../../executor/exec_pass.h" @@ -56,3 +58,5 @@ NNVM_REGISTER_OP(FusedOp) .add_argument("data", "NDArray-or-Symbol[]", "Data"); } // namespace mxnet + +#endif // MXNET_USE_CUDA diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 710f956e2843..37e87a92905e 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -20,6 +20,8 @@ #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_H_ #define MXNET_OPERATOR_FUSION_FUSED_OP_H_ +#if MXNET_USE_CUDA + #include #include #include @@ -136,4 +138,5 @@ using FusedOpHelperParamPtr = std::shared_ptr; } // namespace mxnet +#endif // MXNET_USE_CUDA #endif // MXNET_OPERATOR_FUSION_FUSED_OP_H_ From 204ab301e9a4437e144fbb1c80c1e4fd556da75d Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 6 Jun 2019 15:48:32 -0700 Subject: [PATCH 025/105] Fix --- src/executor/pointwise_fusion_pass.cc | 4 ++-- src/operator/fusion/fused_op-inl.h | 4 ++-- src/operator/fusion/fused_op.cc | 4 ++-- src/operator/fusion/fused_op.h | 3 ++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index b011ac7305b4..d1a3cdd4b915 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -24,8 +24,6 @@ * \author Clement Fuji Tsang */ -#if MXNET_USE_CUDA - #include #include #include @@ -37,6 +35,8 @@ #include "../operator/fusion/fused_op.h" #include "../operator/operator_common.h" +#if MXNET_USE_CUDA + namespace mxnet { namespace exec { namespace { diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index ce121bd5da66..144a0ef09ff9 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -20,13 +20,13 @@ #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ #define MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ -#if MXNET_USE_CUDA - #include #include #include #include "../nn/activation-inl.h" +#if MXNET_USE_CUDA + namespace mxnet { namespace detail { diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index d8d25f0b3bba..ff57f106aed0 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -17,12 +17,12 @@ * under the License. */ -#if MXNET_USE_CUDA - #include "./fused_op.h" #include "../operator_common.h" #include "../../executor/exec_pass.h" +#if MXNET_USE_CUDA + namespace mxnet { DMLC_REGISTER_PARAMETER(FusedOpConfig); diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 37e87a92905e..f8d78b9561f9 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -20,7 +20,6 @@ #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_H_ #define MXNET_OPERATOR_FUSION_FUSED_OP_H_ -#if MXNET_USE_CUDA #include #include @@ -28,6 +27,8 @@ #include #include +#if MXNET_USE_CUDA + namespace mxnet { struct FusedOpConfig : public dmlc::Parameter { From 0e89f8ce67ec7d63af377751cf78ff771f7216bf Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 10 Jun 2019 09:24:33 -0700 Subject: [PATCH 026/105] Fix clang-tidy --- src/executor/infer_graph_attr_pass.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index d2a9c18d2291..9b0c967530e9 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -686,8 +686,8 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, std::vector> in_attrs; std::vector> out_attrs; for (const auto& dep_node : inode.source->control_deps) { - in_attrs.push_back({}); - out_attrs.push_back({}); + in_attrs.emplace_back(); + out_attrs.emplace_back(); auto ¤t_in_attrs = in_attrs.back(); auto ¤t_out_attrs = out_attrs.back(); uint32_t dep_node_id = idx.node_id(dep_node.get()); From 73a2a5ca0bcef936880b4aa2b91b5d21d0a6f485 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 12 Jun 2019 09:50:45 -0700 Subject: [PATCH 027/105] Add erf and erfinv backward --- src/operator/fusion/fused_op-inl.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 144a0ef09ff9..21d0f7e11aa2 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -202,7 +202,9 @@ const std::map> fused_op_special_ops = { {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, {"_backward_rdiv_scalar", {"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}, {"_backward_hypot_scalar", {"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}}, - {"_backward_radians", {"radians(%)", "_0"}} + {"_backward_radians", {"radians(%)", "_0"}}, + {"_backward_erf", {"backward_erf(%, %)", "_1", "_0"}}, + {"_backward_erfinv", {"backward_erfinv(%, %)", "_1", "_0"}} // TODO(ptredak): arange }; @@ -714,11 +716,21 @@ inline DType erf(const DType val) { return erff(val); } +template +inline DType backward_erf(const DType val, const DType grad) { + return 2.0f / sqrt(pi) * exp(-(val*val)) * grad; +} + template inline DType erfinv(const DType val) { return erfinvf(val); } +template +inline DType backward_erfinv(const DType val, const DType grad) { + return 0.5f * sqrt(pi) * exp(val * val) * grad; +} + )code"; const char fused_op_kernel_begin[] = R"code( From 4d0f1c98dce7efca0093214e24d74ad1810f086c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 10 Jun 2019 10:12:31 -0700 Subject: [PATCH 028/105] Gluon support for fusion --- src/imperative/cached_op.cc | 282 +++++++++++++++++++----------- src/imperative/cached_op.h | 1 - src/imperative/imperative_utils.h | 14 ++ src/nnvm/gradient.cc | 4 + src/operator/fusion/fused_op.cu | 9 +- src/operator/fusion/fused_op.h | 1 + 6 files changed, 208 insertions(+), 103 deletions(-) diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 07c7871c6045..ec51bfc34fed 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -34,7 +34,10 @@ constexpr uint32_t kEidNotExist = std::numeric_limits::max(); struct CachedOp::GraphInfo { nnvm::Graph fwd_graph; + nnvm::Graph grad_graph; nnvm::Graph full_graph; + std::vector ograd_entries; + std::unordered_map fwd_input_to_grad_output; std::vector bwd_output_reqs; std::vector bwd_input_eid; }; @@ -45,13 +48,144 @@ struct CachedOp::DynamicRuntime { std::vector op_states; }; +void CreateFullGraph(const nnvm::Symbol& sym, + nnvm::Graph* fwd_graph, + nnvm::Graph* grad_graph, + nnvm::Graph* full_graph, + std::vector* ograd_entries, + std::unordered_map* fwd_input_to_grad_output) { + using namespace nnvm; + static const std::vector zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")}; + static const auto _copy_op = Op::Get("_copy"); + { + NodeEntryMap dedup_out; + for (const NodeEntry& nodeEntry : sym.outputs) { + if (dedup_out.find(nodeEntry) != dedup_out.end()) { + NodePtr copy_node = Node::Create(); + copy_node->attrs.op = _copy_op; + copy_node->attrs.name = + nodeEntry.node->attrs.name + "_copy" + std::to_string(dedup_out[nodeEntry]++); + copy_node->inputs.emplace_back(nodeEntry); + if (_copy_op->attr_parser != nullptr) { + _copy_op->attr_parser(&(copy_node->attrs)); + } + fwd_graph->outputs.emplace_back(std::move(copy_node)); + } else { + dedup_out.emplace(nodeEntry, 0); + fwd_graph->outputs.push_back(nodeEntry); + } + } + } + + // construct backward graph + { + ograd_entries->reserve(fwd_graph->outputs.size()); + for (size_t i = 0; i < fwd_graph->outputs.size(); ++i) + ograd_entries->emplace_back(Node::Create()); + + std::vector xs; + const IndexedGraph& indexed_graph = fwd_graph->indexed_graph(); + for (size_t i = 0; i < indexed_graph.input_nodes().size(); ++i) { + const uint32_t node_id = indexed_graph.input_nodes()[i]; + if (indexed_graph.mutable_input_nodes().count(node_id)) + continue; + (*fwd_input_to_grad_output)[i] = xs.size(); + xs.emplace_back(indexed_graph[node_id].weak_ref.lock()); + } + + CHECK(!xs.empty()) + << "There are no inputs in computation graph that require gradients."; + + *grad_graph = pass::MXGradient( + *fwd_graph, fwd_graph->outputs, xs, *ograd_entries, + exec::AggregateGradient, nullptr, nullptr, + zero_ops, "_copy"); + } + + // construct full graph + { + full_graph->outputs = fwd_graph->outputs; + for (const auto& i : grad_graph->outputs) full_graph->outputs.emplace_back(i); + } +} + +void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) { + const auto& idx = fwd_graph->indexed_graph(); + CHECK_GE(idx.input_nodes().size(), 1) << "CachedOp requires at least 1 input"; + + std::vector ref_count(idx.num_node_entries(), 0); + for (const auto& i : idx.input_nodes()) ++ref_count[idx.entry_id(i, 0)]; + for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)]; + for (size_t i = 0; i < idx.num_nodes(); ++i) { + for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)]; + } + + fwd_graph->attrs["forward_ref_count"] = + std::make_shared(std::move(ref_count)); + + size_t num_forward_nodes = idx.num_nodes(); + size_t num_forward_entries = idx.num_node_entries(); + + const auto& full_idx = full_graph.indexed_graph(); + + std::vector temp_ref_count(full_idx.num_node_entries(), 0); + for (size_t i = num_forward_nodes; i < full_idx.num_nodes(); ++i) { + for (const auto& j : full_idx[i].inputs) { + ++temp_ref_count[full_idx.entry_id(j)]; + } + } + + auto full_ref_count = fwd_graph->GetAttr >("forward_ref_count"); + for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += temp_ref_count[i]; + fwd_graph->attrs["full_ref_count"] = + std::make_shared(std::move(full_ref_count)); +} + +void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph, + const Context& context, size_t num_forward_outputs, const bool inlining) { +#if MXNET_USE_CUDA + if (dmlc::GetEnv("MXNET_USE_FUSION", true) && context.dev_mask() == kGPU && + !inlining) { + if (dmlc::GetEnv("MXNET_DEBUG_PRINT_GRAPH", 0)) { + std::cout << "Before the fusion" << std::endl; + exec::PrintFullGraph(*full_graph); + } + full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); + *full_graph = exec::FusePointwiseForward(std::move(*full_graph)); + full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); + *full_graph = exec::FusePointwiseBackward(std::move(*full_graph)); + if (dmlc::GetEnv("MXNET_DEBUG_PRINT_GRAPH", 0)) { + std::cout << "After the fusion" << std::endl; + exec::PrintFullGraph(*full_graph); + } + } +#endif // MXNET_USE_CUDA + + *fwd_graph = nnvm::Graph(); + fwd_graph->outputs = std::vector(full_graph->outputs.begin(), + full_graph->outputs.begin() + + num_forward_outputs); + *grad_graph = nnvm::Graph(); + grad_graph->outputs = std::vector(full_graph->outputs.begin() + + num_forward_outputs, + full_graph->outputs.end()); + SetRefCounts(fwd_graph, *full_graph); +} + struct CachedOp::CachedOpState { CachedOpState(const Context& context_, const nnvm::Graph& fwd_graph_, - const nnvm::Graph& full_graph_) { + const nnvm::Graph& full_graph_, + const bool inlining_) { context = context_; - info.fwd_graph = fwd_graph_; - info.full_graph = full_graph_; + nnvm::Symbol sym; + sym.outputs = fwd_graph_.outputs; + CreateFullGraph(sym, &info.fwd_graph, &info.grad_graph, + &info.full_graph, &info.ograd_entries, + &info.fwd_input_to_grad_output); + + OptimizeGraph(&info.full_graph, &info.fwd_graph, &info.grad_graph, + context_, fwd_graph_.outputs.size(), inlining_); size_t max_nodes = info.full_graph.indexed_graph().num_nodes(); size_t max_entries = info.full_graph.indexed_graph().num_node_entries(); @@ -95,50 +229,23 @@ struct CachedOp::CachedOpState { CachedOp::CachedOp( const nnvm::Symbol& sym, const std::vector >& flags) { - using namespace nnvm; - using namespace imperative; - static const std::vector zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")}; - static const auto _copy_op = Op::Get("_copy"); config_.Init(flags); if (config_.static_shape) { CHECK(config_.static_alloc) << "static_alloc must be True when static_shape is True"; } - // construct forward graph + auto grad_graph = nnvm::Graph(); + std::unordered_map fwd_input_to_grad_output; + CreateFullGraph(sym, &fwd_graph_, &grad_graph, &full_graph_, + &ograd_entries_, &fwd_input_to_grad_output); + { - NodeEntryMap dedup_out; - for (const NodeEntry& nodeEntry : sym.outputs) { - if (dedup_out.find(nodeEntry) != dedup_out.end()) { - NodePtr copy_node = Node::Create(); - copy_node->attrs.op = _copy_op; - copy_node->attrs.name = - nodeEntry.node->attrs.name + "_copy" + std::to_string(dedup_out[nodeEntry]++); - copy_node->inputs.emplace_back(nodeEntry); - if (_copy_op->attr_parser != nullptr) { - _copy_op->attr_parser(&(copy_node->attrs)); - } - fwd_graph_.outputs.emplace_back(std::move(copy_node)); - } else { - dedup_out.emplace(nodeEntry, 0); - fwd_graph_.outputs.push_back(nodeEntry); - } - } const auto& idx = fwd_graph_.indexed_graph(); - CHECK_GE(idx.input_nodes().size(), 1) << "CachedOp requires at least 1 input"; - - std::vector ref_count(idx.num_node_entries(), 0); - for (const auto& i : idx.input_nodes()) ++ref_count[idx.entry_id(i, 0)]; - for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)]; - for (size_t i = 0; i < idx.num_nodes(); ++i) { - for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)]; - } - - fwd_graph_.attrs["forward_ref_count"] = - std::make_shared(std::move(ref_count)); - + bwd_output_reqs_ = std::vector(grad_graph.outputs.size(), kWriteTo); inlining_ = !config_.static_alloc && (idx.num_nodes() - idx.input_nodes().size()) <= config_.inline_limit; + std::cout << "Inlining: " << inlining_ << std::endl; } // Set params @@ -157,53 +264,9 @@ CachedOp::CachedOp( } } - // construct backward graph + // Set the backward dependency vectors { - ograd_entries_.reserve(fwd_graph_.outputs.size()); - for (size_t i = 0; i < fwd_graph_.outputs.size(); ++i) - ograd_entries_.emplace_back(Node::Create()); - - std::vector xs; - const IndexedGraph& indexed_graph = fwd_graph_.indexed_graph(); - for (size_t i = 0; i < indexed_graph.input_nodes().size(); ++i) { - const uint32_t node_id = indexed_graph.input_nodes()[i]; - if (indexed_graph.mutable_input_nodes().count(node_id)) - continue; - fwd_input_to_grad_output_[i] = xs.size(); - xs.emplace_back(indexed_graph[node_id].weak_ref.lock()); - } - - CHECK(!xs.empty()) - << "There are no inputs in computation graph that require gradients."; - - grad_graph_ = pass::MXGradient( - fwd_graph_, fwd_graph_.outputs, xs, ograd_entries_, - exec::AggregateGradient, nullptr, nullptr, - zero_ops, "_copy"); - } - - // construct full graph - { - size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes(); - size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries(); - - full_graph_.outputs = fwd_graph_.outputs; - bwd_output_reqs_ = std::vector(grad_graph_.outputs.size(), kWriteTo); - for (const auto& i : grad_graph_.outputs) full_graph_.outputs.emplace_back(i); const auto& idx = full_graph_.indexed_graph(); - - std::vector ref_count(idx.num_node_entries(), 0); - for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) { - for (const auto& j : idx[i].inputs) { - ++ref_count[idx.entry_id(j)]; - } - } - - auto full_ref_count = fwd_graph_.GetAttr >("forward_ref_count"); - for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += ref_count[i]; - fwd_graph_.attrs["full_ref_count"] = - std::make_shared(std::move(full_ref_count)); - size_t num_forward_inputs = num_inputs(); size_t num_forward_outputs = num_outputs(); for (uint32_t i = 0; i < ograd_entries_.size(); ++i) { @@ -221,6 +284,8 @@ CachedOp::CachedOp( bwd_out_dep_.push_back(i); } } + + SetRefCounts(&fwd_graph_, full_graph_); } CachedOp::~CachedOp() { @@ -404,10 +469,10 @@ bool CachedOp::SetBackwardGraph( info->bwd_output_reqs = reqs; info->bwd_input_eid.clear(); g = nnvm::Graph(); - g.outputs = fwd_graph_.outputs; - for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) { + g.outputs = info->fwd_graph.outputs; + for (size_t i = 0; i < info->grad_graph.outputs.size(); ++i) { if (info->bwd_output_reqs[i] == kNullOp) continue; - g.outputs.emplace_back(grad_graph_.outputs[i]); + g.outputs.emplace_back(info->grad_graph.outputs[i]); } g.attrs["context"] = std::make_shared( std::vector(g.indexed_graph().num_nodes(), default_ctx)); @@ -418,12 +483,12 @@ bool CachedOp::SetBackwardGraph( if (info->bwd_input_eid.size() != inputs.size()) { info->bwd_input_eid.clear(); SetBackwardInputEid(bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_, - ograd_entries_, idx, &info->bwd_input_eid); + info->ograd_entries, idx, &info->bwd_input_eid); CHECK_EQ(inputs.size(), info->bwd_input_eid.size()); } - size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes(); - size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries(); + size_t num_forward_nodes = info->fwd_graph.indexed_graph().num_nodes(); + size_t num_forward_entries = info->fwd_graph.indexed_graph().num_node_entries(); if (!g.attrs.count("backward_ref_count")) { std::vector ref_count(idx.num_node_entries(), 0); @@ -502,7 +567,8 @@ OpStatePtr CachedOp::GetCachedOpState( return i; } } - auto state_ptr = OpStatePtr::Create(ctx, fwd_graph_, full_graph_); + auto state_ptr = OpStatePtr::Create(ctx, fwd_graph_, full_graph_, + inlining_); cached_op_states_[ctx].push_back(state_ptr); return state_ptr; @@ -902,8 +968,10 @@ OpStatePtr CachedOp::Forward( CHECK_EQ(inputs.size(), num_inputs()); Context default_ctx = inputs[0]->ctx(); + auto state_ptr = GetCachedOpState(default_ctx); + auto& state = state_ptr.get_state(); - const auto& idx = fwd_graph_.indexed_graph(); + const auto& idx = state.info.fwd_graph.indexed_graph(); for (size_t i = 0; i < inputs.size(); ++i) { CHECK_EQ(inputs[i]->ctx(), default_ctx) << "CachedOp requires all inputs to live on the same context. But " @@ -951,6 +1019,7 @@ void CachedOp::DynamicBackward( const std::vector& inputs, const std::vector& reqs, const std::vector& outputs) { + std::cout << "HAHA" << std::endl; using namespace nnvm; using namespace imperative; @@ -966,14 +1035,16 @@ void CachedOp::DynamicBackward( runtime.info.full_graph = state.info.full_graph; runtime.info.bwd_input_eid = state.info.bwd_input_eid; } + std::cout << "HOHO" << std::endl; nnvm::Graph& g = runtime.info.full_graph; const auto& idx = g.indexed_graph(); auto& buff = runtime.buff; auto& states = runtime.op_states; - size_t num_forward_outputs = fwd_graph_.outputs.size(); - size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes(); - size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries(); + size_t num_forward_outputs = runtime.info.fwd_graph.outputs.size(); + size_t num_forward_nodes = runtime.info.fwd_graph.indexed_graph().num_nodes(); + size_t num_forward_entries = runtime.info.fwd_graph.indexed_graph().num_node_entries(); + std::cout << "HIHI" << std::endl; buff.resize(idx.num_node_entries()); std::vector arrays; arrays.reserve(buff.size()); @@ -994,6 +1065,7 @@ void CachedOp::DynamicBackward( *outputs[i] = arrays[eid]->Detach(); arrays[eid] = outputs[i]; } + std::cout << "HOWHOW" << std::endl; // Allocate NDArrays auto ref_count = g.GetAttr >("backward_ref_count"); @@ -1013,11 +1085,14 @@ void CachedOp::DynamicBackward( } const auto& mem_plan = g.GetAttr("backward_mem_plan"); + std::cout << "before allocatememory" << std::endl; AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(), mem_plan, arrays, &array_reqs); + std::cout << "after allocatememory" << std::endl; const auto& dispatch_modes = g.GetAttr("dispatch_mode"); + std::cout << "Run graph!" << std::endl; RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(), std::move(array_reqs), std::move(ref_count), &states, dispatch_modes, Imperative::Get()->is_recording()); @@ -1068,9 +1143,9 @@ void CachedOp::StaticBackward( if (config_.static_shape) { for (auto i : config_.param_indices) { - const auto iter = fwd_input_to_grad_output_.find(i); - if (iter == fwd_input_to_grad_output_.end()) continue; - auto entry = grad_graph_.outputs[iter->second]; + const auto iter = state.info.fwd_input_to_grad_output.find(i); + if (iter == state.info.fwd_input_to_grad_output.end()) continue; + auto entry = state.info.grad_graph.outputs[iter->second]; if (!idx.exist(entry.node.get())) continue; auto eid = idx.entry_id(entry); if (!arrays[eid]->IsSame(*outputs[iter->second]) || @@ -1085,9 +1160,9 @@ void CachedOp::StaticBackward( } } for (auto i : config_.data_indices) { - const auto iter = fwd_input_to_grad_output_.find(i); - if (iter == fwd_input_to_grad_output_.end()) continue; - auto entry = grad_graph_.outputs[iter->second]; + const auto iter = state.info.fwd_input_to_grad_output.find(i); + if (iter == state.info.fwd_input_to_grad_output.end()) continue; + auto entry = state.info.grad_graph.outputs[iter->second]; if (!idx.exist(entry.node.get())) continue; auto eid = idx.entry_id(entry); state.array_reqs[eid] = reqs[iter->second]; @@ -1097,8 +1172,8 @@ void CachedOp::StaticBackward( arrays[eid] = outputs[iter->second]; } } else { - for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) { - auto entry = grad_graph_.outputs[i]; + for (size_t i = 0; i < state.info.grad_graph.outputs.size(); ++i) { + auto entry = state.info.grad_graph.outputs[i]; if (!idx.exist(entry.node.get())) continue; auto eid = idx.entry_id(entry); state.array_reqs[eid] = reqs[i]; @@ -1123,17 +1198,20 @@ void CachedOp::Backward( const std::vector& reqs, const std::vector& outputs) { using namespace imperative; + std::cout << "BACKWARD" << std::endl; CHECK(!Imperative::Get()->is_recording()) << "CachedOp does not support higher order gradients. " << "If you want to do backward with create_graph=True please " << "do not use hybridize."; + std::cout << "Get bulk size" << std::endl; int prev_bulk_size = Engine::Get()->set_bulk_size(config_.backward_bulk_size); try { if (config_.static_alloc) { StaticBackward(retain_graph, state, inputs, reqs, outputs); } else { + std::cout << "Dynamic backward?" << std::endl; DynamicBackward(retain_graph, state, inputs, reqs, outputs); } } catch (const dmlc::Error& e) { @@ -1214,6 +1292,7 @@ void CachedOpBackward(const OpStatePtr& state_ptr, const std::vector& outputs) { using namespace nnvm; using namespace imperative; + std::cout << "CachedOpBackward!" << std::endl; CachedOpActualState &s = state_ptr.get_state(); std::vector in_bufs = inputs; std::vector out_bufs = outputs; @@ -1380,6 +1459,7 @@ NNVM_REGISTER_OP(_CachedOp) .set_attr("FGradient", [](const nnvm::NodePtr& n, const std::vector& ograds) { const CachedOpPtr& op = nnvm::get(n->attrs.parsed); + std::cout << "Called FGradient!" << std::endl; return op->Gradient(n, ograds); }) .set_attr("FListInputNames", diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h index 14b373edea57..2fe31de2fcd4 100644 --- a/src/imperative/cached_op.h +++ b/src/imperative/cached_op.h @@ -193,7 +193,6 @@ class CachedOp { CachedOpConfig config_; nnvm::Graph fwd_graph_; - nnvm::Graph grad_graph_; nnvm::Graph full_graph_; bool inlining_; std::vector ograd_entries_; diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 5cb805c5abcb..2f7eaf152556 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -683,16 +683,21 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev std::pair entry_range = {0, 0}) { using namespace nnvm; nnvm::Graph& g = *p_g; + std::cout << "Checkandinferstoragetype" << std::endl; bool dev_match = g.attrs.count("dev_mask") && g.GetAttr("dev_mask") == dev_mask; + std::cout << "count(dev_mask) " << g.attrs.count("dev_mask") << std::endl; + std::cout << "dev_match: " << dev_match << std::endl; if (!dev_match) { g.attrs["dev_mask"] = std::make_shared(std::move(dev_mask)); } if (dev_match && use_inputs) { + std::cout << "dev match & use_inputs" << std::endl; if (g.attrs.count("storage_type_inputs") && g.GetAttr("storage_type_inputs") == storage_types) return true; } else if (dev_match && g.attrs.count("storage_type")) { + std::cout << "dev_match && storage_type" << std::endl; const auto& prev_storage_types = g.GetAttr("storage_type"); CHECK_EQ(prev_storage_types.size(), storage_types.size()); bool match = true; @@ -710,9 +715,18 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev g.attrs.erase("dispatch_mode"); g.attrs.erase("storage_type"); g.attrs.erase("storage_type_inputs"); + std::cout << "Node range: " << node_range.first << " " << node_range.second << std::endl; if (node_range.second > node_range.first) { g.attrs["node_range"] = std::make_shared(node_range); } + std::cout << g.attrs.size() << std::endl; + for (const auto& attr : g.attrs) { + std::cout << attr.first << std::endl; + } + std::cout << storage_types.size() << std::endl; + for (const auto& st : storage_types) { + std::cout << st << std::endl; + } if (use_inputs) { g = exec::InferStorageType(std::move(g), std::move(storage_types)); } else { diff --git a/src/nnvm/gradient.cc b/src/nnvm/gradient.cc index 586027129a0b..ce7c5cf1528f 100644 --- a/src/nnvm/gradient.cc +++ b/src/nnvm/gradient.cc @@ -83,6 +83,7 @@ struct GradEntry { }; Graph Gradient(Graph src) { + std::cout << "Gradient!" << std::endl; using nnvm::FGradient; using MirrorFun = std::function; using AttrHintFun = std::function; @@ -172,7 +173,10 @@ Graph Gradient(Graph src) { std::vector out_agg_grads; for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) { const NodePtr& ptr = *rit; + std::cout << "Node ptr"<< std::endl; + if (ptr->is_variable()) std::cout << "Variable!" << std::endl; if (ptr->is_variable()) continue; + std::cout << "Op: " << ptr->op()->name << std::endl; out_agg_grads.clear(); auto& out_grad_vec = output_grads.at(ptr.get()); for (uint32_t i = 0; i < out_grad_vec.size(); ++i) { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index beb384da0320..b9806a07fd69 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -215,8 +215,10 @@ void FusedOp::GenerateCode(const std::vector &req) { code += "store(" + var + ", i, output" + std::to_string(counter) + ");\n"; } else if (req[counter] == kAddTo) { code += "storeadd(" + var + ", i, output" + std::to_string(counter) + ");\n"; + } else if (req[counter] == kNullOp) { + // NULL req, do not do anything } else { - LOG(FATAL) << "Encountered unexpected req"; + LOG(FATAL) << "Encountered unexpected req."; } ++counter; } @@ -262,6 +264,9 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, this->cc_major_ = cc_major; this->cc_minor_ = cc_minor; + initialized_ = initialized_ && (req == saved_reqs_); + saved_reqs_ = req; + if (!initialized_) { this->GenerateCode(req); LOG(INFO) << code_; @@ -386,10 +391,12 @@ template <> bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { + std::cout << "InferShape in FusedOp!" << std::endl; std::vector input_shapes(*in_attrs); this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), std::move(input_shapes), "__shape__"); + std::cout << "END: InferShape in FusedOp!" << std::endl; const auto& g = this->symbol_.indexed_graph(); const auto& input_nids = g.input_nodes(); diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index f8d78b9561f9..4a39672a0048 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -115,6 +115,7 @@ class FusedOp { std::vector> aux_out_shapes; std::vector> aux_in_types; std::vector> aux_out_types; + std::vector saved_reqs_; std::string ptx_; std::string kernel_name_; From 3dddad7d4fcb337ea3cd0c50d624dca022205739 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 13 Jun 2019 15:25:13 -0700 Subject: [PATCH 029/105] Cleaning --- src/imperative/cached_op.cc | 13 ------------- src/imperative/cached_op.h | 2 -- src/imperative/imperative_utils.h | 14 -------------- src/nnvm/gradient.cc | 4 ---- 4 files changed, 33 deletions(-) diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index ec51bfc34fed..e7fae3a02225 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -245,7 +245,6 @@ CachedOp::CachedOp( bwd_output_reqs_ = std::vector(grad_graph.outputs.size(), kWriteTo); inlining_ = !config_.static_alloc && (idx.num_nodes() - idx.input_nodes().size()) <= config_.inline_limit; - std::cout << "Inlining: " << inlining_ << std::endl; } // Set params @@ -1019,7 +1018,6 @@ void CachedOp::DynamicBackward( const std::vector& inputs, const std::vector& reqs, const std::vector& outputs) { - std::cout << "HAHA" << std::endl; using namespace nnvm; using namespace imperative; @@ -1035,7 +1033,6 @@ void CachedOp::DynamicBackward( runtime.info.full_graph = state.info.full_graph; runtime.info.bwd_input_eid = state.info.bwd_input_eid; } - std::cout << "HOHO" << std::endl; nnvm::Graph& g = runtime.info.full_graph; const auto& idx = g.indexed_graph(); auto& buff = runtime.buff; @@ -1044,7 +1041,6 @@ void CachedOp::DynamicBackward( size_t num_forward_outputs = runtime.info.fwd_graph.outputs.size(); size_t num_forward_nodes = runtime.info.fwd_graph.indexed_graph().num_nodes(); size_t num_forward_entries = runtime.info.fwd_graph.indexed_graph().num_node_entries(); - std::cout << "HIHI" << std::endl; buff.resize(idx.num_node_entries()); std::vector arrays; arrays.reserve(buff.size()); @@ -1065,7 +1061,6 @@ void CachedOp::DynamicBackward( *outputs[i] = arrays[eid]->Detach(); arrays[eid] = outputs[i]; } - std::cout << "HOWHOW" << std::endl; // Allocate NDArrays auto ref_count = g.GetAttr >("backward_ref_count"); @@ -1085,14 +1080,11 @@ void CachedOp::DynamicBackward( } const auto& mem_plan = g.GetAttr("backward_mem_plan"); - std::cout << "before allocatememory" << std::endl; AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(), mem_plan, arrays, &array_reqs); - std::cout << "after allocatememory" << std::endl; const auto& dispatch_modes = g.GetAttr("dispatch_mode"); - std::cout << "Run graph!" << std::endl; RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(), std::move(array_reqs), std::move(ref_count), &states, dispatch_modes, Imperative::Get()->is_recording()); @@ -1198,20 +1190,17 @@ void CachedOp::Backward( const std::vector& reqs, const std::vector& outputs) { using namespace imperative; - std::cout << "BACKWARD" << std::endl; CHECK(!Imperative::Get()->is_recording()) << "CachedOp does not support higher order gradients. " << "If you want to do backward with create_graph=True please " << "do not use hybridize."; - std::cout << "Get bulk size" << std::endl; int prev_bulk_size = Engine::Get()->set_bulk_size(config_.backward_bulk_size); try { if (config_.static_alloc) { StaticBackward(retain_graph, state, inputs, reqs, outputs); } else { - std::cout << "Dynamic backward?" << std::endl; DynamicBackward(retain_graph, state, inputs, reqs, outputs); } } catch (const dmlc::Error& e) { @@ -1292,7 +1281,6 @@ void CachedOpBackward(const OpStatePtr& state_ptr, const std::vector& outputs) { using namespace nnvm; using namespace imperative; - std::cout << "CachedOpBackward!" << std::endl; CachedOpActualState &s = state_ptr.get_state(); std::vector in_bufs = inputs; std::vector out_bufs = outputs; @@ -1459,7 +1447,6 @@ NNVM_REGISTER_OP(_CachedOp) .set_attr("FGradient", [](const nnvm::NodePtr& n, const std::vector& ograds) { const CachedOpPtr& op = nnvm::get(n->attrs.parsed); - std::cout << "Called FGradient!" << std::endl; return op->Gradient(n, ograds); }) .set_attr("FListInputNames", diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h index 2fe31de2fcd4..7cc398d7b1f1 100644 --- a/src/imperative/cached_op.h +++ b/src/imperative/cached_op.h @@ -197,10 +197,8 @@ class CachedOp { bool inlining_; std::vector ograd_entries_; std::vector bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_; - std::unordered_map fwd_input_to_grad_output_; std::vector save_inputs_, save_outputs_; std::vector bwd_output_reqs_; - std::mutex mutex_; std::unordered_map > cached_op_states_; }; diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 2f7eaf152556..5cb805c5abcb 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -683,21 +683,16 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev std::pair entry_range = {0, 0}) { using namespace nnvm; nnvm::Graph& g = *p_g; - std::cout << "Checkandinferstoragetype" << std::endl; bool dev_match = g.attrs.count("dev_mask") && g.GetAttr("dev_mask") == dev_mask; - std::cout << "count(dev_mask) " << g.attrs.count("dev_mask") << std::endl; - std::cout << "dev_match: " << dev_match << std::endl; if (!dev_match) { g.attrs["dev_mask"] = std::make_shared(std::move(dev_mask)); } if (dev_match && use_inputs) { - std::cout << "dev match & use_inputs" << std::endl; if (g.attrs.count("storage_type_inputs") && g.GetAttr("storage_type_inputs") == storage_types) return true; } else if (dev_match && g.attrs.count("storage_type")) { - std::cout << "dev_match && storage_type" << std::endl; const auto& prev_storage_types = g.GetAttr("storage_type"); CHECK_EQ(prev_storage_types.size(), storage_types.size()); bool match = true; @@ -715,18 +710,9 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev g.attrs.erase("dispatch_mode"); g.attrs.erase("storage_type"); g.attrs.erase("storage_type_inputs"); - std::cout << "Node range: " << node_range.first << " " << node_range.second << std::endl; if (node_range.second > node_range.first) { g.attrs["node_range"] = std::make_shared(node_range); } - std::cout << g.attrs.size() << std::endl; - for (const auto& attr : g.attrs) { - std::cout << attr.first << std::endl; - } - std::cout << storage_types.size() << std::endl; - for (const auto& st : storage_types) { - std::cout << st << std::endl; - } if (use_inputs) { g = exec::InferStorageType(std::move(g), std::move(storage_types)); } else { diff --git a/src/nnvm/gradient.cc b/src/nnvm/gradient.cc index ce7c5cf1528f..586027129a0b 100644 --- a/src/nnvm/gradient.cc +++ b/src/nnvm/gradient.cc @@ -83,7 +83,6 @@ struct GradEntry { }; Graph Gradient(Graph src) { - std::cout << "Gradient!" << std::endl; using nnvm::FGradient; using MirrorFun = std::function; using AttrHintFun = std::function; @@ -173,10 +172,7 @@ Graph Gradient(Graph src) { std::vector out_agg_grads; for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) { const NodePtr& ptr = *rit; - std::cout << "Node ptr"<< std::endl; - if (ptr->is_variable()) std::cout << "Variable!" << std::endl; if (ptr->is_variable()) continue; - std::cout << "Op: " << ptr->op()->name << std::endl; out_agg_grads.clear(); auto& out_grad_vec = output_grads.at(ptr.get()); for (uint32_t i = 0; i < out_grad_vec.size(); ++i) { From 5067fa6b8a76debb683ac74da63c47b2e8d6eb00 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 13 Jun 2019 15:31:38 -0700 Subject: [PATCH 030/105] Cleaning and allow shape/type change in FusedOp --- src/operator/fusion/fused_op.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index b9806a07fd69..9512aa073c48 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -391,12 +391,12 @@ template <> bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { - std::cout << "InferShape in FusedOp!" << std::endl; + this->symbol_.attrs.erase("shape"); + this->symbol_.attrs.erase("shape_inputs"); std::vector input_shapes(*in_attrs); this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), std::move(input_shapes), "__shape__"); - std::cout << "END: InferShape in FusedOp!" << std::endl; const auto& g = this->symbol_.indexed_graph(); const auto& input_nids = g.input_nodes(); @@ -431,6 +431,8 @@ template <> bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { + this->symbol_.attrs.erase("dtype"); + this->symbol_.attrs.erase("dtype_inputs"); std::vector input_types(*in_attrs); this->symbol_ = mxnet::exec::InferType(std::move(this->symbol_), std::move(input_types), From b27a369364458e9dc616b5b45654e39517934e40 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 13 Jun 2019 16:54:30 -0700 Subject: [PATCH 031/105] Fixing Gluon bugs --- src/imperative/cached_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index e7fae3a02225..eb0d9501beca 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -180,7 +180,7 @@ struct CachedOp::CachedOpState { context = context_; nnvm::Symbol sym; sym.outputs = fwd_graph_.outputs; - CreateFullGraph(sym, &info.fwd_graph, &info.grad_graph, + CreateFullGraph(sym.Copy(), &info.fwd_graph, &info.grad_graph, &info.full_graph, &info.ograd_entries, &info.fwd_input_to_grad_output); From f18847cc7a86786d77fec62f21627999051784ca Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 13 Jun 2019 20:41:21 -0700 Subject: [PATCH 032/105] Fixing after rebase --- src/imperative/cached_op.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index eb0d9501beca..ac1f79730a88 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -146,18 +146,10 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap #if MXNET_USE_CUDA if (dmlc::GetEnv("MXNET_USE_FUSION", true) && context.dev_mask() == kGPU && !inlining) { - if (dmlc::GetEnv("MXNET_DEBUG_PRINT_GRAPH", 0)) { - std::cout << "Before the fusion" << std::endl; - exec::PrintFullGraph(*full_graph); - } full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); *full_graph = exec::FusePointwiseForward(std::move(*full_graph)); full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); *full_graph = exec::FusePointwiseBackward(std::move(*full_graph)); - if (dmlc::GetEnv("MXNET_DEBUG_PRINT_GRAPH", 0)) { - std::cout << "After the fusion" << std::endl; - exec::PrintFullGraph(*full_graph); - } } #endif // MXNET_USE_CUDA From 9a05327e2f075fc6d1ceddc68a3dfbbcb1be1248 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 14 Jun 2019 09:37:26 -0700 Subject: [PATCH 033/105] Fixing race condition and guarding against races when using NVRTC --- src/operator/fusion/fused_op.cc | 2 ++ src/operator/fusion/fused_op.cu | 4 ++++ src/operator/fusion/fused_op.h | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index ff57f106aed0..633916d34720 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -27,6 +27,8 @@ namespace mxnet { DMLC_REGISTER_PARAMETER(FusedOpConfig); +std::mutex FusedOp::mutex_; + void FusedOpParamParser(nnvm::NodeAttrs* attrs) { FusedOpConfig param; try { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 9512aa073c48..07bcafb82341 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include "./fused_op.h" #include "./fused_op-inl.h" #include "../operator_common.h" @@ -233,6 +234,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, const std::vector &req, const std::vector &outputs) { using namespace mshadow; + std::lock_guard lock(my_mutex_); CHECK_GE(outputs.size(), 1) << "There needs to be at least 1 output."; std::vector in_dtypes; @@ -305,6 +307,8 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, detail::fused_op_kernel_begin + "\n" + code_ + "\n" + detail::fused_op_kernel_end; + // Guard NVRTC calls + std::lock_guard lock_nvrtc(mutex_); nvrtcProgram program; NVRTC_CALL( nvrtcCreateProgram(&program, // prog diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 4a39672a0048..6c490fd2fe7c 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -26,6 +26,7 @@ #include #include #include +#include #if MXNET_USE_CUDA @@ -123,6 +124,9 @@ class FusedOp { CUfunction kernel_; int cc_major_; int cc_minor_; + + static std::mutex mutex_; + std::mutex my_mutex_; }; using FusedOpPtr = std::shared_ptr; From 309f9a70ed39f7d9fcfbc8107c98fbaccb829285 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 14 Jun 2019 09:58:32 -0700 Subject: [PATCH 034/105] Cleaning and renaming FusedOp to _FusedOp --- src/executor/pointwise_fusion_pass.cc | 2 +- src/operator/fusion/fused_op-inl.h | 38 ++--- src/operator/fusion/fused_op.cc | 207 ++++++++++++++++++++++++- src/operator/fusion/fused_op.cu | 209 +------------------------- src/operator/fusion/fused_op.h | 2 - 5 files changed, 224 insertions(+), 234 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index d1a3cdd4b915..7cf26fe964d4 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -83,7 +83,7 @@ namespace { params_names.pop_back(); node->attrs.dict["num_inputs"] = std::to_string(inputs_size); node->attrs.dict["num_outputs"] = std::to_string(subgraph.outputs.size()); - node->attrs.op = Op::Get("FusedOp"); + node->attrs.op = Op::Get("_FusedOp"); node->op()->attr_parser(&(node->attrs)); return node; } diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 21d0f7e11aa2..dfe3e39edcb5 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -23,7 +23,6 @@ #include #include #include -#include "../nn/activation-inl.h" #if MXNET_USE_CUDA @@ -34,27 +33,22 @@ namespace detail { const char fp16_support_string[] = R"code( #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) -#if defined(__cplusplus) - struct __align__(2) __half { - __host__ __device__ __half() { } - protected: - unsigned short __x; - }; - /* All intrinsic functions are only available to nvcc compilers */ - #if defined(__CUDACC__) - /* Definitions of intrinsics */ - __device__ inline __half __float2half(const float f) { - __half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; - } - __device__ inline float __half2float(const __half h) { - float val; - asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); - return val; - } - #endif /* defined(__CUDACC__) */ -#endif /* defined(__cplusplus) */ +struct __align__(2) __half { + __host__ __device__ __half() { } +protected: + unsigned short __x; +}; +/* Definitions of intrinsics */ +__device__ inline __half __float2half(const float f) { + __half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; +} +__device__ inline float __half2float(const __half h) { + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); + return val; +} #undef __HALF_TO_US #undef __HALF_TO_CUS typedef __half half; diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 633916d34720..34a61654e8b4 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -47,7 +47,157 @@ void FusedOpParamParser(nnvm::NodeAttrs* attrs) { attrs->parsed = FusedOpPtr(new FusedOp(attrs, param)); } -NNVM_REGISTER_OP(FusedOp) +FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { + this->inputs_ = std::vector(config.num_inputs); + this->outputs_ = std::vector(config.num_outputs); + this->symbol_ = nnvm::Graph(); + this->symbol_.outputs = attrs->subgraphs[0]->outputs; + this->initialized_ = false; + this->cc_major_ = -1; + this->cc_minor_ = -1; +} + +bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + this->symbol_.attrs.erase("shape"); + this->symbol_.attrs.erase("shape_inputs"); + std::vector input_shapes(*in_attrs); + this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), + std::move(input_shapes), + "__shape__"); + + const auto& g = this->symbol_.indexed_graph(); + const auto& input_nids = g.input_nodes(); + + std::vector out_shapes; + const std::vector shapes = this->symbol_.GetAttr("shape"); + for (auto& e : g.outputs()) { + out_shapes.push_back(shapes[g.entry_id(e)]); + } + CHECK_EQ(out_shapes.size(), out_attrs->size()); + for (size_t i = 0; i < out_attrs->size(); ++i) { + op::shape_assign(&(out_attrs->at(i)), out_shapes[i]); + } + + // assign to in_attrs + for (size_t i = 0; i < in_attrs->size(); ++i) { + const auto eid = g.entry_id(input_nids[i], 0); + SHAPE_ASSIGN_CHECK(*in_attrs, i, shapes[eid]); + } + + bool inferred = true; + for (const auto& attr : *in_attrs) { + inferred = inferred && !op::shape_is_none(attr); + } + for (const auto& attr : *out_attrs) { + inferred = inferred && !op::shape_is_none(attr); + } + return inferred; +} + +bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + this->symbol_.attrs.erase("dtype"); + this->symbol_.attrs.erase("dtype_inputs"); + std::vector input_types(*in_attrs); + this->symbol_ = mxnet::exec::InferType(std::move(this->symbol_), + std::move(input_types), + "__dtype__"); + + const auto& g = this->symbol_.indexed_graph(); + const auto& input_nids = g.input_nodes(); + + std::vector out_types; + const std::vector types = this->symbol_.GetAttr("dtype"); + for (auto& e : g.outputs()) { + out_types.push_back(types[g.entry_id(e)]); + } + CHECK_EQ(out_types.size(), out_attrs->size()); + for (size_t i = 0; i < out_attrs->size(); ++i) { + op::type_assign(&(out_attrs->at(i)), out_types[i]); + } + + // assign to in_attrs + for (size_t i = 0; i < in_attrs->size(); ++i) { + const auto eid = g.entry_id(input_nids[i], 0); + TYPE_ASSIGN_CHECK(*in_attrs, i, types[eid]); + } + + bool inferred = true; + for (const auto& attr : *in_attrs) { + inferred = inferred && !op::type_is_none(attr); + } + for (const auto& attr : *out_attrs) { + inferred = inferred && !op::type_is_none(attr); + } + return inferred; +} + +template +std::pair, std::vector> FusedOp::GetAttrs(const std::string& attr_name, + const uint32_t node_id) { + const auto& g = this->symbol_.indexed_graph(); + const std::vector attrs = this->symbol_.GetAttr>(attr_name); + const auto& node = g[node_id]; + std::vector inputs, outputs; + for (const auto& e : node.inputs) { + inputs.emplace_back(attrs[g.entry_id(e)]); + } + outputs.resize(node.source->num_outputs()); + for (size_t i = 0; i < g.num_nodes(); ++i) { + if (i == node_id) continue; + const auto& other_node = g[i]; + for (const auto& e : other_node.inputs) { + if (e.node_id == node_id) { + outputs[e.index] = attrs[g.entry_id(e)]; + } + } + } + for (const auto& e : g.outputs()) { + if (e.node_id == node_id) { + outputs[e.index] = attrs[g.entry_id(e)]; + } + } + + return {inputs, outputs}; +} + +bool FusedOpInferShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + return op->InferShape(attrs, in_attrs, out_attrs); +} + +bool FusedOpInferType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + return op->InferType(attrs, in_attrs, out_attrs); +} + +void FusedOpProvideShape(const nnvm::NodeAttrs& attrs, + const std::vector> &in_attrs, + const std::vector> &out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + op->ProvideShape(in_attrs, out_attrs); +} + +void FusedOpProvideType(const nnvm::NodeAttrs& attrs, + const std::vector> &in_attrs, + const std::vector> &out_attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + op->ProvideType(in_attrs, out_attrs); +} + +void FusedOpProvideStorageType(const nnvm::NodeAttrs& attrs, + const std::vector> &in_attrs, + const std::vector> &out_attrs) {} + +NNVM_REGISTER_OP(_FusedOp) +.set_attr("TIsFusion", true) .set_num_inputs([](const NodeAttrs& attrs) { const FusedOpPtr& op = nnvm::get(attrs.parsed); return op->num_inputs(); @@ -56,9 +206,64 @@ NNVM_REGISTER_OP(FusedOp) const FusedOpPtr& op = nnvm::get(attrs.parsed); return op->num_outputs(); }) +.set_attr("FProvideSubgraphShape", FusedOpProvideShape) +.set_attr("FProvideSubgraphType", FusedOpProvideType) +.set_attr("FProvideSubgraphStorageType", + FusedOpProvideStorageType) +.set_attr("FInferShape", FusedOpInferShape) +.set_attr("FInferType", FusedOpInferType) .set_attr_parser(FusedOpParamParser) .add_argument("data", "NDArray-or-Symbol[]", "Data"); +std::pair, std::vector> +FusedOpHelperShape(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAttrs("shape", node_id); +} + +std::pair, std::vector> +FusedOpHelperType(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAttrs("dtype", node_id); +} + +NNVM_REGISTER_OP(_FusedOpHelper) +.set_num_inputs(0) +.set_num_outputs(0) +.set_attr("TIsGhost", true) +.set_attr("TIsFusionHelper", true) +.set_attr("FAccessSubgraphShape", FusedOpHelperShape) +.set_attr("FAccessSubgraphType", FusedOpHelperType); + + +std::pair, std::vector> +FusedOpOutHelperShape(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAuxShape(node_id); +} + +std::pair, std::vector> +FusedOpOutHelperType(const NodeAttrs& attrs) { + const auto& p = nnvm::get(attrs.parsed); + const auto& op = p->op; + const auto& node_id = p->node_id; + return op->GetAuxType(node_id); +} + +NNVM_REGISTER_OP(_FusedOpOutHelper) +.set_num_inputs(0) +.set_num_outputs(0) +.set_attr("TIsGhost", true) +.set_attr("TIsFusionHelper", true) +.set_attr("FAccessSubgraphShape", FusedOpOutHelperShape) +.set_attr("FAccessSubgraphType", FusedOpOutHelperType); + } // namespace mxnet #endif // MXNET_USE_CUDA diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 07bcafb82341..c63552fbe4c4 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -57,16 +57,6 @@ inline std::string mshadowTypeToString(int type) { } // namespace detail -FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { - this->inputs_ = std::vector(config.num_inputs); - this->outputs_ = std::vector(config.num_outputs); - this->symbol_ = nnvm::Graph(); - this->symbol_.outputs = attrs->subgraphs[0]->outputs; - this->initialized_ = false; - this->cc_major_ = -1; - this->cc_minor_ = -1; -} - void FusedOp::GenerateCode(const std::vector &req) { const auto& g = this->symbol_.indexed_graph(); std::string code = ""; @@ -391,115 +381,6 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, &(args[0]), 0)); // arguments } -template <> -bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - this->symbol_.attrs.erase("shape"); - this->symbol_.attrs.erase("shape_inputs"); - std::vector input_shapes(*in_attrs); - this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), - std::move(input_shapes), - "__shape__"); - - const auto& g = this->symbol_.indexed_graph(); - const auto& input_nids = g.input_nodes(); - - std::vector out_shapes; - const std::vector shapes = this->symbol_.GetAttr("shape"); - for (auto& e : g.outputs()) { - out_shapes.push_back(shapes[g.entry_id(e)]); - } - CHECK_EQ(out_shapes.size(), out_attrs->size()); - for (size_t i = 0; i < out_attrs->size(); ++i) { - op::shape_assign(&(out_attrs->at(i)), out_shapes[i]); - } - - // assign to in_attrs - for (size_t i = 0; i < in_attrs->size(); ++i) { - const auto eid = g.entry_id(input_nids[i], 0); - SHAPE_ASSIGN_CHECK(*in_attrs, i, shapes[eid]); - } - - bool inferred = true; - for (const auto& attr : *in_attrs) { - inferred = inferred && !op::shape_is_none(attr); - } - for (const auto& attr : *out_attrs) { - inferred = inferred && !op::shape_is_none(attr); - } - return inferred; -} - -template <> -bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - this->symbol_.attrs.erase("dtype"); - this->symbol_.attrs.erase("dtype_inputs"); - std::vector input_types(*in_attrs); - this->symbol_ = mxnet::exec::InferType(std::move(this->symbol_), - std::move(input_types), - "__dtype__"); - - const auto& g = this->symbol_.indexed_graph(); - const auto& input_nids = g.input_nodes(); - - std::vector out_types; - const std::vector types = this->symbol_.GetAttr("dtype"); - for (auto& e : g.outputs()) { - out_types.push_back(types[g.entry_id(e)]); - } - CHECK_EQ(out_types.size(), out_attrs->size()); - for (size_t i = 0; i < out_attrs->size(); ++i) { - op::type_assign(&(out_attrs->at(i)), out_types[i]); - } - - // assign to in_attrs - for (size_t i = 0; i < in_attrs->size(); ++i) { - const auto eid = g.entry_id(input_nids[i], 0); - TYPE_ASSIGN_CHECK(*in_attrs, i, types[eid]); - } - - bool inferred = true; - for (const auto& attr : *in_attrs) { - inferred = inferred && !op::type_is_none(attr); - } - for (const auto& attr : *out_attrs) { - inferred = inferred && !op::type_is_none(attr); - } - return inferred; -} - -template -std::pair, std::vector> FusedOp::GetAttrs(const std::string& attr_name, - const uint32_t node_id) { - const auto& g = this->symbol_.indexed_graph(); - const std::vector attrs = this->symbol_.GetAttr>(attr_name); - const auto& node = g[node_id]; - std::vector inputs, outputs; - for (const auto& e : node.inputs) { - inputs.emplace_back(attrs[g.entry_id(e)]); - } - outputs.resize(node.source->num_outputs()); - for (size_t i = 0; i < g.num_nodes(); ++i) { - if (i == node_id) continue; - const auto& other_node = g[i]; - for (const auto& e : other_node.inputs) { - if (e.node_id == node_id) { - outputs[e.index] = attrs[g.entry_id(e)]; - } - } - } - for (const auto& e : g.outputs()) { - if (e.node_id == node_id) { - outputs[e.index] = attrs[g.entry_id(e)]; - } - } - - return {inputs, outputs}; -} - void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, @@ -509,95 +390,7 @@ void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs, op->Forward(attrs, ctx, inputs, req, outputs); } -bool FusedOpInferShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const FusedOpPtr& op = nnvm::get(attrs.parsed); - return op->InferShape(attrs, in_attrs, out_attrs); -} - -bool FusedOpInferType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const FusedOpPtr& op = nnvm::get(attrs.parsed); - return op->InferType(attrs, in_attrs, out_attrs); -} - -void FusedOpProvideShape(const nnvm::NodeAttrs& attrs, - const std::vector> &in_attrs, - const std::vector> &out_attrs) { - const FusedOpPtr& op = nnvm::get(attrs.parsed); - op->ProvideShape(in_attrs, out_attrs); -} - -void FusedOpProvideType(const nnvm::NodeAttrs& attrs, - const std::vector> &in_attrs, - const std::vector> &out_attrs) { - const FusedOpPtr& op = nnvm::get(attrs.parsed); - op->ProvideType(in_attrs, out_attrs); -} - -void FusedOpProvideStorageType(const nnvm::NodeAttrs& attrs, - const std::vector> &in_attrs, - const std::vector> &out_attrs) {} - - -NNVM_REGISTER_OP(FusedOp) -.set_attr("TIsFusion", true) -.set_attr("FProvideSubgraphShape", FusedOpProvideShape) -.set_attr("FProvideSubgraphType", FusedOpProvideType) -.set_attr("FProvideSubgraphStorageType", - FusedOpProvideStorageType) -.set_attr("FInferShape", FusedOpInferShape) -.set_attr("FInferType", FusedOpInferType) +NNVM_REGISTER_OP(_FusedOp) .set_attr("FCompute", FusedOpForwardGPU); -std::pair, std::vector> -FusedOpHelperShape(const NodeAttrs& attrs) { - const auto& p = nnvm::get(attrs.parsed); - const auto& op = p->op; - const auto& node_id = p->node_id; - return op->GetAttrs("shape", node_id); -} - -std::pair, std::vector> -FusedOpHelperType(const NodeAttrs& attrs) { - const auto& p = nnvm::get(attrs.parsed); - const auto& op = p->op; - const auto& node_id = p->node_id; - return op->GetAttrs("dtype", node_id); -} - -NNVM_REGISTER_OP(_FusedOpHelper) -.set_num_inputs(0) -.set_num_outputs(0) -.set_attr("TIsGhost", true) -.set_attr("TIsFusionHelper", true) -.set_attr("FAccessSubgraphShape", FusedOpHelperShape) -.set_attr("FAccessSubgraphType", FusedOpHelperType); - - -std::pair, std::vector> -FusedOpOutHelperShape(const NodeAttrs& attrs) { - const auto& p = nnvm::get(attrs.parsed); - const auto& op = p->op; - const auto& node_id = p->node_id; - return op->GetAuxShape(node_id); -} - -std::pair, std::vector> -FusedOpOutHelperType(const NodeAttrs& attrs) { - const auto& p = nnvm::get(attrs.parsed); - const auto& op = p->op; - const auto& node_id = p->node_id; - return op->GetAuxType(node_id); -} - -NNVM_REGISTER_OP(_FusedOpOutHelper) -.set_num_inputs(0) -.set_num_outputs(0) -.set_attr("TIsGhost", true) -.set_attr("TIsFusionHelper", true) -.set_attr("FAccessSubgraphShape", FusedOpOutHelperShape) -.set_attr("FAccessSubgraphType", FusedOpOutHelperType); } // namespace mxnet diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 6c490fd2fe7c..11b993184389 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -68,12 +68,10 @@ class FusedOp { const std::vector &req, const std::vector &outputs); - template bool InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs); - template bool InferType(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs); From 9617b033f730d4e138f83a99b7eb171ed5535a92 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 14 Jun 2019 15:09:57 -0700 Subject: [PATCH 035/105] Going easy on Windows compiler --- src/operator/fusion/fused_op-inl.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index dfe3e39edcb5..1760fe5765a7 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -31,26 +31,21 @@ namespace mxnet { namespace detail { const char fp16_support_string[] = R"code( -#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) -#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) struct __align__(2) __half { __host__ __device__ __half() { } -protected: unsigned short __x; }; /* Definitions of intrinsics */ __device__ inline __half __float2half(const float f) { __half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f)); return val; } __device__ inline float __half2float(const __half h) { float val; - asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x)); return val; } -#undef __HALF_TO_US -#undef __HALF_TO_CUS typedef __half half; )code"; From de9027be7434f2b7a2dd99ec6a3b70cdb49e5269 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 09:11:03 -0700 Subject: [PATCH 036/105] Disable fusion on Windows for now --- src/executor/graph_executor.cc | 2 +- src/imperative/cached_op.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 411ea6f3a921..9981505f60f0 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -983,7 +983,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); -#if MXNET_USE_CUDA +#if MXNET_USE_CUDA && !defined(_WIN32) if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) { g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); g = FusePointwiseForward(std::move(g)); diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index f1fb6ae1e55f..a59866816180 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -143,7 +143,7 @@ void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) { void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph, const Context& context, size_t num_forward_outputs, const bool inlining) { -#if MXNET_USE_CUDA +#if MXNET_USE_CUDA && !defined(_WIN32) if (dmlc::GetEnv("MXNET_USE_FUSION", true) && context.dev_mask() == kGPU && !inlining) { full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); From 3d2d71531eac0943b64182ff88f56e1b18eccfb8 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 13:32:03 -0700 Subject: [PATCH 037/105] Refactor InferAttr and InferShapeAttr --- src/executor/infer_graph_attr_pass.cc | 336 +++++++++++--------------- 1 file changed, 139 insertions(+), 197 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 9b0c967530e9..06834669ca47 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -63,6 +63,133 @@ bool ApplyOpInferAttr(const nnvm::Graph& g, return true; } +template +inline void GetAttrFromForwardNode(const uint32_t nid, + const nnvm::IndexedGraph &idx, + std::vector& rshape, + IsNone fis_none) { + const auto& inode = idx[nid]; + // gradient function, used to get node correspondence. + static auto& fgrad = + Op::GetAttr("FGradient"); + nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; + const nnvm::IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; + // use gradient function to find out the correspondence. + std::vector ograd(fwd_ptr->num_outputs()); + for (size_t i = 0; i < ograd.size(); ++i) { + ograd[i].index = static_cast(i); + } + // input gradient list + const std::vector& igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); + const nnvm::Node* igrad_node = nullptr; + // Input gradient assignement + for (size_t i = 0; i < igrad.size(); ++i) { + if (igrad[i].node->op() == inode.source->op()) { + uint32_t eid = idx.entry_id(nid, igrad[i].index); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; + } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { + // Need to skip empty forward shape, because it may not be + // available now and it is possible to infer the forward + // shape in one of the next a few passes + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + << "Backward shape inconsistent with the forward shape"; + } + if (igrad_node == nullptr) { + igrad_node = igrad[i].node.get(); + } else { + CHECK(igrad_node == igrad[i].node.get()); + } + } + } + // out grad entries + CHECK(igrad_node != nullptr) + << "Cannot find matching backward op for " << inode.source->attrs.name; + for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { + const nnvm::NodeEntry& e = igrad_node->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + } + } + } +} + +template +void GetAttrFromFusedNode(uint32_t nid, + const nnvm::IndexedGraph& idx, + std::vector& rshape, + IsNone fis_none, + const std::string& infer_fusion_name) { + const auto& inode = idx[nid]; + nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; + static auto& finfer_fused_shape = + Op::GetAttr(infer_fusion_name); + auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); + CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << + " is marked as Fusion but does not allow accessing attributes"; + const auto& inferred_attrs = finfer(fwd_ptr->attrs); + const auto& input_attrs = inferred_attrs.first; + const auto& output_attrs = inferred_attrs.second; + CHECK(input_attrs.size() == inode.source->op()->num_outputs) << + "Number of outputs of the gradient node " << inode.source->attrs.name << + " does not match the number of inputs of the corresponding forward node"; + // Set the attributes of output gradients + // using attributes of forward node inputs + for (size_t i = 0; i < input_attrs.size(); ++i) { + uint32_t eid = idx.entry_id(nid, i); + if (fis_none(rshape[eid])) { + rshape[eid] = input_attrs[i]; + } else if (!fis_none(input_attrs[i])) { + CHECK_EQ(rshape[eid], input_attrs[i]) + << "Backward shape inconsistent with the forward shape"; + } + } + // Set the attributes of input gradients + // using attributes of forward node outputs + for (size_t i = 0; i < output_attrs.size(); ++i) { + // We assume that the first inputs to the + // backward op are the output gradients + const auto& e = inode.source->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = output_attrs[i]; + } + } + } +} + +template +void ProvideAttrToFusion(const uint32_t nid, + const nnvm::IndexedGraph& idx, + std::vector& rshape, + const std::string& provide_fusion_name) { + const auto& inode = idx[nid]; + std::vector> in_attrs; + std::vector> out_attrs; + for (const auto& dep_node : inode.source->control_deps) { + in_attrs.push_back({}); + out_attrs.push_back({}); + auto ¤t_in_attrs = in_attrs.back(); + auto ¤t_out_attrs = out_attrs.back(); + uint32_t dep_node_id = idx.node_id(dep_node.get()); + for (const auto& e : idx[dep_node_id].inputs) { + current_in_attrs.push_back(rshape[idx.entry_id(e)]); + } + for (size_t i = 0; i < dep_node->num_outputs(); ++i) { + current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); + } + } + auto provide = + Op::GetAttr(provide_fusion_name).get(inode.source->op(), nullptr); + CHECK(provide != nullptr) << + "Encountered Fusion operator that does not implement providing subgraph attr " << + provide_fusion_name << "."; + provide(inode.source->attrs, in_attrs, out_attrs); +} + /*!\brief * This is a duplicate of the InferAttr function in nnvm with minor modification * to support inferring storage type whose function signature is different from @@ -118,9 +245,6 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, Op::GetAttr(infer_name); static auto& is_backward = Op::GetAttr("TIsBackward"); - // gradient function, used to get node correspondence. - static auto& fgrad = - Op::GetAttr("FGradient"); // reshape shape vector AttrVector rshape; // dispatch mode vector @@ -221,80 +345,11 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; - static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); - if (!is_fusion.get(fwd_ptr->op(), false)) { - const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; - // use gradient function to find out the correspondence. - std::vector ograd(fwd_ptr->num_outputs()); - for (size_t i = 0; i < ograd.size(); ++i) { - ograd[i].index = static_cast(i); - } - // input gradient list - auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); - const nnvm::Node* igrad_node = nullptr; - // Input gradient assignement - for (size_t i = 0; i < igrad.size(); ++i) { - if (igrad[i].node->op() == inode.source->op()) { - uint32_t eid = idx.entry_id(nid, igrad[i].index); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; - } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { - // Need to skip empty forward shape, because it may not be - // available now and it is possible to infer the forward - // shape in one of the next a few passes - CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) - << "Backward shape inconsistent with the forward shape"; - } - if (igrad_node == nullptr) { - igrad_node = igrad[i].node.get(); - } else { - CHECK(igrad_node == igrad[i].node.get()); - } - } - } - // out grad entries - CHECK(igrad_node != nullptr) - << "Cannot find matching backward op for " << inode.source->attrs.name; - for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { - const nnvm::NodeEntry& e = igrad_node->inputs[i]; - if (e.node == nullptr) { - uint32_t eid = idx.entry_id(inode.inputs[i]); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; - } - } - } + static auto& is_fusion_helper = Op::GetAttr("TIsFusionHelper"); + if (!is_fusion_helper.get(fwd_ptr->op(), false)) { + GetAttrFromForwardNode(nid, idx, rshape, fis_none); } else { - static auto& finfer_fused_shape = Op::GetAttr(infer_fusion_name); - auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); - CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << - " is marked as Fusion but does not allow accessing attributes"; - const auto& inferred_attrs = finfer(fwd_ptr->attrs); - const auto& input_attrs = inferred_attrs.first; - const auto& output_attrs = inferred_attrs.second; - CHECK(input_attrs.size() == inode.source->op()->num_outputs) << - "Number of outputs of the gradient node " << inode.source->attrs.name << - " does not match the number of inputs of the corresponding forward node"; - for (size_t i = 0; i < input_attrs.size(); ++i) { - uint32_t eid = idx.entry_id(nid, i); - if (fis_none(rshape[eid])) { - rshape[eid] = input_attrs[i]; - } else if (!fis_none(input_attrs[i])) { - CHECK_EQ(rshape[eid], input_attrs[i]) - << "Backward shape inconsistent with the forward shape"; - } - } - for (size_t i = 0; i < output_attrs.size(); ++i) { - // We assume that the first inputs to the - // backward op are the output gradients - const auto& e = inode.source->inputs[i]; - if (e.node == nullptr) { - uint32_t eid = idx.entry_id(inode.inputs[i]); - if (fis_none(rshape[eid])) { - rshape[eid] = output_attrs[i]; - } - } - } + GetAttrFromFusedNode(nid, idx, rshape, fis_none, infer_fusion_name); } } else { DispatchMode* dispatch_mode = nullptr; @@ -321,28 +376,7 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, try { static auto& is_fusion = Op::GetAttr("TIsFusion"); if (is_fusion.get(inode.source->op(), false)) { - std::vector> in_attrs; - std::vector> out_attrs; - for (const auto& dep_node : inode.source->control_deps) { - in_attrs.push_back({}); - out_attrs.push_back({}); - auto ¤t_in_attrs = in_attrs.back(); - auto ¤t_out_attrs = out_attrs.back(); - uint32_t dep_node_id = idx.node_id(dep_node.get()); - for (const auto& e : idx[dep_node_id].inputs) { - current_in_attrs.push_back(rshape[idx.entry_id(e)]); - } - for (size_t i = 0; i < dep_node->num_outputs(); ++i) { - current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); - } - } - auto provide = - Op::GetAttr(provide_fusion_name).get(inode.source->op(), - nullptr); - CHECK(provide != nullptr) << - "Encountered Fusion operator that does not implement providing subgraph attr " << - provide_fusion_name << "."; - provide(inode.source->attrs, in_attrs, out_attrs); + ProvideAttrToFusion(nid, idx, rshape, provide_fusion_name); } forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, nid, &ishape, &oshape, dispatch_mode); @@ -458,9 +492,6 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, Op::GetAttr(infer_name); static auto& is_backward = Op::GetAttr("TIsBackward"); - // gradient function, used to get node correspondence. - static auto& fgrad = - Op::GetAttr("FGradient"); // reshape shape vector AttrVector rshape; // dispatch mode vector @@ -572,81 +603,12 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; - static auto& is_fusion = Op::GetAttr("TIsFusionHelper"); - if (!is_fusion.get(fwd_ptr->op(), false)) { - const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; - // use gradient function to find out the correspondence. - std::vector ograd(fwd_ptr->num_outputs()); - for (size_t i = 0; i < ograd.size(); ++i) { - ograd[i].index = static_cast(i); - } - // input gradient list - auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); - const nnvm::Node* igrad_node = nullptr; - // Input gradient assignement - for (size_t i = 0; i < igrad.size(); ++i) { - if (igrad[i].node->op() == inode.source->op()) { - uint32_t eid = idx.entry_id(nid, igrad[i].index); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; - } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { - // Need to skip empty forward shape, because it may not be - // available now and it is possible to infer the forward - // shape in one of the next a few passes - CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) - << "Backward shape inconsistent with the forward shape"; - } - if (igrad_node == nullptr) { - igrad_node = igrad[i].node.get(); - } else { - CHECK(igrad_node == igrad[i].node.get()); - } - } - } - // out grad entries - CHECK(igrad_node != nullptr) - << "Cannot find matching backward op for " << inode.source->attrs.name; - for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { - const nnvm::NodeEntry& e = igrad_node->inputs[i]; - if (e.node == nullptr) { - uint32_t eid = idx.entry_id(inode.inputs[i]); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; - } - } - } + static auto& is_fusion_helper = Op::GetAttr("TIsFusionHelper"); + if (!is_fusion_helper.get(fwd_ptr->op(), false)) { + GetAttrFromForwardNode(nid, idx, rshape, fis_none); } else { - static auto& finfer_fused_shape = - Op::GetAttr("FAccessSubgraphShape"); - auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); - CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << - " is marked as Fusion but does not allow accessing attributes"; - const auto& inferred_attrs = finfer(fwd_ptr->attrs); - const auto& input_attrs = inferred_attrs.first; - const auto& output_attrs = inferred_attrs.second; - CHECK(input_attrs.size() == inode.source->op()->num_outputs) << - "Number of outputs of the gradient node " << inode.source->attrs.name << - " does not match the number of inputs of the corresponding forward node"; - for (size_t i = 0; i < input_attrs.size(); ++i) { - uint32_t eid = idx.entry_id(nid, i); - if (fis_none(rshape[eid])) { - rshape[eid] = input_attrs[i]; - } else if (!fis_none(input_attrs[i])) { - CHECK_EQ(rshape[eid], input_attrs[i]) - << "Backward shape inconsistent with the forward shape"; - } - } - for (size_t i = 0; i < output_attrs.size(); ++i) { - // We assume that the first inputs to the - // backward op are the output gradients - const auto& e = inode.source->inputs[i]; - if (e.node == nullptr) { - uint32_t eid = idx.entry_id(inode.inputs[i]); - if (fis_none(rshape[eid])) { - rshape[eid] = output_attrs[i]; - } - } - } + GetAttrFromFusedNode(nid, idx, rshape, fis_none, + "FAccessSubgraphShape"); } } else { DispatchMode* dispatch_mode = nullptr; @@ -683,28 +645,8 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, try { static auto& is_fusion = Op::GetAttr("TIsFusion"); if (is_fusion.get(inode.source->op(), false)) { - std::vector> in_attrs; - std::vector> out_attrs; - for (const auto& dep_node : inode.source->control_deps) { - in_attrs.emplace_back(); - out_attrs.emplace_back(); - auto ¤t_in_attrs = in_attrs.back(); - auto ¤t_out_attrs = out_attrs.back(); - uint32_t dep_node_id = idx.node_id(dep_node.get()); - for (const auto& e : idx[dep_node_id].inputs) { - current_in_attrs.push_back(rshape[idx.entry_id(e)]); - } - for (size_t i = 0; i < dep_node->num_outputs(); ++i) { - current_out_attrs.push_back(rshape[idx.entry_id(dep_node_id, i)]); - } - } - auto provide = - Op::GetAttr("FProvideSubgraphShape").get( - inode.source->op(), - nullptr); - CHECK(provide != nullptr) << - "Encountered Fusion operator that does not implement providing subgraph shape."; - provide(inode.source->attrs, in_attrs, out_attrs); + ProvideAttrToFusion(nid, idx, rshape, + "FProvideSubgraphShape"); } forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, nid, &ishape, &oshape, dispatch_mode); From 52216772043077c435ce5747efe7ab4a609f2545 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Sun, 16 Jun 2019 20:55:40 -0700 Subject: [PATCH 038/105] Added slice and half2 support to FusedOp --- src/executor/pointwise_fusion_pass.cc | 2 + src/operator/fusion/fused_op-inl.h | 162 +++++++++++++++++----- src/operator/fusion/fused_op.cu | 186 ++++++++++++++++++++++++-- 3 files changed, 305 insertions(+), 45 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 7cf26fe964d4..dc7736211434 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -53,6 +53,8 @@ namespace { return true; if (fused_op_mimo_ops.count(op_name)) return true; + if (fused_op_slice_ops.count(op_name)) + return true; if (std::find(fused_op_variable_io_ops.begin(), fused_op_variable_io_ops.end(), op_name) != diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 1760fe5765a7..5ff22bd75742 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -31,24 +31,29 @@ namespace mxnet { namespace detail { const char fp16_support_string[] = R"code( -struct __align__(2) __half { - __host__ __device__ __half() { } - unsigned short __x; -}; -/* Definitions of intrinsics */ -__device__ inline __half __float2half(const float f) { - __half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f)); - return val; -} -__device__ inline float __half2float(const __half h) { - float val; - asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x)); - return val; -} -typedef __half half; +#include )code"; +// const char fp16_support_string[] = R"code( +// struct __align__(2) __half { +// __host__ __device__ __half() { } +// unsigned short __x; +// }; +// /* Definitions of intrinsics */ +// __device__ inline __half __float2half(const float f) { +// __half val; +// asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f)); +// return val; +// } +// __device__ inline float __half2float(const __half h) { +// float val; +// asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x)); +// return val; +// } +// typedef __half half; +// )code"; + + const char type_support_string[] = R"code( using float32 = float; using float64 = double; @@ -218,6 +223,10 @@ const std::map>> fused_op_mimo {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} }; +const std::map fused_op_slice_ops = { + {"slice_axis" , ""}, +}; + const std::vector fused_op_variable_io_ops = { "add_n", "_backward_Activation" @@ -235,34 +244,118 @@ struct LoadType { }; template -inline typename LoadType::Type load(const DType * input, int i) { - return input[i]; +inline typename LoadType::Type load(const DType input) { + return input; } template <> -inline float load(const half * input, int i) { - return __half2float(input[i]); +inline float load(const half input) { + return __half2float(input); +} + +template +inline DType1 store(const DType2 input) { + return input; +} + +template<> +inline half store(const float input) { + return __float2half(input); +} + + + +template +struct VectorConfig { + static const int N = 1; + using IndexType = DType; +}; + +struct VectorConfig { + static const int N = 2; + using IndexType = __half2; +}; + + +template +union VectorType { + typename VectorConfig::IndexType y; + DType x[VectorConfig::N]; + VectorType () {}; + VectorType (const VectorType& y2) { + y = y2.y; + } + VectorType (const typename VectorConfig::IndexType &y2) { + y = y2; + } +}; + + +template +struct Strides { + int x[ndim]; +}; + +template +inline Strides get_index(const Strides strides, int i) { + int idx = i; + Strides ref_index; + #pragma unroll + for (int dim = 0; dim < ndim; dim++) { + int stride = strides.x[dim]; + ref_index.x[dim] = idx / stride; + idx = idx % stride; + } + return ref_index; } + template -inline void store(const typename LoadType::Type value, int i, DType * output) { - output[i] = value; +inline VectorType load_index(const DType * input, int i) { + const auto* vector_input = reinterpret_cast::IndexType *>(input + i); + VectorType ret = {*vector_input}; + return ret; } -template <> -inline void store(const float value, int i, half * output) { - output[i] = __float2half(value); +template +inline VectorType load_slice(const DType * input, const Strides strides, int axis, int begin, Strides* ref_index) { + int idx[nvec]; + bool consecutive = true; + #pragma unroll + for (int j = 0; j < nvec; j++) { + idx[j] = 0; + #pragma unroll + for (int dim = 0; dim < ndim; dim++) { + idx[j] += ref_index[j].x[dim] * strides.x[dim]; + } + idx[j] += begin * strides.x[axis]; + if (j > 0 && (idx[j] != idx[j-1])) { + consecutive = false; + } + } + if (!consecutive) { + VectorType ret; + #pragma unroll + for (int j = 0; j < nvec; j++) { + ret.x[j] = *(input + idx[j]); + } + return ret; + } + return load_index(input, idx[0]); } + + template -inline void storeadd(const typename LoadType::Type value, int i, DType * output) { - output[i] += value; +inline void store_index(const VectorType value, int i, DType * output) { + auto vector_output = reinterpret_cast::IndexType *>(output); + vector_output[i] = value.y; } -template <> -inline void storeadd(const float value, int i, half * output) { - const auto previous = load(output, i); - output[i] = __float2half(value + previous); +template +inline void store_add_index(const VectorType value, int i, DType * output) { + auto vector_output = reinterpret_cast::IndexType *>(output); + vector_output[i] += value.y; } template @@ -725,6 +818,13 @@ inline DType backward_erfinv(const DType val, const DType grad) { const char fused_op_kernel_begin[] = R"code( const int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < N; i+= gridDim.x * blockDim.x) { + Strides ref_index[nvec]; + int offset = i*nvec; + #pragma unroll + for (int j = 0; j < nvec; j++) { + ref_index[j] = get_index(ref_strides, offset + j); + } + )code"; const char fused_op_kernel_end[] = R"code( diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index c63552fbe4c4..50f31f21825e 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -28,11 +28,40 @@ #include "../elemwise_op_common.h" #include "../../executor/exec_pass.h" #include "../../common/cuda_utils.h" +#include namespace mxnet { namespace detail { +std::string FindCUDAIncludePath() { +#if defined(_WIN32) + const std::string delimiter = "\\"; +#else + const std::string delimiter = "/"; +#endif + std::string cuda_include_path; + const char* cuda_path_env = std::getenv("CUDA_PATH"); + if (cuda_path_env != nullptr) { + cuda_include_path += cuda_path_env; + cuda_include_path += delimiter + "include"; + return cuda_include_path; + } + +#if defined(__linux__) + struct stat st; + cuda_include_path = "/usr/local/cuda/include"; + if (stat(cuda_include_path.c_str(), &st) == 0) { + return cuda_include_path; + } +#endif + LOG(FATAL) << "Cannot find cuda include path." + << "CUDA_PATH is not set or CUDA is not installed in the default installation path." + << "In other than linux, it is necessary to set CUDA_PATH."; + return cuda_include_path; +} + + inline std::string mshadowTypeToString(int type) { switch (type) { case mshadow::kFloat32: @@ -55,6 +84,29 @@ inline std::string mshadowTypeToString(int type) { return ""; } +inline int mshadowTypeToVectorLength(int type) { + switch (type) { + case mshadow::kFloat32: + return 1; + case mshadow::kFloat64: + return 1; + case mshadow::kFloat16: + return 2; + case mshadow::kUint8: + return 1; + case mshadow::kInt8: + return 1; + case mshadow::kInt32: + return 1; + case mshadow::kInt64: + return 1; + default: + LOG(FATAL) << "Unknown type enum " << type; + } + return 0; +} + + } // namespace detail void FusedOp::GenerateCode(const std::vector &req) { @@ -63,6 +115,7 @@ void FusedOp::GenerateCode(const std::vector &req) { int temp_name_counter = 0; using NodeEntry = nnvm::IndexedGraph::NodeEntry; std::map, std::string> variables; + std::map load_index; std::vector outputs(g.num_nodes()); @@ -75,15 +128,70 @@ void FusedOp::GenerateCode(const std::vector &req) { } } + for (size_t i = 0; i < g.num_nodes(); ++i) { + const auto& node = g[i]; + const auto* source = node.source; + if (source != nullptr) { + if (source->is_variable()) { + load_index[i] = 1; + } else { + std::string op_name = source->op()->name; + if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { + load_index[node.inputs[0].node_id] = 0; + } + } + } + } + for (size_t i = 0; i < g.num_nodes(); ++i) { + const auto& node = g[i]; + const auto* source = node.source; + if (source != nullptr) { + if (source->is_variable()) { + if (load_index[i]) { + const auto& var_name = source->attrs.name; + code += "const auto vec_" + var_name + " = load_index(" + var_name + ", offset);\n"; + variables[{i, 0}] = var_name; + } + CHECK_EQ(outputs[i], 1); + } else { + std::string op_name = source->op()->name; + if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { + int arg_id = node.inputs[0].node_id; + const auto& var_name = g[arg_id].source->attrs.name; + load_index[arg_id] = 0; + std::string begin = source->attrs.dict.at("begin"); + std::string axis = source->attrs.dict.at("axis"); + const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); + code += "const auto " + vec_name + " = load_slice(" + var_name + ", " + var_name + "_strides," + axis + "," + begin + ", &ref_index[0]);\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = vec_name; + continue; + } + } + } + } + + int counter = 0; + for (const auto& entry : g.outputs()) { + const auto var_name = "output" + std::to_string(counter); + code += "VectorType vec_output" + std::to_string(counter) + ";\n"; + ++counter; + } + + code += "for (int j = 0; j < nvec; j++ ) {\n"; + + for (size_t i = 0; i < g.num_nodes(); ++i) { const auto& node = g[i]; const auto* source = node.source; if (source != nullptr) { std::string var_name = "temp" + std::to_string(temp_name_counter++); if (source->is_variable()) { - code += "const auto " + var_name + " = load(" + source->attrs.name + ", i);\n"; - CHECK_EQ(outputs[i], 1); - variables[{i, 0}] = var_name; + if (load_index[i]) { + code += "const auto " + var_name + " = load(vec_" + variables[{i, 0}] + ".x[j]);\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = var_name; + } } else { std::string op_name = source->op()->name; if (detail::fused_op_binary_ops.find(op_name) != detail::fused_op_binary_ops.end()) { @@ -159,6 +267,13 @@ void FusedOp::GenerateCode(const std::vector &req) { continue; } + if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { + code += "const auto " + var_name + " = load(" + variables[{i, 0}] + ".x[j]);\n"; + variables[{i, 0}] = var_name; + continue; + } + + // Special cases with variable number // of inputs/outputs, listed in // detail::fused_op_variable_io_ops @@ -199,13 +314,28 @@ void FusedOp::GenerateCode(const std::vector &req) { } } - int counter = 0; + counter = 0; + for (const auto& entry : g.outputs()) { + const std::string& var = variables[{entry.node_id, entry.index}]; + const auto var_name = "output" + std::to_string(counter); + code += "vec_" + var_name + ".x[j] = store("+ var +");\n"; + ++counter; + } + + code += "}\n"; + + counter = 0; + + for (const auto& entry : g.outputs()) { const std::string& var = variables[{entry.node_id, entry.index}]; if (req[counter] == kWriteTo || req[counter] == kWriteInplace) { - code += "store(" + var + ", i, output" + std::to_string(counter) + ");\n"; + const auto var_name = "output" + std::to_string(counter); + code += "store_index(vec_" + var_name + ", i, " + var_name + ");\n"; } else if (req[counter] == kAddTo) { - code += "storeadd(" + var + ", i, output" + std::to_string(counter) + ");\n"; + const auto var_name = "output" + std::to_string(counter); + code += "store_add_index(vec_" + var_name + ", i, " + var_name + ");\n"; + //code += "store_add_index(" + var + ", i, output" + std::to_string(counter) + ");\n"; } else if (req[counter] == kNullOp) { // NULL req, do not do anything } else { @@ -229,6 +359,8 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector in_dtypes; std::vector out_dtypes; + int ndim = outputs[0].ndim(); + size_t nvec = detail::mshadowTypeToVectorLength(outputs[0].type_flag_); size_t counter = 0; for (const auto& blob : inputs) { @@ -261,33 +393,39 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, if (!initialized_) { this->GenerateCode(req); - LOG(INFO) << code_; std::string aux_code = ""; std::string kernel_params = ""; + std::string tensor_params = ""; nnvm::Symbol sym; sym.outputs = this->symbol_.outputs; const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; + aux_code += "static const int ndim = " + std::to_string(ndim) + ";\n"; + aux_code += "static const int nvec = " + std::to_string(nvec) + ";\n"; + kernel_params += "const Strides ref_strides,"; for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* " +input_names[i]; + tensor_params += " DType" + std::to_string(i) + "* " +input_names[i]; + kernel_params += " const Strides " + input_names[i]+"_strides"; ++i; if (i < num_params) { + tensor_params += ", "; kernel_params += ", "; } } for (const auto &type : out_dtypes) { std::string type_name = detail::mshadowTypeToString(type); aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - kernel_params += "DType" + std::to_string(i) + "* output" + + tensor_params += "DType" + std::to_string(i) + "* output" + std::to_string(i - in_dtypes.size()); ++i; if (i < num_params) { - kernel_params += ", "; + tensor_params += ", "; } } + kernel_params += tensor_params; code_ = std::string(detail::fp16_support_string) + "\n" + detail::type_support_string + "\n" + detail::fused_op_function_definitions + "\n" + @@ -298,6 +436,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, code_ + "\n" + detail::fused_op_kernel_end; // Guard NVRTC calls + LOG(INFO) << code_; std::lock_guard lock_nvrtc(mutex_); nvrtcProgram program; NVRTC_CALL( @@ -310,15 +449,17 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::string gpu_arch = "--gpu-architecture=compute_" + std::to_string(this->cc_major_) + std::to_string(this->cc_minor_); + std::string cuda_include_path = "-I" + detail::FindCUDAIncludePath(); const char *opts[] = {gpu_arch.c_str(), "--std=c++11", - "-default-device"}; + "-default-device", + cuda_include_path.c_str()}; const std::string kernel_name_demangled = "FusedKernel_" + attrs.name; NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); nvrtcResult compileResult = nvrtcCompileProgram(program, // prog - 3, // numOptions + 4, // numOptions opts); // options // Obtain compilation log from the program. size_t logSize; @@ -342,7 +483,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, CUdevice cuDevice; CUcontext context; CUmodule module; - CUDA_CALL(cudaGetDevice(&device)) + CUDA_CALL(cudaGetDevice(&device)); CUDA_DRIVER_CALL(cuDeviceGet(&cuDevice, device)); CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cuDevice)); CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[0])); @@ -354,14 +495,28 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, Stream* s = ctx.get_stream(); auto stream = Stream::GetStream(s); std::vector args; - size_t N = outputs[0].shape_.Size(); + size_t N = (outputs[0].shape_.Size() + nvec - 1)/nvec; args.push_back(&N); + std::vector ref_strides(ndim); + ref_strides[ndim-1] = 1; + for (int i = ndim-2; i >= 0; i--) { + ref_strides[i] = ref_strides[i+1] * outputs[0].shape_[i+1]; + } + args.push_back(ref_strides.data()); + unsigned int num_blocks = (N + FusedOp::NTHREADS - 1) / FusedOp::NTHREADS; std::vector ptrs; + std::vector> strides; for (const auto &data : inputs) { MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { Tensor tensor = data.FlatTo1D(s); ptrs.push_back(tensor.dptr_); + strides.push_back(std::vector(ndim)); + std::vector& tensor_strides = strides.back(); + tensor_strides[ndim-1] = 1; + for (int i = ndim-2; i >= 0; i--) { + tensor_strides[i] = tensor_strides[i+1] * data.shape_[i+1]; + } }); } for (const auto &data : outputs) { @@ -370,6 +525,9 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, ptrs.push_back(tensor.dptr_); }); } + for (auto &tensor_strides : strides) { + args.push_back(tensor_strides.data()); + } for (auto &ptr : ptrs) { args.push_back(reinterpret_cast(&ptr)); } From f3e4f7a401123faa3b8c4933bc83260d7142bf70 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Mon, 17 Jun 2019 09:28:15 -0700 Subject: [PATCH 039/105] Fix lint errors --- src/operator/fusion/fused_op-inl.h | 1 - src/operator/fusion/fused_op.cu | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 5ff22bd75742..19f8f58433b1 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -52,7 +52,6 @@ const char fp16_support_string[] = R"code( // } // typedef __half half; // )code"; - const char type_support_string[] = R"code( using float32 = float; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 50f31f21825e..24be1c2e5a35 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include #include @@ -28,7 +29,6 @@ #include "../elemwise_op_common.h" #include "../../executor/exec_pass.h" #include "../../common/cuda_utils.h" -#include namespace mxnet { @@ -151,7 +151,7 @@ void FusedOp::GenerateCode(const std::vector &req) { const auto& var_name = source->attrs.name; code += "const auto vec_" + var_name + " = load_index(" + var_name + ", offset);\n"; variables[{i, 0}] = var_name; - } + } CHECK_EQ(outputs[i], 1); } else { std::string op_name = source->op()->name; @@ -162,7 +162,9 @@ void FusedOp::GenerateCode(const std::vector &req) { std::string begin = source->attrs.dict.at("begin"); std::string axis = source->attrs.dict.at("axis"); const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - code += "const auto " + vec_name + " = load_slice(" + var_name + ", " + var_name + "_strides," + axis + "," + begin + ", &ref_index[0]);\n"; + code += "const auto " + vec_name + " = load_slice(" + \ + var_name + ", " + var_name + "_strides," + axis + "," + begin + \ + ", &ref_index[0]);\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = vec_name; continue; @@ -326,7 +328,6 @@ void FusedOp::GenerateCode(const std::vector &req) { counter = 0; - for (const auto& entry : g.outputs()) { const std::string& var = variables[{entry.node_id, entry.index}]; if (req[counter] == kWriteTo || req[counter] == kWriteInplace) { @@ -335,7 +336,6 @@ void FusedOp::GenerateCode(const std::vector &req) { } else if (req[counter] == kAddTo) { const auto var_name = "output" + std::to_string(counter); code += "store_add_index(vec_" + var_name + ", i, " + var_name + ");\n"; - //code += "store_add_index(" + var + ", i, output" + std::to_string(counter) + ");\n"; } else if (req[counter] == kNullOp) { // NULL req, do not do anything } else { @@ -436,7 +436,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, code_ + "\n" + detail::fused_op_kernel_end; // Guard NVRTC calls - LOG(INFO) << code_; + // LOG(INFO) << code_; std::lock_guard lock_nvrtc(mutex_); nvrtcProgram program; NVRTC_CALL( From 84822e151193b6363158c2b309d08752d5137ed8 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Tue, 18 Jun 2019 11:34:15 -0700 Subject: [PATCH 040/105] Added multiple types support for vector loading/storing --- src/operator/fusion/fused_op-inl.h | 71 ++++++++++++++++++------------ src/operator/fusion/fused_op.cu | 18 ++++---- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 19f8f58433b1..aedac48cfaee 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -232,6 +232,16 @@ const std::vector fused_op_variable_io_ops = { }; const char fused_op_function_definitions[] = R"code( + +template +struct remove_pointer; + +template +struct remove_pointer +{ + typedef U type; +}; + template struct LoadType { using Type = DType; @@ -253,43 +263,46 @@ inline float load(const half input) { } template -inline DType1 store(const DType2 input) { +inline DType1 store(const DType2 input, DType1* ref) { return input; } template<> -inline half store(const float input) { +inline half store(const float input, half* ref) { return __float2half(input); } -template +template struct VectorConfig { - static const int N = 1; - using IndexType = DType; + static_assert(size >= 4, "Error"); + using IndexType = float; }; -struct VectorConfig { - static const int N = 2; - using IndexType = __half2; +template <> +struct VectorConfig<8> { + using IndexType = double; }; +template <> +struct VectorConfig<16> { + using IndexType = double2; +}; -template +template union VectorType { - typename VectorConfig::IndexType y; - DType x[VectorConfig::N]; + typename VectorConfig::IndexType y; + DType x[nvec]; VectorType () {}; - VectorType (const VectorType& y2) { + VectorType (const VectorType& y2) { y = y2.y; } - VectorType (const typename VectorConfig::IndexType &y2) { + VectorType (const decltype(y) &y2) { y = y2; } }; - template struct Strides { int x[ndim]; @@ -309,15 +322,15 @@ inline Strides get_index(const Strides strides, int i) { } -template -inline VectorType load_index(const DType * input, int i) { - const auto* vector_input = reinterpret_cast::IndexType *>(input + i); - VectorType ret = {*vector_input}; +template +inline VectorType load_index(const DType * input, int i) { + const auto* vector_input = reinterpret_cast::IndexType *>(input + i); + VectorType ret = {*vector_input}; return ret; } -template -inline VectorType load_slice(const DType * input, const Strides strides, int axis, int begin, Strides* ref_index) { +template +inline VectorType load_slice(const DType * input, const Strides strides, int axis, int begin, Strides* ref_index) { int idx[nvec]; bool consecutive = true; #pragma unroll @@ -333,27 +346,27 @@ inline VectorType load_slice(const DType * input, const Strides str } } if (!consecutive) { - VectorType ret; + VectorType ret; #pragma unroll for (int j = 0; j < nvec; j++) { ret.x[j] = *(input + idx[j]); } return ret; - } - return load_index(input, idx[0]); + } + return load_index(input, idx[0]); } -template -inline void store_index(const VectorType value, int i, DType * output) { - auto vector_output = reinterpret_cast::IndexType *>(output); +template +inline void store_index(const VectorType value, int i, DType * output) { + auto vector_output = reinterpret_cast::IndexType *>(output); vector_output[i] = value.y; } -template -inline void store_add_index(const VectorType value, int i, DType * output) { - auto vector_output = reinterpret_cast::IndexType *>(output); +template +inline void store_add_index(const VectorType value, int i, DType * output) { + auto vector_output = reinterpret_cast::IndexType *>(output); vector_output[i] += value.y; } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 24be1c2e5a35..3950b1c99c4a 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -93,9 +93,9 @@ inline int mshadowTypeToVectorLength(int type) { case mshadow::kFloat16: return 2; case mshadow::kUint8: - return 1; + return 4; case mshadow::kInt8: - return 1; + return 4; case mshadow::kInt32: return 1; case mshadow::kInt64: @@ -149,7 +149,7 @@ void FusedOp::GenerateCode(const std::vector &req) { if (source->is_variable()) { if (load_index[i]) { const auto& var_name = source->attrs.name; - code += "const auto vec_" + var_name + " = load_index(" + var_name + ", offset);\n"; + code += "const auto vec_" + var_name + " = load_index(" + var_name + ", offset);\n"; variables[{i, 0}] = var_name; } CHECK_EQ(outputs[i], 1); @@ -162,7 +162,7 @@ void FusedOp::GenerateCode(const std::vector &req) { std::string begin = source->attrs.dict.at("begin"); std::string axis = source->attrs.dict.at("axis"); const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - code += "const auto " + vec_name + " = load_slice(" + \ + code += "const auto " + vec_name + " = load_slice(" + \ var_name + ", " + var_name + "_strides," + axis + "," + begin + \ ", &ref_index[0]);\n"; CHECK_EQ(outputs[i], 1); @@ -176,7 +176,7 @@ void FusedOp::GenerateCode(const std::vector &req) { int counter = 0; for (const auto& entry : g.outputs()) { const auto var_name = "output" + std::to_string(counter); - code += "VectorType vec_output" + std::to_string(counter) + ";\n"; + code += "VectorType::type, nvec> vec_output" + std::to_string(counter) + ";\n"; ++counter; } @@ -320,7 +320,8 @@ void FusedOp::GenerateCode(const std::vector &req) { for (const auto& entry : g.outputs()) { const std::string& var = variables[{entry.node_id, entry.index}]; const auto var_name = "output" + std::to_string(counter); - code += "vec_" + var_name + ".x[j] = store("+ var +");\n"; + //code += "vec_" + var_name + ".x[j] = store::type>("+ var +");\n"; + code += "vec_" + var_name + ".x[j] = store("+ var +", " + var_name + ");\n"; ++counter; } @@ -360,13 +361,14 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector in_dtypes; std::vector out_dtypes; int ndim = outputs[0].ndim(); - size_t nvec = detail::mshadowTypeToVectorLength(outputs[0].type_flag_); + int nvec = 1; size_t counter = 0; for (const auto& blob : inputs) { in_dtypes.push_back(blob.type_flag_); initialized_ = initialized_ && (blob.type_flag_ == inputs_[counter].dtype); inputs_[counter].dtype = blob.type_flag_; + nvec = max(nvec, detail::mshadowTypeToVectorLength(blob.type_flag_)); ++counter; } @@ -436,7 +438,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, code_ + "\n" + detail::fused_op_kernel_end; // Guard NVRTC calls - // LOG(INFO) << code_; + LOG(INFO) << code_; std::lock_guard lock_nvrtc(mutex_); nvrtcProgram program; NVRTC_CALL( From 28962587299bc1d591dd4450aac3da56a707a1ff Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Tue, 18 Jun 2019 17:30:31 -0400 Subject: [PATCH 041/105] add slice fusion when it's at the beginning of subgraphs --- src/executor/pointwise_fusion_pass.cc | 43 ++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index dc7736211434..eac27aa23fa9 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -54,7 +54,7 @@ namespace { if (fused_op_mimo_ops.count(op_name)) return true; if (fused_op_slice_ops.count(op_name)) - return true; + return false; if (std::find(fused_op_variable_io_ops.begin(), fused_op_variable_io_ops.end(), op_name) != @@ -63,6 +63,16 @@ namespace { return false; } + bool IsInputsOnlyCompatible(nnvm::Node* n) { + using namespace mxnet::detail; + if (n->op() == nullptr) + return false; + std::string op_name = n->op()->name; + if (fused_op_slice_ops.count(op_name)) + return true; + return false; + } + nnvm::NodePtr CreateSubgraphNode(const Graph& subgraph, size_t inputs_size) { nnvm::Symbol subgraph_sym; auto node = nnvm::Node::Create(); @@ -194,6 +204,36 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub return new_graph; } +template +void AddInputsOnlyCompatible(const Graph &g, + std::vector >& subsets, + IsCompatible is_compatible) { + std::unordered_map node2setidx; + size_t subgraphs_fullsize = 0; + for (auto& s : subsets) { + subgraphs_fullsize += s.size(); + } + node2setidx.reserve(subgraphs_fullsize); + for (size_t i = 0; i < subsets.size(); ++i) { + for (auto& n : subsets[i]) { + node2setidx.insert({n, i}); + } + } + std::vector > to_add(subsets.size()); + DFSVisit(g.outputs, [&is_compatible, &node2setidx, &subsets, &to_add](const nnvm::NodePtr& n) { + const auto& it = node2setidx.find(n.get()); + if (it != node2setidx.end()) { + for (auto& e : n->inputs) { + if (is_compatible(e.node.get())) + to_add[it->second].push_back(e.node.get()); + } + } + }); + for (size_t i = 0; i < subsets.size(); ++i) { + subsets[i].insert(to_add[i].begin(), to_add[i].end()); + } +} + Graph FusePointwiseForward(Graph &&g) { Graph ret; g.indexed_graph(); @@ -202,6 +242,7 @@ Graph FusePointwiseForward(Graph &&g) { fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), g.outputs.begin() + num_forward_outputs); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); + AddInputsOnlyCompatible(fg, subsets, IsInputsOnlyCompatible); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; return ret; From eb0151ccc4a667217918df46d0a10ec6b17a20fe Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Tue, 18 Jun 2019 16:44:23 -0700 Subject: [PATCH 042/105] Removed constant ndim assumption in fused op --- src/operator/fusion/fused_op-inl.h | 44 ++++++++++++++---------------- src/operator/fusion/fused_op.cu | 22 ++++++--------- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index aedac48cfaee..7a3437ff24db 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -308,20 +308,6 @@ struct Strides { int x[ndim]; }; -template -inline Strides get_index(const Strides strides, int i) { - int idx = i; - Strides ref_index; - #pragma unroll - for (int dim = 0; dim < ndim; dim++) { - int stride = strides.x[dim]; - ref_index.x[dim] = idx / stride; - idx = idx % stride; - } - return ref_index; -} - - template inline VectorType load_index(const DType * input, int i) { const auto* vector_input = reinterpret_cast::IndexType *>(input + i); @@ -329,19 +315,36 @@ inline VectorType load_index(const DType * input, int i) { return ret; } -template -inline VectorType load_slice(const DType * input, const Strides strides, int axis, int begin, Strides* ref_index) { +template +inline VectorType load_slice(const DType * input, const Strides strides, int begin, int end, int offset) { int idx[nvec]; bool consecutive = true; + + Strides ref_strides; + if (axis > 0) { + int shape = strides.x[axis-1]/strides.x[axis]; + #pragma unroll + for (int dim = 0; dim < axis; dim++) { + ref_strides.x[dim] = (strides.x[dim] / shape) * (end-begin); + } + } + #pragma unroll + for (int dim = axis; dim < ndim; dim++) { + ref_strides.x[dim] = strides.x[dim]; + } + #pragma unroll for (int j = 0; j < nvec; j++) { idx[j] = 0; + int ref_idx = offset + j; #pragma unroll for (int dim = 0; dim < ndim; dim++) { - idx[j] += ref_index[j].x[dim] * strides.x[dim]; + int stride = ref_strides.x[dim]; + idx[j] += (ref_idx / stride) * strides.x[dim]; + ref_idx = ref_idx % stride; } idx[j] += begin * strides.x[axis]; - if (j > 0 && (idx[j] != idx[j-1])) { + if (j > 0 && (idx[j] != (idx[j-1] + 1))) { consecutive = false; } } @@ -830,12 +833,7 @@ inline DType backward_erfinv(const DType val, const DType grad) { const char fused_op_kernel_begin[] = R"code( const int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < N; i+= gridDim.x * blockDim.x) { - Strides ref_index[nvec]; int offset = i*nvec; - #pragma unroll - for (int j = 0; j < nvec; j++) { - ref_index[j] = get_index(ref_strides, offset + j); - } )code"; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 3950b1c99c4a..5338f1b56ad8 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -160,11 +160,12 @@ void FusedOp::GenerateCode(const std::vector &req) { const auto& var_name = g[arg_id].source->attrs.name; load_index[arg_id] = 0; std::string begin = source->attrs.dict.at("begin"); + std::string end = source->attrs.dict.at("end"); std::string axis = source->attrs.dict.at("axis"); const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - code += "const auto " + vec_name + " = load_slice(" + \ - var_name + ", " + var_name + "_strides," + axis + "," + begin + \ - ", &ref_index[0]);\n"; + code += "const auto " + vec_name + " = load_slice(" + \ + var_name + ", " + var_name + "_strides," + begin + \ + "," + end + ", offset);\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = vec_name; continue; @@ -359,6 +360,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, CHECK_GE(outputs.size(), 1) << "There needs to be at least 1 output."; std::vector in_dtypes; + std::vector in_ndims; std::vector out_dtypes; int ndim = outputs[0].ndim(); int nvec = 1; @@ -366,6 +368,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, size_t counter = 0; for (const auto& blob : inputs) { in_dtypes.push_back(blob.type_flag_); + in_ndims.push_back(blob.ndim()); initialized_ = initialized_ && (blob.type_flag_ == inputs_[counter].dtype); inputs_[counter].dtype = blob.type_flag_; nvec = max(nvec, detail::mshadowTypeToVectorLength(blob.type_flag_)); @@ -403,14 +406,13 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; - aux_code += "static const int ndim = " + std::to_string(ndim) + ";\n"; aux_code += "static const int nvec = " + std::to_string(nvec) + ";\n"; - kernel_params += "const Strides ref_strides,"; for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; + aux_code = "static const int ndim" + std::to_string(i) + " = " + std::to_string(in_ndims[i]) + ";\n" + aux_code; tensor_params += " DType" + std::to_string(i) + "* " +input_names[i]; - kernel_params += " const Strides " + input_names[i]+"_strides"; + kernel_params += " const Strides " + input_names[i]+"_strides"; ++i; if (i < num_params) { tensor_params += ", "; @@ -438,7 +440,6 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, code_ + "\n" + detail::fused_op_kernel_end; // Guard NVRTC calls - LOG(INFO) << code_; std::lock_guard lock_nvrtc(mutex_); nvrtcProgram program; NVRTC_CALL( @@ -499,18 +500,13 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector args; size_t N = (outputs[0].shape_.Size() + nvec - 1)/nvec; args.push_back(&N); - std::vector ref_strides(ndim); - ref_strides[ndim-1] = 1; - for (int i = ndim-2; i >= 0; i--) { - ref_strides[i] = ref_strides[i+1] * outputs[0].shape_[i+1]; - } - args.push_back(ref_strides.data()); unsigned int num_blocks = (N + FusedOp::NTHREADS - 1) / FusedOp::NTHREADS; std::vector ptrs; std::vector> strides; for (const auto &data : inputs) { MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { + int ndim = data.ndim(); Tensor tensor = data.FlatTo1D(s); ptrs.push_back(tensor.dptr_); strides.push_back(std::vector(ndim)); From 935342fafa006497955f1c67c010d617532429e2 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Tue, 18 Jun 2019 16:54:53 -0700 Subject: [PATCH 043/105] Fix memory alignment issue in slice for FusedOp --- src/operator/fusion/fused_op-inl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 7a3437ff24db..d40a9768b8b1 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -318,7 +318,7 @@ inline VectorType load_index(const DType * input, int i) { template inline VectorType load_slice(const DType * input, const Strides strides, int begin, int end, int offset) { int idx[nvec]; - bool consecutive = true; + bool mem_aligned = true; Strides ref_strides; if (axis > 0) { @@ -345,10 +345,11 @@ inline VectorType load_slice(const DType * input, const Strides 0 && (idx[j] != (idx[j-1] + 1))) { - consecutive = false; + mem_aligned = false; } } - if (!consecutive) { + mem_aligned = mem_aligned && ((idx[0] % nvec) == 0); + if (!mem_aligned) { VectorType ret; #pragma unroll for (int j = 0; j < nvec; j++) { From ffa6c637cda547ae2baaf9c4618dcbe2fcd8ca11 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Tue, 18 Jun 2019 18:37:44 -0700 Subject: [PATCH 044/105] Fixes --- src/operator/fusion/fused_op-inl.h | 3 +++ src/operator/fusion/fused_op.cu | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index d40a9768b8b1..718360f9823d 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -323,6 +323,9 @@ inline VectorType load_slice(const DType * input, const Strides ref_strides; if (axis > 0) { int shape = strides.x[axis-1]/strides.x[axis]; + if (begin < 0) begin = shape - begin; + if (end < 0) begin = shape - begin; + if (end > shape) end = shape; #pragma unroll for (int dim = 0; dim < axis; dim++) { ref_strides.x[dim] = (strides.x[dim] / shape) * (end-begin); diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 5338f1b56ad8..e07d6a8e7b1d 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -161,6 +161,9 @@ void FusedOp::GenerateCode(const std::vector &req) { load_index[arg_id] = 0; std::string begin = source->attrs.dict.at("begin"); std::string end = source->attrs.dict.at("end"); + if (end == "None") { + end = "((1<<31)-1)"; + } std::string axis = source->attrs.dict.at("axis"); const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); code += "const auto " + vec_name + " = load_slice(" + \ @@ -380,6 +383,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, out_dtypes.push_back(blob.type_flag_); initialized_ = initialized_ && (blob.type_flag_ == outputs_[counter].dtype); outputs_[counter].dtype = blob.type_flag_; + nvec = max(nvec, detail::mshadowTypeToVectorLength(blob.type_flag_)); ++counter; } @@ -398,6 +402,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, if (!initialized_) { this->GenerateCode(req); + LOG(INFO) << code_; std::string aux_code = ""; std::string kernel_params = ""; std::string tensor_params = ""; @@ -409,10 +414,12 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, aux_code += "static const int nvec = " + std::to_string(nvec) + ";\n"; for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); - aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - aux_code = "static const int ndim" + std::to_string(i) + " = " + std::to_string(in_ndims[i]) + ";\n" + aux_code; - tensor_params += " DType" + std::to_string(i) + "* " +input_names[i]; - kernel_params += " const Strides " + input_names[i]+"_strides"; + std::string dtype_var = "DType" + std::to_string(i); + std::string dim_var = "ndim" + std::to_string(i); + aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; + aux_code = "static const int " + dim_var + " = " + std::to_string(in_ndims[i]) + ";\n" + aux_code; + tensor_params += dtype_var + "* " +input_names[i]; + kernel_params += " const Strides<" + dim_var + "> " + input_names[i]+"_strides"; ++i; if (i < num_params) { tensor_params += ", "; @@ -421,8 +428,9 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, } for (const auto &type : out_dtypes) { std::string type_name = detail::mshadowTypeToString(type); - aux_code = "using DType" + std::to_string(i) + " = " + type_name + ";\n" + aux_code; - tensor_params += "DType" + std::to_string(i) + "* output" + + std::string dtype_var = "DType" + std::to_string(i); + aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; + tensor_params += dtype_var + "* output" + std::to_string(i - in_dtypes.size()); ++i; if (i < num_params) { From 803fd2a1dd390acef73402482bbf9c634807299f Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Wed, 19 Jun 2019 09:53:24 -0700 Subject: [PATCH 045/105] Fix lint errors --- src/executor/pointwise_fusion_pass.cc | 18 +++++++++--------- src/operator/fusion/fused_op.cu | 10 ++++++---- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index eac27aa23fa9..2bce9346fa64 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -206,21 +206,21 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub template void AddInputsOnlyCompatible(const Graph &g, - std::vector >& subsets, + std::vector >* subsets, IsCompatible is_compatible) { std::unordered_map node2setidx; size_t subgraphs_fullsize = 0; - for (auto& s : subsets) { + for (auto& s : *subsets) { subgraphs_fullsize += s.size(); } node2setidx.reserve(subgraphs_fullsize); - for (size_t i = 0; i < subsets.size(); ++i) { - for (auto& n : subsets[i]) { + for (size_t i = 0; i < subsets->size(); ++i) { + for (auto& n : (*subsets)[i]) { node2setidx.insert({n, i}); } } - std::vector > to_add(subsets.size()); - DFSVisit(g.outputs, [&is_compatible, &node2setidx, &subsets, &to_add](const nnvm::NodePtr& n) { + std::vector > to_add(subsets->size()); + DFSVisit(g.outputs, [&is_compatible, &node2setidx, subsets, &to_add](const nnvm::NodePtr& n) { const auto& it = node2setidx.find(n.get()); if (it != node2setidx.end()) { for (auto& e : n->inputs) { @@ -229,8 +229,8 @@ void AddInputsOnlyCompatible(const Graph &g, } } }); - for (size_t i = 0; i < subsets.size(); ++i) { - subsets[i].insert(to_add[i].begin(), to_add[i].end()); + for (size_t i = 0; i < subsets->size(); ++i) { + (*subsets)[i].insert(to_add[i].begin(), to_add[i].end()); } } @@ -242,7 +242,7 @@ Graph FusePointwiseForward(Graph &&g) { fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), g.outputs.begin() + num_forward_outputs); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); - AddInputsOnlyCompatible(fg, subsets, IsInputsOnlyCompatible); + AddInputsOnlyCompatible(fg, &subsets, IsInputsOnlyCompatible); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; return ret; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index e07d6a8e7b1d..2d0f06ebadf2 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -149,7 +149,8 @@ void FusedOp::GenerateCode(const std::vector &req) { if (source->is_variable()) { if (load_index[i]) { const auto& var_name = source->attrs.name; - code += "const auto vec_" + var_name + " = load_index(" + var_name + ", offset);\n"; + code += "const auto vec_" + var_name + " = load_index(" + \ + var_name + ", offset);\n"; variables[{i, 0}] = var_name; } CHECK_EQ(outputs[i], 1); @@ -180,7 +181,8 @@ void FusedOp::GenerateCode(const std::vector &req) { int counter = 0; for (const auto& entry : g.outputs()) { const auto var_name = "output" + std::to_string(counter); - code += "VectorType::type, nvec> vec_output" + std::to_string(counter) + ";\n"; + code += "VectorType::type, nvec> vec_output" + std::to_string(counter) + ";\n"; ++counter; } @@ -324,7 +326,6 @@ void FusedOp::GenerateCode(const std::vector &req) { for (const auto& entry : g.outputs()) { const std::string& var = variables[{entry.node_id, entry.index}]; const auto var_name = "output" + std::to_string(counter); - //code += "vec_" + var_name + ".x[j] = store::type>("+ var +");\n"; code += "vec_" + var_name + ".x[j] = store("+ var +", " + var_name + ");\n"; ++counter; } @@ -417,7 +418,8 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::string dtype_var = "DType" + std::to_string(i); std::string dim_var = "ndim" + std::to_string(i); aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; - aux_code = "static const int " + dim_var + " = " + std::to_string(in_ndims[i]) + ";\n" + aux_code; + aux_code = "static const int " + dim_var + " = " + \ + std::to_string(in_ndims[i]) + ";\n" + aux_code; tensor_params += dtype_var + "* " +input_names[i]; kernel_params += " const Strides<" + dim_var + "> " + input_names[i]+"_strides"; ++i; From 3ed3aefbd8413847d4ffd15f5173069c652376c7 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 13:59:28 -0700 Subject: [PATCH 046/105] Do not include cuda_fp16.h --- src/operator/fusion/fused_op-inl.h | 36 +++++++++++++----------------- src/operator/fusion/fused_op.cu | 34 ++-------------------------- 2 files changed, 18 insertions(+), 52 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 718360f9823d..bd202b880b4e 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -31,28 +31,24 @@ namespace mxnet { namespace detail { const char fp16_support_string[] = R"code( -#include +struct __align__(2) __half { + __host__ __device__ __half() { } + unsigned short __x; +}; +/* Definitions of intrinsics */ +__device__ inline __half __float2half(const float f) { + __half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f)); + return val; +} +__device__ inline float __half2float(const __half h) { + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x)); + return val; +} +typedef __half half; )code"; -// const char fp16_support_string[] = R"code( -// struct __align__(2) __half { -// __host__ __device__ __half() { } -// unsigned short __x; -// }; -// /* Definitions of intrinsics */ -// __device__ inline __half __float2half(const float f) { -// __half val; -// asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f)); -// return val; -// } -// __device__ inline float __half2float(const __half h) { -// float val; -// asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x)); -// return val; -// } -// typedef __half half; -// )code"; - const char type_support_string[] = R"code( using float32 = float; using float64 = double; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 2d0f06ebadf2..eb41b6703a36 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -34,34 +34,6 @@ namespace mxnet { namespace detail { -std::string FindCUDAIncludePath() { -#if defined(_WIN32) - const std::string delimiter = "\\"; -#else - const std::string delimiter = "/"; -#endif - std::string cuda_include_path; - const char* cuda_path_env = std::getenv("CUDA_PATH"); - if (cuda_path_env != nullptr) { - cuda_include_path += cuda_path_env; - cuda_include_path += delimiter + "include"; - return cuda_include_path; - } - -#if defined(__linux__) - struct stat st; - cuda_include_path = "/usr/local/cuda/include"; - if (stat(cuda_include_path.c_str(), &st) == 0) { - return cuda_include_path; - } -#endif - LOG(FATAL) << "Cannot find cuda include path." - << "CUDA_PATH is not set or CUDA is not installed in the default installation path." - << "In other than linux, it is necessary to set CUDA_PATH."; - return cuda_include_path; -} - - inline std::string mshadowTypeToString(int type) { switch (type) { case mshadow::kFloat32: @@ -462,17 +434,15 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::string gpu_arch = "--gpu-architecture=compute_" + std::to_string(this->cc_major_) + std::to_string(this->cc_minor_); - std::string cuda_include_path = "-I" + detail::FindCUDAIncludePath(); const char *opts[] = {gpu_arch.c_str(), "--std=c++11", - "-default-device", - cuda_include_path.c_str()}; + "-default-device"}; const std::string kernel_name_demangled = "FusedKernel_" + attrs.name; NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); nvrtcResult compileResult = nvrtcCompileProgram(program, // prog - 4, // numOptions + 3, // numOptions opts); // options // Obtain compilation log from the program. size_t logSize; From 84c2df5d24a1c820120ecae8b4da09d0829b7853 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 15:04:52 -0700 Subject: [PATCH 047/105] Refactor fused op op lists --- src/executor/pointwise_fusion_pass.cc | 8 +- src/operator/fusion/fused_op-inl.h | 315 +++++++++++++------------- src/operator/fusion/fused_op.cu | 87 ++----- 3 files changed, 184 insertions(+), 226 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 2bce9346fa64..dbbd88176767 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -45,13 +45,7 @@ namespace { if (n->op() == nullptr) return false; std::string op_name = n->op()->name; - if (fused_op_binary_ops.count(op_name)) - return true; - if (fused_op_unary_ops.count(op_name)) - return true; - if (fused_op_special_ops.count(op_name)) - return true; - if (fused_op_mimo_ops.count(op_name)) + if (fused_op_ops_desc.count(op_name)) return true; if (fused_op_slice_ops.count(op_name)) return false; diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index bd202b880b4e..c71c09b26bf5 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -59,163 +59,164 @@ using int32 = int; using int64 = long long; )code"; -const std::map fused_op_binary_ops = { - {"elemwise_add", "add"}, - {"_plus" , "add"}, - {"_Plus" , "add"}, - {"_add" , "add"}, - {"elemwise_sub", "sub"}, - {"_minus" , "sub"}, - {"_Minus" , "sub"}, - {"_sub" , "sub"}, - {"elemwise_mul", "mul"}, - {"_mul" , "mul"}, - {"_Mul" , "mul"}, - {"elemwise_div", "div"}, - {"_div" , "div"}, - {"_Div" , "div"}, - {"_Power" , "power"}, - {"_power" , "power"}, - {"_Maximum" , "max"}, - {"_maximum" , "max"}, - {"_Minimum" , "min"}, - {"_minimum" , "min"} -}; - -const std::map fused_op_unary_ops = { - {"amp_cast" , "identity"}, - {"relu" , "relu"}, - {"sigmoid" , "sigmoid"}, - {"softsign" , "softsign"}, - {"exp" , "exp"}, - {"expm1" , "expm1"}, - {"log" , "log"}, - {"log10" , "log10"}, - {"log2" , "log2"}, - {"log1p" , "log1p"}, - {"degrees" , "degrees"}, - {"radians" , "radians"}, - {"sin" , "sin"}, - {"cos" , "cos"}, - {"tan" , "tan"}, - {"arcsin" , "arcsin"}, - {"arccos" , "arccos"}, - {"arccos" , "arccos"}, - {"arctan" , "arctan"}, - {"sinh" , "sinh"}, - {"cosh" , "cosh"}, - {"tanh" , "tanh"}, - {"arcsinh" , "arcsinh"}, - {"arccosh" , "arccosh"}, - {"arctanh" , "arctanh"}, - {"sqrt" , "sqrt"}, - {"rsqrt" , "rsqrt"}, - {"cbrt" , "cbrt"}, - {"rcbrt" , "rcbrt"}, - {"square" , "square"}, - {"squeeze" , "identity"}, - {"zeros_like" , "zero"}, - {"ones_like" , "one"}, - {"flatten" , "identity"}, - {"Reshape" , "identity"}, - {"reshape" , "identity"}, - {"expand_dims" , "identity"}, - {"round" , "round"}, - {"rint" , "rint"}, - {"fix" , "fix"}, - {"floor" , "floor"}, - {"ceil" , "ceil"}, - {"trunc" , "trunc"}, - {"sign" , "sign"}, - {"reciprocal" , "reciprocal"}, - {"abs" , "abs"}, - {"gamma" , "gamma"}, - {"gammaln" , "gammaln"}, - {"erf" , "erf"}, - {"erfinv" , "erfinv"}, - {"_copy" , "identity"}, - {"_identity_with_attr_like_rhs" , "identity"} -}; - -const std::map> fused_op_special_ops = { - {"_plus_scalar", {"add(%, %)", "_0", "scalar"}}, - {"_PlusScalar", {"add(%, %)", "_0", "scalar"}}, - {"_minus_scalar", {"sub(%, %)", "_0", "scalar"}}, - {"_MinusScalar", {"sub(%, %)", "_0", "scalar"}}, - {"_rminus_scalar", {"(-sub(%, %))", "_0", "scalar"}}, - {"_RMinusScalar", {"(-sub(%, %))", "_0", "scalar"}}, - {"_mul_scalar", {"mul(%, %)", "_0", "scalar"}}, - {"_MulScalar", {"mul(%, %)", "_0", "scalar"}}, - {"_div_scalar", {"div(%, %)", "_0", "scalar"}}, - {"_DivScalar", {"div(%, %)", "_0", "scalar"}}, - {"_rdiv_scalar", {"rdiv(%, %)", "_0", "scalar"}}, - {"_power_scalar", {"power(%, %)", "_0", "scalar"}}, - {"_PowerScalar", {"power(%, %)", "_0", "scalar"}}, - {"_rpower_scalar", {"rpow(%, %)", "_0", "scalar"}}, - {"_RPowerScalar", {"rpow(%, %)", "_0", "scalar"}}, - {"_RDivScalar", {"rdiv(%, %)", "_0", "scalar"}}, - {"Cast", {"cast<%>(%)", "dtype", "_0"}}, - {"cast", {"cast<%>(%)", "dtype", "_0"}}, - {"Activation", {"%(%)", "act_type", "_0"}}, - {"clip", {"clip(%, %, %)", "_0", "a_min", "a_max"}}, - {"_zeros", {"zero<%>(0)", "dtype"}}, - {"_ones", {"one<%>(0)", "dtype"}}, - {"negative", {"(-%)", "_0"}}, - {"_hypot", {"hypot(%, %)", "_0", "_1"}}, - {"_hypot_scalar", {"hypot(%, %)", "_0", "scalar"}}, - {"_backward_relu", {"backward_relu(%, %)", "_1", "_0"}}, - {"_backward_sigmoid", {"backward_sigmoid(%, %)", "_1", "_0"}}, - {"_backward_expm1", {"backward_expm1(%, %)", "_1", "_0"}}, - {"_backward_log", {"backward_log(%, %)", "_1", "_0"}}, - {"_backward_log10", {"backward_log10(%, %)", "_1", "_0"}}, - {"_backward_log2", {"backward_log2(%, %)", "_1", "_0"}}, - {"_backward_log1p", {"backward_log1p(%, %)", "_1", "_0"}}, - {"_backward_sin", {"backward_sin(%, %)", "_1", "_0"}}, - {"_backward_cos", {"backward_cos(%, %)", "_1", "_0"}}, - {"_backward_tan", {"backward_tan(%, %)", "_1", "_0"}}, - {"_backward_arcsin", {"backward_arcsin(%, %)", "_1", "_0"}}, - {"_backward_arccos", {"backward_arccos(%, %)", "_1", "_0"}}, - {"_backward_arctan", {"backward_arctan(%, %)", "_1", "_0"}}, - {"_backward_sinh", {"backward_sinh(%, %)", "_1", "_0"}}, - {"_backward_cosh", {"backward_cosh(%, %)", "_1", "_0"}}, - {"_backward_tanh", {"backward_tanh(%, %)", "_1", "_0"}}, - {"_backward_arcsinh", {"backward_arcsinh(%, %)", "_1", "_0"}}, - {"_backward_arccosh", {"backward_arccosh(%, %)", "_1", "_0"}}, - {"_backward_arctanh", {"backward_arctanh(%, %)", "_1", "_0"}}, - {"_backward_sqrt", {"backward_sqrt(%, %)", "_1", "_0"}}, - {"_backward_rsqrt", {"backward_rsqrt(%, %)", "_1", "_0"}}, - {"_backward_cbrt", {"backward_cbrt(%, %)", "_1", "_0"}}, - {"_backward_rcbrt", {"backward_rcbrt(%, %)", "_1", "_0"}}, - {"_backward_square", {"backward_square(%, %)", "_1", "_0"}}, - {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, - {"_backward_div_scalar", {"(% / %)", "_0", "scalar"}}, - {"_backward_rdiv_scalar", {"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}, - {"_backward_hypot_scalar", {"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}}, - {"_backward_radians", {"radians(%)", "_0"}}, - {"_backward_erf", {"backward_erf(%, %)", "_1", "_0"}}, - {"_backward_erfinv", {"backward_erfinv(%, %)", "_1", "_0"}} +const std::map>> fused_op_ops_desc = { + {"elemwise_add" , {{"add(%, %)", "_0", "_1"}}}, + {"_plus" , {{"add(%, %)", "_0", "_1"}}}, + {"_Plus" , {{"add(%, %)", "_0", "_1"}}}, + {"_add" , {{"add(%, %)", "_0", "_1"}}}, + {"elemwise_sub" , {{"sub(%, %)", "_0", "_1"}}}, + {"_minus" , {{"sub(%, %)", "_0", "_1"}}}, + {"_Minus" , {{"sub(%, %)", "_0", "_1"}}}, + {"_sub" , {{"sub(%, %)", "_0", "_1"}}}, + {"elemwise_mul" , {{"mul(%, %)", "_0", "_1"}}}, + {"_mul" , {{"mul(%, %)", "_0", "_1"}}}, + {"_Mul" , {{"mul(%, %)", "_0", "_1"}}}, + {"elemwise_div" , {{"div(%, %)", "_0", "_1"}}}, + {"_div" , {{"div(%, %)", "_0", "_1"}}}, + {"_Div" , {{"div(%, %)", "_0", "_1"}}}, + {"_Power" , {{"power(%, %)", "_0", "_1"}}}, + {"_power" , {{"power(%, %)", "_0", "_1"}}}, + {"_Maximum" , {{"max(%, %)", "_0", "_1"}}}, + {"_maximum" , {{"max(%, %)", "_0", "_1"}}}, + {"_Minimum" , {{"min(%, %)", "_0", "_1"}}}, + {"_minimum" , {{"min(%, %)", "_0", "_1"}}}, + {"amp_cast" , {{"identity(%)", "_0"}}}, + {"relu" , {{"relu(%)", "_0"}}}, + {"sigmoid" , {{"sigmoid(%)", "_0"}}}, + {"softsign" , {{"softsign(%)", "_0"}}}, + {"exp" , {{"exp(%)", "_0"}}}, + {"expm1" , {{"expm1(%)", "_0"}}}, + {"log" , {{"log(%)", "_0"}}}, + {"log10" , {{"log10(%)", "_0"}}}, + {"log2" , {{"log2(%)", "_0"}}}, + {"log1p" , {{"log1p(%)", "_0"}}}, + {"degrees" , {{"degrees(%)", "_0"}}}, + {"radians" , {{"radians(%)", "_0"}}}, + {"sin" , {{"sin(%)", "_0"}}}, + {"cos" , {{"cos(%)", "_0"}}}, + {"tan" , {{"tan(%)", "_0"}}}, + {"arcsin" , {{"arcsin(%)", "_0"}}}, + {"arccos" , {{"arccos(%)", "_0"}}}, + {"arccos" , {{"arccos(%)", "_0"}}}, + {"arctan" , {{"arctan(%)", "_0"}}}, + {"sinh" , {{"sinh(%)", "_0"}}}, + {"cosh" , {{"cosh(%)", "_0"}}}, + {"tanh" , {{"tanh(%)", "_0"}}}, + {"arcsinh" , {{"arcsinh(%)", "_0"}}}, + {"arccosh" , {{"arccosh(%)", "_0"}}}, + {"arctanh" , {{"arctanh(%)", "_0"}}}, + {"sqrt" , {{"sqrt(%)", "_0"}}}, + {"rsqrt" , {{"rsqrt(%)", "_0"}}}, + {"cbrt" , {{"cbrt(%)", "_0"}}}, + {"rcbrt" , {{"rcbrt(%)", "_0"}}}, + {"square" , {{"square(%)", "_0"}}}, + {"squeeze" , {{"identity(%)", "_0"}}}, + {"zeros_like" , {{"zero(%)", "_0"}}}, + {"ones_like" , {{"one(%)", "_0"}}}, + {"flatten" , {{"identity(%)", "_0"}}}, + {"Reshape" , {{"identity(%)", "_0"}}}, + {"reshape" , {{"identity(%)", "_0"}}}, + {"expand_dims" , {{"identity(%)", "_0"}}}, + {"round" , {{"round(%)", "_0"}}}, + {"rint" , {{"rint(%)", "_0"}}}, + {"fix" , {{"fix(%)", "_0"}}}, + {"floor" , {{"floor(%)", "_0"}}}, + {"ceil" , {{"ceil(%)", "_0"}}}, + {"trunc" , {{"trunc(%)", "_0"}}}, + {"sign" , {{"sign(%)", "_0"}}}, + {"reciprocal" , {{"reciprocal(%)", "_0"}}}, + {"abs" , {{"abs(%)", "_0"}}}, + {"gamma" , {{"gamma(%)", "_0"}}}, + {"gammaln" , {{"gammaln(%)", "_0"}}}, + {"erf" , {{"erf(%)", "_0"}}}, + {"erfinv" , {{"erfinv(%)", "_0"}}}, + {"_copy" , {{"identity(%)", "_0"}}}, + {"_identity_with_attr_like_rhs" , {{"identity(%)", "_0"}}}, + {"_plus_scalar" , {{"add(%, %)", "_0", "scalar"}}}, + {"_PlusScalar" , {{"add(%, %)", "_0", "scalar"}}}, + {"_minus_scalar" , {{"sub(%, %)", "_0", "scalar"}}}, + {"_MinusScalar" , {{"sub(%, %)", "_0", "scalar"}}}, + {"_rminus_scalar" , {{"(-sub(%, %))", "_0", "scalar"}}}, + {"_RMinusScalar" , {{"(-sub(%, %))", "_0", "scalar"}}}, + {"_mul_scalar" , {{"mul(%, %)", "_0", "scalar"}}}, + {"_MulScalar" , {{"mul(%, %)", "_0", "scalar"}}}, + {"_div_scalar" , {{"div(%, %)", "_0", "scalar"}}}, + {"_DivScalar" , {{"div(%, %)", "_0", "scalar"}}}, + {"_rdiv_scalar" , {{"rdiv(%, %)", "_0", "scalar"}}}, + {"_power_scalar" , {{"power(%, %)", "_0", "scalar"}}}, + {"_PowerScalar" , {{"power(%, %)", "_0", "scalar"}}}, + {"_rpower_scalar" , {{"rpow(%, %)", "_0", "scalar"}}}, + {"_RPowerScalar" , {{"rpow(%, %)", "_0", "scalar"}}}, + {"_RDivScalar" , {{"rdiv(%, %)", "_0", "scalar"}}}, + {"Cast" , {{"cast<%>(%)", "dtype", "_0"}}}, + {"cast" , {{"cast<%>(%)", "dtype", "_0"}}}, + {"Activation" , {{"%(%)", "act_type", "_0"}}}, + {"clip" , {{"clip(%, %, %)", "_0", "a_min", "a_max"}}}, + {"_zeros" , {{"zero<%>(0)", "dtype"}}}, + {"_ones" , {{"one<%>(0)", "dtype"}}}, + {"negative" , {{"(-%)", "_0"}}}, + {"_hypot" , {{"hypot(%, %)", "_0", "_1"}}}, + {"_hypot_scalar" , {{"hypot(%, %)", "_0", "scalar"}}}, + {"_backward_relu" , {{"backward_relu(%, %)", "_1", "_0"}}}, + {"_backward_sigmoid" , {{"backward_sigmoid(%, %)", "_1", "_0"}}}, + {"_backward_expm1" , {{"backward_expm1(%, %)", "_1", "_0"}}}, + {"_backward_log" , {{"backward_log(%, %)", "_1", "_0"}}}, + {"_backward_log10" , {{"backward_log10(%, %)", "_1", "_0"}}}, + {"_backward_log2" , {{"backward_log2(%, %)", "_1", "_0"}}}, + {"_backward_log1p" , {{"backward_log1p(%, %)", "_1", "_0"}}}, + {"_backward_sin" , {{"backward_sin(%, %)", "_1", "_0"}}}, + {"_backward_cos" , {{"backward_cos(%, %)", "_1", "_0"}}}, + {"_backward_tan" , {{"backward_tan(%, %)", "_1", "_0"}}}, + {"_backward_arcsin" , {{"backward_arcsin(%, %)", "_1", "_0"}}}, + {"_backward_arccos" , {{"backward_arccos(%, %)", "_1", "_0"}}}, + {"_backward_arctan" , {{"backward_arctan(%, %)", "_1", "_0"}}}, + {"_backward_sinh" , {{"backward_sinh(%, %)", "_1", "_0"}}}, + {"_backward_cosh" , {{"backward_cosh(%, %)", "_1", "_0"}}}, + {"_backward_tanh" , {{"backward_tanh(%, %)", "_1", "_0"}}}, + {"_backward_arcsinh" , {{"backward_arcsinh(%, %)", "_1", "_0"}}}, + {"_backward_arccosh" , {{"backward_arccosh(%, %)", "_1", "_0"}}}, + {"_backward_arctanh" , {{"backward_arctanh(%, %)", "_1", "_0"}}}, + {"_backward_sqrt" , {{"backward_sqrt(%, %)", "_1", "_0"}}}, + {"_backward_rsqrt" , {{"backward_rsqrt(%, %)", "_1", "_0"}}}, + {"_backward_cbrt" , {{"backward_cbrt(%, %)", "_1", "_0"}}}, + {"_backward_rcbrt" , {{"backward_rcbrt(%, %)", "_1", "_0"}}}, + {"_backward_square" , {{"backward_square(%, %)", "_1", "_0"}}}, + {"_backward_div_scalar" , {{"(% / %)", "_0", "scalar"}}}, + {"_backward_div_scalar" , {{"(% / %)", "_0", "scalar"}}}, + {"_backward_rdiv_scalar" , {{"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}}, + {"_backward_hypot_scalar" , {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}}}, + {"_backward_radians" , {{"radians(%)", "_0"}}}, + {"_backward_erf" , {{"backward_erf(%, %)", "_1", "_0"}}}, + {"_backward_erfinv" , {{"backward_erfinv(%, %)", "_1", "_0"}}}, + {"_backward_reciprocal" , {{"backward_reciprocal(%, %)", "_1", "_0"}}}, + {"_backward_abs" , {{"(% * sign(%))", "_0", "_1"}}}, + {"_backward_degrees" , {{"degrees(%)", "_0"}}}, + {"_backward_sign" , {{"zero(%)", "_0"}}}, + {"_backward_clip" , {{"backward_clip(%, %, %, %)", "_1", "_0", + "a_min", "a_max"}}}, + {"smooth_l1" , {{"smooth_l1(%, %)", "_0", "scalar"}}}, + {"_backward_smooth_l1" , {{"backward_smooth_l1(%, %, %)", "_1", "scalar", "_0"}}}, // TODO(ptredak): arange -}; - -// Multiple inputs/multiple outputs -const std::map>> fused_op_mimo_ops = { - {"_backward_sub", {{"(%)", "_0"}, - {"(-(%))", "_0"}}}, - {"_backward_mul", {{"(% * %)", "_0", "_2"}, - {"(% * %)", "_0", "_1"}}}, - {"_backward_mul_scalar", {{"(% * %)", "_0", "scalar"}}}, - {"_backward_div", {{"(% / %)", "_0", "_2"}, - {"(-% * % / (% * %))", "_0", "_1", "_2", "_2"}}}, - {"_backward_power", {{"(% * % * powf(%, % - 1))", "_0", "_2", "_1", "_2"}, - {"(% * powf(%, %) * logf(%))", "_0", "_1", "_2", "_1"}}}, - {"_backward_power_scalar", {{"(% * % * powf(%, % - 1))", "_0", "scalar", "_1", "scalar"}}}, - {"_backward_rpower_scalar", {{"(% * % * logf(%))", "_0", "_1", "scalar"}}}, - {"_backward_maximum", {{"((% >= %) ? % : 0)", "_1", "_2", "_0"}, - {"((% >= %) ? 0 : %)", "_1", "_2", "_0"}}}, - {"_backward_minimum", {{"((% <= %) ? % : 0)", "_1", "_2", "_0"}, - {"((% <= %) ? 0 : %)", "_1", "_2", "_0"}}}, - {"_backward_hypot", {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "_2"}, - {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} + // TODO(ptredak): LeakyRelu + // TODO(ptredak): mod and rmod + {"_backward_sub" , {{"(%)", "_0"}, + {"(-(%))", "_0"}}}, + {"_backward_mul" , {{"(% * %)", "_0", "_2"}, + {"(% * %)", "_0", "_1"}}}, + {"_backward_mul_scalar" , {{"(% * %)", "_0", "scalar"}}}, + {"_backward_div" , {{"(% / %)", "_0", "_2"}, + {"(-% * % / (% * %))", "_0", "_1", "_2", "_2"}}}, + {"_backward_power" , {{"(% * % * powf(%, % - 1))", "_0", "_2", "_1", "_2"}, + {"(% * powf(%, %) * logf(%))", "_0", "_1", "_2", "_1"}}}, + {"_backward_power_scalar" , {{"(% * % * powf(%, % - 1))", "_0", "scalar", "_1", + "scalar"}}}, + {"_backward_rpower_scalar" , {{"(% * % * logf(%))", "_0", "_1", "scalar"}}}, + {"_backward_maximum" , {{"((% >= %) ? % : 0)", "_1", "_2", "_0"}, + {"((% >= %) ? 0 : %)", "_1", "_2", "_0"}}}, + {"_backward_minimum" , {{"((% <= %) ? % : 0)", "_1", "_2", "_0"}, + {"((% <= %) ? 0 : %)", "_1", "_2", "_0"}}}, + {"_backward_hypot" , {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "_2"}, + {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} }; const std::map fused_op_slice_ops = { @@ -297,7 +298,7 @@ union VectorType { VectorType (const decltype(y) &y2) { y = y2; } -}; +}; template struct Strides { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index eb41b6703a36..4dab23c6042c 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -78,9 +78,30 @@ inline int mshadowTypeToVectorLength(int type) { return 0; } - } // namespace detail +std::string ParseOpDescription(const std::vector& op_desc, + const std::map, std::string>& variables, + const nnvm::IndexedGraph::Node& node) { + const auto* source = node.source; + std::string fmt = op_desc[0]; + for (size_t j = 1; j < op_desc.size(); ++j) { + const std::string& desc = op_desc[j]; + std::string sub; + if (desc[0] == '_') { + // Argument + const int arg_id = std::stoi(desc.substr(1)); + sub = variables.at({node.inputs[arg_id].node_id, node.inputs[arg_id].index}); + } else { + sub = source->attrs.dict.at(desc); + } + size_t pos = fmt.find("%"); + CHECK_NE(pos, std::string::npos); + fmt.replace(pos, 1, sub); + } + return fmt; +} + void FusedOp::GenerateCode(const std::vector &req) { const auto& g = this->symbol_.indexed_graph(); std::string code = ""; @@ -174,72 +195,14 @@ void FusedOp::GenerateCode(const std::vector &req) { } } else { std::string op_name = source->op()->name; - if (detail::fused_op_binary_ops.find(op_name) != detail::fused_op_binary_ops.end()) { - std::string op = detail::fused_op_binary_ops.at(op_name); - const auto& arg1 = variables[{node.inputs[0].node_id, node.inputs[0].index}]; - const auto& arg2 = variables[{node.inputs[1].node_id, node.inputs[1].index}]; - code += "const auto " + var_name + " = " + op + - "(" + arg1 + ", " + arg2 + ");\n"; - CHECK_EQ(outputs[i], 1); - variables[{i, 0}] = var_name; - continue; - } - - if (detail::fused_op_unary_ops.find(op_name) != detail::fused_op_unary_ops.end()) { - std::string op = detail::fused_op_unary_ops.at(op_name); - const auto& arg1 = variables[{node.inputs[0].node_id, node.inputs[0].index}]; - code += "const auto " + var_name + " = " + op + - "(" + arg1 + ");\n"; - CHECK_EQ(outputs[i], 1); - variables[{i, 0}] = var_name; - continue; - } - - if (detail::fused_op_special_ops.find(op_name) != detail::fused_op_special_ops.end()) { - const std::vector& op_desc = detail::fused_op_special_ops.at(op_name); - std::string fmt = op_desc[0]; - for (size_t j = 1; j < op_desc.size(); ++j) { - const std::string& desc = op_desc[j]; - std::string sub; - if (desc[0] == '_') { - // Argument - int arg_id = std::stoi(desc.substr(1)); - sub = variables[{node.inputs[arg_id].node_id, node.inputs[arg_id].index}]; - } else { - sub = source->attrs.dict.at(desc); - } - size_t pos = fmt.find("%"); - CHECK_NE(pos, std::string::npos); - fmt.replace(pos, 1, sub); - } - code += "const auto " + var_name + " = " + fmt + ";\n"; - CHECK_EQ(outputs[i], 1); - variables[{i, 0}] = var_name; - continue; - } - - if (detail::fused_op_mimo_ops.find(op_name) != detail::fused_op_mimo_ops.end()) { + if (detail::fused_op_ops_desc.find(op_name) != detail::fused_op_ops_desc.end()) { const std::vector>& op_descs = - detail::fused_op_mimo_ops.at(op_name); + detail::fused_op_ops_desc.at(op_name); CHECK_EQ(outputs[i], op_descs.size()); size_t count = 0; for (const auto& op_desc : op_descs) { var_name = "temp" + std::to_string(temp_name_counter++); - std::string fmt = op_desc[0]; - for (size_t j = 1; j < op_desc.size(); ++j) { - const std::string& desc = op_desc[j]; - std::string sub; - if (desc[0] == '_') { - // Argument - int arg_id = std::stoi(desc.substr(1)); - sub = variables[{node.inputs[arg_id].node_id, node.inputs[arg_id].index}]; - } else { - sub = source->attrs.dict.at(desc); - } - size_t pos = fmt.find("%"); - CHECK_NE(pos, std::string::npos); - fmt.replace(pos, 1, sub); - } + const std::string& fmt = ParseOpDescription(op_desc, variables, node); code += "const auto " + var_name + " = " + fmt + ";\n"; variables[{i, count}] = var_name; ++count; From 1d9436595113b9c56c533d2a37870c64f055be67 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 15:17:13 -0700 Subject: [PATCH 048/105] Make linter happy --- src/executor/infer_graph_attr_pass.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 06834669ca47..5b694ab617f8 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -66,8 +66,9 @@ bool ApplyOpInferAttr(const nnvm::Graph& g, template inline void GetAttrFromForwardNode(const uint32_t nid, const nnvm::IndexedGraph &idx, - std::vector& rshape, + std::vector* rshape_ptr, IsNone fis_none) { + std::vector& rshape = *rshape_ptr; const auto& inode = idx[nid]; // gradient function, used to get node correspondence. static auto& fgrad = @@ -119,9 +120,10 @@ inline void GetAttrFromForwardNode(const uint32_t nid, template void GetAttrFromFusedNode(uint32_t nid, const nnvm::IndexedGraph& idx, - std::vector& rshape, + std::vector* rshape_ptr, IsNone fis_none, const std::string& infer_fusion_name) { + std::vector& rshape = *rshape_ptr; const auto& inode = idx[nid]; nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; static auto& finfer_fused_shape = @@ -164,7 +166,7 @@ void GetAttrFromFusedNode(uint32_t nid, template void ProvideAttrToFusion(const uint32_t nid, const nnvm::IndexedGraph& idx, - std::vector& rshape, + const std::vector& rshape, const std::string& provide_fusion_name) { const auto& inode = idx[nid]; std::vector> in_attrs; @@ -347,9 +349,9 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, static auto& is_fusion_helper = Op::GetAttr("TIsFusionHelper"); if (!is_fusion_helper.get(fwd_ptr->op(), false)) { - GetAttrFromForwardNode(nid, idx, rshape, fis_none); + GetAttrFromForwardNode(nid, idx, &rshape, fis_none); } else { - GetAttrFromFusedNode(nid, idx, rshape, fis_none, infer_fusion_name); + GetAttrFromFusedNode(nid, idx, &rshape, fis_none, infer_fusion_name); } } else { DispatchMode* dispatch_mode = nullptr; @@ -605,9 +607,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, static auto& is_fusion_helper = Op::GetAttr("TIsFusionHelper"); if (!is_fusion_helper.get(fwd_ptr->op(), false)) { - GetAttrFromForwardNode(nid, idx, rshape, fis_none); + GetAttrFromForwardNode(nid, idx, &rshape, fis_none); } else { - GetAttrFromFusedNode(nid, idx, rshape, fis_none, + GetAttrFromFusedNode(nid, idx, &rshape, fis_none, "FAccessSubgraphShape"); } } else { From 844cb9f715f7846fbdabe0d4aa695c79b721ba14 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 15:55:09 -0700 Subject: [PATCH 049/105] Changes from review --- src/operator/fusion/fused_op.cu | 234 +++++++++++++++++--------------- src/operator/fusion/fused_op.h | 9 +- 2 files changed, 132 insertions(+), 111 deletions(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 4dab23c6042c..0ad3aae3868a 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -102,7 +102,12 @@ std::string ParseOpDescription(const std::vector& op_desc, return fmt; } -void FusedOp::GenerateCode(const std::vector &req) { +void FusedOp::GenerateCode(const std::vector &req, + const std::vector &in_dtypes, + const std::vector &out_dtypes, + const std::vector &in_ndims, + const int nvec, + const std::string &kernel_name) { const auto& g = this->symbol_.indexed_graph(); std::string code = ""; int temp_name_counter = 0; @@ -171,7 +176,7 @@ void FusedOp::GenerateCode(const std::vector &req) { } } - int counter = 0; + size_t counter = 0; for (const auto& entry : g.outputs()) { const auto var_name = "output" + std::to_string(counter); code += "VectorType &req) { } this->code_ = code; + + // Add boilerplate and type information + LOG(INFO) << code_; + std::string kernel_params = ""; + std::string tensor_params = ""; + nnvm::Symbol sym; + sym.outputs = this->symbol_.outputs; + const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); + size_t num_params = in_dtypes.size() + out_dtypes.size(); + size_t i = 0; + std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; + for (const auto &type : in_dtypes) { + std::string type_name = detail::mshadowTypeToString(type); + std::string dtype_var = "DType" + std::to_string(i); + std::string dim_var = "ndim" + std::to_string(i); + aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; + aux_code = "static const int " + dim_var + " = " + \ + std::to_string(in_ndims[i]) + ";\n" + aux_code; + tensor_params += dtype_var + "* " +input_names[i]; + kernel_params += " const Strides<" + dim_var + "> " + input_names[i]+"_strides"; + ++i; + if (i < num_params) { + tensor_params += ", "; + kernel_params += ", "; + } + } + for (const auto &type : out_dtypes) { + std::string type_name = detail::mshadowTypeToString(type); + std::string dtype_var = "DType" + std::to_string(i); + aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; + tensor_params += dtype_var + "* output" + + std::to_string(i - in_dtypes.size()); + ++i; + if (i < num_params) { + tensor_params += ", "; + } + } + kernel_params += tensor_params; + code_ = std::string(detail::fp16_support_string) + "\n" + + detail::type_support_string + "\n" + + detail::fused_op_function_definitions + "\n" + + aux_code + "\n" + + "__global__ void FusedKernel_" + kernel_name + + "(size_t N, " + kernel_params + ") {\n" + + detail::fused_op_kernel_begin + "\n" + + code_ + "\n" + + detail::fused_op_kernel_end; +} + +void FusedOp::CompileCode(const std::string &kernel_name) { + // Guard NVRTC calls + std::lock_guard lock_nvrtc(mutex_); + nvrtcProgram program; + NVRTC_CALL( + nvrtcCreateProgram(&program, // prog + &code_[0], // buffer + (kernel_name + "_kernel.cu").c_str(), // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + std::string gpu_arch = "--gpu-architecture=compute_" + + std::to_string(this->cc_major_) + + std::to_string(this->cc_minor_); + + const char *opts[] = {gpu_arch.c_str(), + "--std=c++11", + "-default-device"}; + const std::string kernel_name_demangled = "FusedKernel_" + kernel_name; + NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); + + nvrtcResult compileResult = nvrtcCompileProgram(program, // prog + 3, // numOptions + opts); // options + // Obtain compilation log from the program. + size_t logSize; + NVRTC_CALL(nvrtcGetProgramLogSize(program, &logSize)); + std::string log(logSize, '\0'); + NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); + CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; + // Obtain PTX from the program. + size_t ptxSize; + NVRTC_CALL(nvrtcGetPTXSize(program, &ptxSize)); + ptx_.reserve(ptxSize); + NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); + const char *name; + NVRTC_CALL(nvrtcGetLoweredName(program, + kernel_name_demangled.c_str(), + &name)); + kernel_name_ = name; + // Destroy the program. + NVRTC_CALL(nvrtcDestroyProgram(&program)); + int device; + CUdevice cuDevice; + CUcontext context; + CUmodule module; + CUDA_CALL(cudaGetDevice(&device)); + CUDA_DRIVER_CALL(cuDeviceGet(&cuDevice, device)); + CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cuDevice)); + CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[0])); + CUDA_DRIVER_CALL(cuModuleGetFunction(&kernel_, + module, + kernel_name_.c_str())); +} + +bool FusedOp::CheckComputeCapability(const OpContext &ctx) { + const int dev_id = ctx.run_ctx.ctx.dev_id; + const int cc_major = ComputeCapabilityMajor(dev_id); + const int cc_minor = ComputeCapabilityMinor(dev_id); + + const bool ret = cc_major == this->cc_major_ && cc_minor == this->cc_minor_; + this->cc_major_ = cc_major; + this->cc_minor_ = cc_minor; + return ret; } template <> @@ -323,119 +441,15 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, ++counter; } - // Get compute capability of the current GPU - int dev_id = ctx.run_ctx.ctx.dev_id; - int cc_major = ComputeCapabilityMajor(dev_id); - int cc_minor = ComputeCapabilityMinor(dev_id); - - initialized_ = initialized_ && cc_major == this->cc_major_; - initialized_ = initialized_ && cc_minor == this->cc_minor_; - this->cc_major_ = cc_major; - this->cc_minor_ = cc_minor; + // Check and save compute capability of the current GPU + if (!CheckComputeCapability(ctx)) initialized_ = false; initialized_ = initialized_ && (req == saved_reqs_); saved_reqs_ = req; if (!initialized_) { - this->GenerateCode(req); - LOG(INFO) << code_; - std::string aux_code = ""; - std::string kernel_params = ""; - std::string tensor_params = ""; - nnvm::Symbol sym; - sym.outputs = this->symbol_.outputs; - const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); - size_t num_params = in_dtypes.size() + out_dtypes.size(); - size_t i = 0; - aux_code += "static const int nvec = " + std::to_string(nvec) + ";\n"; - for (const auto &type : in_dtypes) { - std::string type_name = detail::mshadowTypeToString(type); - std::string dtype_var = "DType" + std::to_string(i); - std::string dim_var = "ndim" + std::to_string(i); - aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; - aux_code = "static const int " + dim_var + " = " + \ - std::to_string(in_ndims[i]) + ";\n" + aux_code; - tensor_params += dtype_var + "* " +input_names[i]; - kernel_params += " const Strides<" + dim_var + "> " + input_names[i]+"_strides"; - ++i; - if (i < num_params) { - tensor_params += ", "; - kernel_params += ", "; - } - } - for (const auto &type : out_dtypes) { - std::string type_name = detail::mshadowTypeToString(type); - std::string dtype_var = "DType" + std::to_string(i); - aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; - tensor_params += dtype_var + "* output" + - std::to_string(i - in_dtypes.size()); - ++i; - if (i < num_params) { - tensor_params += ", "; - } - } - kernel_params += tensor_params; - code_ = std::string(detail::fp16_support_string) + "\n" + - detail::type_support_string + "\n" + - detail::fused_op_function_definitions + "\n" + - aux_code + "\n" + - "__global__ void FusedKernel_" + attrs.name + - "(size_t N, " + kernel_params + ") {\n" + - detail::fused_op_kernel_begin + "\n" + - code_ + "\n" + - detail::fused_op_kernel_end; - // Guard NVRTC calls - std::lock_guard lock_nvrtc(mutex_); - nvrtcProgram program; - NVRTC_CALL( - nvrtcCreateProgram(&program, // prog - &code_[0], // buffer - (attrs.name + "_kernel.cu").c_str(), // name - 0, // numHeaders - NULL, // headers - NULL)); // includeNames - std::string gpu_arch = "--gpu-architecture=compute_" + - std::to_string(this->cc_major_) + - std::to_string(this->cc_minor_); - - const char *opts[] = {gpu_arch.c_str(), - "--std=c++11", - "-default-device"}; - const std::string kernel_name_demangled = "FusedKernel_" + attrs.name; - NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); - - nvrtcResult compileResult = nvrtcCompileProgram(program, // prog - 3, // numOptions - opts); // options - // Obtain compilation log from the program. - size_t logSize; - NVRTC_CALL(nvrtcGetProgramLogSize(program, &logSize)); - std::string log(logSize, '\0'); - NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); - CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; - // Obtain PTX from the program. - size_t ptxSize; - NVRTC_CALL(nvrtcGetPTXSize(program, &ptxSize)); - ptx_.reserve(ptxSize); - NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); - const char *name; - NVRTC_CALL(nvrtcGetLoweredName(program, - kernel_name_demangled.c_str(), - &name)); - kernel_name_ = name; - // Destroy the program. - NVRTC_CALL(nvrtcDestroyProgram(&program)); - int device; - CUdevice cuDevice; - CUcontext context; - CUmodule module; - CUDA_CALL(cudaGetDevice(&device)); - CUDA_DRIVER_CALL(cuDeviceGet(&cuDevice, device)); - CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cuDevice)); - CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[0])); - CUDA_DRIVER_CALL(cuModuleGetFunction(&kernel_, - module, - kernel_name_.c_str())); + this->GenerateCode(req, in_dtypes, out_dtypes, in_ndims, nvec, attrs.name); + this->CompileCode(attrs.name); initialized_ = true; } Stream* s = ctx.get_stream(); diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 11b993184389..3c5bf5cc3cb6 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -102,7 +102,14 @@ class FusedOp { } private: - void GenerateCode(const std::vector &req); + void GenerateCode(const std::vector &req, + const std::vector &in_dtypes, + const std::vector &out_dtypes, + const std::vector &in_ndims, + const int nvec, + const std::string& kernel_name); + void CompileCode(const std::string &kernel_name); + bool CheckComputeCapability(const OpContext &ctx); std::vector inputs_; std::vector outputs_; From 204b127e4f0c29fb67b72701529ee20fb5295db1 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Jun 2019 20:17:27 -0700 Subject: [PATCH 050/105] Fixes after rebase --- src/operator/fusion/fused_op-inl.h | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index c71c09b26bf5..06e81c324cdd 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -783,6 +783,15 @@ inline DType clip(const DType val, const float a_min, const float a_max) { return max(min(val, a_max), a_min); } +template +inline DType backward_clip(const DType val, const DType grad, const float a_min, const float a_max) { + if (val > a_max || val < a_min) { + return 0; + } else { + return grad; + } +} + template inline DType sign(const DType val) { if (val < 0) return -1; @@ -794,6 +803,11 @@ inline DType reciprocal(const DType val) { return 1.0f / val; } +template +inline DType backward_reciprocal(const DType val, const DType grad) { + return -grad / (val * val); +} + template inline DType abs(const DType val) { return fabsf(val); @@ -829,6 +843,32 @@ inline DType backward_erfinv(const DType val, const DType grad) { return 0.5f * sqrt(pi) * exp(val * val) * grad; } +template +inline DType smooth_l1(const DType val, const DType scalar) { + const auto bsq = scalar * scalar; + const auto ibsq = 1.0f / bsq; + if (val > ibsq) { + return val - 0.5f * ibsq; + } else if (val < -ibsq) { + return -val - 0.5f * ibsq; + } else { + return 0.5f * val * val * bsq; + } +} + +template +inline DType backward_smooth_l1(const DType val, const DType scalar, const DType grad) { + auto bsq = scalar * scalar; + auto ibsq = 1.0f / bsq; + if (val > ibsq) { + return grad; + } else if (val < -ibsq) { + return -grad; + } else { + return bsq * val * grad; + } +} + )code"; const char fused_op_kernel_begin[] = R"code( From 56eb99d2e24cb7425281637eb55e08726d9b5ec1 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Wed, 19 Jun 2019 16:54:00 -0700 Subject: [PATCH 051/105] Expand FusedOp support for slice --- src/operator/fusion/fused_op-inl.h | 60 ++++++++++++----------- src/operator/fusion/fused_op.cu | 77 ++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 54 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 06e81c324cdd..4ea219fe7a03 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -221,6 +221,7 @@ const std::map>> fused_op_ops_ const std::map fused_op_slice_ops = { {"slice_axis" , ""}, + {"slice" , ""}, }; const std::vector fused_op_variable_io_ops = { @@ -230,14 +231,7 @@ const std::vector fused_op_variable_io_ops = { const char fused_op_function_definitions[] = R"code( -template -struct remove_pointer; - -template -struct remove_pointer -{ - typedef U type; -}; +#define INT_MAX (2147483647) template struct LoadType { @@ -301,9 +295,21 @@ union VectorType { }; template -struct Strides { +struct Shape { int x[ndim]; -}; + inline const int& operator [](const int i) const { + return x[i]; + } + inline int& operator [](const int i) { + return x[i]; + } + inline void set(const int def) { + #pragma unroll + for (int i = 0; i < ndim; i++) { + x[i] = def; + } + } + }; template inline VectorType load_index(const DType * input, int i) { @@ -312,25 +318,24 @@ inline VectorType load_index(const DType * input, int i) { return ret; } -template -inline VectorType load_slice(const DType * input, const Strides strides, int begin, int end, int offset) { +template +inline VectorType load_slice(const DType * input, const Shape shape, Shape begin, Shape end, int offset) { int idx[nvec]; bool mem_aligned = true; - Strides ref_strides; - if (axis > 0) { - int shape = strides.x[axis-1]/strides.x[axis]; - if (begin < 0) begin = shape - begin; - if (end < 0) begin = shape - begin; - if (end > shape) end = shape; - #pragma unroll - for (int dim = 0; dim < axis; dim++) { - ref_strides.x[dim] = (strides.x[dim] / shape) * (end-begin); - } - } + Shape ref_strides; + Shape strides; + ref_strides[ndim-1] = 1; + strides[ndim-1] = 1; #pragma unroll - for (int dim = axis; dim < ndim; dim++) { - ref_strides.x[dim] = strides.x[dim]; + for (int dim = ndim-1; dim >=0; dim--) { + if (begin[dim] < 0) begin[dim] = shape[dim] - begin[dim]; + if (end[dim] < 0) end[dim] = shape[dim] - end[dim]; + if (end[dim] > shape[dim]) end[dim] = shape[dim]; + if (dim > 0) { + ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]); + strides[dim-1] = strides[dim] * shape[dim]; + } } #pragma unroll @@ -339,11 +344,10 @@ inline VectorType load_slice(const DType * input, const Strides 0 && (idx[j] != (idx[j-1] + 1))) { mem_aligned = false; } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 0ad3aae3868a..e612a138dbd5 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -78,6 +78,14 @@ inline int mshadowTypeToVectorLength(int type) { return 0; } +inline void replaceString(std::string *input, const std::string old, const std::string repl) { + int pos = 0; + while ((pos = input->find(old, pos)) != std::string::npos) { + input->replace(pos, old.size(), repl); + pos += repl.size(); + } +} + } // namespace detail std::string ParseOpDescription(const std::vector& op_desc, @@ -157,17 +165,35 @@ void FusedOp::GenerateCode(const std::vector &req, if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { int arg_id = node.inputs[0].node_id; const auto& var_name = g[arg_id].source->attrs.name; + const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); load_index[arg_id] = 0; - std::string begin = source->attrs.dict.at("begin"); - std::string end = source->attrs.dict.at("end"); - if (end == "None") { - end = "((1<<31)-1)"; + auto parse_tuple = [](const std::string& input, const std::string def) { + std::string out = input; + detail::replaceString(&out, "(", "{"); + detail::replaceString(&out, ")", "}"); + detail::replaceString(&out, "None", def); + return out; + }; + std::string begin = parse_tuple(source->attrs.dict.at("begin"), "0"); + std::string end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); + if (op_name == "slice") { + // step = parse_tuple(source->attrs.dict.at("step"), "1"); + } else if (op_name == "slice_axis") { + std::string axis = source->attrs.dict.at("axis"); + std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; + std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; + code += "Shape "+ begin_var_name + ";\n"; + code += "Shape "+ end_var_name + ";\n"; + code += begin_var_name + ".set(0);\n"; + code += end_var_name + ".set(INT_MAX);\n"; + code += begin_var_name + "["+axis+"] = " + begin + ";\n"; + code += end_var_name + "["+axis+"] = " + end + ";\n"; + begin = begin_var_name; + end = end_var_name; } - std::string axis = source->attrs.dict.at("axis"); - const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - code += "const auto " + vec_name + " = load_slice(" + \ - var_name + ", " + var_name + "_strides," + begin + \ - "," + end + ", offset);\n"; + code += "const auto " + vec_name + " = load_slice(" + \ + var_name + ", " + var_name + "_shape," + begin + \ + "," + end + ", offset);\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = vec_name; continue; @@ -178,9 +204,9 @@ void FusedOp::GenerateCode(const std::vector &req, size_t counter = 0; for (const auto& entry : g.outputs()) { - const auto var_name = "output" + std::to_string(counter); - code += "VectorType::type, nvec> vec_output" + std::to_string(counter) + ";\n"; + std::string var_name = "output" + std::to_string(counter); + code += "VectorType vec_" + var_name + ";\n"; ++counter; } @@ -304,13 +330,13 @@ void FusedOp::GenerateCode(const std::vector &req, std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; for (const auto &type : in_dtypes) { std::string type_name = detail::mshadowTypeToString(type); - std::string dtype_var = "DType" + std::to_string(i); - std::string dim_var = "ndim" + std::to_string(i); + std::string dtype_var = "DType_" + input_names[i]; + std::string dim_var = "ndim_" + input_names[i]; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; aux_code = "static const int " + dim_var + " = " + \ std::to_string(in_ndims[i]) + ";\n" + aux_code; tensor_params += dtype_var + "* " +input_names[i]; - kernel_params += " const Strides<" + dim_var + "> " + input_names[i]+"_strides"; + kernel_params += " const Shape<" + dim_var + "> " + input_names[i]+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; @@ -319,10 +345,10 @@ void FusedOp::GenerateCode(const std::vector &req, } for (const auto &type : out_dtypes) { std::string type_name = detail::mshadowTypeToString(type); - std::string dtype_var = "DType" + std::to_string(i); + std::string out_name = "output" + std::to_string(i - in_dtypes.size()); + std::string dtype_var = "DType_" + out_name; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; - tensor_params += dtype_var + "* output" + - std::to_string(i - in_dtypes.size()); + tensor_params += dtype_var + "* " + out_name; ++i; if (i < num_params) { tensor_params += ", "; @@ -460,17 +486,16 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, unsigned int num_blocks = (N + FusedOp::NTHREADS - 1) / FusedOp::NTHREADS; std::vector ptrs; - std::vector> strides; + std::vector> shapes; for (const auto &data : inputs) { MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { int ndim = data.ndim(); Tensor tensor = data.FlatTo1D(s); ptrs.push_back(tensor.dptr_); - strides.push_back(std::vector(ndim)); - std::vector& tensor_strides = strides.back(); - tensor_strides[ndim-1] = 1; - for (int i = ndim-2; i >= 0; i--) { - tensor_strides[i] = tensor_strides[i+1] * data.shape_[i+1]; + shapes.push_back(std::vector(ndim)); + std::vector& tensor_shapes = shapes.back(); + for (int i = ndim-1; i >= 0; i--) { + tensor_shapes[i] = data.shape_[i]; } }); } @@ -480,8 +505,8 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, ptrs.push_back(tensor.dptr_); }); } - for (auto &tensor_strides : strides) { - args.push_back(tensor_strides.data()); + for (auto &tensor_shapes : shapes) { + args.push_back(tensor_shapes.data()); } for (auto &ptr : ptrs) { args.push_back(reinterpret_cast(&ptr)); From e31b5862aa03cb6864ecf24bc8c37a70456b5222 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 20 Jun 2019 09:32:49 -0700 Subject: [PATCH 052/105] Fix for fp16 _zeros and _ones --- src/operator/fusion/fused_op-inl.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 4ea219fe7a03..ff0b9f196bd3 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -152,8 +152,8 @@ const std::map>> fused_op_ops_ {"cast" , {{"cast<%>(%)", "dtype", "_0"}}}, {"Activation" , {{"%(%)", "act_type", "_0"}}}, {"clip" , {{"clip(%, %, %)", "_0", "a_min", "a_max"}}}, - {"_zeros" , {{"zero<%>(0)", "dtype"}}}, - {"_ones" , {{"one<%>(0)", "dtype"}}}, + {"_zeros" , {{"zero<%>()", "dtype"}}}, + {"_ones" , {{"one<%>()", "dtype"}}}, {"negative" , {{"(-%)", "_0"}}}, {"_hypot" , {{"hypot(%, %)", "_0", "_1"}}}, {"_hypot_scalar" , {{"hypot(%, %)", "_0", "scalar"}}}, @@ -745,11 +745,21 @@ inline typename LoadType::Type zero(const DType val) { return 0; } +template +inline typename LoadType::Type zero() { + return 0; +} + template inline typename LoadType::Type one(const DType val) { return 1; } +template +inline typename LoadType::Type one() { + return 1; +} + template inline DType round(const DType val) { return roundf(val); From c611b56658db3ccfe43f07f8a7289088c0c37d5f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 20 Jun 2019 10:23:39 -0700 Subject: [PATCH 053/105] Fix --- src/operator/fusion/fused_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index e612a138dbd5..71f9f73192f9 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -79,7 +79,7 @@ inline int mshadowTypeToVectorLength(int type) { } inline void replaceString(std::string *input, const std::string old, const std::string repl) { - int pos = 0; + size_t pos = 0; while ((pos = input->find(old, pos)) != std::string::npos) { input->replace(pos, old.size(), repl); pos += repl.size(); From d0d0fcf531103c81628aa8ffc6562b736ad15464 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 21 Jun 2019 09:06:41 -0700 Subject: [PATCH 054/105] Moving aux functions to unnamed namespace and detail namespace -> fusion namespace --- src/executor/pointwise_fusion_pass.cc | 16 +++++----- src/operator/fusion/fused_op-inl.h | 16 +++++----- src/operator/fusion/fused_op.cu | 42 +++++++++++++-------------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index dbbd88176767..ac1bbd7e2a99 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -41,28 +41,28 @@ namespace mxnet { namespace exec { namespace { bool IsFusionCompatible(nnvm::Node* n) { - using namespace mxnet::detail; + using namespace mxnet::fusion; if (n->op() == nullptr) return false; std::string op_name = n->op()->name; - if (fused_op_ops_desc.count(op_name)) + if (ops_desc.count(op_name)) return true; - if (fused_op_slice_ops.count(op_name)) + if (slice_ops.count(op_name)) return false; - if (std::find(fused_op_variable_io_ops.begin(), - fused_op_variable_io_ops.end(), + if (std::find(variable_io_ops.begin(), + variable_io_ops.end(), op_name) != - fused_op_variable_io_ops.end()) + variable_io_ops.end()) return true; return false; } bool IsInputsOnlyCompatible(nnvm::Node* n) { - using namespace mxnet::detail; + using namespace mxnet::fusion; if (n->op() == nullptr) return false; std::string op_name = n->op()->name; - if (fused_op_slice_ops.count(op_name)) + if (slice_ops.count(op_name)) return true; return false; } diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index ff0b9f196bd3..b8a24458014b 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -28,7 +28,7 @@ namespace mxnet { -namespace detail { +namespace fusion { const char fp16_support_string[] = R"code( struct __align__(2) __half { @@ -59,7 +59,7 @@ using int32 = int; using int64 = long long; )code"; -const std::map>> fused_op_ops_desc = { +const std::map>> ops_desc = { {"elemwise_add" , {{"add(%, %)", "_0", "_1"}}}, {"_plus" , {{"add(%, %)", "_0", "_1"}}}, {"_Plus" , {{"add(%, %)", "_0", "_1"}}}, @@ -219,17 +219,17 @@ const std::map>> fused_op_ops_ {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} }; -const std::map fused_op_slice_ops = { +const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, }; -const std::vector fused_op_variable_io_ops = { +const std::vector variable_io_ops = { "add_n", "_backward_Activation" }; -const char fused_op_function_definitions[] = R"code( +const char function_definitions[] = R"code( #define INT_MAX (2147483647) @@ -885,19 +885,19 @@ inline DType backward_smooth_l1(const DType val, const DType scalar, const DType )code"; -const char fused_op_kernel_begin[] = R"code( +const char kernel_begin[] = R"code( const int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < N; i+= gridDim.x * blockDim.x) { int offset = i*nvec; )code"; -const char fused_op_kernel_end[] = R"code( +const char kernel_end[] = R"code( } } )code"; -} // namespace detail +} // namespace fusion } // namespace mxnet diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 71f9f73192f9..ecdc44965a93 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -32,7 +32,7 @@ namespace mxnet { -namespace detail { +namespace { inline std::string mshadowTypeToString(int type) { switch (type) { @@ -86,8 +86,6 @@ inline void replaceString(std::string *input, const std::string old, const std:: } } -} // namespace detail - std::string ParseOpDescription(const std::vector& op_desc, const std::map, std::string>& variables, const nnvm::IndexedGraph::Node& node) { @@ -110,6 +108,8 @@ std::string ParseOpDescription(const std::vector& op_desc, return fmt; } +} // namespace + void FusedOp::GenerateCode(const std::vector &req, const std::vector &in_dtypes, const std::vector &out_dtypes, @@ -142,7 +142,7 @@ void FusedOp::GenerateCode(const std::vector &req, load_index[i] = 1; } else { std::string op_name = source->op()->name; - if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { + if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { load_index[node.inputs[0].node_id] = 0; } } @@ -162,16 +162,16 @@ void FusedOp::GenerateCode(const std::vector &req, CHECK_EQ(outputs[i], 1); } else { std::string op_name = source->op()->name; - if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { + if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { int arg_id = node.inputs[0].node_id; const auto& var_name = g[arg_id].source->attrs.name; const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); load_index[arg_id] = 0; auto parse_tuple = [](const std::string& input, const std::string def) { std::string out = input; - detail::replaceString(&out, "(", "{"); - detail::replaceString(&out, ")", "}"); - detail::replaceString(&out, "None", def); + replaceString(&out, "(", "{"); + replaceString(&out, ")", "}"); + replaceString(&out, "None", def); return out; }; std::string begin = parse_tuple(source->attrs.dict.at("begin"), "0"); @@ -226,9 +226,9 @@ void FusedOp::GenerateCode(const std::vector &req, } } else { std::string op_name = source->op()->name; - if (detail::fused_op_ops_desc.find(op_name) != detail::fused_op_ops_desc.end()) { + if (fusion::ops_desc.find(op_name) != fusion::ops_desc.end()) { const std::vector>& op_descs = - detail::fused_op_ops_desc.at(op_name); + fusion::ops_desc.at(op_name); CHECK_EQ(outputs[i], op_descs.size()); size_t count = 0; for (const auto& op_desc : op_descs) { @@ -241,7 +241,7 @@ void FusedOp::GenerateCode(const std::vector &req, continue; } - if (detail::fused_op_slice_ops.find(op_name) != detail::fused_op_slice_ops.end()) { + if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { code += "const auto " + var_name + " = load(" + variables[{i, 0}] + ".x[j]);\n"; variables[{i, 0}] = var_name; continue; @@ -250,7 +250,7 @@ void FusedOp::GenerateCode(const std::vector &req, // Special cases with variable number // of inputs/outputs, listed in - // detail::fused_op_variable_io_ops + // fusion::variable_io_ops if (op_name == "add_n") { CHECK_EQ(outputs[i], 1); const auto& arg = variables[{node.inputs[0].node_id, node.inputs[0].index}]; @@ -329,7 +329,7 @@ void FusedOp::GenerateCode(const std::vector &req, size_t i = 0; std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; for (const auto &type : in_dtypes) { - std::string type_name = detail::mshadowTypeToString(type); + std::string type_name = mshadowTypeToString(type); std::string dtype_var = "DType_" + input_names[i]; std::string dim_var = "ndim_" + input_names[i]; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; @@ -344,7 +344,7 @@ void FusedOp::GenerateCode(const std::vector &req, } } for (const auto &type : out_dtypes) { - std::string type_name = detail::mshadowTypeToString(type); + std::string type_name = mshadowTypeToString(type); std::string out_name = "output" + std::to_string(i - in_dtypes.size()); std::string dtype_var = "DType_" + out_name; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; @@ -355,15 +355,15 @@ void FusedOp::GenerateCode(const std::vector &req, } } kernel_params += tensor_params; - code_ = std::string(detail::fp16_support_string) + "\n" + - detail::type_support_string + "\n" + - detail::fused_op_function_definitions + "\n" + + code_ = std::string(fusion::fp16_support_string) + "\n" + + fusion::type_support_string + "\n" + + fusion::function_definitions + "\n" + aux_code + "\n" + "__global__ void FusedKernel_" + kernel_name + "(size_t N, " + kernel_params + ") {\n" + - detail::fused_op_kernel_begin + "\n" + + fusion::kernel_begin + "\n" + code_ + "\n" + - detail::fused_op_kernel_end; + fusion::kernel_end; } void FusedOp::CompileCode(const std::string &kernel_name) { @@ -454,7 +454,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, in_ndims.push_back(blob.ndim()); initialized_ = initialized_ && (blob.type_flag_ == inputs_[counter].dtype); inputs_[counter].dtype = blob.type_flag_; - nvec = max(nvec, detail::mshadowTypeToVectorLength(blob.type_flag_)); + nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); ++counter; } @@ -463,7 +463,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, out_dtypes.push_back(blob.type_flag_); initialized_ = initialized_ && (blob.type_flag_ == outputs_[counter].dtype); outputs_[counter].dtype = blob.type_flag_; - nvec = max(nvec, detail::mshadowTypeToVectorLength(blob.type_flag_)); + nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); ++counter; } From 7f12eac80f218fefa080b127afe367e3ca3cf989 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 25 Jun 2019 08:58:37 -0700 Subject: [PATCH 055/105] Disabling fusion if it alters topological order of inputs --- src/common/exec_utils.h | 45 ++++++++++++++++++++++++++++++++++ src/executor/graph_executor.cc | 36 ++++++++++++++++++++++++--- src/imperative/cached_op.cc | 40 +++++++++++++++++++++++++----- src/imperative/imperative.cc | 4 ++- 4 files changed, 114 insertions(+), 11 deletions(-) diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index 0551b429f17e..48d999828d96 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -622,6 +622,51 @@ inline nnvm::Graph AssignContext(nnvm::Graph g, return g; } +inline void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables) { + using nnvm::Node; + using nnvm::NodePtr; + using nnvm::NodeEntry; + std::unordered_map old_new; + // use DFSVisit to copy all the nodes + DFSVisit(src.outputs, [&old_new, copy_variables](const NodePtr& node) { + NodePtr np; + if (copy_variables || !node->is_variable()) { + np = Node::Create(); + np->attrs = node->attrs; + } else { + np = node; + } + old_new[node.get()] = std::move(np); + }); + // connect nodes of new graph + for (const auto &kv : old_new) { + for (const NodeEntry& e : kv.first->inputs) { + Node *ptr = e.node.get(); + kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version}); + } + for (const NodePtr& p : kv.first->control_deps) { + kv.second->control_deps.emplace_back(old_new[p.get()]); + } + } + // set the head + for (const NodeEntry &e : src.outputs) { + (*dst).outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version}); + } +} + +inline bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx) { + std::set names; + for (const auto& nid : idx.input_nodes()) { + const std::string &name = idx[nid].source->attrs.name; + if (names.count(name)) { + LOG(WARNING) << "Variable name " << name << " is used more than once!"; + return false; + } + names.insert(name); + } + return true; +} + } // namespace common } // namespace mxnet #endif // MXNET_COMMON_EXEC_UTILS_H_ diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 9981505f60f0..7dda560e156b 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "./exec_pass.h" @@ -325,6 +326,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol, if (!need_grad_) return g; for (size_t i = 0; i < g.outputs.size(); ++i) { NodeEntry ngrad(nnvm::Node::Create(), 0, 0); + ngrad.node->attrs.name = "_head_grad_" + std::to_string(i); head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i])); head_grad_map_[ngrad.node.get()] = i; } @@ -966,6 +968,7 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping, this); return exec; } + /*! * \brief This function is triggered by both simple_bind * and bind flows. @@ -985,10 +988,35 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, #if MXNET_USE_CUDA && !defined(_WIN32) if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) { - g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); - g = FusePointwiseForward(std::move(g)); - g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); - g = FusePointwiseBackward(std::move(g)); + nnvm::Graph unoptimized_graph; + common::CopyGraph(&unoptimized_graph, g, false); + + if (common::CheckForInputNameDuplicates(unoptimized_graph.indexed_graph())) { + g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); + g = FusePointwiseForward(std::move(g)); + g.attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs_); + g = FusePointwiseBackward(std::move(g)); + // Check the topological order of inputs + const auto &original_inputs = unoptimized_graph.indexed_graph().input_nodes(); + const auto &new_inputs = g.indexed_graph().input_nodes(); + if (original_inputs.size() != new_inputs.size()) { + LOG(WARNING) + << "Number of inputs after fusion does not match original number of inputs. " + << "This is most probably a bug. Disabling fusion for this run."; + g = unoptimized_graph; + } else { + for (size_t i = 0; i < new_inputs.size(); ++i) { + if (unoptimized_graph.indexed_graph()[original_inputs[i]].source->attrs.name != + g.indexed_graph()[new_inputs[i]].source->attrs.name) { + LOG(WARNING) << "Disabling fusion due to altered topological order of inputs."; + g = unoptimized_graph; + break; + } + } + } + } else { + LOG(WARNING) << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; + } } #endif // MXNET_USE_CUDA diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index a59866816180..38fe4c6d649e 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -80,8 +80,11 @@ void CreateFullGraph(const nnvm::Symbol& sym, // construct backward graph { ograd_entries->reserve(fwd_graph->outputs.size()); - for (size_t i = 0; i < fwd_graph->outputs.size(); ++i) - ograd_entries->emplace_back(Node::Create()); + for (size_t i = 0; i < fwd_graph->outputs.size(); ++i) { + nnvm::NodePtr np = Node::Create(); + np->attrs.name = "_head_grad_" + std::to_string(i); + ograd_entries->emplace_back(np); + } std::vector xs; const IndexedGraph& indexed_graph = fwd_graph->indexed_graph(); @@ -146,10 +149,35 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap #if MXNET_USE_CUDA && !defined(_WIN32) if (dmlc::GetEnv("MXNET_USE_FUSION", true) && context.dev_mask() == kGPU && !inlining) { - full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); - *full_graph = exec::FusePointwiseForward(std::move(*full_graph)); - full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); - *full_graph = exec::FusePointwiseBackward(std::move(*full_graph)); + nnvm::Graph unoptimized_graph; + common::CopyGraph(&unoptimized_graph, *full_graph, false); + + if (common::CheckForInputNameDuplicates(unoptimized_graph.indexed_graph())) { + full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); + *full_graph = exec::FusePointwiseForward(std::move(*full_graph)); + full_graph->attrs["num_forward_outputs"] = std::make_shared(num_forward_outputs); + *full_graph = exec::FusePointwiseBackward(std::move(*full_graph)); + // Check the topological order of inputs + const auto &original_inputs = unoptimized_graph.indexed_graph().input_nodes(); + const auto &new_inputs = full_graph->indexed_graph().input_nodes(); + if (original_inputs.size() != new_inputs.size()) { + LOG(WARNING) + << "Number of inputs after fusion does not match original number of inputs. " + << "This is most probably a bug. Disabling fusion for this run."; + *full_graph = unoptimized_graph; + } else { + for (size_t i = 0; i < new_inputs.size(); ++i) { + if (unoptimized_graph.indexed_graph()[original_inputs[i]].source->attrs.name != + full_graph->indexed_graph()[new_inputs[i]].source->attrs.name) { + LOG(WARNING) << "Disabling fusion due to altered topological order of inputs."; + *full_graph = unoptimized_graph; + break; + } + } + } + } else { + LOG(WARNING) << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; + } } #endif // MXNET_USE_CUDA diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc index d8fba1c169ec..1cbe18baec56 100644 --- a/src/imperative/imperative.cc +++ b/src/imperative/imperative.cc @@ -305,7 +305,9 @@ std::vector Imperative::Backward( std::vector ograd_entries; ograd_entries.reserve(ograds.size()); for (size_t i = 0; i < outputs.size(); ++i) { - ograd_entries.emplace_back(NodeEntry{Node::Create(), 0, 0}); + nnvm::NodePtr np = Node::Create(); + np->attrs.name = "_head_grad_" + std::to_string(i); + ograd_entries.emplace_back(NodeEntry{np, 0, 0}); AGInfo& info = AGInfo::Create(ograd_entries.back().node); info.ctx = outputs[i]->ctx(); if (ograds[i] != nullptr) { From 654a358aaf1741648f8ba0439becf34d1708e6fd Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 25 Jun 2019 11:27:42 -0700 Subject: [PATCH 056/105] Print code only when env variable is set --- src/operator/fusion/fused_op.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index ecdc44965a93..299bf13f0e4b 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -319,7 +319,9 @@ void FusedOp::GenerateCode(const std::vector &req, this->code_ = code; // Add boilerplate and type information - LOG(INFO) << code_; + if (dmlc::GetEnv("MXNET_FUSION_VERBOSE", false) { + LOG(INFO) << code_; + } std::string kernel_params = ""; std::string tensor_params = ""; nnvm::Symbol sym; From 32b690ac5517ed4f1c79c63a40a5f635c31b92db Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 25 Jun 2019 11:49:41 -0700 Subject: [PATCH 057/105] Fix --- src/operator/fusion/fused_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 299bf13f0e4b..816436291377 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -319,7 +319,7 @@ void FusedOp::GenerateCode(const std::vector &req, this->code_ = code; // Add boilerplate and type information - if (dmlc::GetEnv("MXNET_FUSION_VERBOSE", false) { + if (dmlc::GetEnv("MXNET_FUSION_VERBOSE", false)) { LOG(INFO) << code_; } std::string kernel_params = ""; From 39bfcf6103ac669fa236fc6f33e1557e9e78645d Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 25 Jun 2019 14:42:10 -0700 Subject: [PATCH 058/105] Fix lint and 2 tests that specify the same names for multiple inputs --- src/common/exec_utils.h | 1 + src/executor/graph_executor.cc | 3 ++- src/imperative/cached_op.cc | 3 ++- tests/python/unittest/test_operator.py | 8 ++++---- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index 48d999828d96..02b215380f96 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include "../common/utils.h" diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 7dda560e156b..dc440ed71cd7 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1015,7 +1015,8 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, } } } else { - LOG(WARNING) << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; + LOG(WARNING) + << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } #endif // MXNET_USE_CUDA diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 38fe4c6d649e..9c8528397ab0 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -176,7 +176,8 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap } } } else { - LOG(WARNING) << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; + LOG(WARNING) + << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } #endif // MXNET_USE_CUDA diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index d50304712196..43db37b88739 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -1226,8 +1226,8 @@ def test_rsqrt_cos_sin(): @with_seed() def test_maximum_minimum(): - data1 = mx.symbol.Variable('data') - data2 = mx.symbol.Variable('data') + data1 = mx.symbol.Variable('data1') + data2 = mx.symbol.Variable('data2') shape = (3, 4) data_tmp1 = np.random.rand(3,4) data_tmp2 = np.random.rand(3,4) @@ -3786,8 +3786,8 @@ def mathematical_core_binary(name, data1_init=2., data2_init=3., grad_init=2.): - data1 = mx.symbol.Variable('data') - data2 = mx.symbol.Variable('data') + data1 = mx.symbol.Variable('data1') + data2 = mx.symbol.Variable('data2') shape = (3, 4) data_tmp1 = np.random.rand(3, 4) data_tmp2 = np.random.rand(3, 4) From b109a388ed7274fb4d7faf353e7174b13c943a4d Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 26 Jun 2019 11:26:08 -0700 Subject: [PATCH 059/105] Fixes from review and disabling fusion of slice with non-default step --- docs/faq/env_var.md | 25 +++++-- src/common/exec_utils.cc | 79 ++++++++++++++++++++ src/common/exec_utils.h | 61 +++++---------- src/executor/exec_pass.h | 20 +++-- src/executor/node_entry_count.cc | 46 ------------ src/executor/pointwise_fusion_pass.cc | 12 ++- src/operator/fusion/fused_op.cu | 102 +++++++++++++------------- 7 files changed, 189 insertions(+), 156 deletions(-) create mode 100644 src/common/exec_utils.cc delete mode 100644 src/executor/node_entry_count.cc diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index cdd528cd8c8f..bb525231a178 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -192,12 +192,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca * MXNET_PROFILER_AUTOSTART - Values: 0(false) or 1(true) ```(default=0)``` - - Set to 1, MXNet starts the profiler automatically. The profiling result is stored into profile.json in the working directory. + - Set to 1, MXNet starts the profiler automatically. The profiling result is stored into profile.json in the working directory. * MXNET_PROFILER_MODE - Values: 0(false) or 1(true) ```(default=0)``` - - If set to '0', profiler records the events of the symbolic operators. - - If set to '1', profiler records the events of all operators. + - If set to '0', profiler records the events of the symbolic operators. + - If set to '1', profiler records the events of all operators. ## Interface between Python and the C API @@ -233,14 +233,14 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`. * MXNET_CUDA_ALLOW_TENSOR_CORE - 0(false) or 1(true) ```(default=1)``` - - If set to '0', disallows Tensor Core use in CUDA ops. - - If set to '1', allows Tensor Core use in CUDA ops. + - If set to '0', disallows Tensor Core use in CUDA ops. + - If set to '1', allows Tensor Core use in CUDA ops. - This variable can only be set once in a session. * MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION - 0(false) or 1(true) ```(default=0)``` - - If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores - - If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`. + - If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores + - If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`. * MXNET_GLUON_REPO - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'``` @@ -309,6 +309,17 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`. with float32. - Model accuracies do not necessarily improve with this environment variable turned on. +* MXNET_USE_FUSION + - Values: 0(false) or 1(true) ```(default=1)``` + - If this variable is set, MXNet will try fusing some of the operations (pointwise operations only for now). + - It works in Symbolic execution as well as in Gluon models hybridized with ```static_alloc=True``` option. + - Only applies to MXNet that has been compiled with CUDA (```pip install mxnet-cuXX``` or built from source with ```USE_CUDA=1```) + +* MXNET_FUSION_VERBOSE + - Values: 0(false) or 1(true) ```(default=0)``` + - Only applies to MXNet that has been compiled with CUDA and when ```MXNET_USE_FUSION``` option is enabled. + - If this variable is set, MXNet will print the code for fused operators that it generated. + Settings for Minimum Memory Usage --------------------------------- - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1``` diff --git a/src/common/exec_utils.cc b/src/common/exec_utils.cc new file mode 100644 index 000000000000..63f5fa633203 --- /dev/null +++ b/src/common/exec_utils.cc @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file exec_utils.cc + * \brief implementation of executor util functions + */ + +#include "exec_utils.h" +#include +#include +#include + +namespace mxnet { +namespace common { + +void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables) { + using nnvm::Node; + using nnvm::NodePtr; + using nnvm::NodeEntry; + std::unordered_map old_new; + // use DFSVisit to copy all the nodes + DFSVisit(src.outputs, [&old_new, copy_variables](const NodePtr& node) { + NodePtr np; + if (copy_variables || !node->is_variable()) { + np = Node::Create(); + np->attrs = node->attrs; + } else { + np = node; + } + old_new[node.get()] = std::move(np); + }); + // connect nodes of new graph + for (const auto &kv : old_new) { + for (const NodeEntry& e : kv.first->inputs) { + Node *ptr = e.node.get(); + kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version}); + } + for (const NodePtr& p : kv.first->control_deps) { + kv.second->control_deps.emplace_back(old_new[p.get()]); + } + } + // set the head + for (const NodeEntry &e : src.outputs) { + (*dst).outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version}); + } +} + +bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx) { + std::unordered_set names; + for (const auto& nid : idx.input_nodes()) { + const std::string &name = idx[nid].source->attrs.name; + if (names.count(name)) { + LOG(WARNING) << "Variable name " << name << " is used more than once!"; + return false; + } + names.insert(name); + } + return true; +} + +} // namespace common +} // namespace mxnet diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index 02b215380f96..c0bbbb51ebb3 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include "../common/utils.h" @@ -623,50 +622,24 @@ inline nnvm::Graph AssignContext(nnvm::Graph g, return g; } -inline void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables) { - using nnvm::Node; - using nnvm::NodePtr; - using nnvm::NodeEntry; - std::unordered_map old_new; - // use DFSVisit to copy all the nodes - DFSVisit(src.outputs, [&old_new, copy_variables](const NodePtr& node) { - NodePtr np; - if (copy_variables || !node->is_variable()) { - np = Node::Create(); - np->attrs = node->attrs; - } else { - np = node; - } - old_new[node.get()] = std::move(np); - }); - // connect nodes of new graph - for (const auto &kv : old_new) { - for (const NodeEntry& e : kv.first->inputs) { - Node *ptr = e.node.get(); - kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version}); - } - for (const NodePtr& p : kv.first->control_deps) { - kv.second->control_deps.emplace_back(old_new[p.get()]); - } - } - // set the head - for (const NodeEntry &e : src.outputs) { - (*dst).outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version}); - } -} +/*! + * \brief Copy the graph, optionally leaving original Variable nodes + * + * \param dst destination graph + * \param src source graph being copied + * \param copy_variable whether to copy or reuse Variable nodes from the + * source graph + */ +void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables); -inline bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx) { - std::set names; - for (const auto& nid : idx.input_nodes()) { - const std::string &name = idx[nid].source->attrs.name; - if (names.count(name)) { - LOG(WARNING) << "Variable name " << name << " is used more than once!"; - return false; - } - names.insert(name); - } - return true; -} +/*! + * \brief Check whether graph contains any duplicated names in its inputs + * + * \param idx Indexed graph being checked + * + * \return true if there are no duplicates, false otherwise + */ +bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx); } // namespace common } // namespace mxnet diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 626557a8f968..a47c13183ae4 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -190,14 +190,22 @@ void AttachOpResources(const Graph& g, */ Graph DetectInplaceAddTo(Graph g); -using NodeEntryMapCounter = - std::unordered_map; -/*!\brief - * This is to count how many time each output is used by another node (or the output of the graph) +/*! + * \brief Fuse pointwise operations in the forward pass + * + * \param g input graph (needs to be entire graph, not just forward part) + * + * \return graph with fused pointwise operations in the forward pass */ -NodeEntryMapCounter GetNodeEntryCount(const Graph& g); - Graph FusePointwiseForward(Graph&& g); + +/*! + * \brief Fuse pointwise operations in the backward pass + * + * \param g input graph (needs to be entire graph, not just forward part) + * + * \return graph with fused pointwise operations in the backward pass + */ Graph FusePointwiseBackward(Graph&& g); /*! diff --git a/src/executor/node_entry_count.cc b/src/executor/node_entry_count.cc deleted file mode 100644 index 938b005bb46f..000000000000 --- a/src/executor/node_entry_count.cc +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2019 by Contributors - * \file node_entry_count.cc - * \brief function that count how many times a node entry is used - * \author Clement Fuji Tsang - */ -#include "./exec_pass.h" - -// TODO(cfujitsang): should this be pushed to nnvm repository ? -namespace mxnet { -namespace exec { - -NodeEntryMapCounter GetNodeEntryCount(const nnvm::Graph& g) { - NodeEntryMapCounter outputs; - DFSVisit(g.outputs, [&outputs](const nnvm::NodePtr& node) { - for (auto e : node->inputs) { - outputs[e]++; - } - }); - for (auto e : g.outputs) { - outputs[e]++; - } - return outputs; -} - -} // namespace exec -} // namespace mxnet diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index ac1bbd7e2a99..5ba45a64f1c6 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -62,8 +62,18 @@ namespace { if (n->op() == nullptr) return false; std::string op_name = n->op()->name; - if (slice_ops.count(op_name)) + if (slice_ops.count(op_name)) { + if (op_name == "slice") { + // slice with non-default step attribute is not supported + // currently + if (n->attrs.dict.count("step") && + !(n->attrs.dict.at("step") == "()" || + n->attrs.dict.at("step") == "[]")) { + return false; + } + } return true; + } return false; } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 816436291377..667dcbf1d898 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -152,53 +152,51 @@ void FusedOp::GenerateCode(const std::vector &req, const auto& node = g[i]; const auto* source = node.source; if (source != nullptr) { - if (source->is_variable()) { - if (load_index[i]) { - const auto& var_name = source->attrs.name; - code += "const auto vec_" + var_name + " = load_index(" + \ - var_name + ", offset);\n"; - variables[{i, 0}] = var_name; - } - CHECK_EQ(outputs[i], 1); - } else { - std::string op_name = source->op()->name; - if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { - int arg_id = node.inputs[0].node_id; - const auto& var_name = g[arg_id].source->attrs.name; - const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - load_index[arg_id] = 0; - auto parse_tuple = [](const std::string& input, const std::string def) { - std::string out = input; - replaceString(&out, "(", "{"); - replaceString(&out, ")", "}"); - replaceString(&out, "None", def); - return out; - }; - std::string begin = parse_tuple(source->attrs.dict.at("begin"), "0"); - std::string end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); - if (op_name == "slice") { - // step = parse_tuple(source->attrs.dict.at("step"), "1"); - } else if (op_name == "slice_axis") { - std::string axis = source->attrs.dict.at("axis"); - std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; - std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; - code += "Shape "+ begin_var_name + ";\n"; - code += "Shape "+ end_var_name + ";\n"; - code += begin_var_name + ".set(0);\n"; - code += end_var_name + ".set(INT_MAX);\n"; - code += begin_var_name + "["+axis+"] = " + begin + ";\n"; - code += end_var_name + "["+axis+"] = " + end + ";\n"; - begin = begin_var_name; - end = end_var_name; - } - code += "const auto " + vec_name + " = load_slice(" + \ - var_name + ", " + var_name + "_shape," + begin + \ - "," + end + ", offset);\n"; - CHECK_EQ(outputs[i], 1); - variables[{i, 0}] = vec_name; - continue; - } + if (source->is_variable()) { + if (load_index[i]) { + const auto& var_name = source->attrs.name; + code += "const auto vec_" + var_name + " = load_index(" + + var_name + ", offset);\n"; + variables[{i, 0}] = var_name; + } + CHECK_EQ(outputs[i], 1); + } else { + std::string op_name = source->op()->name; + if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { + int arg_id = node.inputs[0].node_id; + const auto& var_name = g[arg_id].source->attrs.name; + const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); + load_index[arg_id] = 0; + auto parse_tuple = [](const std::string& input, const std::string def) { + std::string out = input; + replaceString(&out, "(", "{"); + replaceString(&out, ")", "}"); + replaceString(&out, "None", def); + return out; + }; + std::string begin = parse_tuple(source->attrs.dict.at("begin"), "0"); + std::string end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); + if (op_name == "slice_axis") { + std::string axis = source->attrs.dict.at("axis"); + std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; + std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; + code += "Shape "+ begin_var_name + ";\n"; + code += "Shape "+ end_var_name + ";\n"; + code += begin_var_name + ".set(0);\n"; + code += end_var_name + ".set(INT_MAX);\n"; + code += begin_var_name + "["+axis+"] = " + begin + ";\n"; + code += end_var_name + "["+axis+"] = " + end + ";\n"; + begin = begin_var_name; + end = end_var_name; + } + code += "const auto " + vec_name + " = load_slice(" + + var_name + ", " + var_name + "_shape," + begin + + "," + end + ", offset);\n"; + CHECK_EQ(outputs[i], 1); + variables[{i, 0}] = vec_name; + continue; } + } } } @@ -450,23 +448,23 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, int ndim = outputs[0].ndim(); int nvec = 1; - size_t counter = 0; - for (const auto& blob : inputs) { + CHECK_EQ(inputs.size(), inputs_.size()); + for (size_t counter = 0; counter < inputs.size(); ++counter) { + const auto& blob = inputs[counter]; in_dtypes.push_back(blob.type_flag_); in_ndims.push_back(blob.ndim()); initialized_ = initialized_ && (blob.type_flag_ == inputs_[counter].dtype); inputs_[counter].dtype = blob.type_flag_; nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); - ++counter; } - counter = 0; - for (const auto& blob : outputs) { + CHECK_EQ(outputs.size(), outputs_.size()); + for (size_t counter = 0; counter < outputs.size(); ++counter) { + const auto& blob = outputs[counter]; out_dtypes.push_back(blob.type_flag_); initialized_ = initialized_ && (blob.type_flag_ == outputs_[counter].dtype); outputs_[counter].dtype = blob.type_flag_; nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); - ++counter; } // Check and save compute capability of the current GPU From f1a14fde948d35b498272615fc17241be2670390 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 27 Jun 2019 09:48:55 -0700 Subject: [PATCH 060/105] Add amp_cast to fusion, fixes --- src/executor/infer_graph_attr_pass.cc | 2 +- src/executor/pointwise_fusion_pass.cc | 8 +------- src/operator/fusion/fused_op-inl.h | 8 +------- src/operator/fusion/fused_op.cu | 13 +++++++++++++ 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 5b694ab617f8..19625ef3f86c 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -134,7 +134,7 @@ void GetAttrFromFusedNode(uint32_t nid, const auto& inferred_attrs = finfer(fwd_ptr->attrs); const auto& input_attrs = inferred_attrs.first; const auto& output_attrs = inferred_attrs.second; - CHECK(input_attrs.size() == inode.source->op()->num_outputs) << + CHECK(input_attrs.size() == inode.source->num_outputs()) << "Number of outputs of the gradient node " << inode.source->attrs.name << " does not match the number of inputs of the corresponding forward node"; // Set the attributes of output gradients diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 5ba45a64f1c6..b9b2e5d81cea 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -82,7 +82,7 @@ namespace { auto node = nnvm::Node::Create(); subgraph_sym.outputs = subgraph.outputs; node->attrs.subgraphs.emplace_back(std::make_shared(subgraph_sym)); - std::ostringstream name_oss, params_oss; + std::ostringstream name_oss; // the name of the new node will be the concatenation of all the node names in the subgraph DFSVisit(subgraph.outputs, [&name_oss](const nnvm::NodePtr n) { if (n->op() != nullptr) @@ -91,12 +91,6 @@ namespace { auto subgraph_name = name_oss.str(); subgraph_name.pop_back(); node->attrs.name = subgraph_name; - // in case the subgraph contains some of the weights - for (auto &e : subgraph_sym.ListInputNames(nnvm::Symbol::kAll)) { - params_oss << e << ";"; - } - auto params_names = params_oss.str(); - params_names.pop_back(); node->attrs.dict["num_inputs"] = std::to_string(inputs_size); node->attrs.dict["num_outputs"] = std::to_string(subgraph.outputs.size()); node->attrs.op = Op::Get("_FusedOp"); diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index b8a24458014b..86b38684b766 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -81,6 +81,7 @@ const std::map>> ops_desc = { {"_Minimum" , {{"min(%, %)", "_0", "_1"}}}, {"_minimum" , {{"min(%, %)", "_0", "_1"}}}, {"amp_cast" , {{"identity(%)", "_0"}}}, + {"_backward_amp_cast" , {{"identity(%)", "_0"}}}, {"relu" , {{"relu(%)", "_0"}}}, {"sigmoid" , {{"sigmoid(%)", "_0"}}}, {"softsign" , {{"softsign(%)", "_0"}}}, @@ -438,13 +439,6 @@ inline typename LoadType::Type cast(const DType val) { return static_cast::Type>(val); } -// TODO(ptredak): this is not exactly identity, needs type inference -// in the middle of the graph to do it right -template -inline DType amp_multicast(const DType val) { - return val; -} - // activations template diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 667dcbf1d898..9bb0897c181f 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -279,6 +279,19 @@ void FusedOp::GenerateCode(const std::vector &req, variables[{i, 0}] = var_name; continue; } + + if (op_name == "amp_multicast" || op_name == "_backward_amp_multicast") { + CHECK_EQ(outputs[i], node.inputs.size()); + for (size_t counter = 0; counter < outputs[i]; ++counter) { + const auto& input = node.inputs[counter]; + var_name = "temp" + std::to_string(temp_name_counter++); + const auto& arg = variables[{input.node_id, input.index}]; + code += "const auto " + var_name + " = " + arg + ";\n"; + variables[{i, counter}] = var_name; + } + continue; + } + LOG(FATAL) << "Unrecognized op " + op_name; } } else { From a72b9807db33c3a45e33bae3ebe016aee6e2ecdd Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 2 Jul 2019 12:57:11 -0700 Subject: [PATCH 061/105] Add amp_multicast and its backward to the list of support ops --- src/operator/fusion/fused_op-inl.h | 57 +++++++++++++++++-------- src/operator/fusion/fused_op.cu | 64 +++++++++++++++++++--------- src/operator/fusion/fused_op.h | 1 + src/storage/pooled_storage_manager.h | 4 +- 4 files changed, 86 insertions(+), 40 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 86b38684b766..02278b43ba0c 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -227,7 +227,9 @@ const std::map slice_ops = { const std::vector variable_io_ops = { "add_n", - "_backward_Activation" + "_backward_Activation", + "amp_multicast", + "_backward_amp_multicast" }; const char function_definitions[] = R"code( @@ -268,7 +270,7 @@ inline half store(const float input, half* ref) { template struct VectorConfig { - static_assert(size >= 4, "Error"); + static_assert(size >= 4, "VectorConfig needs to have size of at least 4B"); using IndexType = float; }; @@ -282,6 +284,11 @@ struct VectorConfig<16> { using IndexType = double2; }; +template <> +struct VectorConfig<32> { + using IndexType = double4; +}; + template union VectorType { typename VectorConfig::IndexType y; @@ -298,6 +305,7 @@ union VectorType { template struct Shape { int x[ndim]; + size_t size; inline const int& operator [](const int i) const { return x[i]; } @@ -312,15 +320,22 @@ struct Shape { } }; -template -inline VectorType load_index(const DType * input, int i) { - const auto* vector_input = reinterpret_cast::IndexType *>(input + i); - VectorType ret = {*vector_input}; - return ret; +template +inline VectorType load_index(const DType * input, int i, const Shape &shape) { + if (i < shape.size) { + const auto* vector_input = reinterpret_cast< + const typename VectorConfig::IndexType *>( + input + i); + VectorType ret = {*vector_input}; + return ret; + } else { + VectorType ret({0}); + return ret; + } } template -inline VectorType load_slice(const DType * input, const Shape shape, Shape begin, Shape end, int offset) { +inline VectorType load_slice(const DType * input, const Shape& shape, Shape begin, Shape end, int offset) { int idx[nvec]; bool mem_aligned = true; @@ -362,21 +377,29 @@ inline VectorType load_slice(const DType * input, const Shape } return ret; } - return load_index(input, idx[0]); + return load_index(input, idx[0], shape); } -template -inline void store_index(const VectorType value, int i, DType * output) { - auto vector_output = reinterpret_cast::IndexType *>(output); - vector_output[i] = value.y; +template +inline void store_index(const VectorType value, int i, + DType * output, const Shape& shape) { + if (i < shape.size) { + auto vector_output = reinterpret_cast< + typename VectorConfig::IndexType *>(output); + vector_output[i] = value.y; + } } -template -inline void store_add_index(const VectorType value, int i, DType * output) { - auto vector_output = reinterpret_cast::IndexType *>(output); - vector_output[i] += value.y; +template +inline void store_add_index(const VectorType value, int i, + DType * output, const Shape& shape) { + if (i < shape.size) { + auto vector_output = reinterpret_cast< + typename VectorConfig::IndexType *>(output); + vector_output[i] += value.y; + } } template diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 9bb0897c181f..80925f5201da 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -108,12 +108,32 @@ std::string ParseOpDescription(const std::vector& op_desc, return fmt; } +void AddPointerAndShape(const TBlob& data, + std::vector *ptrs, + std::vector>* shapes) { + MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { + int ndim = data.ndim(); + Tensor tensor = data.FlatTo1D(s); + ptrs->push_back(tensor.dptr_); + shapes->push_back(std::vector(ndim + 2)); + std::vector& tensor_shapes = shapes->back(); + size_t total_size = 1; + for (int i = ndim-1; i >= 0; i--) { + tensor_shapes[i] = data.shape_[i]; + total_size *= data.shape_[i]; + } + size_t * shape_size_ptr = reinterpret_cast(&tensor_shapes[ndim]); + *shape_size_ptr = total_size; + }); +} + } // namespace void FusedOp::GenerateCode(const std::vector &req, const std::vector &in_dtypes, const std::vector &out_dtypes, const std::vector &in_ndims, + const std::vector &out_ndims, const int nvec, const std::string &kernel_name) { const auto& g = this->symbol_.indexed_graph(); @@ -156,7 +176,7 @@ void FusedOp::GenerateCode(const std::vector &req, if (load_index[i]) { const auto& var_name = source->attrs.name; code += "const auto vec_" + var_name + " = load_index(" + - var_name + ", offset);\n"; + var_name + ", offset, " + var_name + "_shape);\n"; variables[{i, 0}] = var_name; } CHECK_EQ(outputs[i], 1); @@ -315,10 +335,12 @@ void FusedOp::GenerateCode(const std::vector &req, const std::string& var = variables[{entry.node_id, entry.index}]; if (req[counter] == kWriteTo || req[counter] == kWriteInplace) { const auto var_name = "output" + std::to_string(counter); - code += "store_index(vec_" + var_name + ", i, " + var_name + ");\n"; + code += "store_index(vec_" + var_name + ", i, " + var_name + ", " + + var_name + "_shape);\n"; } else if (req[counter] == kAddTo) { const auto var_name = "output" + std::to_string(counter); - code += "store_add_index(vec_" + var_name + ", i, " + var_name + ");\n"; + code += "store_add_index(vec_" + var_name + ", i, " + var_name + ", " + + var_name + "_shape);\n"; } else if (req[counter] == kNullOp) { // NULL req, do not do anything } else { @@ -353,19 +375,24 @@ void FusedOp::GenerateCode(const std::vector &req, ++i; if (i < num_params) { tensor_params += ", "; - kernel_params += ", "; } + kernel_params += ", "; } for (const auto &type : out_dtypes) { std::string type_name = mshadowTypeToString(type); std::string out_name = "output" + std::to_string(i - in_dtypes.size()); std::string dtype_var = "DType_" + out_name; + std::string dim_var = "ndim_" + out_name; + aux_code = "static const int " + dim_var + " = " + \ + std::to_string(out_ndims[i - in_dtypes.size()]) + ";\n" + aux_code; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; tensor_params += dtype_var + "* " + out_name; + kernel_params += " const Shape<" + dim_var + "> " + out_name+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; } + kernel_params += ", "; } kernel_params += tensor_params; code_ = std::string(fusion::fp16_support_string) + "\n" + @@ -458,7 +485,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector in_dtypes; std::vector in_ndims; std::vector out_dtypes; - int ndim = outputs[0].ndim(); + std::vector out_ndims; int nvec = 1; CHECK_EQ(inputs.size(), inputs_.size()); @@ -475,6 +502,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, for (size_t counter = 0; counter < outputs.size(); ++counter) { const auto& blob = outputs[counter]; out_dtypes.push_back(blob.type_flag_); + out_ndims.push_back(blob.ndim()); initialized_ = initialized_ && (blob.type_flag_ == outputs_[counter].dtype); outputs_[counter].dtype = blob.type_flag_; nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); @@ -487,37 +515,31 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, saved_reqs_ = req; if (!initialized_) { - this->GenerateCode(req, in_dtypes, out_dtypes, in_ndims, nvec, attrs.name); + this->GenerateCode(req, in_dtypes, out_dtypes, in_ndims, out_ndims, nvec, attrs.name); this->CompileCode(attrs.name); initialized_ = true; } Stream* s = ctx.get_stream(); auto stream = Stream::GetStream(s); std::vector args; - size_t N = (outputs[0].shape_.Size() + nvec - 1)/nvec; + size_t N = 0; + for (const auto& output : outputs) { + N = std::max(N, output.shape_.Size()); + } + N = (N + nvec - 1)/nvec; args.push_back(&N); unsigned int num_blocks = (N + FusedOp::NTHREADS - 1) / FusedOp::NTHREADS; + std::vector ptrs; std::vector> shapes; for (const auto &data : inputs) { - MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { - int ndim = data.ndim(); - Tensor tensor = data.FlatTo1D(s); - ptrs.push_back(tensor.dptr_); - shapes.push_back(std::vector(ndim)); - std::vector& tensor_shapes = shapes.back(); - for (int i = ndim-1; i >= 0; i--) { - tensor_shapes[i] = data.shape_[i]; - } - }); + AddPointerAndShape(data, &ptrs, &shapes); } for (const auto &data : outputs) { - MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { - Tensor tensor = data.FlatTo1D(s); - ptrs.push_back(tensor.dptr_); - }); + AddPointerAndShape(data, &ptrs, &shapes); } + for (auto &tensor_shapes : shapes) { args.push_back(tensor_shapes.data()); } diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 3c5bf5cc3cb6..d133e1819409 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -106,6 +106,7 @@ class FusedOp { const std::vector &in_dtypes, const std::vector &out_dtypes, const std::vector &in_ndims, + const std::vector &out_ndims, const int nvec, const std::string& kernel_name); void CompileCode(const std::string &kernel_name); diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index 91eb536ec7bd..b548f0c9ac26 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -106,8 +106,8 @@ class GPUPooledStorageManager final : public StorageManager { } size_t RoundAllocSize(size_t size) { - // Round up small allocs to the page_size_ to consolidate the pool lookups - size = std::max(size, page_size_); + // Round up small allocs to multiple of page_size_ to consolidate the pool lookups + size = RoundToMultiple(size, page_size_); // To ensure proper freeing under some driver variants, make sure // large allocs entirely occupy their slabs, which cannot then be // locked by smaller permanent allocations sharing the slab. From e4e674e2d629af4cd837b62257c977bfd2542bce Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Tue, 2 Jul 2019 14:14:55 -0700 Subject: [PATCH 062/105] Apply wording suggestions from code review Co-Authored-By: Aaron Markham --- src/executor/pointwise_fusion_pass.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index b9b2e5d81cea..0d6fa0acdb3a 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -113,9 +113,9 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub for (auto p : sub_outputs_in_main) { subgraph.outputs[p.second] = p.first; } - // To generate a subgraph an input have to be replaced by data node (no op) - // and it have to be agnostic to the node from which it's an output - // (For exemple even if two inputs are two different outputs from the same node) + // To generate a subgraph an input has to be replaced by data node (no op) + // and it has to be agnostic to the node from which it's an output + // (For example, even if two inputs are two different outputs from the same node) auto inputs = GetSubgraphInputs(subgraph, subgraph_set); auto subgraph_node = create_subgraph_node(subgraph, inputs.size()); subgraph_node->inputs = inputs; From 5766481e969b9c12d16e518ad583ab5379f78080 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Tue, 2 Jul 2019 14:15:33 -0700 Subject: [PATCH 063/105] Apply wording suggestions from code review Co-Authored-By: Aaron Markham --- src/executor/simple_partition_pass.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 0d47a5783089..7551e8d2fefb 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -312,9 +312,9 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set for (auto p : sub_outputs_in_main) { subgraph.outputs[p.second] = p.first; } - // To generate a subgraph an input have to be replace by data node (no op) - // and it have to be agnostic to the node from which it's an output - // (For exemple even if two inputs are two different outputs from the same node) + // To generate a subgraph an input has to be replaced by data node (no op) + // and it has to be agnostic to the node from which it's an output + // (For example, even if two inputs are two different outputs from the same node) auto inputs = GetSubgraphInputs(subgraph, subgraph_set); auto subgraph_node = create_subgraph_node(subgraph); subgraph_node->inputs = inputs; From 62513e667571d7988d6c0982870fb60be4cc360c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 2 Jul 2019 14:19:02 -0700 Subject: [PATCH 064/105] Make clearer comment --- src/executor/pointwise_fusion_pass.cc | 6 ++++-- src/executor/simple_partition_pass.h | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 0d6fa0acdb3a..1b86fc8d28bb 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -100,7 +100,8 @@ namespace { } // namespace /*! - * \brief Replace a set of nodes by a subgraph node + * \brief Replace a set of nodes by a subgraph node. + * This function is used specifically in pointwise fusion. */ template Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& subgraph_sets, @@ -115,7 +116,8 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub } // To generate a subgraph an input has to be replaced by data node (no op) // and it has to be agnostic to the node from which it's an output - // (For example, even if two inputs are two different outputs from the same node) + // (For example, even if two inputs are two different outputs from the same node, + // they need to be replaced by two completely separate data nodes) auto inputs = GetSubgraphInputs(subgraph, subgraph_set); auto subgraph_node = create_subgraph_node(subgraph, inputs.size()); subgraph_node->inputs = inputs; diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 7551e8d2fefb..c5d68bda8498 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -299,7 +299,7 @@ void dispNodesSet(Graph g, NodeRawPtrSet s) { } /*! - * \brief Replace a set of nodes by a subgraph node + * \brief Replace a set of nodes by a subgraph node. */ template Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_sets, @@ -314,7 +314,8 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set } // To generate a subgraph an input has to be replaced by data node (no op) // and it has to be agnostic to the node from which it's an output - // (For example, even if two inputs are two different outputs from the same node) + // (For example, even if two inputs are two different outputs from the same node, + // they need to be replaced by two completely separate data nodes) auto inputs = GetSubgraphInputs(subgraph, subgraph_set); auto subgraph_node = create_subgraph_node(subgraph); subgraph_node->inputs = inputs; From dd651d30afde7f8388bfb0285de066015225254c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 2 Jul 2019 14:26:39 -0700 Subject: [PATCH 065/105] Adding punctuation and capitalization to \brief descriptions --- src/common/exec_utils.cc | 2 +- src/common/exec_utils.h | 4 ++-- src/executor/exec_pass.h | 4 ++-- src/executor/pointwise_fusion_pass.cc | 2 +- src/executor/simple_partition_pass.h | 10 +++++----- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/common/exec_utils.cc b/src/common/exec_utils.cc index 63f5fa633203..6782abd8b21f 100644 --- a/src/common/exec_utils.cc +++ b/src/common/exec_utils.cc @@ -19,7 +19,7 @@ /*! * \file exec_utils.cc - * \brief implementation of executor util functions + * \brief Implementation of executor util functions. */ #include "exec_utils.h" diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index c0bbbb51ebb3..332797537e26 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -623,7 +623,7 @@ inline nnvm::Graph AssignContext(nnvm::Graph g, } /*! - * \brief Copy the graph, optionally leaving original Variable nodes + * \brief Copy the graph, optionally leaving original Variable nodes. * * \param dst destination graph * \param src source graph being copied @@ -633,7 +633,7 @@ inline nnvm::Graph AssignContext(nnvm::Graph g, void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables); /*! - * \brief Check whether graph contains any duplicated names in its inputs + * \brief Check whether graph contains any duplicated names in its inputs. * * \param idx Indexed graph being checked * diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index a47c13183ae4..186199870666 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -191,7 +191,7 @@ void AttachOpResources(const Graph& g, Graph DetectInplaceAddTo(Graph g); /*! - * \brief Fuse pointwise operations in the forward pass + * \brief Fuse pointwise operations in the forward pass. * * \param g input graph (needs to be entire graph, not just forward part) * @@ -200,7 +200,7 @@ Graph DetectInplaceAddTo(Graph g); Graph FusePointwiseForward(Graph&& g); /*! - * \brief Fuse pointwise operations in the backward pass + * \brief Fuse pointwise operations in the backward pass. * * \param g input graph (needs to be entire graph, not just forward part) * diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 1b86fc8d28bb..92c7ba5dfd9d 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -20,7 +20,7 @@ /*! * Copyright (c) 2019 by Contributors * \file pointwise_fusion_pass.cc - * \brief + * \brief Pass applying pointwise fusion. * \author Clement Fuji Tsang */ diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index c5d68bda8498..f4c0dc9de130 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -20,7 +20,7 @@ /*! * Copyright (c) 2019 by Contributors * \file simple_partition_pass.h - * \brief + * \brief Simple pass for partitioning a graph. * \author Clement Fuji Tsang */ #ifndef MXNET_EXECUTOR_SIMPLE_PARTITION_PASS_H_ @@ -43,7 +43,7 @@ namespace exec { /*! * \brief Custom graph class, which will contain bi-directional nodes - * we need to compute DFS and reverse DFS for graph partitioning + * we need to compute DFS and reverse DFS for graph partitioning. */ class BidirectionalGraph { public: @@ -215,7 +215,7 @@ using NodeEntrySet = std::unordered_set; /*! - * \brief get the output nodes of the subgraph in the main graph + * \brief Get the output nodes of the subgraph in the main graph. * \return a map between the node in the main graph and the output index of the subgraph node */ nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_set) { @@ -239,7 +239,7 @@ nnvm::NodeEntryMap GetSubgraphOutputs(Graph g, NodeRawPtrSet subgraph_ } /*! - * \brief create new input nodes of the subgraph and plug them + * \brief Create new input nodes of the subgraph and plug them. * \return the inputs of the subgraph node in the main graph */ std::vector GetSubgraphInputs(Graph g, NodeRawPtrSet subgraph_set) { @@ -286,7 +286,7 @@ std::unordered_map GetGraphInputsMap(const Graph& g) { } /*! - * \brief helper function to display what nodes are in a specific subset + * \brief Helper function to display what nodes are in a specific subset. */ void dispNodesSet(Graph g, NodeRawPtrSet s) { DFSVisit(g.outputs, [&s](const nnvm::NodePtr n){ From 7974888dbb4a9291fc91431863a87ca94c17dbb0 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 2 Jul 2019 13:57:23 -0700 Subject: [PATCH 066/105] Fix --- src/operator/fusion/fused_op.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 80925f5201da..1154fa32b99c 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -110,7 +110,9 @@ std::string ParseOpDescription(const std::vector& op_desc, void AddPointerAndShape(const TBlob& data, std::vector *ptrs, - std::vector>* shapes) { + std::vector>* shapes, + mshadow::Stream * s) { + using namespace mshadow; MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { int ndim = data.ndim(); Tensor tensor = data.FlatTo1D(s); @@ -534,10 +536,10 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector ptrs; std::vector> shapes; for (const auto &data : inputs) { - AddPointerAndShape(data, &ptrs, &shapes); + AddPointerAndShape(data, &ptrs, &shapes, s); } for (const auto &data : outputs) { - AddPointerAndShape(data, &ptrs, &shapes); + AddPointerAndShape(data, &ptrs, &shapes, s); } for (auto &tensor_shapes : shapes) { From 2aa89501f7c48398b169819849e23db94a45496a Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 3 Jul 2019 10:27:11 -0700 Subject: [PATCH 067/105] Fix --- src/operator/fusion/fused_op-inl.h | 27 ++++++++++++++++++++++++--- src/operator/fusion/fused_op.cu | 7 +++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 02278b43ba0c..f1d548066cf7 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -46,6 +46,7 @@ __device__ inline float __half2float(const __half h) { asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x)); return val; } + typedef __half half; )code"; @@ -117,6 +118,7 @@ const std::map>> ops_desc = { {"flatten" , {{"identity(%)", "_0"}}}, {"Reshape" , {{"identity(%)", "_0"}}}, {"reshape" , {{"identity(%)", "_0"}}}, + {"_backward_reshape" , {{"identity(%)", "_0"}}}, {"expand_dims" , {{"identity(%)", "_0"}}}, {"round" , {{"round(%)", "_0"}}}, {"rint" , {{"rint(%)", "_0"}}}, @@ -289,6 +291,16 @@ struct VectorConfig<32> { using IndexType = double4; }; +template +inline DType add_elem(const DType& x, const DType& y) { + return x + y; +} + +template <> +inline half add_elem(const half& x, const half& y) { + return __float2half(__half2float(x) + __half2float(y)); +} + template union VectorType { typename VectorConfig::IndexType y; @@ -300,6 +312,13 @@ union VectorType { VectorType (const decltype(y) &y2) { y = y2; } + inline VectorType& operator+=(const VectorType& rhs) { + #pragma unroll + for (int i = 0; i < nvec; ++i) { + x[i] = add_elem(x[i], rhs.x[i]); + } + return *this; + } }; template @@ -385,7 +404,7 @@ inline VectorType load_slice(const DType * input, const Shape template inline void store_index(const VectorType value, int i, DType * output, const Shape& shape) { - if (i < shape.size) { + if (i < (shape.size + nvec - 1) / nvec) { auto vector_output = reinterpret_cast< typename VectorConfig::IndexType *>(output); vector_output[i] = value.y; @@ -395,10 +414,12 @@ inline void store_index(const VectorType value, int i, template inline void store_add_index(const VectorType value, int i, DType * output, const Shape& shape) { - if (i < shape.size) { + if (i < (shape.size + nvec - 1) / nvec) { auto vector_output = reinterpret_cast< typename VectorConfig::IndexType *>(output); - vector_output[i] += value.y; + VectorType ret(vector_output[i]); + ret += value; + vector_output[i] = ret.y; } } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 1154fa32b99c..da7e2c08a854 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -117,14 +117,17 @@ void AddPointerAndShape(const TBlob& data, int ndim = data.ndim(); Tensor tensor = data.FlatTo1D(s); ptrs->push_back(tensor.dptr_); - shapes->push_back(std::vector(ndim + 2)); + // We need alignment to 8 bytes for size_t in the Shape struct + // so if ndim is odd, there will be 4B of padding + const int offset = ndim % 2 == 0 ? 2 : 3; + shapes->push_back(std::vector(ndim + offset)); std::vector& tensor_shapes = shapes->back(); size_t total_size = 1; for (int i = ndim-1; i >= 0; i--) { tensor_shapes[i] = data.shape_[i]; total_size *= data.shape_[i]; } - size_t * shape_size_ptr = reinterpret_cast(&tensor_shapes[ndim]); + size_t * shape_size_ptr = reinterpret_cast(&tensor_shapes[ndim + offset - 2]); *shape_size_ptr = total_size; }); } From a96e778a45258da38015d38e659b3d741cbb268f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 8 Jul 2019 09:30:27 -0700 Subject: [PATCH 068/105] Add backward_cast to fusion --- src/operator/fusion/fused_op-inl.h | 3 ++- src/operator/fusion/fused_op.cu | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index f1d548066cf7..51d9fb09bf73 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -231,7 +231,8 @@ const std::vector variable_io_ops = { "add_n", "_backward_Activation", "amp_multicast", - "_backward_amp_multicast" + "_backward_amp_multicast", + "_backward_cast" }; const char function_definitions[] = R"code( diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index da7e2c08a854..f0048ff7b00c 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -317,6 +317,17 @@ void FusedOp::GenerateCode(const std::vector &req, continue; } + if (op_name == "_backward_cast") { + CHECK_EQ(outputs[i], 1); + const std::vector& types = this->symbol_.GetAttr("dtype"); + const int output_type = types[g.entry_id(i, 0)]; + const auto& arg = variables[{node.inputs[0].node_id, node.inputs[0].index}]; + code += "const auto " + var_name + " = cast<" + mshadowTypeToString(output_type) + + ">(" + arg + ");\n"; + variables[{i, 0}] = var_name; + continue; + } + LOG(FATAL) << "Unrecognized op " + op_name; } } else { From 9ea5464d8fd7ae062f2c39c56edb93a962eadbb0 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 8 Jul 2019 13:51:37 -0700 Subject: [PATCH 069/105] Adding unittests for fusion. Fix for erfinv_grad --- src/operator/fusion/fused_op-inl.h | 9 +- src/operator/mshadow_op.h | 2 +- .../tensor/elemwise_unary_op_basic.cc | 2 +- tests/python/gpu/test_fusion.py | 172 ++++++++++++++++++ 4 files changed, 178 insertions(+), 7 deletions(-) create mode 100644 tests/python/gpu/test_fusion.py diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 51d9fb09bf73..aac0cb7b124d 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -99,7 +99,6 @@ const std::map>> ops_desc = { {"tan" , {{"tan(%)", "_0"}}}, {"arcsin" , {{"arcsin(%)", "_0"}}}, {"arccos" , {{"arccos(%)", "_0"}}}, - {"arccos" , {{"arccos(%)", "_0"}}}, {"arctan" , {{"arctan(%)", "_0"}}}, {"sinh" , {{"sinh(%)", "_0"}}}, {"cosh" , {{"cosh(%)", "_0"}}}, @@ -896,8 +895,8 @@ inline DType backward_erfinv(const DType val, const DType grad) { return 0.5f * sqrt(pi) * exp(val * val) * grad; } -template -inline DType smooth_l1(const DType val, const DType scalar) { +template +inline DType1 smooth_l1(const DType1 val, const DType2 scalar) { const auto bsq = scalar * scalar; const auto ibsq = 1.0f / bsq; if (val > ibsq) { @@ -909,8 +908,8 @@ inline DType smooth_l1(const DType val, const DType scalar) { } } -template -inline DType backward_smooth_l1(const DType val, const DType scalar, const DType grad) { +template +inline DType backward_smooth_l1(const DType val, const DType2 scalar, const DType grad) { auto bsq = scalar * scalar; auto ibsq = 1.0f / bsq; if (val > ibsq) { diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h index ab53e7733066..130601e41a63 100644 --- a/src/operator/mshadow_op.h +++ b/src/operator/mshadow_op.h @@ -168,7 +168,7 @@ struct softrelu : public mxnet_op::tunable { MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a)); -MXNET_UNARY_MATH_OP(erfinv_grad, 0.5 * math::sqrt(PI) * math::exp(math::sqr(erfinv::Map(a)))); +MXNET_UNARY_MATH_OP(erfinv_grad, 0.5 * math::sqrt(PI) * math::exp(math::sqr(a))); MXNET_UNARY_MATH_OP(erf_grad, 2.0 / math::sqrt(PI) * math::exp(-(a * a))); diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index 98dc8dad825f..d17a24399726 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -984,7 +984,7 @@ Example:: )code" ADD_FILELINE) .set_attr("FCompute", UnaryOp::Compute) -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_erfinv"}); +.set_attr("FGradient", ElemwiseGradUseOut{"_backward_erfinv"}); MXNET_OPERATOR_REGISTER_BINARY(_backward_erfinv) .set_attr("FCompute", diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py new file mode 100644 index 000000000000..478dd5dd9965 --- /dev/null +++ b/tests/python/gpu/test_fusion.py @@ -0,0 +1,172 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import mxnet as mx +import numpy as np +from mxnet.test_utils import * + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.insert(0, os.path.join(curr_path, '../unittest')) +from common import with_seed + +def check_fused_symbol(sym, **kwargs): + inputs = sym.list_inputs() + shapes = {inp : kwargs[inp].shape for inp in inputs} + test_sym = mx.sym.Group([mx.sym.identity(s) for s in sym]) + rtol = {'float16' : 1e-2, + 'float32' : 5e-7, + 'float64' : 5e-7, + } + atol = {'float16' : 1e-3, + 'float32' : 1e-7, + 'float64' : 1e-7, + } + for dtype in ['float16', 'float32', 'float64']: + data = {inp : kwargs[inp].astype(dtype) for inp in inputs} + for grad_req in ['write', 'add']: + type_dict = {inp : dtype for inp in inputs} + os.environ["MXNET_USE_FUSION"] = "0" + orig_exec = test_sym.simple_bind(ctx=mx.gpu(0), grad_req=grad_req, type_dict=type_dict, **shapes) + os.environ["MXNET_USE_FUSION"] = "1" + fused_exec = test_sym.simple_bind(ctx=mx.gpu(0), grad_req=grad_req, type_dict=type_dict, **shapes) + fwd_orig = orig_exec.forward(is_train=True, **data) + out_grads = [mx.nd.ones_like(arr) for arr in fwd_orig] + orig_exec.backward(out_grads=out_grads) + fwd_fused = fused_exec.forward(is_train=True, **data) + fused_exec.backward(out_grads=out_grads) + for orig, fused in zip(fwd_orig, fwd_fused): + np.testing.assert_allclose(orig.asnumpy(), fused.asnumpy(), rtol=rtol[dtype], atol=atol[dtype]) + for orig, fused in zip(orig_exec.grad_arrays, fused_exec.grad_arrays): + if orig is None and fused is None: + continue + assert orig is not None + assert fused is not None + print(orig, fused) + np.testing.assert_allclose(orig.asnumpy(), fused.asnumpy(), rtol=rtol[dtype], atol=atol[dtype]) + +def check_unary_ops(): + unary_ops = [ + 'relu', + 'sigmoid', + 'softsign', + 'exp', + 'expm1', + 'log', + 'log10', + 'log2', + 'log1p', + 'degrees', + 'radians', + 'sin', + 'cos', + 'tan', + 'arcsin', + 'arccos', + 'arctan', + 'sinh', + 'cosh', + 'tanh', + 'arcsinh', + 'arctanh', + 'sqrt', + 'rsqrt', + 'cbrt', + 'rcbrt', + 'square', + 'squeeze', + 'zeros_like', + 'ones_like', + 'flatten', + 'round', + 'rint', + 'fix', + 'floor', + 'ceil', + 'trunc', + 'sign', + 'reciprocal', + 'abs', + 'gamma', + 'gammaln', + 'erf', + 'erfinv', + 'negative', + ] + arr = mx.random.uniform(shape=rand_shape_2d()) + a = mx.sym.Variable('a') + for op_name in unary_ops: + print("Checking fusion of " + op_name) + op = getattr(mx.sym, op_name) + sym = op(a) + check_fused_symbol(sym, a=arr) + + # unary ops requiring special treatment + + # arccosh needs input to be >= 1 + arr2 = arr + 1 + check_fused_symbol(mx.sym.arccosh(a), a=arr2) + + # Activation requires act_type attribute + for act_type in ['relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']: + check_fused_symbol(mx.sym.Activation(a, act_type=act_type), a=arr) + + # Cast requires dtype + for dtype in ['float16', 'float32', 'float64', 'int32']: + check_fused_symbol(mx.sym.Cast(a, dtype=dtype), a=arr) + + # reshape requires shape + check_fused_symbol(mx.sym.reshape(a, shape=(-1,)), a=arr) + + # expand_dims requires axis + check_fused_symbol(mx.sym.expand_dims(a, axis=1), a=arr) + + # clip requires a_min, a_max + check_fused_symbol(mx.sym.clip(a, a_min=0.3, a_max=0.7), a=arr) + + # smooth_l1 requires a scalar + check_fused_symbol(mx.sym.smooth_l1(a, scalar=0.3), a=arr) + +def check_binary_ops(): + a = mx.sym.Variable('a') + b = mx.sym.Variable('b') + shape = rand_shape_2d() + arr1 = mx.random.uniform(shape=shape) + arr2 = mx.random.uniform(shape=shape) + + check_fused_symbol(a+b, a=arr1, b=arr2) + check_fused_symbol(a+3, a=arr1) + check_fused_symbol(a-b, a=arr1, b=arr2) + check_fused_symbol(a-3, a=arr1) + check_fused_symbol(3-a, a=arr1) + check_fused_symbol(a*b, a=arr1, b=arr2) + check_fused_symbol(a*3, a=arr1) + check_fused_symbol(a/b, a=arr1, b=arr2) + check_fused_symbol(a/3, a=arr1) + check_fused_symbol(3/a, a=arr1) + check_fused_symbol(a**b, a=arr1, b=arr2) + check_fused_symbol(a**3, a=arr1) + check_fused_symbol(mx.sym.pow(3,a), a=arr1) + check_fused_symbol(mx.sym.maximum(a,b), a=arr1, b=arr2) + check_fused_symbol(mx.sym.minimum(a,b), a=arr1, b=arr2) + check_fused_symbol(mx.sym.hypot(a,b), a=arr1, b=arr2) + check_fused_symbol(mx.sym.hypot(a,3), a=arr1) + +@with_seed() +def test_fusion(): + check_unary_ops() + check_binary_ops() From 6c3a75a9c7f3cf6a662cd006c017fb9ab1980ee3 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 9 Jul 2019 10:03:12 -0700 Subject: [PATCH 070/105] Adding slice ops and add_n to tests --- tests/python/gpu/test_fusion.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index 478dd5dd9965..737b6a378a7d 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -16,6 +16,7 @@ # under the License. import os +import random import mxnet as mx import numpy as np from mxnet.test_utils import * @@ -29,8 +30,8 @@ def check_fused_symbol(sym, **kwargs): shapes = {inp : kwargs[inp].shape for inp in inputs} test_sym = mx.sym.Group([mx.sym.identity(s) for s in sym]) rtol = {'float16' : 1e-2, - 'float32' : 5e-7, - 'float64' : 5e-7, + 'float32' : 1e-6, + 'float64' : 1e-6, } atol = {'float16' : 1e-3, 'float32' : 1e-7, @@ -56,7 +57,6 @@ def check_fused_symbol(sym, **kwargs): continue assert orig is not None assert fused is not None - print(orig, fused) np.testing.assert_allclose(orig.asnumpy(), fused.asnumpy(), rtol=rtol[dtype], atol=atol[dtype]) def check_unary_ops(): @@ -166,7 +166,30 @@ def check_binary_ops(): check_fused_symbol(mx.sym.hypot(a,b), a=arr1, b=arr2) check_fused_symbol(mx.sym.hypot(a,3), a=arr1) +def check_other_ops(): + a = mx.sym.Variable('a') + b = mx.sym.Variable('b') + c = mx.sym.Variable('c') + shape = rand_shape_2d() + shape = (5,) + shape + arr1 = mx.random.uniform(shape=shape) + arr2 = mx.random.uniform(shape=shape) + arr3 = mx.random.uniform(shape=shape) + + check_fused_symbol(mx.sym.add_n(a,b,c), a=arr1, b=arr2, c=arr3) + + check_fused_symbol(mx.sym.slice_axis(a, axis=0, begin=1, end=4), a=arr1) + + begin = (random.randint(0, shape[0]-2), + random.randint(0, shape[1]-2), + random.randint(0, shape[2]-2)) + end = (random.randint(begin[0]+1, shape[0]-1), + random.randint(begin[1]+1, shape[1]-1), + random.randint(begin[2]+1, shape[2]-1)) + check_fused_symbol(mx.sym.slice(a, begin=begin, end=end), a=arr1) + @with_seed() def test_fusion(): check_unary_ops() check_binary_ops() + check_other_ops() From 6d0eaf3e42a65c36347a0ba0dd189e09bfea213c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 10 Jul 2019 11:19:55 -0700 Subject: [PATCH 071/105] Fixes from review --- docs/faq/env_var.md | 2 +- src/executor/pointwise_fusion_pass.cc | 4 ++++ src/operator/fusion/fused_op.cc | 28 +++++++++++++-------------- src/operator/fusion/fused_op.cu | 6 +++--- src/operator/fusion/fused_op.h | 2 +- tests/python/gpu/test_fusion.py | 16 +++++++++------ 6 files changed, 33 insertions(+), 25 deletions(-) diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index bb525231a178..bef768caed25 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -313,7 +313,7 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`. - Values: 0(false) or 1(true) ```(default=1)``` - If this variable is set, MXNet will try fusing some of the operations (pointwise operations only for now). - It works in Symbolic execution as well as in Gluon models hybridized with ```static_alloc=True``` option. - - Only applies to MXNet that has been compiled with CUDA (```pip install mxnet-cuXX``` or built from source with ```USE_CUDA=1```) + - Only applies to MXNet that has been compiled with CUDA (```pip install mxnet-cuXX``` or built from source with ```USE_CUDA=1```) and running on GPU. * MXNET_FUSION_VERBOSE - Values: 0(false) or 1(true) ```(default=0)``` diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 92c7ba5dfd9d..381606f920e1 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -204,6 +204,10 @@ Graph ReplaceSubgraphsPointwise(Graph&& g, const std::vector& sub return new_graph; } +/* \brief Add nodes as inputs to the subgraph. This is used for operations + * which are only compatible when they are the first nodes in the + * subgraph. + */ template void AddInputsOnlyCompatible(const Graph &g, std::vector >* subsets, diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 34a61654e8b4..a703e9bc242c 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -50,8 +50,8 @@ void FusedOpParamParser(nnvm::NodeAttrs* attrs) { FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { this->inputs_ = std::vector(config.num_inputs); this->outputs_ = std::vector(config.num_outputs); - this->symbol_ = nnvm::Graph(); - this->symbol_.outputs = attrs->subgraphs[0]->outputs; + this->subgraph_ = nnvm::Graph(); + this->subgraph_.outputs = attrs->subgraphs[0]->outputs; this->initialized_ = false; this->cc_major_ = -1; this->cc_minor_ = -1; @@ -60,18 +60,18 @@ FusedOp::FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config) { bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { - this->symbol_.attrs.erase("shape"); - this->symbol_.attrs.erase("shape_inputs"); + this->subgraph_.attrs.erase("shape"); + this->subgraph_.attrs.erase("shape_inputs"); std::vector input_shapes(*in_attrs); - this->symbol_ = mxnet::exec::InferShape(std::move(this->symbol_), + this->subgraph_ = mxnet::exec::InferShape(std::move(this->subgraph_), std::move(input_shapes), "__shape__"); - const auto& g = this->symbol_.indexed_graph(); + const auto& g = this->subgraph_.indexed_graph(); const auto& input_nids = g.input_nodes(); std::vector out_shapes; - const std::vector shapes = this->symbol_.GetAttr("shape"); + const std::vector shapes = this->subgraph_.GetAttr("shape"); for (auto& e : g.outputs()) { out_shapes.push_back(shapes[g.entry_id(e)]); } @@ -99,18 +99,18 @@ bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { - this->symbol_.attrs.erase("dtype"); - this->symbol_.attrs.erase("dtype_inputs"); + this->subgraph_.attrs.erase("dtype"); + this->subgraph_.attrs.erase("dtype_inputs"); std::vector input_types(*in_attrs); - this->symbol_ = mxnet::exec::InferType(std::move(this->symbol_), + this->subgraph_ = mxnet::exec::InferType(std::move(this->subgraph_), std::move(input_types), "__dtype__"); - const auto& g = this->symbol_.indexed_graph(); + const auto& g = this->subgraph_.indexed_graph(); const auto& input_nids = g.input_nodes(); std::vector out_types; - const std::vector types = this->symbol_.GetAttr("dtype"); + const std::vector types = this->subgraph_.GetAttr("dtype"); for (auto& e : g.outputs()) { out_types.push_back(types[g.entry_id(e)]); } @@ -138,8 +138,8 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, template std::pair, std::vector> FusedOp::GetAttrs(const std::string& attr_name, const uint32_t node_id) { - const auto& g = this->symbol_.indexed_graph(); - const std::vector attrs = this->symbol_.GetAttr>(attr_name); + const auto& g = this->subgraph_.indexed_graph(); + const std::vector attrs = this->subgraph_.GetAttr>(attr_name); const auto& node = g[node_id]; std::vector inputs, outputs; for (const auto& e : node.inputs) { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index f0048ff7b00c..9d8e1e3f07b0 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -141,7 +141,7 @@ void FusedOp::GenerateCode(const std::vector &req, const std::vector &out_ndims, const int nvec, const std::string &kernel_name) { - const auto& g = this->symbol_.indexed_graph(); + const auto& g = this->subgraph_.indexed_graph(); std::string code = ""; int temp_name_counter = 0; using NodeEntry = nnvm::IndexedGraph::NodeEntry; @@ -319,7 +319,7 @@ void FusedOp::GenerateCode(const std::vector &req, if (op_name == "_backward_cast") { CHECK_EQ(outputs[i], 1); - const std::vector& types = this->symbol_.GetAttr("dtype"); + const std::vector& types = this->subgraph_.GetAttr("dtype"); const int output_type = types[g.entry_id(i, 0)]; const auto& arg = variables[{node.inputs[0].node_id, node.inputs[0].index}]; code += "const auto " + var_name + " = cast<" + mshadowTypeToString(output_type) + @@ -374,7 +374,7 @@ void FusedOp::GenerateCode(const std::vector &req, std::string kernel_params = ""; std::string tensor_params = ""; nnvm::Symbol sym; - sym.outputs = this->symbol_.outputs; + sym.outputs = this->subgraph_.outputs; const std::vector input_names = sym.ListInputNames(nnvm::Symbol::kAll); size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index d133e1819409..8cd90a34db8b 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -116,7 +116,7 @@ class FusedOp { std::vector outputs_; std::string code_; - nnvm::Graph symbol_; + nnvm::Graph subgraph_; std::vector> aux_in_shapes; std::vector> aux_out_shapes; diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index 737b6a378a7d..e262b53b1411 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -180,12 +180,12 @@ def check_other_ops(): check_fused_symbol(mx.sym.slice_axis(a, axis=0, begin=1, end=4), a=arr1) - begin = (random.randint(0, shape[0]-2), - random.randint(0, shape[1]-2), - random.randint(0, shape[2]-2)) - end = (random.randint(begin[0]+1, shape[0]-1), - random.randint(begin[1]+1, shape[1]-1), - random.randint(begin[2]+1, shape[2]-1)) + begin = (random.randint(0, shape[0]-1), + random.randint(0, shape[1]-1), + random.randint(0, shape[2]-1)) + end = (random.randint(begin[0]+1, shape[0]), + random.randint(begin[1]+1, shape[1]), + random.randint(begin[2]+1, shape[2])) check_fused_symbol(mx.sym.slice(a, begin=begin, end=end), a=arr1) @with_seed() @@ -193,3 +193,7 @@ def test_fusion(): check_unary_ops() check_binary_ops() check_other_ops() + +if __name__ == '__main__': + import nose + nose.runmodule() From 70735f2b64c0f8b4170c1c63c43327d2df23f353 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 11 Jul 2019 09:50:17 -0700 Subject: [PATCH 072/105] Setting inplace option --- src/operator/fusion/fused_op.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index a703e9bc242c..6c8fa5c1df91 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -206,6 +206,18 @@ NNVM_REGISTER_OP(_FusedOp) const FusedOpPtr& op = nnvm::get(attrs.parsed); return op->num_outputs(); }) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs) { + const FusedOpPtr& op = nnvm::get(attrs.parsed); + const auto num_inputs = op->num_inputs(); + const auto num_outputs = op->num_outputs(); + std::vector > ret; + for (auto i = 0u; i < num_inputs; ++i) { + for (auto j = 0u; j < num_outputs; ++j) { + ret.emplace_back(i,j); + } + } + return ret; + }) .set_attr("FProvideSubgraphShape", FusedOpProvideShape) .set_attr("FProvideSubgraphType", FusedOpProvideType) .set_attr("FProvideSubgraphStorageType", From 9049086b91d4d30b93396741359d9df50b5eaa61 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 12 Jul 2019 09:59:43 -0700 Subject: [PATCH 073/105] Fix lint --- src/operator/fusion/fused_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 6c8fa5c1df91..120fc003d29e 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -213,7 +213,7 @@ NNVM_REGISTER_OP(_FusedOp) std::vector > ret; for (auto i = 0u; i < num_inputs; ++i) { for (auto j = 0u; j < num_outputs; ++j) { - ret.emplace_back(i,j); + ret.emplace_back(i, j); } } return ret; From 6f56a8b173b5d980ad61f3c4622ef81a1cc1038e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 12 Jul 2019 15:33:06 -0700 Subject: [PATCH 074/105] Storing double in half --- src/operator/fusion/fused_op-inl.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index aac0cb7b124d..505c4762edec 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -263,13 +263,11 @@ inline DType1 store(const DType2 input, DType1* ref) { return input; } -template<> -inline half store(const float input, half* ref) { +template +inline half store(const DType input, half* ref) { return __float2half(input); } - - template struct VectorConfig { static_assert(size >= 4, "VectorConfig needs to have size of at least 4B"); From 171c24fbf1bd137f496d4a095360718153561b2e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 19 Jul 2019 09:14:03 -0700 Subject: [PATCH 075/105] Retrigger CI From 26b19ed39a111a07edaaf638cb49b80af3fe6344 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 23 Jul 2019 09:06:41 -0700 Subject: [PATCH 076/105] Slight relaxing of the relative tolerance in the test --- tests/python/gpu/test_fusion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index e262b53b1411..2a0315dfd6f0 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -30,8 +30,8 @@ def check_fused_symbol(sym, **kwargs): shapes = {inp : kwargs[inp].shape for inp in inputs} test_sym = mx.sym.Group([mx.sym.identity(s) for s in sym]) rtol = {'float16' : 1e-2, - 'float32' : 1e-6, - 'float64' : 1e-6, + 'float32' : 1.5e-6, + 'float64' : 1.5e-6, } atol = {'float16' : 1e-3, 'float32' : 1e-7, From 912e831f806b434a47661e4b3c564c24ff25d380 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 24 Jul 2019 11:46:28 -0700 Subject: [PATCH 077/105] Move the env variable check to the end --- src/executor/graph_executor.cc | 2 +- src/imperative/cached_op.cc | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index dc440ed71cd7..f4b53b2c1106 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -987,7 +987,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, nnvm::Graph g = InitFullGraph(symbol, grad_req_types); #if MXNET_USE_CUDA && !defined(_WIN32) - if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) { + if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) { nnvm::Graph unoptimized_graph; common::CopyGraph(&unoptimized_graph, g, false); diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 855c02db96ca..9ff368dbafab 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -147,8 +147,9 @@ void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) { void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph, const Context& context, size_t num_forward_outputs, const bool inlining) { #if MXNET_USE_CUDA && !defined(_WIN32) - if (dmlc::GetEnv("MXNET_USE_FUSION", true) && context.dev_mask() == kGPU && - !inlining) { + if (context.dev_mask() == kGPU && + !inlining && + dmlc::GetEnv("MXNET_USE_FUSION", true)) { nnvm::Graph unoptimized_graph; common::CopyGraph(&unoptimized_graph, *full_graph, false); From 052576ec40257fe4195ac09c9e6ca8f3e338533e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 25 Jul 2019 15:23:15 -0700 Subject: [PATCH 078/105] Fix a race condition between InferShape and scheduled Forward --- src/operator/fusion/fused_op.cc | 8 ++++ src/operator/fusion/fused_op.cu | 82 ++++++++++++++++++++++++--------- src/operator/fusion/fused_op.h | 27 ++++++++++- 3 files changed, 94 insertions(+), 23 deletions(-) diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 120fc003d29e..e698912a7461 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -93,6 +93,10 @@ bool FusedOp::InferShape(const nnvm::NodeAttrs &attrs, for (const auto& attr : *out_attrs) { inferred = inferred && !op::shape_is_none(attr); } + if (inferred) { + std::lock_guard lock(my_mutex_); + intermediate_shapes_.push_back({*in_attrs, *out_attrs, shapes}); + } return inferred; } @@ -132,6 +136,10 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, for (const auto& attr : *out_attrs) { inferred = inferred && !op::type_is_none(attr); } + if (inferred) { + std::lock_guard lock(my_mutex_); + intermediate_dtypes_.push_back({*in_attrs, *out_attrs, types}); + } return inferred; } diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 9d8e1e3f07b0..9270898a1fe4 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -140,7 +140,9 @@ void FusedOp::GenerateCode(const std::vector &req, const std::vector &in_ndims, const std::vector &out_ndims, const int nvec, - const std::string &kernel_name) { + const std::string &kernel_name, + const std::vector &node_shapes, + const std::vector &node_dtypes) { const auto& g = this->subgraph_.indexed_graph(); std::string code = ""; int temp_name_counter = 0; @@ -319,8 +321,7 @@ void FusedOp::GenerateCode(const std::vector &req, if (op_name == "_backward_cast") { CHECK_EQ(outputs[i], 1); - const std::vector& types = this->subgraph_.GetAttr("dtype"); - const int output_type = types[g.entry_id(i, 0)]; + const int output_type = node_dtypes[g.entry_id(i, 0)]; const auto& arg = variables[{node.inputs[0].node_id, node.inputs[0].index}]; code += "const auto " + var_name + " = cast<" + mshadowTypeToString(output_type) + ">(" + arg + ");\n"; @@ -488,6 +489,56 @@ bool FusedOp::CheckComputeCapability(const OpContext &ctx) { return ret; } +void FusedOp::CheckShapesAndTypes(const std::vector &inputs, + const std::vector &outputs, + std::vector *in_dtypes, + std::vector *in_ndims, + std::vector *out_dtypes, + std::vector *out_ndims, + int *nvec) { + std::vector in_shapes; + std::vector out_shapes; + CHECK_EQ(inputs.size(), inputs_.size()); + CHECK_EQ(outputs.size(), outputs_.size()); + + for (size_t counter = 0; counter < inputs.size(); ++counter) { + const auto& blob = inputs[counter]; + in_dtypes->push_back(blob.type_flag_); + in_ndims->push_back(blob.ndim()); + in_shapes.push_back(blob.shape_); + initialized_ = initialized_ && blob.type_flag_ == inputs_[counter].dtype; + inputs_[counter].dtype = blob.type_flag_; + *nvec = max(*nvec, mshadowTypeToVectorLength(blob.type_flag_)); + } + + for (size_t counter = 0; counter < outputs.size(); ++counter) { + const auto& blob = outputs[counter]; + out_dtypes->push_back(blob.type_flag_); + out_ndims->push_back(blob.ndim()); + out_shapes.push_back(blob.shape_); + initialized_ = initialized_ && blob.type_flag_ == outputs_[counter].dtype; + outputs_[counter].dtype = blob.type_flag_; + *nvec = max(*nvec, mshadowTypeToVectorLength(blob.type_flag_)); + } + + for (auto it = intermediate_shapes_.begin(); + it != intermediate_shapes_.end(); + ++it) { + if (it->input_attr == in_shapes && it->output_attr == out_shapes) { + intermediate_shapes_.erase(intermediate_shapes_.begin(), it); + break; + } + } + for (auto it = intermediate_dtypes_.begin(); + it != intermediate_dtypes_.end(); + ++it) { + if (it->input_attr == *in_dtypes && it->output_attr == *out_dtypes) { + intermediate_dtypes_.erase(intermediate_dtypes_.begin(), it); + break; + } + } +} + template <> void FusedOp::Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, @@ -504,25 +555,11 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector out_ndims; int nvec = 1; - CHECK_EQ(inputs.size(), inputs_.size()); - for (size_t counter = 0; counter < inputs.size(); ++counter) { - const auto& blob = inputs[counter]; - in_dtypes.push_back(blob.type_flag_); - in_ndims.push_back(blob.ndim()); - initialized_ = initialized_ && (blob.type_flag_ == inputs_[counter].dtype); - inputs_[counter].dtype = blob.type_flag_; - nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); - } + CheckShapesAndTypes(inputs, outputs, &in_dtypes, &in_ndims, + &out_dtypes, &out_ndims, &nvec); - CHECK_EQ(outputs.size(), outputs_.size()); - for (size_t counter = 0; counter < outputs.size(); ++counter) { - const auto& blob = outputs[counter]; - out_dtypes.push_back(blob.type_flag_); - out_ndims.push_back(blob.ndim()); - initialized_ = initialized_ && (blob.type_flag_ == outputs_[counter].dtype); - outputs_[counter].dtype = blob.type_flag_; - nvec = max(nvec, mshadowTypeToVectorLength(blob.type_flag_)); - } + const auto& node_shapes = intermediate_shapes_[0].internal_attr; + const auto& node_dtypes = intermediate_dtypes_[0].internal_attr; // Check and save compute capability of the current GPU if (!CheckComputeCapability(ctx)) initialized_ = false; @@ -531,7 +568,8 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, saved_reqs_ = req; if (!initialized_) { - this->GenerateCode(req, in_dtypes, out_dtypes, in_ndims, out_ndims, nvec, attrs.name); + this->GenerateCode(req, in_dtypes, out_dtypes, in_ndims, out_ndims, + nvec, attrs.name, node_shapes, node_dtypes); this->CompileCode(attrs.name); initialized_ = true; } diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 8cd90a34db8b..fb1c395ded84 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -108,9 +108,18 @@ class FusedOp { const std::vector &in_ndims, const std::vector &out_ndims, const int nvec, - const std::string& kernel_name); + const std::string& kernel_name, + const std::vector &node_shapes, + const std::vector &node_dtypes); void CompileCode(const std::string &kernel_name); bool CheckComputeCapability(const OpContext &ctx); + void CheckShapesAndTypes(const std::vector &inputs, + const std::vector &outputs, + std::vector *in_dtypes, + std::vector *in_ndims, + std::vector *out_dtypes, + std::vector *out_ndims, + int *nvec); std::vector inputs_; std::vector outputs_; @@ -118,6 +127,22 @@ class FusedOp { std::string code_; nnvm::Graph subgraph_; + template + struct IntermediateAttr { + std::vector input_attr; + std::vector output_attr; + std::vector internal_attr; + }; + + // Shapes and types inside the subgraph + // copied here, because a subsequent call + // to InferShape/InferType can overwrite the + // original information stored in subgraph_ + // attributes while the previous iterations + // still need them. + std::vector > intermediate_shapes_; + std::vector > intermediate_dtypes_; + std::vector> aux_in_shapes; std::vector> aux_out_shapes; std::vector> aux_in_types; From 0e1918fbfd6885fe3ecf9ad90643c79c48c10351 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 25 Jul 2019 18:10:35 -0700 Subject: [PATCH 079/105] Fix flakey test_fusion test involving fp32 erfinv op. --- tests/python/gpu/test_fusion.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index 2a0315dfd6f0..682a8f2284b8 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -104,13 +104,16 @@ def check_unary_ops(): 'gamma', 'gammaln', 'erf', - 'erfinv', 'negative', ] + + def announce_check(op_name): + print("Checking fusion of " + op_name) + arr = mx.random.uniform(shape=rand_shape_2d()) a = mx.sym.Variable('a') for op_name in unary_ops: - print("Checking fusion of " + op_name) + announce_check(op_name) op = getattr(mx.sym, op_name) sym = op(a) check_fused_symbol(sym, a=arr) @@ -119,26 +122,38 @@ def check_unary_ops(): # arccosh needs input to be >= 1 arr2 = arr + 1 + announce_check('arccosh') check_fused_symbol(mx.sym.arccosh(a), a=arr2) + # erfinv needs -1 < input < 1, but we avoid the limits of this range where the slope nears +inf. + arr2 = (arr - 0.5) * 1.99 + announce_check('erfinv') + check_fused_symbol(mx.sym.erfinv(a), a=arr2) + # Activation requires act_type attribute for act_type in ['relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']: + announce_check("Activation(act_type='{}')".format(act_type)) check_fused_symbol(mx.sym.Activation(a, act_type=act_type), a=arr) # Cast requires dtype for dtype in ['float16', 'float32', 'float64', 'int32']: + announce_check("Cast(dtype='{}')".format(dtype)) check_fused_symbol(mx.sym.Cast(a, dtype=dtype), a=arr) # reshape requires shape + announce_check('reshape') check_fused_symbol(mx.sym.reshape(a, shape=(-1,)), a=arr) # expand_dims requires axis + announce_check('expand_dims') check_fused_symbol(mx.sym.expand_dims(a, axis=1), a=arr) # clip requires a_min, a_max + announce_check('clip') check_fused_symbol(mx.sym.clip(a, a_min=0.3, a_max=0.7), a=arr) # smooth_l1 requires a scalar + announce_check('smooth_l1') check_fused_symbol(mx.sym.smooth_l1(a, scalar=0.3), a=arr) def check_binary_ops(): From 7e1df6a088506158588e90474fe8fd691b14babc Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 29 Jul 2019 15:52:23 -0700 Subject: [PATCH 080/105] Fix from review --- src/operator/fusion/fused_op.cu | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 9270898a1fe4..277f2286ddd7 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -431,9 +431,9 @@ void FusedOp::CompileCode(const std::string &kernel_name) { nvrtcCreateProgram(&program, // prog &code_[0], // buffer (kernel_name + "_kernel.cu").c_str(), // name - 0, // numHeaders + 0, // num headers NULL, // headers - NULL)); // includeNames + NULL)); // include names std::string gpu_arch = "--gpu-architecture=compute_" + std::to_string(this->cc_major_) + std::to_string(this->cc_minor_); @@ -445,18 +445,18 @@ void FusedOp::CompileCode(const std::string &kernel_name) { NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str())); nvrtcResult compileResult = nvrtcCompileProgram(program, // prog - 3, // numOptions + 3, // num options opts); // options // Obtain compilation log from the program. - size_t logSize; - NVRTC_CALL(nvrtcGetProgramLogSize(program, &logSize)); - std::string log(logSize, '\0'); + size_t log_size; + NVRTC_CALL(nvrtcGetProgramLogSize(program, &log_size)); + std::string log(log_size, '\0'); NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; // Obtain PTX from the program. - size_t ptxSize; - NVRTC_CALL(nvrtcGetPTXSize(program, &ptxSize)); - ptx_.reserve(ptxSize); + size_t ptx_size; + NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size)); + ptx_.reserve(ptx_size); NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); const char *name; NVRTC_CALL(nvrtcGetLoweredName(program, @@ -466,12 +466,12 @@ void FusedOp::CompileCode(const std::string &kernel_name) { // Destroy the program. NVRTC_CALL(nvrtcDestroyProgram(&program)); int device; - CUdevice cuDevice; + CUdevice cu_device; CUcontext context; CUmodule module; CUDA_CALL(cudaGetDevice(&device)); - CUDA_DRIVER_CALL(cuDeviceGet(&cuDevice, device)); - CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cuDevice)); + CUDA_DRIVER_CALL(cuDeviceGet(&cu_device, device)); + CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device)); CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[0])); CUDA_DRIVER_CALL(cuModuleGetFunction(&kernel_, module, From 7a9273876bb00828f1f5b2afb2de5758a3081502 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Wed, 10 Jul 2019 21:53:43 -0700 Subject: [PATCH 081/105] Added broadcast_like and slice_like to fused op --- src/operator/fusion/fused_op-inl.h | 10 ++-- src/operator/fusion/fused_op.cu | 84 +++++++++++++++++++++--------- src/operator/fusion/fused_op.h | 1 + 3 files changed, 67 insertions(+), 28 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 505c4762edec..b1d86ffff320 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -224,6 +224,8 @@ const std::map>> ops_desc = { const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, + {"slice_like" , ""}, + {"broadcast_like" , ""}, }; const std::vector variable_io_ops = { @@ -364,7 +366,7 @@ inline VectorType load_slice(const DType * input, const Shape for (int dim = ndim-1; dim >=0; dim--) { if (begin[dim] < 0) begin[dim] = shape[dim] - begin[dim]; if (end[dim] < 0) end[dim] = shape[dim] - end[dim]; - if (end[dim] > shape[dim]) end[dim] = shape[dim]; + if (end[dim] == INT_MAX) end[dim] = shape[dim]; if (dim > 0) { ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]); strides[dim-1] = strides[dim] * shape[dim]; @@ -378,7 +380,9 @@ inline VectorType load_slice(const DType * input, const Shape #pragma unroll for (int dim = 0; dim < ndim; dim++) { int stride = ref_strides[dim]; - idx[j] += (ref_idx / stride + begin[dim]) * strides[dim]; + if (shape[dim] > 1) { + idx[j] += (ref_idx / stride + begin[dim]) * strides[dim]; + } ref_idx = ref_idx % stride; } if (j > 0 && (idx[j] != (idx[j-1] + 1))) { @@ -397,8 +401,6 @@ inline VectorType load_slice(const DType * input, const Shape return load_index(input, idx[0], shape); } - - template inline void store_index(const VectorType value, int i, DType * output, const Shape& shape) { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 277f2286ddd7..8f54d42967fd 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -108,27 +108,32 @@ std::string ParseOpDescription(const std::vector& op_desc, return fmt; } +void AddShape(const mxnet::TShape& shape, + std::vector>* shapes) { + // We need alignment to 8 bytes for size_t in the Shape struct + // so if ndim is odd, there will be 4B of padding + int ndim = shape.ndim(); + const int offset = ndim % 2 == 0 ? 2 : 3; + shapes->push_back(std::vector(ndim + offset)); + std::vector& tensor_shapes = shapes->back(); + size_t total_size = 1; + for (int i = ndim-1; i >= 0; i--) { + tensor_shapes[i] = shape[i]; + total_size *= shape[i]; + } + size_t * shape_size_ptr = reinterpret_cast(&tensor_shapes[ndim + offset - 2]); + *shape_size_ptr = total_size; +} + void AddPointerAndShape(const TBlob& data, std::vector *ptrs, std::vector>* shapes, mshadow::Stream * s) { using namespace mshadow; MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { - int ndim = data.ndim(); Tensor tensor = data.FlatTo1D(s); ptrs->push_back(tensor.dptr_); - // We need alignment to 8 bytes for size_t in the Shape struct - // so if ndim is odd, there will be 4B of padding - const int offset = ndim % 2 == 0 ? 2 : 3; - shapes->push_back(std::vector(ndim + offset)); - std::vector& tensor_shapes = shapes->back(); - size_t total_size = 1; - for (int i = ndim-1; i >= 0; i--) { - tensor_shapes[i] = data.shape_[i]; - total_size *= data.shape_[i]; - } - size_t * shape_size_ptr = reinterpret_cast(&tensor_shapes[ndim + offset - 2]); - *shape_size_ptr = total_size; + AddShape(data.shape_, shapes); }); } @@ -201,20 +206,35 @@ void FusedOp::GenerateCode(const std::vector &req, replaceString(&out, "None", def); return out; }; - std::string begin = parse_tuple(source->attrs.dict.at("begin"), "0"); - std::string end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); - if (op_name == "slice_axis") { - std::string axis = source->attrs.dict.at("axis"); + std::string begin; + std::string end; + if (op_name == "broadcast_like" || op_name == "slice_like") { std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; - std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; code += "Shape "+ begin_var_name + ";\n"; - code += "Shape "+ end_var_name + ";\n"; code += begin_var_name + ".set(0);\n"; - code += end_var_name + ".set(INT_MAX);\n"; - code += begin_var_name + "["+axis+"] = " + begin + ";\n"; - code += end_var_name + "["+axis+"] = " + end + ";\n"; + int like_id = node.inputs[1].node_id; + if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == extra_shape_args_.end()) { + extra_shape_args_.push_back(like_id); + } + std::string end_var_name = "extra_" + std::to_string(like_id) + "_shape"; begin = begin_var_name; end = end_var_name; + } else { + begin = parse_tuple(source->attrs.dict.at("begin"), "0"); + end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); + if (op_name == "slice_axis") { + std::string axis = source->attrs.dict.at("axis"); + std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; + std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; + code += "Shape "+ begin_var_name + ";\n"; + code += "Shape "+ end_var_name + ";\n"; + code += begin_var_name + ".set(0);\n"; + code += end_var_name + ".set(INT_MAX);\n"; + code += begin_var_name + "["+axis+"] = " + begin + ";\n"; + code += end_var_name + "["+axis+"] = " + end + ";\n"; + begin = begin_var_name; + end = end_var_name; + } } code += "const auto " + vec_name + " = load_slice(" + var_name + ", " + var_name + "_shape," + begin + @@ -380,6 +400,14 @@ void FusedOp::GenerateCode(const std::vector &req, size_t num_params = in_dtypes.size() + out_dtypes.size(); size_t i = 0; std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; + + const auto& shapes = this->symbol_.GetAttr("shape"); + for (const auto &shape_id: extra_shape_args_) { + std::string shape_name = "extra_" + std::to_string(shape_id) + "_shape"; + int ndim = shapes[shape_id].ndim(); + kernel_params += " const Shape<" + std::to_string(ndim) + "> " + shape_name; + kernel_params += ", "; + } for (const auto &type : in_dtypes) { std::string type_name = mshadowTypeToString(type); std::string dtype_var = "DType_" + input_names[i]; @@ -388,7 +416,8 @@ void FusedOp::GenerateCode(const std::vector &req, aux_code = "static const int " + dim_var + " = " + \ std::to_string(in_ndims[i]) + ";\n" + aux_code; tensor_params += dtype_var + "* " +input_names[i]; - kernel_params += " const Shape<" + dim_var + "> " + input_names[i]+"_shape"; + //kernel_params += " const Shape<" + dim_var + "> " + input_names[i]+"_shape"; + kernel_params += " const Shape<" + std::to_string(in_ndims[i]) + "> " + input_names[i]+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; @@ -404,7 +433,8 @@ void FusedOp::GenerateCode(const std::vector &req, std::to_string(out_ndims[i - in_dtypes.size()]) + ";\n" + aux_code; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; tensor_params += dtype_var + "* " + out_name; - kernel_params += " const Shape<" + dim_var + "> " + out_name+"_shape"; + //kernel_params += " const Shape<" + dim_var + "> " + out_name+"_shape"; + kernel_params += " const Shape<" + std::to_string(out_ndims[i-in_dtypes.size()]) + "> " + out_name+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; @@ -412,6 +442,7 @@ void FusedOp::GenerateCode(const std::vector &req, kernel_params += ", "; } kernel_params += tensor_params; + code_ = std::string(fusion::fp16_support_string) + "\n" + fusion::type_support_string + "\n" + fusion::function_definitions + "\n" + @@ -587,6 +618,11 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector ptrs; std::vector> shapes; + + const auto& node_shapes = this->symbol_.GetAttr("shape"); + for (const auto &shape_id: extra_shape_args_) { + AddShape(node_shapes[shape_id], &shapes); + } for (const auto &data : inputs) { AddPointerAndShape(data, &ptrs, &shapes, s); } diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index fb1c395ded84..176ad04956a7 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -148,6 +148,7 @@ class FusedOp { std::vector> aux_in_types; std::vector> aux_out_types; std::vector saved_reqs_; + std::vector extra_shape_args_; std::string ptx_; std::string kernel_name_; From a1dee5831cfb25115b976b119cacddac0cc28925 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Wed, 10 Jul 2019 22:10:27 -0700 Subject: [PATCH 082/105] Minor fix and cleanup --- src/operator/fusion/fused_op.cu | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 8f54d42967fd..8023a6196801 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -401,7 +401,7 @@ void FusedOp::GenerateCode(const std::vector &req, size_t i = 0; std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; - const auto& shapes = this->symbol_.GetAttr("shape"); + const auto& shapes = this->subgraph_.GetAttr("shape"); for (const auto &shape_id: extra_shape_args_) { std::string shape_name = "extra_" + std::to_string(shape_id) + "_shape"; int ndim = shapes[shape_id].ndim(); @@ -412,12 +412,11 @@ void FusedOp::GenerateCode(const std::vector &req, std::string type_name = mshadowTypeToString(type); std::string dtype_var = "DType_" + input_names[i]; std::string dim_var = "ndim_" + input_names[i]; + std::string dim_val = std::to_string(in_ndims[i]); aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; - aux_code = "static const int " + dim_var + " = " + \ - std::to_string(in_ndims[i]) + ";\n" + aux_code; + aux_code = "static const int " + dim_var + " = " + dim_val + ";\n" + aux_code; tensor_params += dtype_var + "* " +input_names[i]; - //kernel_params += " const Shape<" + dim_var + "> " + input_names[i]+"_shape"; - kernel_params += " const Shape<" + std::to_string(in_ndims[i]) + "> " + input_names[i]+"_shape"; + kernel_params += " const Shape<" + dim_val + "> " + input_names[i]+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; @@ -429,12 +428,11 @@ void FusedOp::GenerateCode(const std::vector &req, std::string out_name = "output" + std::to_string(i - in_dtypes.size()); std::string dtype_var = "DType_" + out_name; std::string dim_var = "ndim_" + out_name; - aux_code = "static const int " + dim_var + " = " + \ - std::to_string(out_ndims[i - in_dtypes.size()]) + ";\n" + aux_code; + std::string dim_val = std::to_string(out_ndims[i - in_dtypes.size()]); + aux_code = "static const int " + dim_var + " = " + dim_val + ";\n" + aux_code; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; tensor_params += dtype_var + "* " + out_name; - //kernel_params += " const Shape<" + dim_var + "> " + out_name+"_shape"; - kernel_params += " const Shape<" + std::to_string(out_ndims[i-in_dtypes.size()]) + "> " + out_name+"_shape"; + kernel_params += " const Shape<" + dim_val + "> " + out_name+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; @@ -619,7 +617,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector ptrs; std::vector> shapes; - const auto& node_shapes = this->symbol_.GetAttr("shape"); + const auto& node_shapes = this->subgraph_.GetAttr("shape"); for (const auto &shape_id: extra_shape_args_) { AddShape(node_shapes[shape_id], &shapes); } From 36201fe567cac0c1414b769e099b51a39f54843a Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Thu, 1 Aug 2019 16:33:42 -0700 Subject: [PATCH 083/105] Added negative axis support in slice_axis, temporarily disabled fusion of slice_like and broadcast_like --- src/operator/fusion/fused_op-inl.h | 4 ++-- src/operator/fusion/fused_op.cu | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index b1d86ffff320..69398de7402c 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -224,8 +224,8 @@ const std::map>> ops_desc = { const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, - {"slice_like" , ""}, - {"broadcast_like" , ""}, + //{"slice_like" , ""}, + //{"broadcast_like" , ""}, }; const std::vector variable_io_ops = { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 8023a6196801..d2b0a870df91 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -208,9 +208,10 @@ void FusedOp::GenerateCode(const std::vector &req, }; std::string begin; std::string end; + std::string ndim_var_name = "ndim_" + var_name; if (op_name == "broadcast_like" || op_name == "slice_like") { std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; - code += "Shape "+ begin_var_name + ";\n"; + code += "Shape<" + var_name + "> "+ begin_var_name + ";\n"; code += begin_var_name + ".set(0);\n"; int like_id = node.inputs[1].node_id; if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == extra_shape_args_.end()) { @@ -224,10 +225,13 @@ void FusedOp::GenerateCode(const std::vector &req, end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); if (op_name == "slice_axis") { std::string axis = source->attrs.dict.at("axis"); + if (std::stoi(axis) < 0) { + axis = ndim_var_name + axis; + } std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; - code += "Shape "+ begin_var_name + ";\n"; - code += "Shape "+ end_var_name + ";\n"; + code += "Shape<" + ndim_var_name + "> "+ begin_var_name + ";\n"; + code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; code += begin_var_name + ".set(0);\n"; code += end_var_name + ".set(INT_MAX);\n"; code += begin_var_name + "["+axis+"] = " + begin + ";\n"; From c077e978ab7da81d99472b40fd0884a48cb455bb Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Thu, 1 Aug 2019 18:10:45 -0700 Subject: [PATCH 084/105] Added axes support to slice_like --- src/operator/fusion/fused_op-inl.h | 2 +- src/operator/fusion/fused_op.cu | 48 +++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 69398de7402c..6df15765c9f9 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -224,7 +224,7 @@ const std::map>> ops_desc = { const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, - //{"slice_like" , ""}, + {"slice_like" , ""}, //{"broadcast_like" , ""}, }; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index d2b0a870df91..a6bd90029b91 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -86,6 +86,25 @@ inline void replaceString(std::string *input, const std::string old, const std:: } } +inline std::vector splitStringToVector(const std::string& input) { + size_t pos_start = 0, pos_end; + const std::string& s = input.substr(1, input.length()-2); + std::vector res; + + while ((pos_end = s.find (",", pos_start)) != std::string::npos) { + std::string token = s.substr (pos_start, pos_end - pos_start); + pos_start = pos_end + 1; + if (token.length() > 0) { + res.push_back (token); + } + } + + if (pos_start < s.length()-1) { + res.push_back (s.substr (pos_start)); + } + return res; +} + std::string ParseOpDescription(const std::vector& op_desc, const std::map, std::string>& variables, const nnvm::IndexedGraph::Node& node) { @@ -198,6 +217,7 @@ void FusedOp::GenerateCode(const std::vector &req, int arg_id = node.inputs[0].node_id; const auto& var_name = g[arg_id].source->attrs.name; const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); + std::string ndim_var_name = "ndim_" + var_name; load_index[arg_id] = 0; auto parse_tuple = [](const std::string& input, const std::string def) { std::string out = input; @@ -206,28 +226,42 @@ void FusedOp::GenerateCode(const std::vector &req, replaceString(&out, "None", def); return out; }; + auto parse_axis = [ndim_var_name](const std::string& axis) { + if (std::stoi(axis) < 0) { + return ndim_var_name + axis; + } + return axis; + }; std::string begin; std::string end; - std::string ndim_var_name = "ndim_" + var_name; if (op_name == "broadcast_like" || op_name == "slice_like") { std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; - code += "Shape<" + var_name + "> "+ begin_var_name + ";\n"; + code += "Shape<" + ndim_var_name + "> "+ begin_var_name + ";\n"; code += begin_var_name + ".set(0);\n"; int like_id = node.inputs[1].node_id; if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == extra_shape_args_.end()) { extra_shape_args_.push_back(like_id); } - std::string end_var_name = "extra_" + std::to_string(like_id) + "_shape"; + std::string end_var_name; + std::string extra_var_name = "extra_" + std::to_string(like_id) + "_shape"; + if (source->attrs.dict.count("axes") == 0) { + end_var_name = extra_var_name; + } else { + std::string axes = source->attrs.dict.at("axes"); + end_var_name = var_name + "_" + std::to_string(i) + "_end"; + code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; + code += end_var_name + ".set(INT_MAX);\n"; + for (auto axis: splitStringToVector(axes)) { + code += end_var_name + "["+axis+"] = " + extra_var_name + "["+axis+"]" + + ";\n"; + } + } begin = begin_var_name; end = end_var_name; } else { begin = parse_tuple(source->attrs.dict.at("begin"), "0"); end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); if (op_name == "slice_axis") { - std::string axis = source->attrs.dict.at("axis"); - if (std::stoi(axis) < 0) { - axis = ndim_var_name + axis; - } + std::string axis = parse_axis(source->attrs.dict.at("axis")); std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; code += "Shape<" + ndim_var_name + "> "+ begin_var_name + ";\n"; From 3f0bfb41d6ead1401c27814f8fd2592a1e60a82b Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Fri, 2 Aug 2019 16:52:37 -0700 Subject: [PATCH 085/105] Added axis support to broadcast_like --- src/operator/fusion/fused_op-inl.h | 2 +- src/operator/fusion/fused_op.cu | 36 +++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 6df15765c9f9..b1d86ffff320 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -225,7 +225,7 @@ const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, {"slice_like" , ""}, - //{"broadcast_like" , ""}, + {"broadcast_like" , ""}, }; const std::vector variable_io_ops = { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index a6bd90029b91..4e0fcd7f5438 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -244,15 +244,35 @@ void FusedOp::GenerateCode(const std::vector &req, } std::string end_var_name; std::string extra_var_name = "extra_" + std::to_string(like_id) + "_shape"; - if (source->attrs.dict.count("axes") == 0) { - end_var_name = extra_var_name; + if (op_name == "slice_like") { + if (source->attrs.dict.count("axes") == 0) { + end_var_name = extra_var_name; + } else { + std::string axes = source->attrs.dict.at("axes"); + end_var_name = var_name + "_" + std::to_string(i) + "_end"; + code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; + code += end_var_name + ".set(INT_MAX);\n"; + for (auto axis: splitStringToVector(axes)) { + axis = parse_axis(axis); + code += end_var_name + "["+axis+"] = " + extra_var_name + "["+axis+"]" + + ";\n"; + } + } } else { - std::string axes = source->attrs.dict.at("axes"); - end_var_name = var_name + "_" + std::to_string(i) + "_end"; - code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; - code += end_var_name + ".set(INT_MAX);\n"; - for (auto axis: splitStringToVector(axes)) { - code += end_var_name + "["+axis+"] = " + extra_var_name + "["+axis+"]" + + ";\n"; + if (source->attrs.dict.count("lhs_axes") == 0) { + end_var_name = extra_var_name; + } else { + std::string lhs_axes = source->attrs.dict.at("lhs_axes"); + std::string rhs_axes = source->attrs.dict.at("rhs_axes"); + end_var_name = var_name + "_" + std::to_string(i) + "_end"; + code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; + code += end_var_name + ".set(INT_MAX);\n"; + std::vector v_lhs_axes = splitStringToVector(lhs_axes); + std::vector v_rhs_axes = splitStringToVector(rhs_axes); + for (int i = 0; i < v_lhs_axes.size(); i++) { + std::string lhs_axis = parse_axis(v_lhs_axes[i]); + std::string rhs_axis = parse_axis(v_rhs_axes[i]); + code += end_var_name + "["+lhs_axis+"] = " + extra_var_name + "["+rhs_axis+"]" + + ";\n"; + } } } begin = begin_var_name; From 1e20339157d4c61bdfff82c2618d17df290572f2 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Fri, 9 Aug 2019 09:59:35 -0700 Subject: [PATCH 086/105] Add fast_load_slice function to fused op code --- src/operator/fusion/fused_op-inl.h | 60 +++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index b1d86ffff320..fa585a1303c4 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -353,6 +353,20 @@ inline VectorType load_index(const DType * input, int i, const Shap } } +template +inline VectorType global_load_index(const DType * input, int i, const Shape &shape) { + if (i < shape.size) { + const auto* vector_input = reinterpret_cast< + const typename VectorConfig::IndexType *>( + input + i); + VectorType ret = {__ldg(vector_input)}; + return ret; + } else { + VectorType ret({0}); + return ret; + } +} + template inline VectorType load_slice(const DType * input, const Shape& shape, Shape begin, Shape end, int offset) { int idx[nvec]; @@ -372,7 +386,6 @@ inline VectorType load_slice(const DType * input, const Shape strides[dim-1] = strides[dim] * shape[dim]; } } - #pragma unroll for (int j = 0; j < nvec; j++) { idx[j] = 0; @@ -385,11 +398,12 @@ inline VectorType load_slice(const DType * input, const Shape } ref_idx = ref_idx % stride; } - if (j > 0 && (idx[j] != (idx[j-1] + 1))) { - mem_aligned = false; + if (j > 0) { + if (mem_aligned) mem_aligned = (idx[j] == (idx[j-1] + 1)); + } else { + if (mem_aligned) mem_aligned = ((idx[0] & (nvec-1)) == 0); } } - mem_aligned = mem_aligned && ((idx[0] % nvec) == 0); if (!mem_aligned) { VectorType ret; #pragma unroll @@ -398,7 +412,43 @@ inline VectorType load_slice(const DType * input, const Shape } return ret; } - return load_index(input, idx[0], shape); + return global_load_index(input, idx[0], shape); + //return load_index(input, idx[0], shape); +} + +template +inline VectorType fast_load_slice(const DType * input, const Shape& shape, Shape begin, Shape end, int offset) { + int idx[nvec]; + + Shape ref_strides; + Shape strides; + ref_strides[ndim-1] = 1; + strides[ndim-1] = 1; + #pragma unroll + for (int dim = ndim-1; dim >=0; dim--) { + if (begin[dim] < 0) begin[dim] = shape[dim] - begin[dim]; + if (end[dim] < 0) end[dim] = shape[dim] - end[dim]; + if (end[dim] == INT_MAX) end[dim] = shape[dim]; + if (dim > 0) { + ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]); + strides[dim-1] = strides[dim] * shape[dim]; + } + } + #pragma unroll + for (int j = 0; j < nvec; j++) { + idx[j] = 0; + int ref_idx = offset + j; + #pragma unroll + for (int dim = 0; dim < ndim; dim++) { + int stride = ref_strides[dim]; + if (shape[dim] > 1) { + idx[j] += (ref_idx / stride + begin[dim]) * strides[dim]; + } + ref_idx = ref_idx % stride; + } + } + return global_load_index(input, idx[0], shape); + //return load_index(input, idx[0], shape); } template From 13b3076a34112fdb5cfe1ffb504a94eec49c2ab4 Mon Sep 17 00:00:00 2001 From: Chaitanya Talnikar Date: Wed, 14 Aug 2019 11:22:05 -0700 Subject: [PATCH 087/105] Added runtime switch for choosing fast and slow slice kernel --- src/operator/fusion/fused_op-inl.h | 43 +++---- src/operator/fusion/fused_op.cu | 185 +++++++++++++++++++---------- src/operator/fusion/fused_op.h | 21 ++-- 3 files changed, 148 insertions(+), 101 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index fa585a1303c4..b516aac4a191 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -370,7 +370,6 @@ inline VectorType global_load_index(const DType * input, int i, con template inline VectorType load_slice(const DType * input, const Shape& shape, Shape begin, Shape end, int offset) { int idx[nvec]; - bool mem_aligned = true; Shape ref_strides; Shape strides; @@ -398,27 +397,18 @@ inline VectorType load_slice(const DType * input, const Shape } ref_idx = ref_idx % stride; } - if (j > 0) { - if (mem_aligned) mem_aligned = (idx[j] == (idx[j-1] + 1)); - } else { - if (mem_aligned) mem_aligned = ((idx[0] & (nvec-1)) == 0); - } } - if (!mem_aligned) { - VectorType ret; - #pragma unroll - for (int j = 0; j < nvec; j++) { - ret.x[j] = *(input + idx[j]); - } - return ret; + VectorType ret; + #pragma unroll + for (int j = 0; j < nvec; j++) { + ret.x[j] = *(input + idx[j]); } - return global_load_index(input, idx[0], shape); - //return load_index(input, idx[0], shape); + return ret; } template inline VectorType fast_load_slice(const DType * input, const Shape& shape, Shape begin, Shape end, int offset) { - int idx[nvec]; + int idx = 0; Shape ref_strides; Shape strides; @@ -434,21 +424,16 @@ inline VectorType fast_load_slice(const DType * input, const Shape< strides[dim-1] = strides[dim] * shape[dim]; } } + int ref_idx = offset; #pragma unroll - for (int j = 0; j < nvec; j++) { - idx[j] = 0; - int ref_idx = offset + j; - #pragma unroll - for (int dim = 0; dim < ndim; dim++) { - int stride = ref_strides[dim]; - if (shape[dim] > 1) { - idx[j] += (ref_idx / stride + begin[dim]) * strides[dim]; - } - ref_idx = ref_idx % stride; - } + for (int dim = 0; dim < ndim; dim++) { + int stride = ref_strides[dim]; + if (shape[dim] > 1) { + idx += (ref_idx / stride + begin[dim]) * strides[dim]; + } + ref_idx = ref_idx % stride; } - return global_load_index(input, idx[0], shape); - //return load_index(input, idx[0], shape); + return global_load_index(input, idx, shape); } template diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 4e0fcd7f5438..ff61b21db091 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -86,21 +86,28 @@ inline void replaceString(std::string *input, const std::string old, const std:: } } -inline std::vector splitStringToVector(const std::string& input) { +inline std::vector splitStringToVector(const std::string& input, const std::string def) { size_t pos_start = 0, pos_end; const std::string& s = input.substr(1, input.length()-2); - std::vector res; + std::vector res; + + auto convert_token = [def](std::string token){ + if (token == def) { + return 0; + } + return std::stoi(token); + }; while ((pos_end = s.find (",", pos_start)) != std::string::npos) { std::string token = s.substr (pos_start, pos_end - pos_start); pos_start = pos_end + 1; if (token.length() > 0) { - res.push_back (token); + res.push_back (convert_token(token)); } } - if (pos_start < s.length()-1) { - res.push_back (s.substr (pos_start)); + if (pos_start < s.length()) { + res.push_back (convert_token(s.substr (pos_start))); } return res; } @@ -127,7 +134,7 @@ std::string ParseOpDescription(const std::vector& op_desc, return fmt; } -void AddShape(const mxnet::TShape& shape, +void AddShape(const mxnet::TShape& shape, std::vector>* shapes) { // We need alignment to 8 bytes for size_t in the Shape struct // so if ndim is odd, there will be 4B of padding @@ -158,21 +165,23 @@ void AddPointerAndShape(const TBlob& data, } // namespace -void FusedOp::GenerateCode(const std::vector &req, +void FusedOp::GenerateCode(int kernel_index, const std::vector &req, const std::vector &in_dtypes, const std::vector &out_dtypes, const std::vector &in_ndims, const std::vector &out_ndims, + const mxnet::ShapeVector &node_shapes, + const std::vector &node_dtypes, const int nvec, const std::string &kernel_name, - const std::vector &node_shapes, - const std::vector &node_dtypes) { + std::vector* check_shapes) { const auto& g = this->subgraph_.indexed_graph(); std::string code = ""; int temp_name_counter = 0; using NodeEntry = nnvm::IndexedGraph::NodeEntry; std::map, std::string> variables; std::map load_index; + bool check_shapes_compile = true; std::vector outputs(g.num_nodes()); @@ -215,86 +224,117 @@ void FusedOp::GenerateCode(const std::vector &req, std::string op_name = source->op()->name; if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { int arg_id = node.inputs[0].node_id; + const auto& shape = node_shapes[arg_id]; + const int ndim = shape.ndim(); const auto& var_name = g[arg_id].source->attrs.name; const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - std::string ndim_var_name = "ndim_" + var_name; load_index[arg_id] = 0; auto parse_tuple = [](const std::string& input, const std::string def) { std::string out = input; replaceString(&out, "(", "{"); replaceString(&out, ")", "}"); replaceString(&out, "None", def); + replaceString(&out, " ", ""); return out; }; - auto parse_axis = [ndim_var_name](const std::string& axis) { - if (std::stoi(axis) < 0) { - return ndim_var_name + axis; + auto build_tuple = [ndim](int axis, const std::string str, const std::string def) { + std::string tuple = "{"; + for (int i = 0; i < axis; i++) { + tuple = tuple + def + ","; + } + tuple += str; + for (int i = axis + 1; i < ndim; i++) { + tuple = tuple + "," + def; + } + tuple += "}"; + return tuple; + }; + auto check_tuple = [ndim, nvec](const std::string str) { + std::vector tuple = splitStringToVector(str, "INT_MAX"); + if (tuple[ndim-1] % nvec == 0) { + return true; + } + return false; + }; + auto build_string_axis = [ndim](int axis) { + if (axis < 0) { + axis = ndim + axis; } - return axis; - }; + return std::to_string(axis); + }; + auto build_string_end = [i, ndim, var_name](std::string* code) { + std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; + *code += "Shape<" + std::to_string(ndim) + "> "+ end_var_name + ";\n"; + *code += end_var_name + ".set(INT_MAX);\n"; + return end_var_name; + }; std::string begin; std::string end; if (op_name == "broadcast_like" || op_name == "slice_like") { - std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; - code += "Shape<" + ndim_var_name + "> "+ begin_var_name + ";\n"; - code += begin_var_name + ".set(0);\n"; int like_id = node.inputs[1].node_id; + begin = build_tuple(0, "0", "0"); + std::string end_var_name; + std::string extra_var_name = "extra_" + std::to_string(like_id) + "_shape"; if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == extra_shape_args_.end()) { extra_shape_args_.push_back(like_id); } - std::string end_var_name; - std::string extra_var_name = "extra_" + std::to_string(like_id) + "_shape"; if (op_name == "slice_like") { if (source->attrs.dict.count("axes") == 0) { end_var_name = extra_var_name; + if (check_shapes) { + check_shapes->push_back(like_id); + check_shapes->push_back(arg_id); + } } else { std::string axes = source->attrs.dict.at("axes"); - end_var_name = var_name + "_" + std::to_string(i) + "_end"; - code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; - code += end_var_name + ".set(INT_MAX);\n"; - for (auto axis: splitStringToVector(axes)) { - axis = parse_axis(axis); + end_var_name = build_string_end(&code); + for (auto ax: splitStringToVector(axes, "")) { + std::string axis = build_string_axis(ax); code += end_var_name + "["+axis+"] = " + extra_var_name + "["+axis+"]" + + ";\n"; } } } else { if (source->attrs.dict.count("lhs_axes") == 0) { end_var_name = extra_var_name; + if (check_shapes) { + check_shapes->push_back(like_id); + check_shapes->push_back(arg_id); + } } else { std::string lhs_axes = source->attrs.dict.at("lhs_axes"); std::string rhs_axes = source->attrs.dict.at("rhs_axes"); - end_var_name = var_name + "_" + std::to_string(i) + "_end"; - code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; - code += end_var_name + ".set(INT_MAX);\n"; - std::vector v_lhs_axes = splitStringToVector(lhs_axes); - std::vector v_rhs_axes = splitStringToVector(rhs_axes); + end_var_name = build_string_end(&code); + std::vector v_lhs_axes = splitStringToVector(lhs_axes, ""); + std::vector v_rhs_axes = splitStringToVector(rhs_axes, ""); for (int i = 0; i < v_lhs_axes.size(); i++) { - std::string lhs_axis = parse_axis(v_lhs_axes[i]); - std::string rhs_axis = parse_axis(v_rhs_axes[i]); + std::string lhs_axis = build_string_axis(v_lhs_axes[i]); + std::string rhs_axis = build_string_axis(v_rhs_axes[i]); code += end_var_name + "["+lhs_axis+"] = " + extra_var_name + "["+rhs_axis+"]" + + ";\n"; } } } - begin = begin_var_name; end = end_var_name; } else { begin = parse_tuple(source->attrs.dict.at("begin"), "0"); end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); if (op_name == "slice_axis") { - std::string axis = parse_axis(source->attrs.dict.at("axis")); - std::string begin_var_name = var_name + "_" + std::to_string(i) + "_begin"; - std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; - code += "Shape<" + ndim_var_name + "> "+ begin_var_name + ";\n"; - code += "Shape<" + ndim_var_name + "> "+ end_var_name + ";\n"; - code += begin_var_name + ".set(0);\n"; - code += end_var_name + ".set(INT_MAX);\n"; - code += begin_var_name + "["+axis+"] = " + begin + ";\n"; - code += end_var_name + "["+axis+"] = " + end + ";\n"; - begin = begin_var_name; - end = end_var_name; + int axis = std::stoi(source->attrs.dict.at("axis")); + begin = build_tuple(axis, begin, "0"); + end = build_tuple(axis, end, "INT_MAX"); } + if (check_shapes) { + if (check_tuple(begin) && check_tuple(end)) { + check_shapes->push_back(arg_id); + } else { + check_shapes_compile = false; + } + } + } + std::string slice_func = "load_slice"; + if (!check_shapes) { + slice_func = "fast_" + slice_func; } - code += "const auto " + vec_name + " = load_slice(" + + code += "const auto " + vec_name + " = " + slice_func + "(" + var_name + ", " + var_name + "_shape," + begin + "," + end + ", offset);\n"; CHECK_EQ(outputs[i], 1); @@ -305,6 +345,10 @@ void FusedOp::GenerateCode(const std::vector &req, } } + if (!check_shapes_compile) { + check_shapes->clear(); + } + size_t counter = 0; for (const auto& entry : g.outputs()) { std::string var_name = "output" + std::to_string(counter); @@ -444,11 +488,11 @@ void FusedOp::GenerateCode(const std::vector &req, ++counter; } - this->code_ = code; + this->code_[kernel_index] = code; // Add boilerplate and type information if (dmlc::GetEnv("MXNET_FUSION_VERBOSE", false)) { - LOG(INFO) << code_; + LOG(INFO) << code_[kernel_index]; } std::string kernel_params = ""; std::string tensor_params = ""; @@ -459,10 +503,9 @@ void FusedOp::GenerateCode(const std::vector &req, size_t i = 0; std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; - const auto& shapes = this->subgraph_.GetAttr("shape"); for (const auto &shape_id: extra_shape_args_) { std::string shape_name = "extra_" + std::to_string(shape_id) + "_shape"; - int ndim = shapes[shape_id].ndim(); + int ndim = node_shapes[shape_id].ndim(); kernel_params += " const Shape<" + std::to_string(ndim) + "> " + shape_name; kernel_params += ", "; } @@ -499,24 +542,24 @@ void FusedOp::GenerateCode(const std::vector &req, } kernel_params += tensor_params; - code_ = std::string(fusion::fp16_support_string) + "\n" + + code_[kernel_index] = std::string(fusion::fp16_support_string) + "\n" + fusion::type_support_string + "\n" + fusion::function_definitions + "\n" + aux_code + "\n" + "__global__ void FusedKernel_" + kernel_name + "(size_t N, " + kernel_params + ") {\n" + fusion::kernel_begin + "\n" + - code_ + "\n" + + code_[kernel_index] + "\n" + fusion::kernel_end; } -void FusedOp::CompileCode(const std::string &kernel_name) { +void FusedOp::CompileCode(int kernel_index, const std::string &kernel_name) { // Guard NVRTC calls std::lock_guard lock_nvrtc(mutex_); nvrtcProgram program; NVRTC_CALL( nvrtcCreateProgram(&program, // prog - &code_[0], // buffer + &code_[kernel_index][0], // buffer (kernel_name + "_kernel.cu").c_str(), // name 0, // num headers NULL, // headers @@ -543,13 +586,13 @@ void FusedOp::CompileCode(const std::string &kernel_name) { // Obtain PTX from the program. size_t ptx_size; NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size)); - ptx_.reserve(ptx_size); - NVRTC_CALL(nvrtcGetPTX(program, &ptx_[0])); + ptx_[kernel_index].reserve(ptx_size); + NVRTC_CALL(nvrtcGetPTX(program, &ptx_[kernel_index][0])); const char *name; NVRTC_CALL(nvrtcGetLoweredName(program, kernel_name_demangled.c_str(), &name)); - kernel_name_ = name; + kernel_name_[kernel_index] = name; // Destroy the program. NVRTC_CALL(nvrtcDestroyProgram(&program)); int device; @@ -559,10 +602,10 @@ void FusedOp::CompileCode(const std::string &kernel_name) { CUDA_CALL(cudaGetDevice(&device)); CUDA_DRIVER_CALL(cuDeviceGet(&cu_device, device)); CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device)); - CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[0])); - CUDA_DRIVER_CALL(cuModuleGetFunction(&kernel_, + CUDA_DRIVER_CALL(cuModuleLoadData(&module, &ptx_[kernel_index][0])); + CUDA_DRIVER_CALL(cuModuleGetFunction(&kernel_[kernel_index], module, - kernel_name_.c_str())); + kernel_name_[kernel_index].c_str())); } bool FusedOp::CheckComputeCapability(const OpContext &ctx) { @@ -655,9 +698,14 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, saved_reqs_ = req; if (!initialized_) { - this->GenerateCode(req, in_dtypes, out_dtypes, in_ndims, out_ndims, - nvec, attrs.name, node_shapes, node_dtypes); - this->CompileCode(attrs.name); + this->GenerateCode(0, req, in_dtypes, out_dtypes, in_ndims, out_ndims, + node_shapes, node_dtypes, nvec, attrs.name, &check_shape_args_); + this->CompileCode(0, attrs.name); + if (check_shape_args_.size() > 0) { + this->GenerateCode(1, req, in_dtypes, out_dtypes, in_ndims, out_ndims, + node_shapes, node_dtypes, nvec, attrs.name, NULL); + this->CompileCode(1, attrs.name); + } initialized_ = true; } Stream* s = ctx.get_stream(); @@ -675,7 +723,6 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector ptrs; std::vector> shapes; - const auto& node_shapes = this->subgraph_.GetAttr("shape"); for (const auto &shape_id: extra_shape_args_) { AddShape(node_shapes[shape_id], &shapes); } @@ -692,8 +739,18 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, for (auto &ptr : ptrs) { args.push_back(reinterpret_cast(&ptr)); } + int kernel_index = 0; + if (check_shape_args_.size() > 0) { + kernel_index = 1; + for (const auto &shape_id: check_shape_args_) { + const auto& shape = node_shapes[shape_id]; + if (shape[shape.ndim()-1] % nvec != 0) { + kernel_index = 0; + } + } + } CUDA_DRIVER_CALL( - cuLaunchKernel(kernel_, + cuLaunchKernel(kernel_[kernel_index], num_blocks, 1, 1, // grid dim FusedOp::NTHREADS, 1, 1, // block dim 0, stream, // shared mem and stream diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 176ad04956a7..f2f2ed394985 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -30,6 +30,7 @@ #if MXNET_USE_CUDA + namespace mxnet { struct FusedOpConfig : public dmlc::Parameter { @@ -102,16 +103,19 @@ class FusedOp { } private: - void GenerateCode(const std::vector &req, + void GenerateCode(int kernel_index, + const std::vector &req, const std::vector &in_dtypes, const std::vector &out_dtypes, const std::vector &in_ndims, const std::vector &out_ndims, + const mxnet::ShapeVector &node_shapes, + const std::vector &node_dtypes, const int nvec, const std::string& kernel_name, - const std::vector &node_shapes, - const std::vector &node_dtypes); - void CompileCode(const std::string &kernel_name); + std::vector *check_shapes); + void CompileCode(int kernel_index, + const std::string &kernel_name); bool CheckComputeCapability(const OpContext &ctx); void CheckShapesAndTypes(const std::vector &inputs, const std::vector &outputs, @@ -124,7 +128,7 @@ class FusedOp { std::vector inputs_; std::vector outputs_; - std::string code_; + std::string code_[2]; nnvm::Graph subgraph_; template @@ -149,11 +153,12 @@ class FusedOp { std::vector> aux_out_types; std::vector saved_reqs_; std::vector extra_shape_args_; + std::vector check_shape_args_; - std::string ptx_; - std::string kernel_name_; + std::string ptx_[2]; + std::string kernel_name_[2]; + CUfunction kernel_[2]; bool initialized_; - CUfunction kernel_; int cc_major_; int cc_minor_; From e5649e1fb4996b5ec435da566a9b92dce6e8dfc2 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 20 Aug 2019 11:26:02 -0700 Subject: [PATCH 088/105] Fix lint and warning --- src/operator/fusion/fused_op.cu | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index ff61b21db091..dd12e94d933e 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -98,16 +98,16 @@ inline std::vector splitStringToVector(const std::string& input, const std: return std::stoi(token); }; - while ((pos_end = s.find (",", pos_start)) != std::string::npos) { - std::string token = s.substr (pos_start, pos_end - pos_start); + while ((pos_end = s.find(",", pos_start)) != std::string::npos) { + std::string token = s.substr(pos_start, pos_end - pos_start); pos_start = pos_end + 1; if (token.length() > 0) { - res.push_back (convert_token(token)); + res.push_back(convert_token(token)); } } if (pos_start < s.length()) { - res.push_back (convert_token(s.substr (pos_start))); + res.push_back(convert_token(s.substr(pos_start))); } return res; } @@ -275,7 +275,8 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, begin = build_tuple(0, "0", "0"); std::string end_var_name; std::string extra_var_name = "extra_" + std::to_string(like_id) + "_shape"; - if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == extra_shape_args_.end()) { + if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == + extra_shape_args_.end()) { extra_shape_args_.push_back(like_id); } if (op_name == "slice_like") { @@ -288,9 +289,10 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, } else { std::string axes = source->attrs.dict.at("axes"); end_var_name = build_string_end(&code); - for (auto ax: splitStringToVector(axes, "")) { + for (auto ax : splitStringToVector(axes, "")) { std::string axis = build_string_axis(ax); - code += end_var_name + "["+axis+"] = " + extra_var_name + "["+axis+"]" + + ";\n"; + code += end_var_name + "["+axis+"] = " + + extra_var_name + "["+axis+"];\n"; } } } else { @@ -306,10 +308,11 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, end_var_name = build_string_end(&code); std::vector v_lhs_axes = splitStringToVector(lhs_axes, ""); std::vector v_rhs_axes = splitStringToVector(rhs_axes, ""); - for (int i = 0; i < v_lhs_axes.size(); i++) { + for (size_t i = 0; i < v_lhs_axes.size(); i++) { std::string lhs_axis = build_string_axis(v_lhs_axes[i]); std::string rhs_axis = build_string_axis(v_rhs_axes[i]); - code += end_var_name + "["+lhs_axis+"] = " + extra_var_name + "["+rhs_axis+"]" + + ";\n"; + code += end_var_name + "["+lhs_axis+"] = " + + extra_var_name + "["+rhs_axis+"];\n"; } } } @@ -503,7 +506,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, size_t i = 0; std::string aux_code = "static const int nvec = " + std::to_string(nvec) + ";\n"; - for (const auto &shape_id: extra_shape_args_) { + for (const auto &shape_id : extra_shape_args_) { std::string shape_name = "extra_" + std::to_string(shape_id) + "_shape"; int ndim = node_shapes[shape_id].ndim(); kernel_params += " const Shape<" + std::to_string(ndim) + "> " + shape_name; @@ -723,7 +726,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, std::vector ptrs; std::vector> shapes; - for (const auto &shape_id: extra_shape_args_) { + for (const auto &shape_id : extra_shape_args_) { AddShape(node_shapes[shape_id], &shapes); } for (const auto &data : inputs) { @@ -742,7 +745,7 @@ void FusedOp::Forward(const nnvm::NodeAttrs& attrs, int kernel_index = 0; if (check_shape_args_.size() > 0) { kernel_index = 1; - for (const auto &shape_id: check_shape_args_) { + for (const auto &shape_id : check_shape_args_) { const auto& shape = node_shapes[shape_id]; if (shape[shape.ndim()-1] % nvec != 0) { kernel_index = 0; From 868bcf69e485d8ab7c8f60d75a69ab113f72858e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 21 Aug 2019 10:23:49 -0700 Subject: [PATCH 089/105] Going easy on Windows compiler (again) --- src/operator/fusion/fused_op-inl.h | 331 +++++++++++++++-------------- src/operator/fusion/fused_op.cu | 1 + 2 files changed, 168 insertions(+), 164 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index b516aac4a191..0237403da9c4 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -525,42 +525,21 @@ inline DType relu(const DType val) { return val > 0 ? val : 0; } -template -inline DType backward_relu(const DType val, const DType grad) { - return val > 0 ? grad : 0; -} - template inline DType sigmoid(const DType val) { return 1.f/(1 + expf(-val)); } -template -inline DType backward_sigmoid(const DType out, const DType grad) { - return grad * out * (1 - out); -} - template inline DType softrelu(const DType val) { return logf(1 + expf(val)); } -template -inline DType backward_softrelu(const DType val, const DType grad) { - return grad * sigmoid(val); -} - template inline DType softsign(const DType val) { return val / (1 + fabsf(val)); } -template -inline DType backward_softsign(const DType val, const DType grad) { - const DType ap1 = 1 + fabsf(val); - return grad / (ap1 * ap1); -} - // exp and log template @@ -568,61 +547,31 @@ inline DType exp(const DType val) { return expf(val); } -template -inline DType backward_exp(const DType val, const DType grad) { - return grad * expf(val); -} - template inline DType expm1(const DType val) { return expm1f(val); } -template -inline DType backward_expm1(const DType val, const DType grad) { - return grad * expf(val); -} - template inline DType log(const DType val) { return logf(val); } -template -inline DType backward_log(const DType val, const DType grad) { - return grad / val; -} - template inline DType log10(const DType val) { return log10f(val); } -template -inline DType backward_log10(const DType val, const DType grad) { - return grad / (val * logf(10)); -} - template inline DType log2(const DType val) { return log2f(val); } -template -inline DType backward_log2(const DType val, const DType grad) { - return grad / (val * logf(2)); -} - template inline DType log1p(const DType val) { return log1pf(val); } -template -inline DType backward_log1p(const DType val, const DType grad) { - return grad / (1 + val); -} - // trigonometric constexpr double pi = 3.14159265358979323846; @@ -642,123 +591,61 @@ inline DType sin(const DType val) { return sinf(val); } -template -inline DType backward_sin(const DType val, const DType grad) { - return grad * cosf(val); -} - template inline DType cos(const DType val) { return cosf(val); } -template -inline DType backward_cos(const DType val, const DType grad) { - return -grad * sinf(val); -} - template inline DType tan(const DType val) { return tanf(val); } -// Uses output from tan -template -inline DType backward_tan(const DType out, const DType grad) { - return grad * (out * out + 1); -} - template inline DType arcsin(const DType val) { return asinf(val); } -template -inline DType backward_arcsin(const DType val, const DType grad) { - return grad / sqrtf(1 - val*val); -} - template inline DType arccos(const DType val) { return acosf(val); } -template -inline DType backward_arccos(const DType val, const DType grad) { - return -grad / sqrtf(1 - val*val); -} - template inline DType arctan(const DType val) { return atanf(val); } -template -inline DType backward_arctan(const DType val, const DType grad) { - return grad / (1 + val*val); -} - template inline DType sinh(const DType val) { return sinhf(val); } -template -inline DType backward_sinh(const DType val, const DType grad) { - return grad * coshf(val); -} - template inline DType cosh(const DType val) { return coshf(val); } -template -inline DType backward_cosh(const DType val, const DType grad) { - return grad * sinhf(val); -} - template inline DType tanh(const DType val) { return tanhf(val); } -// Uses tanh output -template -inline DType backward_tanh(const DType out, const DType grad) { - return grad * (1 - out * out); -} - template inline DType arcsinh(const DType val) { return asinhf(val); } -template -inline DType backward_arcsinh(const DType val, const DType grad) { - return grad / sqrtf(val * val + 1); -} - template inline DType arccosh(const DType val) { return acoshf(val); } -template -inline DType backward_arccosh(const DType val, const DType grad) { - return grad / sqrtf(val * val - 1); -} - template inline DType arctanh(const DType val) { return atanhf(val); } -template -inline DType backward_arctanh(const DType val, const DType grad) { - return grad / (1 - val * val); -} - // sqrt template @@ -766,53 +653,26 @@ inline DType sqrt(const DType val) { return sqrtf(val); } -template -inline DType backward_sqrt(const DType out, const DType grad) { - return 0.5 * grad / out; -} - template inline DType rsqrt(const DType val) { return rsqrtf(val); } -template -inline DType backward_rsqrt(const DType val, const DType grad) { - const DType inv = 1 / val; - return -0.5 * grad * sqrtf(inv) * inv; -} - template inline DType cbrt(const DType val) { return cbrtf(val); } -template -inline DType backward_cbrt(const DType out, const DType grad) { - return grad / (3.0f * out * out); -} - template inline DType rcbrt(const DType val) { return rcbrtf(val); } -template -inline DType backward_rcbrt(const DType val, const DType grad) { - const DType inv = 1 / val; - return -1.f/3.f * grad * cbrtf(inv) * inv; -} - template inline DType square(const DType val) { return val * val; } -template -inline DType backward_square(const DType val, const DType grad) { - return 2 * val * grad; -} - template inline typename LoadType::Type zero(const DType val) { return 0; @@ -870,15 +730,6 @@ inline DType clip(const DType val, const float a_min, const float a_max) { return max(min(val, a_max), a_min); } -template -inline DType backward_clip(const DType val, const DType grad, const float a_min, const float a_max) { - if (val > a_max || val < a_min) { - return 0; - } else { - return grad; - } -} - template inline DType sign(const DType val) { if (val < 0) return -1; @@ -890,11 +741,6 @@ inline DType reciprocal(const DType val) { return 1.0f / val; } -template -inline DType backward_reciprocal(const DType val, const DType grad) { - return -grad / (val * val); -} - template inline DType abs(const DType val) { return fabsf(val); @@ -915,21 +761,11 @@ inline DType erf(const DType val) { return erff(val); } -template -inline DType backward_erf(const DType val, const DType grad) { - return 2.0f / sqrt(pi) * exp(-(val*val)) * grad; -} - template inline DType erfinv(const DType val) { return erfinvf(val); } -template -inline DType backward_erfinv(const DType val, const DType grad) { - return 0.5f * sqrt(pi) * exp(val * val) * grad; -} - template inline DType1 smooth_l1(const DType1 val, const DType2 scalar) { const auto bsq = scalar * scalar; @@ -943,6 +779,173 @@ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) { } } +)code"; + +const char backward_function_definitions[] = R"code( +template +inline DType backward_relu(const DType val, const DType grad) { + return val > 0 ? grad : 0; +} + +template +inline DType backward_sigmoid(const DType out, const DType grad) { + return grad * out * (1 - out); +} + +template +inline DType backward_softrelu(const DType val, const DType grad) { + return grad * sigmoid(val); +} + +template +inline DType backward_softsign(const DType val, const DType grad) { + const DType ap1 = 1 + fabsf(val); + return grad / (ap1 * ap1); +} + +template +inline DType backward_exp(const DType val, const DType grad) { + return grad * expf(val); +} + +template +inline DType backward_expm1(const DType val, const DType grad) { + return grad * expf(val); +} + +template +inline DType backward_log(const DType val, const DType grad) { + return grad / val; +} + +template +inline DType backward_log10(const DType val, const DType grad) { + return grad / (val * logf(10)); +} + +template +inline DType backward_log2(const DType val, const DType grad) { + return grad / (val * logf(2)); +} + +template +inline DType backward_log1p(const DType val, const DType grad) { + return grad / (1 + val); +} + +template +inline DType backward_sin(const DType val, const DType grad) { + return grad * cosf(val); +} + +template +inline DType backward_cos(const DType val, const DType grad) { + return -grad * sinf(val); +} + +// Uses output from tan +template +inline DType backward_tan(const DType out, const DType grad) { + return grad * (out * out + 1); +} + +template +inline DType backward_arcsin(const DType val, const DType grad) { + return grad / sqrtf(1 - val*val); +} + +template +inline DType backward_arccos(const DType val, const DType grad) { + return -grad / sqrtf(1 - val*val); +} + +template +inline DType backward_arctan(const DType val, const DType grad) { + return grad / (1 + val*val); +} + +template +inline DType backward_sinh(const DType val, const DType grad) { + return grad * coshf(val); +} + +template +inline DType backward_cosh(const DType val, const DType grad) { + return grad * sinhf(val); +} + +// Uses tanh output +template +inline DType backward_tanh(const DType out, const DType grad) { + return grad * (1 - out * out); +} + +template +inline DType backward_arcsinh(const DType val, const DType grad) { + return grad / sqrtf(val * val + 1); +} + +template +inline DType backward_arccosh(const DType val, const DType grad) { + return grad / sqrtf(val * val - 1); +} + +template +inline DType backward_arctanh(const DType val, const DType grad) { + return grad / (1 - val * val); +} + +template +inline DType backward_sqrt(const DType out, const DType grad) { + return 0.5 * grad / out; +} + +template +inline DType backward_rsqrt(const DType val, const DType grad) { + const DType inv = 1 / val; + return -0.5 * grad * sqrtf(inv) * inv; +} + +template +inline DType backward_cbrt(const DType out, const DType grad) { + return grad / (3.0f * out * out); +} + +template +inline DType backward_rcbrt(const DType val, const DType grad) { + const DType inv = 1 / val; + return -1.f/3.f * grad * cbrtf(inv) * inv; +} + +template +inline DType backward_square(const DType val, const DType grad) { + return 2 * val * grad; +} + +template +inline DType backward_clip(const DType val, const DType grad, const float a_min, const float a_max) { + if (val > a_max || val < a_min) { + return 0; + } else { + return grad; + } +} + +template +inline DType backward_reciprocal(const DType val, const DType grad) { + return -grad / (val * val); +} + +template +inline DType backward_erf(const DType val, const DType grad) { + return 2.0f / sqrt(pi) * exp(-(val*val)) * grad; +} + +template +inline DType backward_erfinv(const DType val, const DType grad) { + return 0.5f * sqrt(pi) * exp(val * val) * grad; +} + template inline DType backward_smooth_l1(const DType val, const DType2 scalar, const DType grad) { auto bsq = scalar * scalar; diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index dd12e94d933e..01318329a046 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -548,6 +548,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, code_[kernel_index] = std::string(fusion::fp16_support_string) + "\n" + fusion::type_support_string + "\n" + fusion::function_definitions + "\n" + + fusion::backward_function_definitions + "\n" + aux_code + "\n" + "__global__ void FusedKernel_" + kernel_name + "(size_t N, " + kernel_params + ") {\n" + From 1608d6ae7cdf28052d1d7ae1b706e976b8055dbb Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 4 Sep 2019 09:13:01 -0700 Subject: [PATCH 090/105] Fix slice_like --- src/operator/fusion/fused_op-inl.h | 2 +- src/operator/fusion/fused_op.cu | 57 +++++++----------------------- src/operator/fusion/fused_op.h | 6 ++-- tests/python/gpu/test_fusion.py | 11 +++++- 4 files changed, 26 insertions(+), 50 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 0237403da9c4..37bdaf01cb74 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -225,7 +225,7 @@ const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, {"slice_like" , ""}, - {"broadcast_like" , ""}, + // {"broadcast_like" , ""}, }; const std::vector variable_io_ops = { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 01318329a046..7f6fe3796850 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -174,7 +174,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, const std::vector &node_dtypes, const int nvec, const std::string &kernel_name, - std::vector* check_shapes) { + std::vector* check_shapes) { const auto& g = this->subgraph_.indexed_graph(); std::string code = ""; int temp_name_counter = 0; @@ -223,12 +223,13 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, } else { std::string op_name = source->op()->name; if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { - int arg_id = node.inputs[0].node_id; - const auto& shape = node_shapes[arg_id]; + int node_id = node.inputs[0].node_id; + const uint32_t input_entry_id = g.entry_id(node.inputs[0]); + const auto& shape = node_shapes[input_entry_id]; const int ndim = shape.ndim(); - const auto& var_name = g[arg_id].source->attrs.name; + const auto& var_name = g[node_id].source->attrs.name; const auto vec_name = "vec_" + var_name + "_" + std::to_string(i); - load_index[arg_id] = 0; + load_index[node_id] = 0; auto parse_tuple = [](const std::string& input, const std::string def) { std::string out = input; replaceString(&out, "(", "{"); @@ -271,52 +272,18 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, std::string begin; std::string end; if (op_name == "broadcast_like" || op_name == "slice_like") { - int like_id = node.inputs[1].node_id; + uint32_t like_id = g.entry_id(i, 0); begin = build_tuple(0, "0", "0"); - std::string end_var_name; std::string extra_var_name = "extra_" + std::to_string(like_id) + "_shape"; if (std::find(extra_shape_args_.begin(), extra_shape_args_.end(), like_id) == extra_shape_args_.end()) { extra_shape_args_.push_back(like_id); } - if (op_name == "slice_like") { - if (source->attrs.dict.count("axes") == 0) { - end_var_name = extra_var_name; - if (check_shapes) { - check_shapes->push_back(like_id); - check_shapes->push_back(arg_id); - } - } else { - std::string axes = source->attrs.dict.at("axes"); - end_var_name = build_string_end(&code); - for (auto ax : splitStringToVector(axes, "")) { - std::string axis = build_string_axis(ax); - code += end_var_name + "["+axis+"] = " + - extra_var_name + "["+axis+"];\n"; - } - } - } else { - if (source->attrs.dict.count("lhs_axes") == 0) { - end_var_name = extra_var_name; - if (check_shapes) { - check_shapes->push_back(like_id); - check_shapes->push_back(arg_id); - } - } else { - std::string lhs_axes = source->attrs.dict.at("lhs_axes"); - std::string rhs_axes = source->attrs.dict.at("rhs_axes"); - end_var_name = build_string_end(&code); - std::vector v_lhs_axes = splitStringToVector(lhs_axes, ""); - std::vector v_rhs_axes = splitStringToVector(rhs_axes, ""); - for (size_t i = 0; i < v_lhs_axes.size(); i++) { - std::string lhs_axis = build_string_axis(v_lhs_axes[i]); - std::string rhs_axis = build_string_axis(v_rhs_axes[i]); - code += end_var_name + "["+lhs_axis+"] = " + - extra_var_name + "["+rhs_axis+"];\n"; - } - } + if (check_shapes) { + check_shapes->push_back(like_id); + check_shapes->push_back(input_entry_id); } - end = end_var_name; + end = extra_var_name; } else { begin = parse_tuple(source->attrs.dict.at("begin"), "0"); end = parse_tuple(source->attrs.dict.at("end"), "INT_MAX"); @@ -327,7 +294,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, } if (check_shapes) { if (check_tuple(begin) && check_tuple(end)) { - check_shapes->push_back(arg_id); + check_shapes->push_back(input_entry_id); } else { check_shapes_compile = false; } diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index f2f2ed394985..81c80d07ad42 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -113,7 +113,7 @@ class FusedOp { const std::vector &node_dtypes, const int nvec, const std::string& kernel_name, - std::vector *check_shapes); + std::vector *check_shapes); void CompileCode(int kernel_index, const std::string &kernel_name); bool CheckComputeCapability(const OpContext &ctx); @@ -152,8 +152,8 @@ class FusedOp { std::vector> aux_in_types; std::vector> aux_out_types; std::vector saved_reqs_; - std::vector extra_shape_args_; - std::vector check_shape_args_; + std::vector extra_shape_args_; + std::vector check_shape_args_; std::string ptx_[2]; std::string kernel_name_[2]; diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index 682a8f2284b8..6adf935fb29c 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -28,7 +28,8 @@ def check_fused_symbol(sym, **kwargs): inputs = sym.list_inputs() shapes = {inp : kwargs[inp].shape for inp in inputs} - test_sym = mx.sym.Group([mx.sym.identity(s) for s in sym]) + # Double identity so that there is always something to fuse + test_sym = mx.sym.Group([mx.sym.identity(mx.sym.identity(s)) for s in sym]) rtol = {'float16' : 1e-2, 'float32' : 1.5e-6, 'float64' : 1.5e-6, @@ -203,6 +204,14 @@ def check_other_ops(): random.randint(begin[2]+1, shape[2])) check_fused_symbol(mx.sym.slice(a, begin=begin, end=end), a=arr1) + arr1 = mx.random.uniform(shape=(2,3,4,5)) + arr2 = mx.random.uniform(shape=(1,2,3)) + check_fused_symbol(mx.sym.slice_like(a,b, axes=[-2, 0]), a=arr1, b=arr2) + + arr1 = mx.random.uniform(shape=(1,1,2,3)) + arr2 = mx.random.uniform(shape=(2,2,2,3)) + check_fused_symbol(mx.sym.broadcast_like(a, b, lhs_axes=[0], rhs_axes=[0]), a=arr1, b=arr2) + @with_seed() def test_fusion(): check_unary_ops() From 037a5dea5baa5ae9cc27df7d2805eab56904dde6 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 4 Sep 2019 11:28:20 -0700 Subject: [PATCH 091/105] Debug broadcast_like fusion --- src/executor/exec_pass.h | 3 +- src/executor/infer_graph_attr_pass.cc | 65 ++++++++++++++++++--------- src/operator/fusion/fused_op-inl.h | 2 +- src/operator/fusion/fused_op.cc | 36 ++++++++++----- src/operator/fusion/fused_op.h | 51 ++++++++++++++------- 5 files changed, 106 insertions(+), 51 deletions(-) diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 186199870666..dcbf23903cb7 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -40,7 +40,7 @@ namespace mxnet { namespace exec { template -using FAccessSubgraphAttr = std::function, std::vector> +using FAccessSubgraphAttr = std::function, std::vector> (const NodeAttrs& attrs)>; using FAccessSubgraphShape = FAccessSubgraphAttr; @@ -49,6 +49,7 @@ using FAccessSubgraphStorageType = FAccessSubgraphAttr; template using FProvideSubgraphAttr = std::function &nodes, const std::vector> &in_attrs, const std::vector> &out_attrs)>; using FProvideSubgraphShape = FProvideSubgraphAttr; diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 19625ef3f86c..5eb6c5eb37f2 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -125,39 +125,60 @@ void GetAttrFromFusedNode(uint32_t nid, const std::string& infer_fusion_name) { std::vector& rshape = *rshape_ptr; const auto& inode = idx[nid]; - nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; + // gradient function, used to get node correspondence. + static auto& fgrad = + Op::GetAttr("FGradient"); + nnvm::NodePtr fused_fwd_ptr = inode.source->control_deps[0]; static auto& finfer_fused_shape = Op::GetAttr(infer_fusion_name); - auto finfer = finfer_fused_shape.get(fwd_ptr->op(), nullptr); - CHECK(finfer != nullptr) << "Operator " << fwd_ptr->attrs.name << + auto finfer = finfer_fused_shape.get(fused_fwd_ptr->op(), nullptr); + CHECK(finfer != nullptr) << "Operator " << fused_fwd_ptr->attrs.name << " is marked as Fusion but does not allow accessing attributes"; - const auto& inferred_attrs = finfer(fwd_ptr->attrs); - const auto& input_attrs = inferred_attrs.first; - const auto& output_attrs = inferred_attrs.second; - CHECK(input_attrs.size() == inode.source->num_outputs()) << - "Number of outputs of the gradient node " << inode.source->attrs.name << - " does not match the number of inputs of the corresponding forward node"; + const auto& inferred_attrs = finfer(fused_fwd_ptr->attrs); + const auto& fwd_ptr = std::get<0>(inferred_attrs); + const auto& input_attrs = std::get<1>(inferred_attrs); + const auto& output_attrs = std::get<2>(inferred_attrs); + + // use gradient function to find out the correspondence. + std::vector ograd(fwd_ptr->num_outputs()); + for (size_t i = 0; i < ograd.size(); ++i) { + ograd[i].index = static_cast(i); + } + // input gradient list + const std::vector& igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); + const nnvm::Node* igrad_node = nullptr; // Set the attributes of output gradients // using attributes of forward node inputs - for (size_t i = 0; i < input_attrs.size(); ++i) { - uint32_t eid = idx.entry_id(nid, i); - if (fis_none(rshape[eid])) { - rshape[eid] = input_attrs[i]; - } else if (!fis_none(input_attrs[i])) { - CHECK_EQ(rshape[eid], input_attrs[i]) - << "Backward shape inconsistent with the forward shape"; + for (size_t i = 0; i < igrad.size(); ++i) { + if (igrad[i].node->op() == inode.source->op()) { + uint32_t eid = idx.entry_id(nid, igrad[i].index); + if (fis_none(rshape[eid])) { + rshape[eid] = input_attrs[i]; + } else if (!fis_none(input_attrs[i])) { + // Need to skip empty forward shape, because it may not be + // available now and it is possible to infer the forward + // shape in one of the next a few passes + CHECK_EQ(rshape[eid], input_attrs[i]) + << "Backward shape inconsistent with the forward shape"; + } + if (igrad_node == nullptr) { + igrad_node = igrad[i].node.get(); + } else { + CHECK(igrad_node == igrad[i].node.get()); + } } } + // Set the attributes of input gradients // using attributes of forward node outputs - for (size_t i = 0; i < output_attrs.size(); ++i) { - // We assume that the first inputs to the - // backward op are the output gradients - const auto& e = inode.source->inputs[i]; + CHECK(igrad_node != nullptr) + << "Cannot find matching backward op for " << inode.source->attrs.name; + for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { + const nnvm::NodeEntry& e = igrad_node->inputs[i]; if (e.node == nullptr) { uint32_t eid = idx.entry_id(inode.inputs[i]); if (fis_none(rshape[eid])) { - rshape[eid] = output_attrs[i]; + rshape[eid] = output_attrs[e.index]; } } } @@ -189,7 +210,7 @@ void ProvideAttrToFusion(const uint32_t nid, CHECK(provide != nullptr) << "Encountered Fusion operator that does not implement providing subgraph attr " << provide_fusion_name << "."; - provide(inode.source->attrs, in_attrs, out_attrs); + provide(inode.source->attrs, inode.source->control_deps, in_attrs, out_attrs); } /*!\brief diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 37bdaf01cb74..0237403da9c4 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -225,7 +225,7 @@ const std::map slice_ops = { {"slice_axis" , ""}, {"slice" , ""}, {"slice_like" , ""}, - // {"broadcast_like" , ""}, + {"broadcast_like" , ""}, }; const std::vector variable_io_ops = { diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index e698912a7461..d9473ab0198c 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -144,7 +144,10 @@ bool FusedOp::InferType(const nnvm::NodeAttrs &attrs, } template -std::pair, std::vector> FusedOp::GetAttrs(const std::string& attr_name, +std::tuple, + std::vector> + FusedOp::GetAttrs(const std::string& attr_name, const uint32_t node_id) { const auto& g = this->subgraph_.indexed_graph(); const std::vector attrs = this->subgraph_.GetAttr>(attr_name); @@ -169,7 +172,9 @@ std::pair, std::vector> FusedOp::GetAttrs(const std::str } } - return {inputs, outputs}; + return {node.weak_ref.lock(), + inputs, + outputs}; } bool FusedOpInferShape(const nnvm::NodeAttrs& attrs, @@ -187,20 +192,23 @@ bool FusedOpInferType(const nnvm::NodeAttrs& attrs, } void FusedOpProvideShape(const nnvm::NodeAttrs& attrs, + const std::vector& nodes, const std::vector> &in_attrs, const std::vector> &out_attrs) { const FusedOpPtr& op = nnvm::get(attrs.parsed); - op->ProvideShape(in_attrs, out_attrs); + op->ProvideShape(nodes, in_attrs, out_attrs); } void FusedOpProvideType(const nnvm::NodeAttrs& attrs, + const std::vector& nodes, const std::vector> &in_attrs, const std::vector> &out_attrs) { const FusedOpPtr& op = nnvm::get(attrs.parsed); - op->ProvideType(in_attrs, out_attrs); + op->ProvideType(nodes, in_attrs, out_attrs); } void FusedOpProvideStorageType(const nnvm::NodeAttrs& attrs, + const std::vector& nodes, const std::vector> &in_attrs, const std::vector> &out_attrs) {} @@ -219,8 +227,8 @@ NNVM_REGISTER_OP(_FusedOp) const auto num_inputs = op->num_inputs(); const auto num_outputs = op->num_outputs(); std::vector > ret; - for (auto i = 0u; i < num_inputs; ++i) { - for (auto j = 0u; j < num_outputs; ++j) { + for (unsigned int i = 0; i < num_inputs; ++i) { + for (unsigned int j = 0; j < num_outputs; ++j) { ret.emplace_back(i, j); } } @@ -235,7 +243,9 @@ NNVM_REGISTER_OP(_FusedOp) .set_attr_parser(FusedOpParamParser) .add_argument("data", "NDArray-or-Symbol[]", "Data"); -std::pair, std::vector> +std::tuple, + std::vector> FusedOpHelperShape(const NodeAttrs& attrs) { const auto& p = nnvm::get(attrs.parsed); const auto& op = p->op; @@ -243,7 +253,9 @@ FusedOpHelperShape(const NodeAttrs& attrs) { return op->GetAttrs("shape", node_id); } -std::pair, std::vector> +std::tuple, + std::vector> FusedOpHelperType(const NodeAttrs& attrs) { const auto& p = nnvm::get(attrs.parsed); const auto& op = p->op; @@ -260,7 +272,9 @@ NNVM_REGISTER_OP(_FusedOpHelper) .set_attr("FAccessSubgraphType", FusedOpHelperType); -std::pair, std::vector> +std::tuple, + std::vector> FusedOpOutHelperShape(const NodeAttrs& attrs) { const auto& p = nnvm::get(attrs.parsed); const auto& op = p->op; @@ -268,7 +282,9 @@ FusedOpOutHelperShape(const NodeAttrs& attrs) { return op->GetAuxShape(node_id); } -std::pair, std::vector> +std::tuple, + std::vector> FusedOpOutHelperType(const NodeAttrs& attrs) { const auto& p = nnvm::get(attrs.parsed); const auto& op = p->op; diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 81c80d07ad42..748b301e25ea 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -78,28 +78,44 @@ class FusedOp { std::vector *out_attrs); template - std::pair, std::vector> GetAttrs(const std::string& attr_name, - const uint32_t node_id); - - void ProvideShape(const std::vector> &in_attrs, + std::tuple, + std::vector> + GetAttrs(const std::string& attr_name, + const uint32_t node_id); + + void ProvideShape(const std::vector& nodes, + const std::vector> &in_attrs, const std::vector> &out_attrs) { - aux_in_shapes = in_attrs; - aux_out_shapes = out_attrs; + aux_nodes_ = nodes; + aux_in_shapes_ = in_attrs; + aux_out_shapes_ = out_attrs; } - void ProvideType(const std::vector> &in_attrs, + void ProvideType(const std::vector& nodes, + const std::vector> &in_attrs, const std::vector> &out_attrs) { - aux_in_types = in_attrs; - aux_out_types = out_attrs; + aux_nodes_ = nodes; + aux_in_types_ = in_attrs; + aux_out_types_ = out_attrs; } - std::pair, std::vector> + std::tuple, + std::vector> GetAuxShape(const int node_id) const { - return {aux_in_shapes[node_id], aux_out_shapes[node_id]}; + return {aux_nodes_[node_id], + aux_in_shapes_[node_id], + aux_out_shapes_[node_id]}; } - std::pair, std::vector> GetAuxType(const int node_id) const { - return {aux_in_types[node_id], aux_out_types[node_id]}; + std::tuple, + std::vector> + GetAuxType(const int node_id) const { + return {aux_nodes_[node_id], + aux_in_types_[node_id], + aux_out_types_[node_id]}; } private: @@ -147,10 +163,11 @@ class FusedOp { std::vector > intermediate_shapes_; std::vector > intermediate_dtypes_; - std::vector> aux_in_shapes; - std::vector> aux_out_shapes; - std::vector> aux_in_types; - std::vector> aux_out_types; + std::vector aux_nodes_; + std::vector> aux_in_shapes_; + std::vector> aux_out_shapes_; + std::vector> aux_in_types_; + std::vector> aux_out_types_; std::vector saved_reqs_; std::vector extra_shape_args_; std::vector check_shape_args_; From e501bc9811710b87cb1f0b9fcac91a02364c87a3 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 5 Sep 2019 09:37:25 -0700 Subject: [PATCH 092/105] Fix lint --- src/executor/exec_pass.h | 5 ++++- src/operator/fusion/fused_op.cc | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index dcbf23903cb7..25a326171510 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -35,12 +35,15 @@ #include #include #include +#include namespace mxnet { namespace exec { template -using FAccessSubgraphAttr = std::function, std::vector> +using FAccessSubgraphAttr = std::function, + std::vector> (const NodeAttrs& attrs)>; using FAccessSubgraphShape = FAccessSubgraphAttr; diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index d9473ab0198c..b29334ff1d44 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -17,6 +17,8 @@ * under the License. */ +#include + #include "./fused_op.h" #include "../operator_common.h" #include "../../executor/exec_pass.h" From e0ca7d07d3fb0472c271b4db996968faf0c96c5e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 10 Sep 2019 08:40:00 -0700 Subject: [PATCH 093/105] Fix lint --- src/operator/fusion/fused_op.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 748b301e25ea..dee1be629f43 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -27,6 +27,7 @@ #include #include #include +#include #if MXNET_USE_CUDA From 8d3dc77e3b3c4669265511d658b9ffa2f5267f16 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 11 Sep 2019 11:12:15 -0700 Subject: [PATCH 094/105] Trigger CI From 786b0718ccdff73144995e2638888df5d7ad8e7c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 11 Sep 2019 11:26:34 -0700 Subject: [PATCH 095/105] Get rid of the initializer list --- src/operator/fusion/fused_op.cc | 6 +++--- src/operator/fusion/fused_op.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index b29334ff1d44..071215b840a5 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -174,9 +174,9 @@ std::tuple, std::vector> GetAuxShape(const int node_id) const { - return {aux_nodes_[node_id], - aux_in_shapes_[node_id], - aux_out_shapes_[node_id]}; + return std::make_tuple(aux_nodes_[node_id], + aux_in_shapes_[node_id], + aux_out_shapes_[node_id]); } std::tuple, std::vector> GetAuxType(const int node_id) const { - return {aux_nodes_[node_id], - aux_in_types_[node_id], - aux_out_types_[node_id]}; + return std::make_tuple(aux_nodes_[node_id], + aux_in_types_[node_id], + aux_out_types_[node_id]); } private: From 0720f661250535c0b436b269b8727091580a3329 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 16 Sep 2019 10:41:37 -0700 Subject: [PATCH 096/105] Fix backward calls with different gradient type --- src/operator/fusion/fused_op-inl.h | 128 ++++++++++++++--------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 0237403da9c4..8897936ed67f 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -782,148 +782,148 @@ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) { )code"; const char backward_function_definitions[] = R"code( -template -inline DType backward_relu(const DType val, const DType grad) { +template +inline DTypeGrad backward_relu(const DType val, const DTypeGrad grad) { return val > 0 ? grad : 0; } -template -inline DType backward_sigmoid(const DType out, const DType grad) { +template +inline DTypeGrad backward_sigmoid(const DType out, const DTypeGrad grad) { return grad * out * (1 - out); } -template -inline DType backward_softrelu(const DType val, const DType grad) { +template +inline DTypeGrad backward_softrelu(const DType val, const DTypeGrad grad) { return grad * sigmoid(val); } -template -inline DType backward_softsign(const DType val, const DType grad) { +template +inline DTypeGrad backward_softsign(const DType val, const DTypeGrad grad) { const DType ap1 = 1 + fabsf(val); return grad / (ap1 * ap1); } -template -inline DType backward_exp(const DType val, const DType grad) { +template +inline DTypeGrad backward_exp(const DType val, const DTypeGrad grad) { return grad * expf(val); } -template -inline DType backward_expm1(const DType val, const DType grad) { +template +inline DTypeGrad backward_expm1(const DType val, const DTypeGrad grad) { return grad * expf(val); } -template -inline DType backward_log(const DType val, const DType grad) { +template +inline DTypeGrad backward_log(const DType val, const DTypeGrad grad) { return grad / val; } -template -inline DType backward_log10(const DType val, const DType grad) { +template +inline DTypeGrad backward_log10(const DType val, const DTypeGrad grad) { return grad / (val * logf(10)); } -template -inline DType backward_log2(const DType val, const DType grad) { +template +inline DTypeGrad backward_log2(const DType val, const DTypeGrad grad) { return grad / (val * logf(2)); } -template -inline DType backward_log1p(const DType val, const DType grad) { +template +inline DTypeGrad backward_log1p(const DType val, const DTypeGrad grad) { return grad / (1 + val); } -template -inline DType backward_sin(const DType val, const DType grad) { +template +inline DTypeGrad backward_sin(const DType val, const DTypeGrad grad) { return grad * cosf(val); } -template -inline DType backward_cos(const DType val, const DType grad) { +template +inline DTypeGrad backward_cos(const DType val, const DTypeGrad grad) { return -grad * sinf(val); } // Uses output from tan -template -inline DType backward_tan(const DType out, const DType grad) { +template +inline DTypeGrad backward_tan(const DType out, const DTypeGrad grad) { return grad * (out * out + 1); } -template -inline DType backward_arcsin(const DType val, const DType grad) { +template +inline DTypeGrad backward_arcsin(const DType val, const DTypeGrad grad) { return grad / sqrtf(1 - val*val); } -template -inline DType backward_arccos(const DType val, const DType grad) { +template +inline DTypeGrad backward_arccos(const DType val, const DTypeGrad grad) { return -grad / sqrtf(1 - val*val); } -template -inline DType backward_arctan(const DType val, const DType grad) { +template +inline DTypeGrad backward_arctan(const DType val, const DTypeGrad grad) { return grad / (1 + val*val); } -template -inline DType backward_sinh(const DType val, const DType grad) { +template +inline DTypeGrad backward_sinh(const DType val, const DTypeGrad grad) { return grad * coshf(val); } -template -inline DType backward_cosh(const DType val, const DType grad) { +template +inline DTypeGrad backward_cosh(const DType val, const DTypeGrad grad) { return grad * sinhf(val); } // Uses tanh output -template -inline DType backward_tanh(const DType out, const DType grad) { +template +inline DTypeGrad backward_tanh(const DType out, const DTypeGrad grad) { return grad * (1 - out * out); } -template -inline DType backward_arcsinh(const DType val, const DType grad) { +template +inline DTypeGrad backward_arcsinh(const DType val, const DTypeGrad grad) { return grad / sqrtf(val * val + 1); } -template -inline DType backward_arccosh(const DType val, const DType grad) { +template +inline DTypeGrad backward_arccosh(const DType val, const DTypeGrad grad) { return grad / sqrtf(val * val - 1); } -template -inline DType backward_arctanh(const DType val, const DType grad) { +template +inline DTypeGrad backward_arctanh(const DType val, const DTypeGrad grad) { return grad / (1 - val * val); } -template -inline DType backward_sqrt(const DType out, const DType grad) { +template +inline DTypeGrad backward_sqrt(const DType out, const DTypeGrad grad) { return 0.5 * grad / out; } -template -inline DType backward_rsqrt(const DType val, const DType grad) { +template +inline DTypeGrad backward_rsqrt(const DType val, const DTypeGrad grad) { const DType inv = 1 / val; return -0.5 * grad * sqrtf(inv) * inv; } -template -inline DType backward_cbrt(const DType out, const DType grad) { +template +inline DTypeGrad backward_cbrt(const DType out, const DTypeGrad grad) { return grad / (3.0f * out * out); } -template -inline DType backward_rcbrt(const DType val, const DType grad) { +template +inline DTypeGrad backward_rcbrt(const DType val, const DTypeGrad grad) { const DType inv = 1 / val; return -1.f/3.f * grad * cbrtf(inv) * inv; } -template -inline DType backward_square(const DType val, const DType grad) { +template +inline DTypeGrad backward_square(const DType val, const DTypeGrad grad) { return 2 * val * grad; } -template -inline DType backward_clip(const DType val, const DType grad, const float a_min, const float a_max) { +template +inline DTypeGrad backward_clip(const DType val, const DTypeGrad grad, const float a_min, const float a_max) { if (val > a_max || val < a_min) { return 0; } else { @@ -931,23 +931,23 @@ inline DType backward_clip(const DType val, const DType grad, const float a_min, } } -template -inline DType backward_reciprocal(const DType val, const DType grad) { +template +inline DTypeGrad backward_reciprocal(const DType val, const DTypeGrad grad) { return -grad / (val * val); } -template -inline DType backward_erf(const DType val, const DType grad) { +template +inline DTypeGrad backward_erf(const DType val, const DTypeGrad grad) { return 2.0f / sqrt(pi) * exp(-(val*val)) * grad; } -template -inline DType backward_erfinv(const DType val, const DType grad) { +template +inline DTypeGrad backward_erfinv(const DType val, const DTypeGrad grad) { return 0.5f * sqrt(pi) * exp(val * val) * grad; } -template -inline DType backward_smooth_l1(const DType val, const DType2 scalar, const DType grad) { +template +inline DTypeGrad backward_smooth_l1(const DType val, const DType2 scalar, const DTypeGrad grad) { auto bsq = scalar * scalar; auto ibsq = 1.0f / bsq; if (val > ibsq) { From da8bfe37327eb93f27940a8bbfb598482caf4d52 Mon Sep 17 00:00:00 2001 From: cfujitsang Date: Thu, 19 Sep 2019 07:51:17 -0700 Subject: [PATCH 097/105] avoid cycle when adding node specific for inputs of subgraph for pointwise fusion --- src/executor/pointwise_fusion_pass.cc | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 381606f920e1..c4c327be2108 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -30,6 +30,7 @@ #include #include #include +#include #include "./simple_partition_pass.h" #include "../operator/fusion/fused_op-inl.h" #include "../operator/fusion/fused_op.h" @@ -224,7 +225,7 @@ void AddInputsOnlyCompatible(const Graph &g, } } std::vector > to_add(subsets->size()); - DFSVisit(g.outputs, [&is_compatible, &node2setidx, subsets, &to_add](const nnvm::NodePtr& n) { + DFSVisit(g.outputs, [&is_compatible, &node2setidx, &to_add](const nnvm::NodePtr& n) { const auto& it = node2setidx.find(n.get()); if (it != node2setidx.end()) { for (auto& e : n->inputs) { @@ -233,8 +234,28 @@ void AddInputsOnlyCompatible(const Graph &g, } } }); + std::unordered_set added; // to avoid when the node to add is input of two subsets for (size_t i = 0; i < subsets->size(); ++i) { - (*subsets)[i].insert(to_add[i].begin(), to_add[i].end()); + std::vector heads; + for (auto n : subsets->at(i)) { + for (auto e : n->inputs) { + if (!subsets->at(i).count(e.node.get())) + heads.push_back(e); + } + } + for (size_t j = 0; j < to_add[i].size(); ++j) { + if (!added.count(to_add[i][j])) { + bool make_cycle = false; + DFSVisit(heads, [&make_cycle, &node=to_add[i][j]](const nnvm::NodePtr& n) { + if (n.get() == node) + make_cycle = true; + }); + if (!make_cycle) { + (*subsets)[i].insert(to_add[i][j]); + added.insert(to_add[i][j]); + } + } + } } } From ed035955dcc38afafdcbff2296ee631470803918 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 20 Sep 2019 10:43:17 -0700 Subject: [PATCH 098/105] Fix lint --- src/executor/pointwise_fusion_pass.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index c4c327be2108..2321ad40aebf 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -234,7 +234,9 @@ void AddInputsOnlyCompatible(const Graph &g, } } }); - std::unordered_set added; // to avoid when the node to add is input of two subsets + + // Avoid duplicating the node that is input of two subsets + std::unordered_set added; for (size_t i = 0; i < subsets->size(); ++i) { std::vector heads; for (auto n : subsets->at(i)) { @@ -246,7 +248,8 @@ void AddInputsOnlyCompatible(const Graph &g, for (size_t j = 0; j < to_add[i].size(); ++j) { if (!added.count(to_add[i][j])) { bool make_cycle = false; - DFSVisit(heads, [&make_cycle, &node=to_add[i][j]](const nnvm::NodePtr& n) { + const auto& node = to_add[i][j]; + DFSVisit(heads, [&make_cycle, &node](const nnvm::NodePtr& n) { if (n.get() == node) make_cycle = true; }); From 69facdc517423acd8d6123162bfbeaaf27d845ff Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 23 Sep 2019 13:27:58 -0700 Subject: [PATCH 099/105] Add namespace to the fusion implementations --- src/operator/fusion/fused_op-inl.h | 281 +++++++++++++++-------------- src/operator/fusion/fused_op.cu | 31 ++-- 2 files changed, 162 insertions(+), 150 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 8897936ed67f..d3d9ae03bf85 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -61,143 +61,145 @@ using int64 = long long; )code"; const std::map>> ops_desc = { - {"elemwise_add" , {{"add(%, %)", "_0", "_1"}}}, - {"_plus" , {{"add(%, %)", "_0", "_1"}}}, - {"_Plus" , {{"add(%, %)", "_0", "_1"}}}, - {"_add" , {{"add(%, %)", "_0", "_1"}}}, - {"elemwise_sub" , {{"sub(%, %)", "_0", "_1"}}}, - {"_minus" , {{"sub(%, %)", "_0", "_1"}}}, - {"_Minus" , {{"sub(%, %)", "_0", "_1"}}}, - {"_sub" , {{"sub(%, %)", "_0", "_1"}}}, - {"elemwise_mul" , {{"mul(%, %)", "_0", "_1"}}}, - {"_mul" , {{"mul(%, %)", "_0", "_1"}}}, - {"_Mul" , {{"mul(%, %)", "_0", "_1"}}}, - {"elemwise_div" , {{"div(%, %)", "_0", "_1"}}}, - {"_div" , {{"div(%, %)", "_0", "_1"}}}, - {"_Div" , {{"div(%, %)", "_0", "_1"}}}, - {"_Power" , {{"power(%, %)", "_0", "_1"}}}, - {"_power" , {{"power(%, %)", "_0", "_1"}}}, - {"_Maximum" , {{"max(%, %)", "_0", "_1"}}}, - {"_maximum" , {{"max(%, %)", "_0", "_1"}}}, - {"_Minimum" , {{"min(%, %)", "_0", "_1"}}}, - {"_minimum" , {{"min(%, %)", "_0", "_1"}}}, - {"amp_cast" , {{"identity(%)", "_0"}}}, - {"_backward_amp_cast" , {{"identity(%)", "_0"}}}, - {"relu" , {{"relu(%)", "_0"}}}, - {"sigmoid" , {{"sigmoid(%)", "_0"}}}, - {"softsign" , {{"softsign(%)", "_0"}}}, - {"exp" , {{"exp(%)", "_0"}}}, - {"expm1" , {{"expm1(%)", "_0"}}}, - {"log" , {{"log(%)", "_0"}}}, - {"log10" , {{"log10(%)", "_0"}}}, - {"log2" , {{"log2(%)", "_0"}}}, - {"log1p" , {{"log1p(%)", "_0"}}}, - {"degrees" , {{"degrees(%)", "_0"}}}, - {"radians" , {{"radians(%)", "_0"}}}, - {"sin" , {{"sin(%)", "_0"}}}, - {"cos" , {{"cos(%)", "_0"}}}, - {"tan" , {{"tan(%)", "_0"}}}, - {"arcsin" , {{"arcsin(%)", "_0"}}}, - {"arccos" , {{"arccos(%)", "_0"}}}, - {"arctan" , {{"arctan(%)", "_0"}}}, - {"sinh" , {{"sinh(%)", "_0"}}}, - {"cosh" , {{"cosh(%)", "_0"}}}, - {"tanh" , {{"tanh(%)", "_0"}}}, - {"arcsinh" , {{"arcsinh(%)", "_0"}}}, - {"arccosh" , {{"arccosh(%)", "_0"}}}, - {"arctanh" , {{"arctanh(%)", "_0"}}}, - {"sqrt" , {{"sqrt(%)", "_0"}}}, - {"rsqrt" , {{"rsqrt(%)", "_0"}}}, - {"cbrt" , {{"cbrt(%)", "_0"}}}, - {"rcbrt" , {{"rcbrt(%)", "_0"}}}, - {"square" , {{"square(%)", "_0"}}}, - {"squeeze" , {{"identity(%)", "_0"}}}, - {"zeros_like" , {{"zero(%)", "_0"}}}, - {"ones_like" , {{"one(%)", "_0"}}}, - {"flatten" , {{"identity(%)", "_0"}}}, - {"Reshape" , {{"identity(%)", "_0"}}}, - {"reshape" , {{"identity(%)", "_0"}}}, - {"_backward_reshape" , {{"identity(%)", "_0"}}}, - {"expand_dims" , {{"identity(%)", "_0"}}}, - {"round" , {{"round(%)", "_0"}}}, - {"rint" , {{"rint(%)", "_0"}}}, - {"fix" , {{"fix(%)", "_0"}}}, - {"floor" , {{"floor(%)", "_0"}}}, - {"ceil" , {{"ceil(%)", "_0"}}}, - {"trunc" , {{"trunc(%)", "_0"}}}, - {"sign" , {{"sign(%)", "_0"}}}, - {"reciprocal" , {{"reciprocal(%)", "_0"}}}, - {"abs" , {{"abs(%)", "_0"}}}, - {"gamma" , {{"gamma(%)", "_0"}}}, - {"gammaln" , {{"gammaln(%)", "_0"}}}, - {"erf" , {{"erf(%)", "_0"}}}, - {"erfinv" , {{"erfinv(%)", "_0"}}}, - {"_copy" , {{"identity(%)", "_0"}}}, - {"_identity_with_attr_like_rhs" , {{"identity(%)", "_0"}}}, - {"_plus_scalar" , {{"add(%, %)", "_0", "scalar"}}}, - {"_PlusScalar" , {{"add(%, %)", "_0", "scalar"}}}, - {"_minus_scalar" , {{"sub(%, %)", "_0", "scalar"}}}, - {"_MinusScalar" , {{"sub(%, %)", "_0", "scalar"}}}, - {"_rminus_scalar" , {{"(-sub(%, %))", "_0", "scalar"}}}, - {"_RMinusScalar" , {{"(-sub(%, %))", "_0", "scalar"}}}, - {"_mul_scalar" , {{"mul(%, %)", "_0", "scalar"}}}, - {"_MulScalar" , {{"mul(%, %)", "_0", "scalar"}}}, - {"_div_scalar" , {{"div(%, %)", "_0", "scalar"}}}, - {"_DivScalar" , {{"div(%, %)", "_0", "scalar"}}}, - {"_rdiv_scalar" , {{"rdiv(%, %)", "_0", "scalar"}}}, - {"_power_scalar" , {{"power(%, %)", "_0", "scalar"}}}, - {"_PowerScalar" , {{"power(%, %)", "_0", "scalar"}}}, - {"_rpower_scalar" , {{"rpow(%, %)", "_0", "scalar"}}}, - {"_RPowerScalar" , {{"rpow(%, %)", "_0", "scalar"}}}, - {"_RDivScalar" , {{"rdiv(%, %)", "_0", "scalar"}}}, - {"Cast" , {{"cast<%>(%)", "dtype", "_0"}}}, - {"cast" , {{"cast<%>(%)", "dtype", "_0"}}}, - {"Activation" , {{"%(%)", "act_type", "_0"}}}, - {"clip" , {{"clip(%, %, %)", "_0", "a_min", "a_max"}}}, - {"_zeros" , {{"zero<%>()", "dtype"}}}, - {"_ones" , {{"one<%>()", "dtype"}}}, + {"elemwise_add" , {{"op::add(%, %)", "_0", "_1"}}}, + {"_plus" , {{"op::add(%, %)", "_0", "_1"}}}, + {"_Plus" , {{"op::add(%, %)", "_0", "_1"}}}, + {"_add" , {{"op::add(%, %)", "_0", "_1"}}}, + {"elemwise_sub" , {{"op::sub(%, %)", "_0", "_1"}}}, + {"_minus" , {{"op::sub(%, %)", "_0", "_1"}}}, + {"_Minus" , {{"op::sub(%, %)", "_0", "_1"}}}, + {"_sub" , {{"op::sub(%, %)", "_0", "_1"}}}, + {"elemwise_mul" , {{"op::mul(%, %)", "_0", "_1"}}}, + {"_mul" , {{"op::mul(%, %)", "_0", "_1"}}}, + {"_Mul" , {{"op::mul(%, %)", "_0", "_1"}}}, + {"elemwise_div" , {{"op::div(%, %)", "_0", "_1"}}}, + {"_div" , {{"op::div(%, %)", "_0", "_1"}}}, + {"_Div" , {{"op::div(%, %)", "_0", "_1"}}}, + {"_Power" , {{"op::power(%, %)", "_0", "_1"}}}, + {"_power" , {{"op::power(%, %)", "_0", "_1"}}}, + {"_Maximum" , {{"op::max(%, %)", "_0", "_1"}}}, + {"_maximum" , {{"op::max(%, %)", "_0", "_1"}}}, + {"_Minimum" , {{"op::min(%, %)", "_0", "_1"}}}, + {"_minimum" , {{"op::min(%, %)", "_0", "_1"}}}, + {"amp_cast" , {{"op::identity(%)", "_0"}}}, + {"_backward_amp_cast" , {{"op::identity(%)", "_0"}}}, + {"relu" , {{"op::relu(%)", "_0"}}}, + {"sigmoid" , {{"op::sigmoid(%)", "_0"}}}, + {"softsign" , {{"op::softsign(%)", "_0"}}}, + {"exp" , {{"op::exp(%)", "_0"}}}, + {"expm1" , {{"op::expm1(%)", "_0"}}}, + {"log" , {{"op::log(%)", "_0"}}}, + {"log10" , {{"op::log10(%)", "_0"}}}, + {"log2" , {{"op::log2(%)", "_0"}}}, + {"log1p" , {{"op::log1p(%)", "_0"}}}, + {"degrees" , {{"op::degrees(%)", "_0"}}}, + {"radians" , {{"op::radians(%)", "_0"}}}, + {"sin" , {{"op::sin(%)", "_0"}}}, + {"cos" , {{"op::cos(%)", "_0"}}}, + {"tan" , {{"op::tan(%)", "_0"}}}, + {"arcsin" , {{"op::arcsin(%)", "_0"}}}, + {"arccos" , {{"op::arccos(%)", "_0"}}}, + {"arctan" , {{"op::arctan(%)", "_0"}}}, + {"sinh" , {{"op::sinh(%)", "_0"}}}, + {"cosh" , {{"op::cosh(%)", "_0"}}}, + {"tanh" , {{"op::tanh(%)", "_0"}}}, + {"arcsinh" , {{"op::arcsinh(%)", "_0"}}}, + {"arccosh" , {{"op::arccosh(%)", "_0"}}}, + {"arctanh" , {{"op::arctanh(%)", "_0"}}}, + {"sqrt" , {{"op::sqrt(%)", "_0"}}}, + {"rsqrt" , {{"op::rsqrt(%)", "_0"}}}, + {"cbrt" , {{"op::cbrt(%)", "_0"}}}, + {"rcbrt" , {{"op::rcbrt(%)", "_0"}}}, + {"square" , {{"op::square(%)", "_0"}}}, + {"squeeze" , {{"op::identity(%)", "_0"}}}, + {"zeros_like" , {{"op::zero(%)", "_0"}}}, + {"ones_like" , {{"op::one(%)", "_0"}}}, + {"flatten" , {{"op::identity(%)", "_0"}}}, + {"Reshape" , {{"op::identity(%)", "_0"}}}, + {"reshape" , {{"op::identity(%)", "_0"}}}, + {"_backward_reshape" , {{"op::identity(%)", "_0"}}}, + {"expand_dims" , {{"op::identity(%)", "_0"}}}, + {"round" , {{"op::round(%)", "_0"}}}, + {"rint" , {{"op::rint(%)", "_0"}}}, + {"fix" , {{"op::fix(%)", "_0"}}}, + {"floor" , {{"op::floor(%)", "_0"}}}, + {"ceil" , {{"op::ceil(%)", "_0"}}}, + {"trunc" , {{"op::trunc(%)", "_0"}}}, + {"sign" , {{"op::sign(%)", "_0"}}}, + {"reciprocal" , {{"op::reciprocal(%)", "_0"}}}, + {"abs" , {{"op::abs(%)", "_0"}}}, + {"gamma" , {{"op::gamma(%)", "_0"}}}, + {"gammaln" , {{"op::gammaln(%)", "_0"}}}, + {"erf" , {{"op::erf(%)", "_0"}}}, + {"erfinv" , {{"op::erfinv(%)", "_0"}}}, + {"_copy" , {{"op::identity(%)", "_0"}}}, + {"_identity_with_attr_like_rhs" , {{"op::identity(%)", "_0"}}}, + {"_plus_scalar" , {{"op::add(%, %)", "_0", "scalar"}}}, + {"_PlusScalar" , {{"op::add(%, %)", "_0", "scalar"}}}, + {"_minus_scalar" , {{"op::sub(%, %)", "_0", "scalar"}}}, + {"_MinusScalar" , {{"op::sub(%, %)", "_0", "scalar"}}}, + {"_rminus_scalar" , {{"(-op::sub(%, %))", "_0", "scalar"}}}, + {"_RMinusScalar" , {{"(-op::sub(%, %))", "_0", "scalar"}}}, + {"_mul_scalar" , {{"op::mul(%, %)", "_0", "scalar"}}}, + {"_MulScalar" , {{"op::mul(%, %)", "_0", "scalar"}}}, + {"_div_scalar" , {{"op::div(%, %)", "_0", "scalar"}}}, + {"_DivScalar" , {{"op::div(%, %)", "_0", "scalar"}}}, + {"_rdiv_scalar" , {{"op::rdiv(%, %)", "_0", "scalar"}}}, + {"_power_scalar" , {{"op::power(%, %)", "_0", "scalar"}}}, + {"_PowerScalar" , {{"op::power(%, %)", "_0", "scalar"}}}, + {"_rpower_scalar" , {{"op::rpow(%, %)", "_0", "scalar"}}}, + {"_RPowerScalar" , {{"op::rpow(%, %)", "_0", "scalar"}}}, + {"_RDivScalar" , {{"op::rdiv(%, %)", "_0", "scalar"}}}, + {"Cast" , {{"op::cast<%>(%)", "dtype", "_0"}}}, + {"cast" , {{"op::cast<%>(%)", "dtype", "_0"}}}, + {"Activation" , {{"op::%(%)", "act_type", "_0"}}}, + {"clip" , {{"op::clip(%, %, %)", "_0", "a_min", "a_max"}}}, + {"_zeros" , {{"op::zero<%>()", "dtype"}}}, + {"_ones" , {{"op::one<%>()", "dtype"}}}, {"negative" , {{"(-%)", "_0"}}}, - {"_hypot" , {{"hypot(%, %)", "_0", "_1"}}}, - {"_hypot_scalar" , {{"hypot(%, %)", "_0", "scalar"}}}, - {"_backward_relu" , {{"backward_relu(%, %)", "_1", "_0"}}}, - {"_backward_sigmoid" , {{"backward_sigmoid(%, %)", "_1", "_0"}}}, - {"_backward_expm1" , {{"backward_expm1(%, %)", "_1", "_0"}}}, - {"_backward_log" , {{"backward_log(%, %)", "_1", "_0"}}}, - {"_backward_log10" , {{"backward_log10(%, %)", "_1", "_0"}}}, - {"_backward_log2" , {{"backward_log2(%, %)", "_1", "_0"}}}, - {"_backward_log1p" , {{"backward_log1p(%, %)", "_1", "_0"}}}, - {"_backward_sin" , {{"backward_sin(%, %)", "_1", "_0"}}}, - {"_backward_cos" , {{"backward_cos(%, %)", "_1", "_0"}}}, - {"_backward_tan" , {{"backward_tan(%, %)", "_1", "_0"}}}, - {"_backward_arcsin" , {{"backward_arcsin(%, %)", "_1", "_0"}}}, - {"_backward_arccos" , {{"backward_arccos(%, %)", "_1", "_0"}}}, - {"_backward_arctan" , {{"backward_arctan(%, %)", "_1", "_0"}}}, - {"_backward_sinh" , {{"backward_sinh(%, %)", "_1", "_0"}}}, - {"_backward_cosh" , {{"backward_cosh(%, %)", "_1", "_0"}}}, - {"_backward_tanh" , {{"backward_tanh(%, %)", "_1", "_0"}}}, - {"_backward_arcsinh" , {{"backward_arcsinh(%, %)", "_1", "_0"}}}, - {"_backward_arccosh" , {{"backward_arccosh(%, %)", "_1", "_0"}}}, - {"_backward_arctanh" , {{"backward_arctanh(%, %)", "_1", "_0"}}}, - {"_backward_sqrt" , {{"backward_sqrt(%, %)", "_1", "_0"}}}, - {"_backward_rsqrt" , {{"backward_rsqrt(%, %)", "_1", "_0"}}}, - {"_backward_cbrt" , {{"backward_cbrt(%, %)", "_1", "_0"}}}, - {"_backward_rcbrt" , {{"backward_rcbrt(%, %)", "_1", "_0"}}}, - {"_backward_square" , {{"backward_square(%, %)", "_1", "_0"}}}, + {"_hypot" , {{"op::hypot(%, %)", "_0", "_1"}}}, + {"_hypot_scalar" , {{"op::hypot(%, %)", "_0", "scalar"}}}, + {"_backward_relu" , {{"op::backward_relu(%, %)", "_1", "_0"}}}, + {"_backward_sigmoid" , {{"op::backward_sigmoid(%, %)", "_1", "_0"}}}, + {"_backward_expm1" , {{"op::backward_expm1(%, %)", "_1", "_0"}}}, + {"_backward_log" , {{"op::backward_log(%, %)", "_1", "_0"}}}, + {"_backward_log10" , {{"op::backward_log10(%, %)", "_1", "_0"}}}, + {"_backward_log2" , {{"op::backward_log2(%, %)", "_1", "_0"}}}, + {"_backward_log1p" , {{"op::backward_log1p(%, %)", "_1", "_0"}}}, + {"_backward_sin" , {{"op::backward_sin(%, %)", "_1", "_0"}}}, + {"_backward_cos" , {{"op::backward_cos(%, %)", "_1", "_0"}}}, + {"_backward_tan" , {{"op::backward_tan(%, %)", "_1", "_0"}}}, + {"_backward_arcsin" , {{"op::backward_arcsin(%, %)", "_1", "_0"}}}, + {"_backward_arccos" , {{"op::backward_arccos(%, %)", "_1", "_0"}}}, + {"_backward_arctan" , {{"op::backward_arctan(%, %)", "_1", "_0"}}}, + {"_backward_sinh" , {{"op::backward_sinh(%, %)", "_1", "_0"}}}, + {"_backward_cosh" , {{"op::backward_cosh(%, %)", "_1", "_0"}}}, + {"_backward_tanh" , {{"op::backward_tanh(%, %)", "_1", "_0"}}}, + {"_backward_arcsinh" , {{"op::backward_arcsinh(%, %)", "_1", "_0"}}}, + {"_backward_arccosh" , {{"op::backward_arccosh(%, %)", "_1", "_0"}}}, + {"_backward_arctanh" , {{"op::backward_arctanh(%, %)", "_1", "_0"}}}, + {"_backward_sqrt" , {{"op::backward_sqrt(%, %)", "_1", "_0"}}}, + {"_backward_rsqrt" , {{"op::backward_rsqrt(%, %)", "_1", "_0"}}}, + {"_backward_cbrt" , {{"op::backward_cbrt(%, %)", "_1", "_0"}}}, + {"_backward_rcbrt" , {{"op::backward_rcbrt(%, %)", "_1", "_0"}}}, + {"_backward_square" , {{"op::backward_square(%, %)", "_1", "_0"}}}, {"_backward_div_scalar" , {{"(% / %)", "_0", "scalar"}}}, {"_backward_div_scalar" , {{"(% / %)", "_0", "scalar"}}}, {"_backward_rdiv_scalar" , {{"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}}, - {"_backward_hypot_scalar" , {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "scalar"}}}, - {"_backward_radians" , {{"radians(%)", "_0"}}}, - {"_backward_erf" , {{"backward_erf(%, %)", "_1", "_0"}}}, - {"_backward_erfinv" , {{"backward_erfinv(%, %)", "_1", "_0"}}}, - {"_backward_reciprocal" , {{"backward_reciprocal(%, %)", "_1", "_0"}}}, - {"_backward_abs" , {{"(% * sign(%))", "_0", "_1"}}}, - {"_backward_degrees" , {{"degrees(%)", "_0"}}}, - {"_backward_sign" , {{"zero(%)", "_0"}}}, - {"_backward_clip" , {{"backward_clip(%, %, %, %)", "_1", "_0", - "a_min", "a_max"}}}, - {"smooth_l1" , {{"smooth_l1(%, %)", "_0", "scalar"}}}, - {"_backward_smooth_l1" , {{"backward_smooth_l1(%, %, %)", "_1", "scalar", "_0"}}}, + {"_backward_hypot_scalar" , {{"(% * % / op::hypot(%, %))", + "_0", "_1", "_1", "scalar"}}}, + {"_backward_radians" , {{"op::radians(%)", "_0"}}}, + {"_backward_erf" , {{"op::backward_erf(%, %)", "_1", "_0"}}}, + {"_backward_erfinv" , {{"op::backward_erfinv(%, %)", "_1", "_0"}}}, + {"_backward_reciprocal" , {{"op::backward_reciprocal(%, %)", "_1", "_0"}}}, + {"_backward_abs" , {{"(% * op::sign(%))", "_0", "_1"}}}, + {"_backward_degrees" , {{"op::degrees(%)", "_0"}}}, + {"_backward_sign" , {{"op::zero(%)", "_0"}}}, + {"_backward_clip" , {{"op::backward_clip(%, %, %, %)", "_1", "_0", + "a_min", "a_max"}}}, + {"smooth_l1" , {{"op::smooth_l1(%, %)", "_0", "scalar"}}}, + {"_backward_smooth_l1" , {{"op::backward_smooth_l1(%, %, %)", + "_1", "scalar", "_0"}}}, // TODO(ptredak): arange // TODO(ptredak): LeakyRelu // TODO(ptredak): mod and rmod @@ -217,8 +219,8 @@ const std::map>> ops_desc = { {"((% >= %) ? 0 : %)", "_1", "_2", "_0"}}}, {"_backward_minimum" , {{"((% <= %) ? % : 0)", "_1", "_2", "_0"}, {"((% <= %) ? 0 : %)", "_1", "_2", "_0"}}}, - {"_backward_hypot" , {{"(% * % / hypot(%, %))", "_0", "_1", "_1", "_2"}, - {"(% * % / hypot(%, %))", "_0", "_2", "_1", "_2"}}} + {"_backward_hypot" , {{"(% * % / op::hypot(%, %))", "_0", "_1", "_1", "_2"}, + {"(% * % / op::hypot(%, %))", "_0", "_2", "_1", "_2"}}} }; const std::map slice_ops = { @@ -240,6 +242,8 @@ const char function_definitions[] = R"code( #define INT_MAX (2147483647) +namespace op { + template struct LoadType { using Type = DType; @@ -779,9 +783,14 @@ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) { } } +} // namespace op + )code"; const char backward_function_definitions[] = R"code( + +namespace op { + template inline DTypeGrad backward_relu(const DType val, const DTypeGrad grad) { return val > 0 ? grad : 0; @@ -959,6 +968,8 @@ inline DTypeGrad backward_smooth_l1(const DType val, const DType2 scalar, const } } +} // namespace op + )code"; const char kernel_begin[] = R"code( diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 7f6fe3796850..60cb211f3704 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -215,7 +215,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, if (source->is_variable()) { if (load_index[i]) { const auto& var_name = source->attrs.name; - code += "const auto vec_" + var_name + " = load_index(" + + code += "const auto vec_" + var_name + " = op::load_index(" + var_name + ", offset, " + var_name + "_shape);\n"; variables[{i, 0}] = var_name; } @@ -265,7 +265,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, }; auto build_string_end = [i, ndim, var_name](std::string* code) { std::string end_var_name = var_name + "_" + std::to_string(i) + "_end"; - *code += "Shape<" + std::to_string(ndim) + "> "+ end_var_name + ";\n"; + *code += "op::Shape<" + std::to_string(ndim) + "> "+ end_var_name + ";\n"; *code += end_var_name + ".set(INT_MAX);\n"; return end_var_name; }; @@ -304,7 +304,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, if (!check_shapes) { slice_func = "fast_" + slice_func; } - code += "const auto " + vec_name + " = " + slice_func + "(" + + code += "const auto " + vec_name + " = op::" + slice_func + "(" + var_name + ", " + var_name + "_shape," + begin + "," + end + ", offset);\n"; CHECK_EQ(outputs[i], 1); @@ -322,7 +322,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, size_t counter = 0; for (const auto& entry : g.outputs()) { std::string var_name = "output" + std::to_string(counter); - code += "VectorType vec_" + var_name + ";\n"; ++counter; } @@ -337,7 +337,8 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, std::string var_name = "temp" + std::to_string(temp_name_counter++); if (source->is_variable()) { if (load_index[i]) { - code += "const auto " + var_name + " = load(vec_" + variables[{i, 0}] + ".x[j]);\n"; + code += "const auto " + var_name + " = op::load(vec_" + + variables[{i, 0}] + ".x[j]);\n"; CHECK_EQ(outputs[i], 1); variables[{i, 0}] = var_name; } @@ -359,7 +360,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, } if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) { - code += "const auto " + var_name + " = load(" + variables[{i, 0}] + ".x[j]);\n"; + code += "const auto " + var_name + " = op::load(" + variables[{i, 0}] + ".x[j]);\n"; variables[{i, 0}] = var_name; continue; } @@ -374,7 +375,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, code += "auto " + var_name + " = " + arg + ";\n"; for (size_t inp = 1; inp < node.inputs.size(); ++inp) { const auto& temp_arg = variables[{node.inputs[inp].node_id, node.inputs[inp].index}]; - code += var_name + " = add(" + var_name + ", " + temp_arg + ");\n"; + code += var_name + " = op::add(" + var_name + ", " + temp_arg + ");\n"; } variables[{i, 0}] = var_name; continue; @@ -392,7 +393,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, } else { lhs = variables[{node.inputs[2].node_id, node.inputs[2].index}]; } - code += "const auto " + var_name + " = backward_" + act_type + + code += "const auto " + var_name + " = op::backward_" + act_type + "(" + lhs + ", " + rhs + ");\n"; variables[{i, 0}] = var_name; @@ -415,7 +416,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, CHECK_EQ(outputs[i], 1); const int output_type = node_dtypes[g.entry_id(i, 0)]; const auto& arg = variables[{node.inputs[0].node_id, node.inputs[0].index}]; - code += "const auto " + var_name + " = cast<" + mshadowTypeToString(output_type) + + code += "const auto " + var_name + " = op::cast<" + mshadowTypeToString(output_type) + ">(" + arg + ");\n"; variables[{i, 0}] = var_name; continue; @@ -432,7 +433,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, for (const auto& entry : g.outputs()) { const std::string& var = variables[{entry.node_id, entry.index}]; const auto var_name = "output" + std::to_string(counter); - code += "vec_" + var_name + ".x[j] = store("+ var +", " + var_name + ");\n"; + code += "vec_" + var_name + ".x[j] = op::store("+ var +", " + var_name + ");\n"; ++counter; } @@ -444,11 +445,11 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, const std::string& var = variables[{entry.node_id, entry.index}]; if (req[counter] == kWriteTo || req[counter] == kWriteInplace) { const auto var_name = "output" + std::to_string(counter); - code += "store_index(vec_" + var_name + ", i, " + var_name + ", " + + code += "op::store_index(vec_" + var_name + ", i, " + var_name + ", " + var_name + "_shape);\n"; } else if (req[counter] == kAddTo) { const auto var_name = "output" + std::to_string(counter); - code += "store_add_index(vec_" + var_name + ", i, " + var_name + ", " + + code += "op::store_add_index(vec_" + var_name + ", i, " + var_name + ", " + var_name + "_shape);\n"; } else if (req[counter] == kNullOp) { // NULL req, do not do anything @@ -476,7 +477,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, for (const auto &shape_id : extra_shape_args_) { std::string shape_name = "extra_" + std::to_string(shape_id) + "_shape"; int ndim = node_shapes[shape_id].ndim(); - kernel_params += " const Shape<" + std::to_string(ndim) + "> " + shape_name; + kernel_params += " const op::Shape<" + std::to_string(ndim) + "> " + shape_name; kernel_params += ", "; } for (const auto &type : in_dtypes) { @@ -487,7 +488,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; aux_code = "static const int " + dim_var + " = " + dim_val + ";\n" + aux_code; tensor_params += dtype_var + "* " +input_names[i]; - kernel_params += " const Shape<" + dim_val + "> " + input_names[i]+"_shape"; + kernel_params += " const op::Shape<" + dim_val + "> " + input_names[i]+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; @@ -503,7 +504,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, aux_code = "static const int " + dim_var + " = " + dim_val + ";\n" + aux_code; aux_code = "using " + dtype_var + " = " + type_name + ";\n" + aux_code; tensor_params += dtype_var + "* " + out_name; - kernel_params += " const Shape<" + dim_val + "> " + out_name+"_shape"; + kernel_params += " const op::Shape<" + dim_val + "> " + out_name+"_shape"; ++i; if (i < num_params) { tensor_params += ", "; From e26770b886da4a245e2a2ba06d2383f2df0968b0 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 21 Oct 2019 14:35:30 -0700 Subject: [PATCH 100/105] Set launch bounds on the fused kernel --- src/operator/fusion/fused_op.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 60cb211f3704..0abfaa7fd5a9 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -518,6 +518,7 @@ void FusedOp::GenerateCode(int kernel_index, const std::vector &req, fusion::function_definitions + "\n" + fusion::backward_function_definitions + "\n" + aux_code + "\n" + + "__launch_bounds__(" + std::to_string(FusedOp::NTHREADS) + ")\n" + "__global__ void FusedKernel_" + kernel_name + "(size_t N, " + kernel_params + ") {\n" + fusion::kernel_begin + "\n" + From 80e36ba8c2da19308b5bcbbafe83cb6fc1133612 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 21 Oct 2019 16:40:55 -0700 Subject: [PATCH 101/105] Fix NumPy tests --- src/operator/fusion/fused_op-inl.h | 7 ++++++- src/operator/fusion/fused_op.cu | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index d3d9ae03bf85..737d4d756603 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -341,7 +341,12 @@ struct Shape { x[i] = def; } } - }; +}; + +template <> +struct Shape<0> { + size_t size; +}; template inline VectorType load_index(const DType * input, int i, const Shape &shape) { diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 0abfaa7fd5a9..f6df38bac247 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -555,7 +555,8 @@ void FusedOp::CompileCode(int kernel_index, const std::string &kernel_name) { NVRTC_CALL(nvrtcGetProgramLogSize(program, &log_size)); std::string log(log_size, '\0'); NVRTC_CALL(nvrtcGetProgramLog(program, &log[0])); - CHECK_EQ(compileResult, NVRTC_SUCCESS) << "NVRTC Compilation failed.\n" << log; + CHECK_EQ(compileResult, NVRTC_SUCCESS) + << "NVRTC Compilation failed. Please set environment variable MXNET_USE_FUSION to 0.\n" << log; // Obtain PTX from the program. size_t ptx_size; NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size)); From 36e5ce8ac55d5becf84d564fc89f175c3da9a972 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 22 Oct 2019 16:08:45 -0700 Subject: [PATCH 102/105] Test showcasing an issue fixed in PR #16553 --- tests/python/unittest/test_gluon.py | 41 +++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index f1d0cc7ac274..d2fe8ceb00e7 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -3079,6 +3079,47 @@ def forward(self, x): shape = (np.random.randint(1, 10), np.random.randint(1, 10), 1) block(mx.nd.ones(shape)) +@with_seed() +def test_reqs_switching_training_inference(): + class Foo(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Foo, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + y = 2 * x + return F.sqrt(x) + F.sqrt(y) + + f = Foo() + f.hybridize(static_alloc=True) + x = mx.nd.ones(shape=(10,10)) + x.attach_grad() + x2 = mx.nd.ones(shape=x.shape) * 2 + x2.attach_grad() + + # Call first in training mode + with mx.autograd.record(): + y = f(x) + y.backward() + + grad1 = x.grad.asnumpy() + + # Compute the gradient with some other input + with mx.autograd.record(): + y = f(x2) + y.backward() + + # Call inference mode + y = f(x) + + # Call training mode again + with mx.autograd.record(): + y = f(x) + y.backward() + + grad2 = x.grad.asnumpy() + + mx.test_utils.assert_almost_equal(grad1, grad2) + if __name__ == '__main__': import nose nose.runmodule() From f77fe5b4346902e4d74e1342acd91e7223f1acd5 Mon Sep 17 00:00:00 2001 From: moisesh Date: Tue, 22 Oct 2019 17:28:44 -0700 Subject: [PATCH 103/105] Cast scalarts to FP32 and perform (a*1.0/b) instead of (a/b) Fix lint errors Fix lint --- src/operator/fusion/fused_op-inl.h | 55 +++++++++++++++--------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 737d4d756603..3085bfd1dc07 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -134,22 +134,22 @@ const std::map>> ops_desc = { {"erfinv" , {{"op::erfinv(%)", "_0"}}}, {"_copy" , {{"op::identity(%)", "_0"}}}, {"_identity_with_attr_like_rhs" , {{"op::identity(%)", "_0"}}}, - {"_plus_scalar" , {{"op::add(%, %)", "_0", "scalar"}}}, - {"_PlusScalar" , {{"op::add(%, %)", "_0", "scalar"}}}, - {"_minus_scalar" , {{"op::sub(%, %)", "_0", "scalar"}}}, - {"_MinusScalar" , {{"op::sub(%, %)", "_0", "scalar"}}}, - {"_rminus_scalar" , {{"(-op::sub(%, %))", "_0", "scalar"}}}, - {"_RMinusScalar" , {{"(-op::sub(%, %))", "_0", "scalar"}}}, - {"_mul_scalar" , {{"op::mul(%, %)", "_0", "scalar"}}}, - {"_MulScalar" , {{"op::mul(%, %)", "_0", "scalar"}}}, - {"_div_scalar" , {{"op::div(%, %)", "_0", "scalar"}}}, - {"_DivScalar" , {{"op::div(%, %)", "_0", "scalar"}}}, - {"_rdiv_scalar" , {{"op::rdiv(%, %)", "_0", "scalar"}}}, - {"_power_scalar" , {{"op::power(%, %)", "_0", "scalar"}}}, - {"_PowerScalar" , {{"op::power(%, %)", "_0", "scalar"}}}, - {"_rpower_scalar" , {{"op::rpow(%, %)", "_0", "scalar"}}}, - {"_RPowerScalar" , {{"op::rpow(%, %)", "_0", "scalar"}}}, - {"_RDivScalar" , {{"op::rdiv(%, %)", "_0", "scalar"}}}, + {"_plus_scalar" , {{"op::add(%, float(%))", "_0", "scalar"}}}, + {"_PlusScalar" , {{"op::add(%, float(%))", "_0", "scalar"}}}, + {"_minus_scalar" , {{"op::sub(%, float(%))", "_0", "scalar"}}}, + {"_MinusScalar" , {{"op::sub(%, float(%))", "_0", "scalar"}}}, + {"_rminus_scalar" , {{"(-op::sub(%, float(%)))", "_0", "scalar"}}}, + {"_RMinusScalar" , {{"(-op::sub(%, float(%)))", "_0", "scalar"}}}, + {"_mul_scalar" , {{"op::mul(%, float(%))", "_0", "scalar"}}}, + {"_MulScalar" , {{"op::mul(%, float(%))", "_0", "scalar"}}}, + {"_div_scalar" , {{"op::mul(%, 1.0f/float(%))", "_0", "scalar"}}}, + {"_DivScalar" , {{"op::mul(%, 1.0f/float(%))", "_0", "scalar"}}}, + {"_rdiv_scalar" , {{"op::rdiv(%, float(%))", "_0", "scalar"}}}, + {"_power_scalar" , {{"op::power(%, float(%))", "_0", "scalar"}}}, + {"_PowerScalar" , {{"op::power(%, float(%))", "_0", "scalar"}}}, + {"_rpower_scalar" , {{"op::rpow(%, float(%))", "_0", "scalar"}}}, + {"_RPowerScalar" , {{"op::rpow(%, float(%))", "_0", "scalar"}}}, + {"_RDivScalar" , {{"op::rdiv(%, float(%))", "_0", "scalar"}}}, {"Cast" , {{"op::cast<%>(%)", "dtype", "_0"}}}, {"cast" , {{"op::cast<%>(%)", "dtype", "_0"}}}, {"Activation" , {{"op::%(%)", "act_type", "_0"}}}, @@ -158,7 +158,7 @@ const std::map>> ops_desc = { {"_ones" , {{"op::one<%>()", "dtype"}}}, {"negative" , {{"(-%)", "_0"}}}, {"_hypot" , {{"op::hypot(%, %)", "_0", "_1"}}}, - {"_hypot_scalar" , {{"op::hypot(%, %)", "_0", "scalar"}}}, + {"_hypot_scalar" , {{"op::hypot(%, float(%))", "_0", "scalar"}}}, {"_backward_relu" , {{"op::backward_relu(%, %)", "_1", "_0"}}}, {"_backward_sigmoid" , {{"op::backward_sigmoid(%, %)", "_1", "_0"}}}, {"_backward_expm1" , {{"op::backward_expm1(%, %)", "_1", "_0"}}}, @@ -183,10 +183,11 @@ const std::map>> ops_desc = { {"_backward_cbrt" , {{"op::backward_cbrt(%, %)", "_1", "_0"}}}, {"_backward_rcbrt" , {{"op::backward_rcbrt(%, %)", "_1", "_0"}}}, {"_backward_square" , {{"op::backward_square(%, %)", "_1", "_0"}}}, - {"_backward_div_scalar" , {{"(% / %)", "_0", "scalar"}}}, - {"_backward_div_scalar" , {{"(% / %)", "_0", "scalar"}}}, - {"_backward_rdiv_scalar" , {{"(-% * % / (% * %))", "_0", "scalar", "_1", "_1"}}}, - {"_backward_hypot_scalar" , {{"(% * % / op::hypot(%, %))", + {"_backward_div_scalar" , {{"(% * 1.0f/float(%))", "_0", "scalar"}}}, + {"_backward_div_scalar" , {{"(% * 1.0f/float(%))", "_0", "scalar"}}}, + {"_backward_rdiv_scalar" , {{"(-% * float(%) / (% * %))", "_0", + "scalar", "_1", "_1"}}}, + {"_backward_hypot_scalar" , {{"(% * % / op::hypot(%, float(%)))", "_0", "_1", "_1", "scalar"}}}, {"_backward_radians" , {{"op::radians(%)", "_0"}}}, {"_backward_erf" , {{"op::backward_erf(%, %)", "_1", "_0"}}}, @@ -197,8 +198,8 @@ const std::map>> ops_desc = { {"_backward_sign" , {{"op::zero(%)", "_0"}}}, {"_backward_clip" , {{"op::backward_clip(%, %, %, %)", "_1", "_0", "a_min", "a_max"}}}, - {"smooth_l1" , {{"op::smooth_l1(%, %)", "_0", "scalar"}}}, - {"_backward_smooth_l1" , {{"op::backward_smooth_l1(%, %, %)", + {"smooth_l1" , {{"op::smooth_l1(%, float(%))", "_0", "scalar"}}}, + {"_backward_smooth_l1" , {{"op::backward_smooth_l1(%, float(%), %)", "_1", "scalar", "_0"}}}, // TODO(ptredak): arange // TODO(ptredak): LeakyRelu @@ -207,14 +208,14 @@ const std::map>> ops_desc = { {"(-(%))", "_0"}}}, {"_backward_mul" , {{"(% * %)", "_0", "_2"}, {"(% * %)", "_0", "_1"}}}, - {"_backward_mul_scalar" , {{"(% * %)", "_0", "scalar"}}}, + {"_backward_mul_scalar" , {{"(% * float(%))", "_0", "scalar"}}}, {"_backward_div" , {{"(% / %)", "_0", "_2"}, {"(-% * % / (% * %))", "_0", "_1", "_2", "_2"}}}, {"_backward_power" , {{"(% * % * powf(%, % - 1))", "_0", "_2", "_1", "_2"}, {"(% * powf(%, %) * logf(%))", "_0", "_1", "_2", "_1"}}}, - {"_backward_power_scalar" , {{"(% * % * powf(%, % - 1))", "_0", "scalar", "_1", - "scalar"}}}, - {"_backward_rpower_scalar" , {{"(% * % * logf(%))", "_0", "_1", "scalar"}}}, + {"_backward_power_scalar" , {{"(% * float(%) * powf(%, float(%) - 1))", + "_0", "scalar", "_1", "scalar"}}}, + {"_backward_rpower_scalar" , {{"(% * % * logf(float(%)))", "_0", "_1", "scalar"}}}, {"_backward_maximum" , {{"((% >= %) ? % : 0)", "_1", "_2", "_0"}, {"((% >= %) ? 0 : %)", "_1", "_2", "_0"}}}, {"_backward_minimum" , {{"((% <= %) ? % : 0)", "_1", "_2", "_0"}, From 76aa154fdf47ca32b4100fa55b9e0fde606fb734 Mon Sep 17 00:00:00 2001 From: Clement Fuji Tsang Date: Mon, 28 Oct 2019 09:54:50 -0700 Subject: [PATCH 104/105] Fix a bug in cycle detection for inputs only op in pointwise fusion --- src/executor/pointwise_fusion_pass.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 2321ad40aebf..c6e2405cb2a4 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -249,7 +249,12 @@ void AddInputsOnlyCompatible(const Graph &g, if (!added.count(to_add[i][j])) { bool make_cycle = false; const auto& node = to_add[i][j]; - DFSVisit(heads, [&make_cycle, &node](const nnvm::NodePtr& n) { + std::vector _heads; + std::copy_if(heads.begin(), heads.end(), std::back_inserter(_heads), + [&node](const nnvm::NodeEntry& n) { + return n.node.get() != node; + }); + DFSVisit(_heads, [&make_cycle, &node](const nnvm::NodePtr& n) { if (n.get() == node) make_cycle = true; }); From 3d1b5afb60c6c81aa176983f2ae2743ec5cbcf07 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 31 Oct 2019 10:06:48 -0700 Subject: [PATCH 105/105] Add comments to simple_partition_pass.h file --- src/executor/infer_graph_attr_pass.cc | 2 +- src/executor/simple_partition_pass.h | 57 ++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 5eb6c5eb37f2..80e4084c478e 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -69,7 +69,7 @@ inline void GetAttrFromForwardNode(const uint32_t nid, std::vector* rshape_ptr, IsNone fis_none) { std::vector& rshape = *rshape_ptr; - const auto& inode = idx[nid]; + const nnvm::IndexedGraph::Node& inode = idx[nid]; // gradient function, used to get node correspondence. static auto& fgrad = Op::GetAttr("FGradient"); diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index f4c0dc9de130..5b26a4523c13 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -42,8 +42,10 @@ namespace exec { /*! - * \brief Custom graph class, which will contain bi-directional nodes - * we need to compute DFS and reverse DFS for graph partitioning. + * \brief Custom graph class, which contains bi-directional nodes + * required for traversing in both directions (from outputs to inputs + * and vice versa). It is a non-owning layer on top of NNVM graph, since + * NNVM graph enables traversing only in 1 direction (from outputs to inputs). */ class BidirectionalGraph { public: @@ -59,12 +61,17 @@ class BidirectionalGraph { nodes.reserve(num_nodes); nnvm2nid.reserve(num_nodes); outputs.reserve(idx.outputs().size()); + // Create all the nodes in a new graph from + // nodes in the NNVM graph and store them + // in nodes array DFSVisit(g.outputs, [this](const nnvm::NodePtr& n) { Node new_node; new_node.nnvmptr = n.get(); nnvm2nid[n.get()] = static_cast(nodes.size()); nodes.emplace_back(std::move(new_node)); }); + // Create all connections between nodes in + // the graph (both directions) for (const auto& it : nnvm2nid) { nnvm::Node* nnvmnode = it.first; uint32_t nid = it.second; @@ -74,18 +81,33 @@ class BidirectionalGraph { nodes[nid].inputs.emplace_back(&nodes[input_nid]); } } + // Create output connections from the graph for (auto& e : g.outputs) { uint32_t nid = nnvm2nid[e.node.get()]; outputs.emplace_back(&nodes[nid]); } } + /* \brief Get all subsets of nodes, where: + * - graph constructed from nodes in each subset is a connected graph + * - every node fulfills a predicate is_compatible + * - if nodes u and v are part of a subset, then for each path between + * u and v in the original directed graph, all nodes on those paths + * are also part of the subset + * \param is_compatible A function taking nnvm::Node* and returning bool + * which identifies which nodes should be included in + * subsets. + */ template std::vector> get_subsets(FCompatible is_compatible) { std::vector> subgraphs; std::unordered_set incomp_set; std::unordered_set all_set(nodes.size()); std::vector separation_sets; + // Check each node for compatibility + // and, if it is incompatible, mark nodes + // on each side of it as not possible to be + // in the same subset for (Node& node : nodes) { if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); @@ -112,6 +134,8 @@ class BidirectionalGraph { for (Node* n : incomp_set) { comp_set.erase(n); } + // For each node construct the map of nodes that cannot be in + // the same subset for (Node* n : comp_set) { for (PairSet p : separation_sets) { if (p.first.count(n)) { @@ -132,6 +156,7 @@ class BidirectionalGraph { } std::unordered_set visited; std::deque stack(outputs.begin(), outputs.end()); + // Create subsets while (!stack.empty()) { Node* vertex = stack.front(); stack.pop_front(); @@ -153,6 +178,13 @@ class BidirectionalGraph { using PairVec = std::pair, std::vector>; using IncompMap = std::unordered_map>; + /* \brief Traverse the graph using DFS in either direction. + * \param heads Starting nodes for the DFS algorithm. + * \param reverse If true, DFS will traverse the graph from + * outputs to inputs. Otherwise, it will + * traverse the graph from inputs to outputs. + * \param fvisit Function to call on each visisted node. + */ template void DFS(const std::vector& heads, bool reverse, FVisit fvisit) { std::unordered_set visited; @@ -174,6 +206,15 @@ class BidirectionalGraph { } } + /* \brief Get the connected subgraph that contains the head node, + * only previously unused nodes, according to the rules + * from incompatibility map. + * \param head Node which needs to be part of the returned subgraph. + * \param unused_set Only nodes from this set will be considered when + * adding to the growing subgraph. + * \param incomp_map Map containing data on which nodes are incompatible + * to be in the same subgraph. + */ std::unordered_set naive_grow_subgraph(Node* head, std::unordered_set* unused_set, IncompMap* incomp_map) { @@ -188,6 +229,7 @@ class BidirectionalGraph { unused_set->erase(vertex); subgraph.insert(vertex); incomp_set.insert((*incomp_map)[vertex].begin(), (*incomp_map)[vertex].end()); + // Traverse the grpah in both directions for (Node* input : vertex->inputs) { if (unused_set->count(input) && !incomp_set.count(input)) { stack.emplace_back(input); @@ -367,6 +409,17 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set return new_graph; } +/* \brief Get all subsets of nodes, where: + * - graph constructed from nodes in each subset is a connected graph + * - every node fulfills a predicate is_compatible + * - if nodes u and v are part of a subset, then for each path between + * u and v in the original directed graph, all nodes on those paths + * are also part of the subset + * \param g NNVM graph + * \param is_compatible A function taking nnvm::Node* and returning bool + * which identifies which nodes should be included in + * subsets. + */ template std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_compatible) { BidirectionalGraph biG = BidirectionalGraph(g);