Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Speedup SequenceMask on GPU #14445

Merged
merged 1 commit into from
Mar 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 17 additions & 62 deletions src/operator/sequence_mask-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,70 +65,24 @@ struct SequenceMaskParam : public dmlc::Parameter<SequenceMaskParam> {
}
};

// (seqlen, batch, rest) case
template <int req>
struct SequenceMask0Kernel {
template <typename DType, typename IType>
MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
index_t max_s_len, index_t batch_size,
index_t restsize, DType value) {
const index_t seqpos = static_cast<int>(idx[b]);
#pragma unroll
for (index_t s = seqpos; s < max_s_len; ++s) {
index_t incr = (s * batch_size * restsize) + (b * restsize);
#pragma unroll
for (index_t r = 0; r < restsize; ++r)
KERNEL_ASSIGN(in[incr + r], req, value);
}
}
};

// (batch, seqlen, rest) case
template <int req>
struct SequenceMask1Kernel {
template <typename DType, typename IType>
MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
index_t max_s_len, index_t batch_size,
index_t restsize, DType value) {
const index_t seqpos = static_cast<int>(idx[b]);
#pragma unroll
for (index_t s = seqpos; s < max_s_len; ++s) {
index_t incr = (b * max_s_len * restsize) + (s * restsize);
#pragma unroll
for (index_t r = 0; r < restsize; ++r)
KERNEL_ASSIGN(in[incr + r], req, value);
}
}
};
template<typename DType, typename IType>
void SequenceMaskExec(const mshadow::Tensor<cpu, 3, DType> &data,
const mshadow::Tensor<cpu, 1, IType> &indices,
const OpReqType req, mshadow::Stream<cpu> *const s,
int axis, DType val);
#ifdef __CUDACC__
template<typename DType, typename IType>
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
void SequenceMaskExec(const mshadow::Tensor<gpu, 3, DType> &data,
const mshadow::Tensor<gpu, 1, IType> &indices,
const OpReqType req, mshadow::Stream<gpu> *const s,
int axis, DType val);
#endif

template <typename xpu, typename DType, typename IType>
class SequenceMaskOp : public Operator {
public:
explicit SequenceMaskOp(SequenceMaskParam p) { this->param_ = p; }

void sequence_mask(const mshadow::Tensor<xpu, 3, DType> &data,
const mshadow::Tensor<xpu, 1, IType> &indices,
const OpReqType req, mshadow::Stream<xpu> *const s,
DType val) {
using namespace mshadow;
using namespace mshadow::expr;

index_t batch = indices.size(0);
index_t max_seq_len = data.size(param_.axis);
index_t restsize = data.size(2);

MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
if (param_.axis == 1)
mxnet_op::Kernel<SequenceMask1Kernel<req_type>, xpu>::Launch(
s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
val);
else
mxnet_op::Kernel<SequenceMask0Kernel<req_type>, xpu>::Launch(
s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
val);
});
}

virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
Expand All @@ -155,8 +109,8 @@ class SequenceMaskOp : public Operator {
if (param_.use_sequence_length) {
Tensor<xpu, 1, IType> indices =
in_data[seq_mask::kSequenceLength].get<xpu, 1, IType>(s);
sequence_mask(out, indices, req[seq_mask::kOut], s,
static_cast<DType>(param_.value));
SequenceMaskExec<DType, IType>(out, indices, req[seq_mask::kOut], s,
param_.axis, static_cast<DType>(param_.value));
}
}

Expand Down Expand Up @@ -198,11 +152,12 @@ class SequenceMaskOp : public Operator {
s3, s);
out_g_temp = F<mshadow_op::identity>(out_g);
out_g = out_g_temp;
sequence_mask(out_g, indices, kWriteInplace, s, DType(0.));
SequenceMaskExec<DType, IType>(out_g, indices, kWriteInplace, s, param_.axis, DType(0.));
Assign(data_g, kAddTo, F<mshadow_op::identity>(out_g));
} else {
Assign(data_g, req[seq_mask::kData], F<mshadow_op::identity>(out_g));
sequence_mask(data_g, indices, req[seq_mask::kData], s, DType(0.));
SequenceMaskExec<DType, IType>(
data_g, indices, req[seq_mask::kData], s, param_.axis, DType(0.));
}
}
}
Expand Down
64 changes: 64 additions & 0 deletions src/operator/sequence_mask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,70 @@

namespace mxnet {
namespace op {

// (seqlen, batch, rest) case
template <int req>
struct SequenceMask0CPUKernel {
template <typename DType, typename IType>
MSHADOW_XINLINE static void Map(int batch, DType *in, const IType *idx,
index_t max_s_len, index_t batch_size,
index_t restsize, DType value) {
const index_t seqpos = static_cast<int>(idx[batch]);
#pragma unroll
for (index_t s = seqpos; s < max_s_len; ++s) {
index_t incr = (s * batch_size * restsize) + (batch * restsize);
#pragma unroll
for (index_t r = 0; r < restsize; ++r)
KERNEL_ASSIGN(in[incr + r], req, value);
}
}
};

// (batch, seqlen, rest) case
template <int req>
struct SequenceMask1CPUKernel {
template <typename DType, typename IType>
MSHADOW_XINLINE static void Map(int batch, DType *in, const IType *idx,
index_t max_s_len, index_t batch_size,
index_t restsize, DType value) {
const index_t seqpos = static_cast<int>(idx[batch]);
#pragma unroll
for (index_t s = seqpos; s < max_s_len; ++s) {
index_t incr = (batch * max_s_len * restsize) + (s * restsize);
#pragma unroll
for (index_t r = 0; r < restsize; ++r)
KERNEL_ASSIGN(in[incr + r], req, value);
}
}
};

template<typename DType, typename IType>
void SequenceMaskExec(
const mshadow::Tensor<cpu, 3, DType> &data,
const mshadow::Tensor<cpu, 1, IType> &indices,
const OpReqType req, mshadow::Stream<cpu> *const s,
int axis, DType val) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;

index_t batch = indices.size(0);
index_t max_seq_len = data.size(axis);
index_t restsize = data.size(2);

MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
if (axis == 1) {
Kernel<SequenceMask1CPUKernel<req_type>, cpu>::Launch(
s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
val);
} else {
Kernel<SequenceMask0CPUKernel<req_type>, cpu>::Launch(
s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
val);
}
});
}

template <>
Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype, int itype) {
Operator *op = nullptr;
Expand Down
59 changes: 59 additions & 0 deletions src/operator/sequence_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,65 @@
namespace mxnet {
namespace op {

// (seqlen, batch, rest) case
template <int req>
struct SequenceMask0GPUKernel {
template <typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, DType *in, const IType *idx,
index_t max_s_len, index_t batch_size,
index_t restsize, DType value) {
index_t batch = i / restsize % batch_size;
const index_t seqpos = static_cast<int>(idx[batch]);
index_t seq = i / restsize / batch_size;
if (seq >= seqpos) {
KERNEL_ASSIGN(in[i], req, value);
}
}
};

// (batch, seqlen, rest) case
template <int req>
struct SequenceMask1GPUKernel {
template <typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, DType *in, const IType *idx,
index_t max_s_len, index_t batch_size,
index_t restsize, DType value) {
index_t batch = i / restsize / max_s_len;
const index_t seqpos = static_cast<int>(idx[batch]);
index_t seq = i / restsize % max_s_len;
if (seq >= seqpos) {
KERNEL_ASSIGN(in[i], req, value);
}
}
};

template<typename DType, typename IType>
void SequenceMaskExec(
const mshadow::Tensor<gpu, 3, DType> &data,
const mshadow::Tensor<gpu, 1, IType> &indices,
const OpReqType req, mshadow::Stream<gpu> *const s,
int axis, DType val) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;

index_t batch = indices.size(0);
index_t max_seq_len = data.size(axis);
index_t restsize = data.size(2);

MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
if (axis == 1) {
Kernel<SequenceMask1GPUKernel<req_type>, gpu>::Launch(
s, data.shape_.Size(), data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
val);
} else {
Kernel<SequenceMask0GPUKernel<req_type>, gpu>::Launch(
s, data.shape_.Size(), data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
val);
}
});
}

template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype, int itype) {
Operator *op = NULL;
MSHADOW_TYPE_SWITCH(dtype, DType, {
Expand Down