Adding integer index support to Sequence* family of operators.

Adding ability to use int32 arrays, or any castable-to-int type, as the sequence_length array to SequenceMask, SequenceLast, and SequenceReverse. Previously these operaters all requred sequence_length to be the same data type as the input array. See MxNet Jira ticket here: https://issues.apache.org/jira/browse/MXNET-1295 See also GitHub issues here: apache#12649 dmlc/gluon-nlp#346
stephenrawls · Jan 15, 2019 · 63c5b1e · 63c5b1e
1 parent 22e2ef8
commit 63c5b1e
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 61 deletions.
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
@@ -65,9 +65,9 @@ struct SequenceLastParam : public dmlc::Parameter<SequenceLastParam> {
 
 template <int req>
 struct SequenceLastKernel {
-  template <typename DType>
+  template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
-                                  const DType *idx, int offset1, int offset2,
+                                  const IType *idx, int offset1, int offset2,
                                   mshadow::Shape<2> oshape) {
     const auto opos = mxnet_op::unravel(i, oshape);
     const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
@@ -77,9 +77,9 @@ struct SequenceLastKernel {
 };
 
 struct SequenceLastGradKernel {
-  template <typename DType>
+  template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType *in_grad, const DType *out_grad,
-                                  const DType *idx, int offset1, int offset2,
+                                  const IType *idx, int offset1, int offset2,
                                   mshadow::Shape<2> oshape) {
     const auto opos = mxnet_op::unravel(i, oshape);
     const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
@@ -88,14 +88,14 @@ struct SequenceLastGradKernel {
   }
 };
 
-template <typename xpu, typename DType>
+template <typename xpu, typename DType, typename IType>
 class SequenceLastOp : public Operator {
  public:
   explicit SequenceLastOp(SequenceLastParam p) { this->param_ = p; }
 
   void sequence_last(const mshadow::Tensor<xpu, 3, DType> &data,
                      const mshadow::Tensor<xpu, 2, DType> &out,
-                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const mshadow::Tensor<xpu, 1, IType> &indices,
                      const OpReqType req, mshadow::Stream<xpu> *const s) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -115,7 +115,7 @@ class SequenceLastOp : public Operator {
 
   void sequence_last_grad(const mshadow::Tensor<xpu, 3, DType> &in_grad,
                           const mshadow::Tensor<xpu, 2, DType> &out_grad,
-                          const mshadow::Tensor<xpu, 1, DType> &indices,
+                          const mshadow::Tensor<xpu, 1, IType> &indices,
                           mshadow::Stream<xpu> *const s) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -163,11 +163,11 @@ class SequenceLastOp : public Operator {
     Tensor<xpu, 2, DType> out =
         out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(
             Shape2(batch, rest_size), s);
-    Tensor<xpu, 1, DType> indices =
+    Tensor<xpu, 1, IType> indices =
         param_.use_sequence_length
-            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s)
             : ctx.requested[seq_last::kTempSpace]
-                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
+                  .get_space_typed<xpu, 1, IType>(Shape1(batch), s);
     if (!param_.use_sequence_length) indices = max_seq_len;
 
     sequence_last(data, out, indices, req[seq_last::kOut], s);
@@ -206,11 +206,11 @@ class SequenceLastOp : public Operator {
     Tensor<xpu, 2, DType> output_grad =
         out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(
             Shape2(batch, rest_size), s);
-    Tensor<xpu, 1, DType> indices =
+    Tensor<xpu, 1, IType> indices =
         param_.use_sequence_length
-            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s)
             : ctx.requested[seq_last::kTempSpace]
-                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
+                  .get_space_typed<xpu, 1, IType>(Shape1(batch), s);
 
     if (req[seq_last::kData] == kWriteTo) data_grad = 0.0f;
     sequence_last_grad(data_grad, output_grad, indices, s);
@@ -221,7 +221,7 @@ class SequenceLastOp : public Operator {
 };  // class SequenceLastOp
 
 template <typename xpu>
-Operator *CreateOp(SequenceLastParam param, int dtype);
+Operator *CreateOp(SequenceLastParam param, int dtype, int itype);
 
 #if DMLC_USE_CXX11
 class SequenceLastProp : public OperatorProperty {
@@ -281,8 +281,6 @@ class SequenceLastProp : public OperatorProperty {
     for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();

diff --git a/src/operator/sequence_last.cc b/src/operator/sequence_last.cc
@@ -28,18 +28,25 @@
 namespace mxnet {
 namespace op {
 template <>
-Operator *CreateOp<cpu>(SequenceLastParam param, int dtype) {
+Operator *CreateOp<cpu>(SequenceLastParam param, int dtype, int itype) {
   Operator *op = nullptr;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceLastOp<cpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceLastOp<cpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
 Operator *SequenceLastProp::CreateOperatorEx(Context ctx,
                                              std::vector<TShape> *in_shape,
                                              std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  if (in_type->size() >= 2 && (*in_type)[1] != -1)
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[1]);
+
+  // sequence_length not passed in, so fall back to using input array dtype for second input argument
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SequenceLastParam);

diff --git a/src/operator/sequence_last.cu b/src/operator/sequence_last.cu
@@ -28,10 +28,13 @@
 
 namespace mxnet {
 namespace op {
-template <> Operator *CreateOp<gpu>(SequenceLastParam param, int dtype) {
+template <> Operator *CreateOp<gpu>(SequenceLastParam param, int dtype, int itype) {
   Operator *op = NULL;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceLastOp<gpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+	  op = new SequenceLastOp<gpu, DType, IType>(param);
+	});
+    });
   return op;
 }
 

diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
@@ -68,8 +68,8 @@ struct SequenceMaskParam : public dmlc::Parameter<SequenceMaskParam> {
 // (seqlen, batch, rest) case
 template <int req>
 struct SequenceMask0Kernel {
-  template <typename DType>
-  MSHADOW_XINLINE static void Map(int b, DType *in, const DType *idx,
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
                                   index_t max_s_len, index_t batch_size,
                                   index_t restsize, DType value) {
     const index_t seqpos = static_cast<int>(idx[b]);
@@ -86,8 +86,8 @@ struct SequenceMask0Kernel {
 // (batch, seqlen, rest) case
 template <int req>
 struct SequenceMask1Kernel {
-  template <typename DType>
-  MSHADOW_XINLINE static void Map(int b, DType *in, const DType *idx,
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
                                   index_t max_s_len, index_t batch_size,
                                   index_t restsize, DType value) {
     const index_t seqpos = static_cast<int>(idx[b]);
@@ -101,13 +101,13 @@ struct SequenceMask1Kernel {
   }
 };
 
-template <typename xpu, typename DType>
+template <typename xpu, typename DType, typename IType>
 class SequenceMaskOp : public Operator {
  public:
   explicit SequenceMaskOp(SequenceMaskParam p) { this->param_ = p; }
 
   void sequence_mask(const mshadow::Tensor<xpu, 3, DType> &data,
-                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const mshadow::Tensor<xpu, 1, IType> &indices,
                      const OpReqType req, mshadow::Stream<xpu> *const s,
                      DType val) {
     using namespace mshadow;
@@ -153,8 +153,8 @@ class SequenceMaskOp : public Operator {
     // Actual implementation of masking
     Assign(out, req[seq_mask::kOut], F<mshadow_op::identity>(data));
     if (param_.use_sequence_length) {
-      Tensor<xpu, 1, DType> indices =
-          in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, IType> indices =
+          in_data[seq_mask::kSequenceLength].get<xpu, 1, IType>(s);
       sequence_mask(out, indices, req[seq_mask::kOut], s,
                     static_cast<DType>(param_.value));
     }
@@ -190,8 +190,8 @@ class SequenceMaskOp : public Operator {
     if (!param_.use_sequence_length) {
       Assign(data_g, req[seq_mask::kData], F<mshadow_op::identity>(out_g));
     } else {
-      Tensor<xpu, 1, DType> indices =
-          in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, IType> indices =
+          in_data[seq_mask::kSequenceLength].get<xpu, 1, IType>(s);
       if (req[seq_mask::kData] == kAddTo) {
         Tensor<xpu, 3, DType> out_g_temp =
             ctx.requested[seq_mask::kTempSpace].get_space_typed<xpu, 3, DType>(
@@ -212,7 +212,7 @@ class SequenceMaskOp : public Operator {
 };  // class SequenceMaskOp
 
 template <typename xpu>
-Operator *CreateOp(SequenceMaskParam param, int dtype);
+Operator *CreateOp(SequenceMaskParam param, int dtype, int itype);
 
 #if DMLC_USE_CXX11
 class SequenceMaskProp : public OperatorProperty {
@@ -270,8 +270,6 @@ class SequenceMaskProp : public OperatorProperty {
     for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();

diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
@@ -28,18 +28,26 @@
 namespace mxnet {
 namespace op {
 template <>
-Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype) {
+Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype, int itype) {
   Operator *op = nullptr;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceMaskOp<cpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceMaskOp<cpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
 Operator *SequenceMaskProp::CreateOperatorEx(Context ctx,
                                              std::vector<TShape> *in_shape,
                                              std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  if (in_type->size() >= 2 && (*in_type)[1] != -1) {
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[1]);
+  }
+
+  // sequence_length not passed in, so fall back to using input array dtype for second input argument
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SequenceMaskParam);

diff --git a/src/operator/sequence_mask.cu b/src/operator/sequence_mask.cu
@@ -29,10 +29,13 @@
 namespace mxnet {
 namespace op {
 
-template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype) {
+template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype, int itype) {
   Operator *op = NULL;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceMaskOp<gpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceMaskOp<gpu, DType>(param);
+        });
+    });
   return op;
 }
 

diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
@@ -65,14 +65,14 @@ struct SequenceReverseParam : public dmlc::Parameter<SequenceReverseParam> {
 };
 
 struct ReverseKernel {
-  template <typename DType>
+  template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(const int i, DType *const out_data,
                                   const DType *const in_data,
                                   const OpReqType req,
                                   const index_t max_seq_len,
                                   const index_t batch_size,
                                   const index_t other_dim, const index_t numel,
-                                  const DType *const indices) {
+                                  const IType *const indices) {
     for (index_t batch = 0; batch < batch_size; ++batch) {
       const index_t num_seq =
           indices ? static_cast<index_t>(indices[batch]) : max_seq_len;
@@ -102,13 +102,13 @@ struct ReverseKernel {
   }
 };
 
-template <typename xpu, typename DType>
+template <typename xpu, typename DType, typename IType>
 class SequenceReverseOp : public Operator {
  public:
   explicit SequenceReverseOp(SequenceReverseParam p) { this->param_ = p; }
   void sequence_reverse(const mshadow::Tensor<xpu, 3, DType> &data,
                         const mshadow::Tensor<xpu, 3, DType> &out,
-                        const OpReqType req, const DType *const indices,
+                        const OpReqType req, const IType *const indices,
                         mshadow::Stream<xpu> *const s) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -145,9 +145,9 @@ class SequenceReverseOp : public Operator {
     Tensor<xpu, 3, DType> out =
         out_data[seq_reverse::kOut].get_with_shape<xpu, 3, DType>(s3, s);
 
-    const DType *const indices =
+    const IType *const indices =
         param_.use_sequence_length
-            ? in_data[seq_reverse::kSequenceLength].dptr<DType>()
+            ? in_data[seq_reverse::kSequenceLength].dptr<IType>()
             : nullptr;
 
     sequence_reverse(data, out, req[seq_reverse::kOut], indices, s);
@@ -179,9 +179,9 @@ class SequenceReverseOp : public Operator {
     Tensor<xpu, 3, DType> output_grad =
         out_grad[seq_reverse::kOut].get_with_shape<xpu, 3, DType>(s3, s);
 
-    const DType *const indices =
+    const IType *const indices =
         param_.use_sequence_length
-            ? in_data[seq_reverse::kSequenceLength].dptr<DType>()
+            ? in_data[seq_reverse::kSequenceLength].dptr<IType>()
             : nullptr;
 
     sequence_reverse(output_grad, data_grad, req[seq_reverse::kData], indices,
@@ -193,7 +193,7 @@ class SequenceReverseOp : public Operator {
 };  // class SequenceReverseOp
 
 template <typename xpu>
-Operator *CreateOp(SequenceReverseParam param, int dtype);
+Operator *CreateOp(SequenceReverseParam param, int dtype, int itype);
 
 #if DMLC_USE_CXX11
 class SequenceReverseProp : public OperatorProperty {
@@ -249,8 +249,6 @@ class SequenceReverseProp : public OperatorProperty {
     for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();

diff --git a/src/operator/sequence_reverse.cc b/src/operator/sequence_reverse.cc
@@ -28,18 +28,26 @@
 namespace mxnet {
 namespace op {
 template <>
-Operator *CreateOp<cpu>(SequenceReverseParam param, int dtype) {
+Operator *CreateOp<cpu>(SequenceReverseParam param, int dtype, int itype) {
   Operator *op = nullptr;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceReverseOp<cpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceReverseOp<cpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
 Operator *SequenceReverseProp::CreateOperatorEx(
     Context ctx, std::vector<TShape> *in_shape,
     std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+
+  if (in_type->size() >= 2 && (*in_type)[1] != -1)
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[1]);
+
+  // sequence_length not passed in, so fall back to using input array dtype for second input argument
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SequenceReverseParam);

diff --git a/src/operator/sequence_reverse.cu b/src/operator/sequence_reverse.cu
@@ -28,11 +28,13 @@
 
 namespace mxnet {
 namespace op {
-template <> Operator *CreateOp<gpu>(SequenceReverseParam param, int dtype) {
+template <> Operator *CreateOp<gpu>(SequenceReverseParam param, int dtype, int itype) {
   Operator *op = nullptr;
   MSHADOW_TYPE_SWITCH(dtype, DType, {
-    op = new SequenceReverseOp<gpu, DType>(param);
-  })
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceReverseOp<gpu, DType, IType>(param);
+        });
+    });
   return op;
 }