Add deframe op and stft/istft api. (PaddlePaddle#23)

* Add frame api * Add deframe op and kernels. * Add stft and istft apis. * Add deframe api. Update stft and istft apis. * Fix bug in frame_from_librosa function when input dims >= 3 * Rename deframe to overlap_add. * Update istft. * Update after code review.
KPatr1ck · Sep 10, 2021 · f9e3309 · f9e3309
1 parent fcd9069
commit f9e3309
Show file tree

Hide file tree

Showing 10 changed files with 1,088 additions and 204 deletions.
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
@@ -32,6 +32,11 @@ class FrameOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const int x_rank = x_dims.size();
 
+    PADDLE_ENFORCE_GE(
+        x_rank, 1, platform::errors::InvalidArgument(
+                       "Input(X) of FrameOp should be a tensor which contains "
+                       "at least 1 dimension, but got rank %s.",
+                       x_rank));
     PADDLE_ENFORCE_GT(hop_length, 0,
                       platform::errors::InvalidArgument(
                           "Attribute(hop_length) of FrameOp should be greater "
@@ -111,7 +116,7 @@ class FrameOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
     Frame Operator.
 
-    Frame op slices frames from input sequence $X$.
+    Frame op convert time sequences into frames.
 
     )DOC");
   }
@@ -174,7 +179,9 @@ REGISTER_OP_CPU_KERNEL(
                      paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
-    frame_grad, ops::FrameGradKernel<paddle::platform::CPUDeviceContext, float>,
+    frame_grad, ops::FrameGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FrameGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::FrameGradKernel<paddle::platform::CPUDeviceContext,
                          paddle::platform::complex<float>>,

diff --git a/paddle/fluid/operators/frame_op.cu b/paddle/fluid/operators/frame_op.cu
@@ -29,7 +29,8 @@ REGISTER_OP_CUDA_KERNEL(
                      paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
-    frame_grad,
+    frame_grad, ops::FrameGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::FrameGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FrameGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::FrameGradKernel<paddle::platform::CUDADeviceContext,

diff --git a/paddle/fluid/operators/frame_op.h b/paddle/fluid/operators/frame_op.h
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/seq2col.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -27,170 +28,6 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 
-template <typename T>
-struct DataMappingFunctor {
-  DataMappingFunctor(const T* x, T* out, size_t seq_length, size_t frame_length,
-                     size_t n_frames, size_t hop_length)
-      : x_(x),
-        out_(out),
-        seq_length_(seq_length),
-        frame_length_(frame_length),
-        n_frames_(n_frames),
-        hop_length_(hop_length) {}
-
-  /*
-    Convert sequences to frames.
-
-    1. Dimension infomation:
-
-       Sequences                   Frames
-    (N, seq_length)  ->  (N, frame_length, n_frames)
-
-    2. Mapping from `i` to  `src_idx` and `trg_idx` can be derived from:
-
-      a. Notion
-        - `i` stands for the flattened index of a bunch of frames.
-        - `src_idx` and `trg_idx` are the 1D indices of seqs and frames
-          respectivly.
-
-      b. Sample idx
-        ```cpp
-        sample_idx = i / (n_frames_ * frame_length_);
-        ```
-
-      c. Maps `i` to `f` and `n`.
-        ```cpp
-        f = i % (n_frames_ * frame_length_) / n_frames_;
-        n = i % (n_frames_ * frame_length_) % n_frames_;
-        ```
-
-      d. Replace `sample_idx`, `f` and `n` in the following eqations:
-        ```cpp
-        src_idx = sample_idx * seq_length_ + n * hop_length_ + f;
-        trg_idx = sample_idx * n_frames_ * frame_length_ + f * n_frames_ + n;
-        out_[trg_idx] = x_[src_idx];
-        ```
-
-      e. Result can be deduced shown in the function body below.
-  */
-  HOSTDEVICE void operator()(size_t i) const {
-    size_t src_idx;
-    size_t trg_idx;
-    src_idx = i / (n_frames_ * frame_length_) * seq_length_ +
-              i % (n_frames_ * frame_length_) % n_frames_ * hop_length_ +
-              i % (n_frames_ * frame_length_) / n_frames_;
-    trg_idx = i / (n_frames_ * frame_length_) * n_frames_ * frame_length_ +
-              i % (n_frames_ * frame_length_) / n_frames_ * n_frames_ +
-              i % (n_frames_ * frame_length_) % n_frames_;
-    out_[trg_idx] = x_[src_idx];
-  }
-
-  const T* x_;
-  T* out_;
-  size_t seq_length_;
-  size_t frame_length_;
-  size_t n_frames_;
-  size_t hop_length_;
-};
-
-template <typename T>
-struct DataMappingGradFunctor {
-  DataMappingGradFunctor(const T* d_out, T* d_x, size_t seq_length,
-                         size_t frame_length, size_t n_frames,
-                         size_t hop_length)
-      : d_out_(d_out),
-        d_x_(d_x),
-        seq_length_(seq_length),
-        frame_length_(frame_length),
-        n_frames_(n_frames),
-        hop_length_(hop_length) {}
-
-  /*
-    Accumulate output gradient d_out to d_x.
-
-    1. Dimension infomation:
-
-              d_out                        d_x
-    (N, frame_length, n_frames)  ->  (N, seq_length)
-
-    2. Using a sliding window to find source indices from `d_out` according to
-       `i`:
-
-      a. Notion
-        - `i` stands for the flattened index of `d_x`.
-        - `seq_i` stands for a relative index of a `d_x` sample.
-        - `left`: Starting index of a frame window.
-        - `right`: Ending index of a frame window.
-
-      b. Sample idx
-        ```cpp
-        sample_idx = i / seq_length_;
-        ```
-
-      c. Slides a window with length of `frame_length` to find `f` and `n`.
-        - `n`: The idx of num_frames_, increases in each hop.
-        - `f`: The idx of frame_lengths_, relative idx from left of a sliding
-               window.
-
-      d. Accumulate all grads from d_out.
-        ```cpp
-        d_x_[i] +=
-            d_out_[sample_idx * frame_length_ * n_frames_ + f * n_frames_ + n];
-        ```
-  */
-  HOSTDEVICE void operator()(size_t i) const {
-    size_t sample_idx = i / seq_length_;
-    size_t seq_i = i % seq_length_;
-
-    // Sliding window
-    d_x_[i] = 0;  // Init d_x_[i] to 0, and sums up all
-                  // grads from d_out_ in the while loop.
-
-    size_t n = get_start_frame_idx(seq_i);
-    size_t f;
-    size_t left = n * hop_length_;
-    size_t right = left + frame_length_ - 1;
-
-    while (left <= seq_i && right < seq_length_) {
-      f = seq_i - left;
-      d_x_[i] +=
-          d_out_[sample_idx * frame_length_ * n_frames_ + f * n_frames_ + n];
-      // Next frame.
-      left += hop_length_;
-      right += hop_length_;
-      n += 1;
-    }
-  }
-
-  /*
-    Calculate minimum value of frame index `n` to satisfy the inequality:
-
-      seq_i <= right
-      ==> seq_i <= left + frame_length - 1
-      ==> seq_i <= hop_length_ * n + frame_length_ - 1
-  */
-  HOSTDEVICE size_t get_start_frame_idx(size_t seq_i) const {
-    int64_t tmp = seq_i + 1 - frame_length_;
-    if (tmp > 0) {
-      size_t n = tmp / hop_length_;
-      if (tmp % hop_length_ == 0) {
-        return n;
-      } else {
-        return n + 1;
-      }
-    } else {
-      return 0;
-    }
-  }
-
-  const T* d_out_;
-  T* d_x_;
-  size_t seq_length_;
-  size_t frame_length_;
-  size_t n_frames_;
-  size_t hop_length_;
-};
-
 template <typename DeviceContext, typename T>
 struct FrameFunctor {
   void operator()(const DeviceContext& dev_ctx, const Tensor* input,
@@ -203,12 +40,12 @@ struct FrameFunctor {
 
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
     if (!is_grad) {
-      DataMappingFunctor<T> functor(input_data, output_data, seq_length,
-                                    frame_length, n_frames, hop_length);
+      math::Seq2ColFunctor<T> functor(input_data, output_data, seq_length,
+                                      frame_length, n_frames, hop_length);
       for_range(functor);
     } else {
-      DataMappingGradFunctor<T> functor(input_data, output_data, seq_length,
-                                        frame_length, n_frames, hop_length);
+      math::Col2SeqFunctor<T> functor(input_data, output_data, seq_length,
+                                      frame_length, n_frames, hop_length);
       for_range(functor);
     }
   }
@@ -385,10 +222,8 @@ class FrameGradKernel : public framework::OpKernel<T> {
         falls into Case 2. Finally, it restores the dims of `d_x` tensor.
   */
   void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
     const size_t d_out_rank = d_out->dims().size();
     const size_t d_x_rank = d_x->dims().size();