danpovey · pegahgh · Feb 16, 2016 · Feb 17, 2016 · danpovey · Feb 18, 2016
diff --git a/src/feat/Makefile b/src/feat/Makefile
@@ -9,7 +9,7 @@ TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \
          resample-test online-feature-test sinusoid-detection-test \
          signal-test
 
-OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
+OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o signal-distort.o \
            feature-spectrogram.o mel-computations.o wave-reader.o \
            pitch-functions.o resample.o online-feature.o sinusoid-detection.o \
            signal.o

diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc
@@ -0,0 +1,85 @@
+// featbin/signal-distort.cc
+
+// Copyright 2016 Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#include "feat/signal-distort.h"
+
+namespace kaldi {
+
+// randomly disturb the input signal using a band-pass filter with no zeros.
+void PerturbXvectorSignal::ComputeAndApplyRandDistortion(const MatrixBase<BaseFloat> &input_egs,
+                                                    Matrix<BaseFloat> *perturb_egs) {
+  // Generate impluse response |H(w)| using nonzero random sequence and smooth them 
+  // using moving-average window with small window size.
+  // For simplicity, assume zero-phase response and H(w) = |H(w)|.
+  // num_fft_samp = 512
+  int32 num_fft_samp = 512;
+  Vector<BaseFloat> im_response(num_fft_samp);
+
+}
+
+// Stretches the time axis for input egs without fixing the pitch value.
+// It changes the speed and duration of the input signal without fixing pitch.
+// The output  y w.r.t input x is going to be y(t - offset) = x(stretch * (t - offset)),
+// where offset is the time index which the signal is stretches along that and the input
+// and output are the same for t = offset.
+// ArbitraryResample class is used to generate resampled output for different time-stretches.
+// The output y is the stretched form of the input, x, and stretch value is randomely generated
+// between [1 - max_stretch, 1 + max_stretch].
+// y[(m - n + 2 * t)/2] = x[(1 + stretch) * (m - n + 2 * t)/2] for t = 0,..,n   
+void PerturbXvectorSignal::TimeStretch(const MatrixBase<BaseFloat> &input_egs,  
+                                       Matrix<BaseFloat> *perturb_egs) {
+  Matrix<BaseFloat> in_mat(input_egs), 
+    out_mat(perturb_egs->NumRows(), perturb_egs->NumCols());
+  int32 input_dim = input_egs.NumCols(), 
+    dim = perturb_egs->NumCols();
+  Vector<BaseFloat> samp_points_secs(dim);
+  BaseFloat samp_freq = 2000, 
+    max_stretch = opts_.max_time_stretch;
+  // we stretch the middle part of the example and the input should be expanded
+  // by extra frame to be larger than the output length => s * (m+n)/2 < m.
+  // y((m - n + 2 * t)/2) = x(s * (m - n + 2 * t)/2) for t = 0,..,n 
+  // where m = dim(x) and n = dim(y).
+  KALDI_ASSERT(input_dim > dim * ((1.0 + max_stretch) / (1.0 - max_stretch)));
+  // Generate random stretch value between -max_stretch, max_stretch.
+  int32 max_stretch_int = static_cast<int32>(max_stretch * 1000);
+  BaseFloat stretch = static_cast<BaseFloat>(RandInt(-max_stretch_int, max_stretch_int) / 1000.0); 
+  if (abs(stretch) > 0) {
+    int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to.
+    BaseFloat filter_cutoff_hz = samp_freq * 0.475; // lowpass frequency that's lower than 95% of 
+                                                    // the Nyquist.
+    for (int32 i = 0; i < dim; i++) 
+      samp_points_secs(i) = static_cast<BaseFloat>(((1.0 + stretch) * 
+        (0.5 * (input_dim - dim) + i))/ samp_freq);
+
+    ArbitraryResample time_resample(input_dim, samp_freq,
+                                    filter_cutoff_hz, 
+                                    samp_points_secs,
+                                    num_zeros);
+    time_resample.Resample(in_mat, &out_mat);
+  } else {
+    int32 offset = static_cast<BaseFloat>(0.5 * (input_egs.NumCols() - perturb_egs->NumCols()));
+    out_mat.CopyFromMat(input_egs.Range(0, input_egs.NumRows(), offset, perturb_egs->NumCols()));
+  }
+  perturb_egs->CopyFromMat(out_mat);
+}
+
+
+} // end of namespace kaldi
diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h
@@ -0,0 +1,80 @@
+// featbin/signal-distort.h
+
+// Copyright 2016 Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_SIGNAL_DISTORT_H_
+#define KALDI_SIGNAL_DISTORT_H_
+
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "base/kaldi-error.h"
+#include "matrix/matrix-lib.h"
+#include "util/common-utils.h"
+
+#include "feat/resample.h"
+#include "matrix/matrix-functions.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+
+// options class for distorting signals in egs
+struct XvectorPerturbOptions {
+  BaseFloat max_shift; 
+  BaseFloat max_time_stretch;
+  int32 frame_dim;
+  bool negation; 
+  bool rand_distort;
+  std::string noise_egs;
+  XvectorPerturbOptions(): max_shift(0.2),
+                           max_time_stretch(0.2),
+                           frame_dim(80),
+                           negation(false),
+                           rand_distort(false) { }
+  void Register(OptionsItf *opts) { 
+    opts->Register("max-shift", &max_shift, "Maximum random shift relative"
+                "to frame length applied to egs.");
+    opts->Register("max-speed-perturb", &max_time_stretch,
+                   "Max speed perturbation applied on egs.");
+    opts->Register("frame-dim", &frame_dim,
+                   "The numebr of samples in input frame as product of frame_length by samp_freq.");
+    opts->Register("negation", &negation, "If true, the input value is negated randomly.");
+    opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added.");
+    opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes"
+                   "using some designed FIR filter with no zeros.");
+  }
+};
+
+class PerturbXvectorSignal {
+ public:
+  PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { };
+
+  void ComputeAndApplyRandDistortion(const MatrixBase<BaseFloat> &input_egs,
+                                     Matrix<BaseFloat> *perturb_egs);
+
+  void TimeStretch(const MatrixBase<BaseFloat> &input_egs,
+                   Matrix<BaseFloat> *perturb_egs);
+
+ private:
+  XvectorPerturbOptions opts_;
+};
+
+} // end of namespace kaldi
+#endif // KALDI_SIGNAL_DISTORT_H_
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
@@ -6,7 +6,8 @@ include ../kaldi.mk
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
-BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
+BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-fold-egs \
+   nnet3-xvector-signal-perturb-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
    nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
@@ -24,7 +25,7 @@ TESTFILES =
 ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
          ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
-         ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
+         ../thread/kaldi-thread.a ../feat/kaldi-feat.a ../cudamatrix/kaldi-cudamatrix.a \
          ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
          ../util/kaldi-util.a ../base/kaldi-base.a
 

diff --git a/src/nnet3bin/nnet3-fold-egs.cc b/src/nnet3bin/nnet3-fold-egs.cc
@@ -0,0 +1,89 @@
+// nnet3bin/nnet3-copy-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2016  Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Combine examples for neural network training and supports multiple rspecifiers, in which case it will reads the inputs \n"
+        "round-robin and writes to the output"
+        "\n"
+        "Usage:  nnet3-fold-egs [options] <egs-rspecifier1> [<egs-rspecifier2> ...] <egs-wspecifier>\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-fold-egs ark:1.egs ark:2.egs ark,t:text.egs\n"
+        "or:\n"
+        "nnet3-fold-egs ark:train.egs ark:1.egs ark:2.egs\n";
+
+    ParseOptions po(usage);
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    //
+    int32 num_inputs = po.NumArgs() - 1;
+    std::vector<SequentialNnetExampleReader*> example_readers(num_inputs);
+    for (int32 i = 0; i < num_inputs; i++)
+      example_readers[i] = new SequentialNnetExampleReader(po.GetArg(i+1));
+
+    std::string examples_wspecifier(po.GetArg(num_inputs+1));
+    NnetExampleWriter example_writer(examples_wspecifier);
+    int64 num_written = 0;
+    std::vector<int64> num_read(num_inputs);
+
+    //for (; !example_readers[0]->Done(); tot_num_read++) {
+    while (!example_readers[0]->Done()) {
+      for (int32 reader = 0; reader < num_inputs; reader++) { 
+        if (!example_readers[reader]->Done()) {
+          example_readers[reader]->Next();
+          num_read[reader]++;
+          std::string key = example_readers[reader]->Key();
+          const NnetExample &eg = example_readers[reader]->Value();
+          example_writer.Write(key, eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_inputs; i++)
+      delete example_readers[i];
+
+    KALDI_LOG << "Read " << num_read[0] << "neural-network training examples "
+              << "from " << num_inputs << " inputs, wrote "
+              << num_written; 
+
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}