diff --git a/src/feat/Makefile b/src/feat/Makefile index 1b2afc40942..3e2a8981706 100644 --- a/src/feat/Makefile +++ b/src/feat/Makefile @@ -9,7 +9,7 @@ TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \ resample-test online-feature-test sinusoid-detection-test \ signal-test -OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \ +OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o signal-distort.o \ feature-spectrogram.o mel-computations.o wave-reader.o \ pitch-functions.o resample.o online-feature.o sinusoid-detection.o \ signal.o diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc new file mode 100644 index 00000000000..1eff3f5844a --- /dev/null +++ b/src/feat/signal-distort.cc @@ -0,0 +1,85 @@ +// featbin/signal-distort.cc + +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + + +#include "feat/signal-distort.h" + +namespace kaldi { + +// randomly disturb the input signal using a band-pass filter with no zeros. +void PerturbXvectorSignal::ComputeAndApplyRandDistortion(const MatrixBase &input_egs, + Matrix *perturb_egs) { + // Generate impluse response |H(w)| using nonzero random sequence and smooth them + // using moving-average window with small window size. + // For simplicity, assume zero-phase response and H(w) = |H(w)|. + // num_fft_samp = 512 + int32 num_fft_samp = 512; + Vector im_response(num_fft_samp); + +} + +// Stretches the time axis for input egs without fixing the pitch value. +// It changes the speed and duration of the input signal without fixing pitch. +// The output y w.r.t input x is going to be y(t - offset) = x(stretch * (t - offset)), +// where offset is the time index which the signal is stretches along that and the input +// and output are the same for t = offset. +// ArbitraryResample class is used to generate resampled output for different time-stretches. +// The output y is the stretched form of the input, x, and stretch value is randomely generated +// between [1 - max_stretch, 1 + max_stretch]. +// y[(m - n + 2 * t)/2] = x[(1 + stretch) * (m - n + 2 * t)/2] for t = 0,..,n +void PerturbXvectorSignal::TimeStretch(const MatrixBase &input_egs, + Matrix *perturb_egs) { + Matrix in_mat(input_egs), + out_mat(perturb_egs->NumRows(), perturb_egs->NumCols()); + int32 input_dim = input_egs.NumCols(), + dim = perturb_egs->NumCols(); + Vector samp_points_secs(dim); + BaseFloat samp_freq = 2000, + max_stretch = opts_.max_time_stretch; + // we stretch the middle part of the example and the input should be expanded + // by extra frame to be larger than the output length => s * (m+n)/2 < m. + // y((m - n + 2 * t)/2) = x(s * (m - n + 2 * t)/2) for t = 0,..,n + // where m = dim(x) and n = dim(y). + KALDI_ASSERT(input_dim > dim * ((1.0 + max_stretch) / (1.0 - max_stretch))); + // Generate random stretch value between -max_stretch, max_stretch. + int32 max_stretch_int = static_cast(max_stretch * 1000); + BaseFloat stretch = static_cast(RandInt(-max_stretch_int, max_stretch_int) / 1000.0); + if (abs(stretch) > 0) { + int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to. + BaseFloat filter_cutoff_hz = samp_freq * 0.475; // lowpass frequency that's lower than 95% of + // the Nyquist. + for (int32 i = 0; i < dim; i++) + samp_points_secs(i) = static_cast(((1.0 + stretch) * + (0.5 * (input_dim - dim) + i))/ samp_freq); + + ArbitraryResample time_resample(input_dim, samp_freq, + filter_cutoff_hz, + samp_points_secs, + num_zeros); + time_resample.Resample(in_mat, &out_mat); + } else { + int32 offset = static_cast(0.5 * (input_egs.NumCols() - perturb_egs->NumCols())); + out_mat.CopyFromMat(input_egs.Range(0, input_egs.NumRows(), offset, perturb_egs->NumCols())); + } + perturb_egs->CopyFromMat(out_mat); +} + + +} // end of namespace kaldi diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h new file mode 100644 index 00000000000..78699e4d176 --- /dev/null +++ b/src/feat/signal-distort.h @@ -0,0 +1,80 @@ +// featbin/signal-distort.h + +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_SIGNAL_DISTORT_H_ +#define KALDI_SIGNAL_DISTORT_H_ + +#include +#include +#include +#include + +#include "base/kaldi-error.h" +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" + +#include "feat/resample.h" +#include "matrix/matrix-functions.h" +#include "cudamatrix/cu-matrix.h" + +namespace kaldi { + +// options class for distorting signals in egs +struct XvectorPerturbOptions { + BaseFloat max_shift; + BaseFloat max_time_stretch; + int32 frame_dim; + bool negation; + bool rand_distort; + std::string noise_egs; + XvectorPerturbOptions(): max_shift(0.2), + max_time_stretch(0.2), + frame_dim(80), + negation(false), + rand_distort(false) { } + void Register(OptionsItf *opts) { + opts->Register("max-shift", &max_shift, "Maximum random shift relative" + "to frame length applied to egs."); + opts->Register("max-speed-perturb", &max_time_stretch, + "Max speed perturbation applied on egs."); + opts->Register("frame-dim", &frame_dim, + "The numebr of samples in input frame as product of frame_length by samp_freq."); + opts->Register("negation", &negation, "If true, the input value is negated randomly."); + opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added."); + opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes" + "using some designed FIR filter with no zeros."); + } +}; + +class PerturbXvectorSignal { + public: + PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { }; + + void ComputeAndApplyRandDistortion(const MatrixBase &input_egs, + Matrix *perturb_egs); + + void TimeStretch(const MatrixBase &input_egs, + Matrix *perturb_egs); + + private: + XvectorPerturbOptions opts_; +}; + +} // end of namespace kaldi +#endif // KALDI_SIGNAL_DISTORT_H_ diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 0a57c17fad0..9442e0d030f 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -6,7 +6,8 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ +BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-fold-egs \ + nnet3-xvector-signal-perturb-egs nnet3-subset-egs \ nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \ nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions \ nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \ @@ -24,7 +25,7 @@ TESTFILES = ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \ ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ - ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \ + ../thread/kaldi-thread.a ../feat/kaldi-feat.a ../cudamatrix/kaldi-cudamatrix.a \ ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \ ../util/kaldi-util.a ../base/kaldi-base.a diff --git a/src/nnet3bin/nnet3-fold-egs.cc b/src/nnet3bin/nnet3-fold-egs.cc new file mode 100644 index 00000000000..4237efdafaf --- /dev/null +++ b/src/nnet3bin/nnet3-fold-egs.cc @@ -0,0 +1,89 @@ +// nnet3bin/nnet3-copy-egs.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-example-utils.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Combine examples for neural network training and supports multiple rspecifiers, in which case it will reads the inputs \n" + "round-robin and writes to the output" + "\n" + "Usage: nnet3-fold-egs [options] [ ...] \n" + "\n" + "e.g.\n" + "nnet3-fold-egs ark:1.egs ark:2.egs ark,t:text.egs\n" + "or:\n" + "nnet3-fold-egs ark:train.egs ark:1.egs ark:2.egs\n"; + + ParseOptions po(usage); + po.Read(argc, argv); + + if (po.NumArgs() < 2) { + po.PrintUsage(); + exit(1); + } + + // + int32 num_inputs = po.NumArgs() - 1; + std::vector example_readers(num_inputs); + for (int32 i = 0; i < num_inputs; i++) + example_readers[i] = new SequentialNnetExampleReader(po.GetArg(i+1)); + + std::string examples_wspecifier(po.GetArg(num_inputs+1)); + NnetExampleWriter example_writer(examples_wspecifier); + int64 num_written = 0; + std::vector num_read(num_inputs); + + //for (; !example_readers[0]->Done(); tot_num_read++) { + while (!example_readers[0]->Done()) { + for (int32 reader = 0; reader < num_inputs; reader++) { + if (!example_readers[reader]->Done()) { + example_readers[reader]->Next(); + num_read[reader]++; + std::string key = example_readers[reader]->Key(); + const NnetExample &eg = example_readers[reader]->Value(); + example_writer.Write(key, eg); + num_written++; + } + } + } + for (int32 i = 0; i < num_inputs; i++) + delete example_readers[i]; + + KALDI_LOG << "Read " << num_read[0] << "neural-network training examples " + << "from " << num_inputs << " inputs, wrote " + << num_written; + + return (num_written == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/nnet3bin/nnet3-xvector-signal-perturb-egs.cc b/src/nnet3bin/nnet3-xvector-signal-perturb-egs.cc new file mode 100644 index 00000000000..c2f4078ea69 --- /dev/null +++ b/src/nnet3bin/nnet3-xvector-signal-perturb-egs.cc @@ -0,0 +1,155 @@ +// nnet3bin/nnet3-xvector-signal-perturb-egs.cc + +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/signal-distort.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-example-utils.h" +namespace kaldi { +namespace nnet3 { + +// This function applies different type of perturbation to input_egs. +// random distortion of inputs, random shifts, adding additive noise, +// random time stretch and random negations are different type of +// distortions used in this function. +void ApplyPerturbation(XvectorPerturbOptions opts, + const Matrix &input_egs, + Matrix *noise_egs, + Matrix *perturb_egs) { + + PerturbXvectorSignal perturb_xvector(opts); + + Matrix shifted_egs(input_egs); + // Generate random shift samples to shift egs. + if (opts.max_shift != 0.0) { + int32 max_shift_int = static_cast(opts.max_shift * opts.frame_dim); + // shift input_egs using random shift. + int32 eg_dim = input_egs.NumCols() - opts.frame_dim, + shift = RandInt(0, max_shift_int); + shifted_egs.CopyFromMat(input_egs.Range(0, input_egs.NumRows(), shift, eg_dim)); + } + + Matrix rand_distort_shifted_egs(shifted_egs); + if (opts.rand_distort) { + // randomly generate an zero-phase FIR filter with no zeros. + // In future, we can select trucated part of room impluse response + // and convolve it with input_egs. + perturb_xvector.ComputeAndApplyRandDistortion(shifted_egs, + &rand_distort_shifted_egs); + } + + if (noise_egs) { + // select random block of noise egs and add to input_egs + // number of additive noises should be larger than number of input-egs. + KALDI_ASSERT(noise_egs->NumRows() >= input_egs.NumRows()); + if (noise_egs->NumRows() < input_egs.NumRows()) { + // repeat the noise_egs_mat blocks to have same length block + // and randomly perturb the rows. + } else { + // Select random submatrix out of noise_egs and add it to perturb_egs. + // we should shuffle noise_egs before passing them to this binary. + int32 start_row_ind = RandInt(0, noise_egs->NumRows() - input_egs.NumRows()), + start_col_ind = RandInt(0, noise_egs->NumCols() - input_egs.NumCols()); + rand_distort_shifted_egs.AddMat(1.0, noise_egs->Range(start_row_ind, input_egs.NumRows(), + start_col_ind, input_egs.NumCols())); + } + } + // Perturb speed of signal egs + Matrix warped_distorted_shifted_egs(rand_distort_shifted_egs); + if (opts.max_time_stretch != 0.0) + perturb_xvector.TimeStretch(rand_distort_shifted_egs, + &warped_distorted_shifted_egs); + + // If nagation is true, the sample values are randomly negated + // with some probability. + if (opts.negation) { + + } +} + +} // end of namespace nnet3 +} // end of namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + + const char *usage = + "Corrupts the examples supplied via input pipe with different type of distortions\n" + "such as additive noise, negation, random time shifts or random distortion.\n" + "Usage: nnet3-xvector-signal-perturb-egs [options...] \n" + "e.g.\n" + "nnet3-xvector-signal-perturb-egs --noise-egs=noise.egs\n" + "--max-shift=0.2 --max-speed-perturb=0.1 --negation=true\n" + "ark:input.egs akr:distorted.egs\n"; + ParseOptions po(usage); + + XvectorPerturbOptions perturb_opts; + perturb_opts.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string examples_rspecifier = po.GetArg(1), + examples_wspecifier = po.GetArg(2); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + + NnetExampleWriter example_writer(examples_wspecifier); + + int64 num_read = 0, num_written = 0; + + Matrix *noise_mat = NULL; + // read additive noise egs if it is specified. + if (!perturb_opts.noise_egs.empty()) { + SequentialNnetExampleReader noise_reader(perturb_opts.noise_egs); + const NnetExample &noise_egs = noise_reader.Value(); + const NnetIo &noise_io = noise_egs.io[0]; + noise_io.features.CopyToMat(noise_mat); + + } + + for (; !example_reader.Done(); example_reader.Next(), num_read++) { + std::string key = example_reader.Key(); + const NnetExample &input_eg = example_reader.Value(); + const NnetIo &input_eg_io = input_eg.io[0]; + NnetExample *perturb_eg = new NnetExample(); + Matrix perturb_eg_mat, + input_eg_mat; + input_eg_io.features.CopyToMat(&input_eg_mat); + ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); + perturb_eg->io.resize(1.0); + perturb_eg->io[0].features.SwapFullMatrix(&perturb_eg_mat); + example_writer.Write(key, *perturb_eg); + num_written++; + } + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}