Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/feat/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \
resample-test online-feature-test sinusoid-detection-test \
signal-test

OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o signal-distort.o \
feature-spectrogram.o mel-computations.o wave-reader.o \
pitch-functions.o resample.o online-feature.o sinusoid-detection.o \
signal.o
Expand Down
85 changes: 85 additions & 0 deletions src/feat/signal-distort.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// featbin/signal-distort.cc

// Copyright 2016 Pegah Ghahremani

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.



#include "feat/signal-distort.h"

namespace kaldi {

// randomly disturb the input signal using a band-pass filter with no zeros.
void PerturbXvectorSignal::ComputeAndApplyRandDistortion(const MatrixBase<BaseFloat> &input_egs,
Matrix<BaseFloat> *perturb_egs) {
// Generate impluse response |H(w)| using nonzero random sequence and smooth them
// using moving-average window with small window size.
// For simplicity, assume zero-phase response and H(w) = |H(w)|.
// num_fft_samp = 512
int32 num_fft_samp = 512;
Vector<BaseFloat> im_response(num_fft_samp);

}

// Stretches the time axis for input egs without fixing the pitch value.
// It changes the speed and duration of the input signal without fixing pitch.
// The output y w.r.t input x is going to be y(t - offset) = x(stretch * (t - offset)),
// where offset is the time index which the signal is stretches along that and the input
// and output are the same for t = offset.
// ArbitraryResample class is used to generate resampled output for different time-stretches.
// The output y is the stretched form of the input, x, and stretch value is randomely generated
// between [1 - max_stretch, 1 + max_stretch].
// y[(m - n + 2 * t)/2] = x[(1 + stretch) * (m - n + 2 * t)/2] for t = 0,..,n
void PerturbXvectorSignal::TimeStretch(const MatrixBase<BaseFloat> &input_egs,
Matrix<BaseFloat> *perturb_egs) {
Matrix<BaseFloat> in_mat(input_egs),
out_mat(perturb_egs->NumRows(), perturb_egs->NumCols());
int32 input_dim = input_egs.NumCols(),
dim = perturb_egs->NumCols();
Vector<BaseFloat> samp_points_secs(dim);
BaseFloat samp_freq = 2000,
max_stretch = opts_.max_time_stretch;
// we stretch the middle part of the example and the input should be expanded
// by extra frame to be larger than the output length => s * (m+n)/2 < m.
// y((m - n + 2 * t)/2) = x(s * (m - n + 2 * t)/2) for t = 0,..,n
// where m = dim(x) and n = dim(y).
KALDI_ASSERT(input_dim > dim * ((1.0 + max_stretch) / (1.0 - max_stretch)));
// Generate random stretch value between -max_stretch, max_stretch.
int32 max_stretch_int = static_cast<int32>(max_stretch * 1000);
BaseFloat stretch = static_cast<BaseFloat>(RandInt(-max_stretch_int, max_stretch_int) / 1000.0);
if (abs(stretch) > 0) {
int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to.
BaseFloat filter_cutoff_hz = samp_freq * 0.475; // lowpass frequency that's lower than 95% of
// the Nyquist.
for (int32 i = 0; i < dim; i++)
samp_points_secs(i) = static_cast<BaseFloat>(((1.0 + stretch) *
(0.5 * (input_dim - dim) + i))/ samp_freq);

ArbitraryResample time_resample(input_dim, samp_freq,
filter_cutoff_hz,
samp_points_secs,
num_zeros);
time_resample.Resample(in_mat, &out_mat);
} else {
int32 offset = static_cast<BaseFloat>(0.5 * (input_egs.NumCols() - perturb_egs->NumCols()));
out_mat.CopyFromMat(input_egs.Range(0, input_egs.NumRows(), offset, perturb_egs->NumCols()));
}
perturb_egs->CopyFromMat(out_mat);
}


} // end of namespace kaldi
80 changes: 80 additions & 0 deletions src/feat/signal-distort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// featbin/signal-distort.h

// Copyright 2016 Pegah Ghahremani

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_SIGNAL_DISTORT_H_
#define KALDI_SIGNAL_DISTORT_H_

#include <cassert>
#include <cstdlib>
#include <string>
#include <vector>

#include "base/kaldi-error.h"
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"

#include "feat/resample.h"
#include "matrix/matrix-functions.h"
#include "cudamatrix/cu-matrix.h"

namespace kaldi {

// options class for distorting signals in egs
struct XvectorPerturbOptions {
BaseFloat max_shift;
BaseFloat max_time_stretch;
int32 frame_dim;
bool negation;
bool rand_distort;
std::string noise_egs;
XvectorPerturbOptions(): max_shift(0.2),
max_time_stretch(0.2),
frame_dim(80),
negation(false),
rand_distort(false) { }
void Register(OptionsItf *opts) {
opts->Register("max-shift", &max_shift, "Maximum random shift relative"
"to frame length applied to egs.");
opts->Register("max-speed-perturb", &max_time_stretch,
"Max speed perturbation applied on egs.");
opts->Register("frame-dim", &frame_dim,
"The numebr of samples in input frame as product of frame_length by samp_freq.");
opts->Register("negation", &negation, "If true, the input value is negated randomly.");
opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added.");
opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use - not _.

"using some designed FIR filter with no zeros.");
}
};

class PerturbXvectorSignal {
public:
PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { };

void ComputeAndApplyRandDistortion(const MatrixBase<BaseFloat> &input_egs,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be cleaner to have this operate on vectors rather than matrices, and have separate code you call before and after that converts between the matrix and vector representations (and checks that the Indexes are in the correct order, etc.)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added two separate functions for vectorization and unvectorization. However we need to apply filters separately on each rows, so why do we really need vectorization???

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the egs are dumped, it may not be the case that each time-sequence has
just one 't' value. We may dump them while breaking up the signal into
short pieces of e.g. 10ms or 5ms or even less. So you need to convert
these back into a continuous time-sequence before applying any filter.

Dan

On Thu, Feb 18, 2016 at 12:51 PM, pegahgh notifications@github.com wrote:

In src/feat/signal-distort.h
#7 (comment):

  • opts->Register("max-speed-perturb", &max_time_stretch,
  •               "Max speed perturbation applied on egs.");
    
  • opts->Register("frame-dim", &frame_dim,
  •               "The numebr of samples in input frame as product of frame_length by samp_freq.");
    
  • opts->Register("negation", &negation, "If true, the input value is negated randomly.");
  • opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added.");
  • opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes"
  •               "using some designed FIR filter with no zeros.");
    
  • }
    +};

+class PerturbXvectorSignal {

  • public:
  • PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { };
  • void ComputeAndApplyRandDistortion(const MatrixBase &input_egs,

I added two separate functions for vectorization and unvectorization.
However we need to apply filters separately on each rows, so why do we
really need vectorization???


Reply to this email directly or view it on GitHub
https://github.com/danpovey/kaldi/pull/7/files#r53352458.

Matrix<BaseFloat> *perturb_egs);

void TimeStretch(const MatrixBase<BaseFloat> &input_egs,
Matrix<BaseFloat> *perturb_egs);

private:
XvectorPerturbOptions opts_;
};

} // end of namespace kaldi
#endif // KALDI_SIGNAL_DISTORT_H_
5 changes: 3 additions & 2 deletions src/nnet3bin/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ include ../kaldi.mk
LDFLAGS += $(CUDA_LDFLAGS)
LDLIBS += $(CUDA_LDLIBS)

BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-fold-egs \
nnet3-xvector-signal-perturb-egs nnet3-subset-egs \
nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions \
nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
Expand All @@ -24,7 +25,7 @@ TESTFILES =
ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
../transform/kaldi-transform.a ../tree/kaldi-tree.a \
../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
../thread/kaldi-thread.a ../feat/kaldi-feat.a ../cudamatrix/kaldi-cudamatrix.a \
../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
../util/kaldi-util.a ../base/kaldi-base.a

Expand Down
89 changes: 89 additions & 0 deletions src/nnet3bin/nnet3-fold-egs.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// nnet3bin/nnet3-copy-egs.cc

// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey)
// 2016 Pegah Ghahremani

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "nnet3/nnet-example.h"
#include "nnet3/nnet-example-utils.h"

int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet3;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;

const char *usage =
"Combine examples for neural network training and supports multiple rspecifiers, in which case it will reads the inputs \n"
"round-robin and writes to the output"
"\n"
"Usage: nnet3-fold-egs [options] <egs-rspecifier1> [<egs-rspecifier2> ...] <egs-wspecifier>\n"
"\n"
"e.g.\n"
"nnet3-fold-egs ark:1.egs ark:2.egs ark,t:text.egs\n"
"or:\n"
"nnet3-fold-egs ark:train.egs ark:1.egs ark:2.egs\n";

ParseOptions po(usage);
po.Read(argc, argv);

if (po.NumArgs() < 2) {
po.PrintUsage();
exit(1);
}

//
int32 num_inputs = po.NumArgs() - 1;
std::vector<SequentialNnetExampleReader*> example_readers(num_inputs);
for (int32 i = 0; i < num_inputs; i++)
example_readers[i] = new SequentialNnetExampleReader(po.GetArg(i+1));

std::string examples_wspecifier(po.GetArg(num_inputs+1));
NnetExampleWriter example_writer(examples_wspecifier);
int64 num_written = 0;
std::vector<int64> num_read(num_inputs);

//for (; !example_readers[0]->Done(); tot_num_read++) {
while (!example_readers[0]->Done()) {
for (int32 reader = 0; reader < num_inputs; reader++) {
if (!example_readers[reader]->Done()) {
example_readers[reader]->Next();
num_read[reader]++;
std::string key = example_readers[reader]->Key();
const NnetExample &eg = example_readers[reader]->Value();
example_writer.Write(key, eg);
num_written++;
}
}
}
for (int32 i = 0; i < num_inputs; i++)
delete example_readers[i];

KALDI_LOG << "Read " << num_read[0] << "neural-network training examples "
<< "from " << num_inputs << " inputs, wrote "
<< num_written;

return (num_written == 0 ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what() << '\n';
return -1;
}
}
Loading