diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h index b9c5794a629..26127a4dc4d 100644 --- a/src/feat/feature-common-inl.h +++ b/src/feat/feature-common-inl.h @@ -33,26 +33,26 @@ void OfflineFeatureTpl::ComputeFeatures( Matrix *output) { KALDI_ASSERT(output != NULL); BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq; - if (sample_freq == new_sample_freq) + if (sample_freq == new_sample_freq) { Compute(wave, vtln_warp, output); - else { - if (new_sample_freq < sample_freq) { - if (! computer_.GetFrameOptions().allow_downsample) + } else { + if (new_sample_freq < sample_freq && + ! computer_.GetFrameOptions().allow_downsample) KALDI_ERR << "Waveform and config sample Frequency mismatch: " << sample_freq << " .vs " << new_sample_freq - << " ( use --allow_downsample=true option to allow " + << " (use --allow-downsample=true to allow " << " downsampling the waveform)."; - - // Downsample the waveform. - Vector downsampled_wave(wave); - DownsampleWaveForm(sample_freq, wave, - new_sample_freq, &downsampled_wave); - Compute(downsampled_wave, vtln_warp, output); - } else - KALDI_ERR << "New sample Frequency " << new_sample_freq - << " is larger than waveform original sampling frequency " - << sample_freq; - + else if (new_sample_freq > sample_freq && + ! computer_.GetFrameOptions().allow_upsample) + KALDI_ERR << "Waveform and config sample Frequency mismatch: " + << sample_freq << " .vs " << new_sample_freq + << " (use --allow-upsample=true option to allow " + << " upsampling the waveform)."; + // Resample the waveform. + Vector resampled_wave(wave); + ResampleWaveform(sample_freq, wave, + new_sample_freq, &resampled_wave); + Compute(resampled_wave, vtln_warp, output); } } diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h index c249414259c..e911055368f 100644 --- a/src/feat/feature-window.h +++ b/src/feat/feature-window.h @@ -40,14 +40,15 @@ struct FrameExtractionOptions { BaseFloat preemph_coeff; // Preemphasis coefficient. bool remove_dc_offset; // Subtract mean of wave before FFT. std::string window_type; // e.g. Hamming window - bool round_to_power_of_two; - BaseFloat blackman_coeff; - bool snip_edges; - bool allow_downsample; // May be "hamming", "rectangular", "povey", "hanning", "blackman" // "povey" is a window I made to be similar to Hamming but to go to zero at the // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) // I just don't think the Hamming window makes sense as a windowing function. + bool round_to_power_of_two; + BaseFloat blackman_coeff; + bool snip_edges; + bool allow_downsample; + bool allow_upsample; FrameExtractionOptions(): samp_freq(16000), frame_shift_ms(10.0), @@ -59,7 +60,8 @@ struct FrameExtractionOptions { round_to_power_of_two(true), blackman_coeff(0.42), snip_edges(true), - allow_downsample(false) { } + allow_downsample(false), + allow_upsample(false) { } void Register(OptionsItf *opts) { opts->Register("sample-frequency", &samp_freq, @@ -90,6 +92,9 @@ struct FrameExtractionOptions { opts->Register("allow-downsample", &allow_downsample, "If true, allow the input waveform to have a higher frequency than " "the specified --sample-frequency (and we'll downsample)."); + opts->Register("allow-upsample", &allow_upsample, + "If true, allow the input waveform to have a lower frequency than " + "the specified --sample-frequency (and we'll upsample)."); } int32 WindowShift() const { return static_cast(samp_freq * 0.001 * frame_shift_ms); diff --git a/src/feat/resample.cc b/src/feat/resample.cc index 518685d85c8..11f4c62bf1c 100644 --- a/src/feat/resample.cc +++ b/src/feat/resample.cc @@ -302,7 +302,7 @@ void ArbitraryResample::Resample(const VectorBase &input, VectorBase *output) const { KALDI_ASSERT(input.Dim() == num_samples_in_ && output->Dim() == weights_.size()); - + int32 output_dim = output->Dim(); for (int32 i = 0; i < output_dim; i++) { SubVector input_part(input, first_index_[i], weights_[i].Dim()); @@ -365,13 +365,13 @@ BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const { return filter * window; } -void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase &wave, - BaseFloat new_freq, Vector *new_wave) { - KALDI_ASSERT(new_freq < orig_freq); - BaseFloat lowpass_cutoff = 0.99 * 0.5 * new_freq; +void ResampleWaveform(BaseFloat orig_freq, const VectorBase &wave, + BaseFloat new_freq, Vector *new_wave) { + BaseFloat min_freq = std::min(orig_freq, new_freq); + BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq; int32 lowpass_filter_width = 6; - LinearResample signal_downsampler(orig_freq, new_freq, - lowpass_cutoff, lowpass_filter_width); - signal_downsampler.Resample(wave, true, new_wave); + LinearResample resampler(orig_freq, new_freq, + lowpass_cutoff, lowpass_filter_width); + resampler.Resample(wave, true, new_wave); } } // namespace kaldi diff --git a/src/feat/resample.h b/src/feat/resample.h index cc3e5064863..ecac2ba7566 100644 --- a/src/feat/resample.h +++ b/src/feat/resample.h @@ -40,7 +40,7 @@ namespace kaldi { /** \file[resample.h] - + This header contains declarations of classes for resampling signals. The normal cases of resampling a signal are upsampling and downsampling (increasing and decreasing the sample rate of a signal, respectively), @@ -51,7 +51,7 @@ namespace kaldi { The input signal is always evenly spaced, say sampled with frequency S, and we assume the original signal was band-limited to S/2 or lower. The n'th input sample x_n (with n = 0, 1, ...) is interpreted as the original - signal's value at time n/S. + signal's value at time n/S. For resampling, it is convenient to view the input signal as a continuous function x(t) of t, where each sample x_n becomes a delta function @@ -73,14 +73,14 @@ namespace kaldi { means we window the sinc function out to its first zero on the left and right, w = 2 means the second zero, and so on; we normally choose w to be at least two. We call this num_zeros, not w, in the code. - + Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting signal s(t) at an arbitrary time t is easy: we have \f[ s(t) = 1/S \sum_n x_n h(t - n/S) \f]. (note: the sign of t - n/S might be wrong, but it doesn't matter as the filter and window are symmetric). This is true for arbitrary values of t. What the class ArbitraryResample does - is to allow you to evaluate the signal for specified values of t. + is to allow you to evaluate the signal for specified values of t. */ @@ -90,7 +90,7 @@ namespace kaldi { don't have to be linearly spaced. The low-pass filter cutoff "filter_cutoff_hz" should be less than half the sample rate; "num_zeros" should probably be at least two preferably more; higher numbers give - sharper filters but will be less efficient. + sharper filters but will be less efficient. */ class ArbitraryResample { public: @@ -115,7 +115,7 @@ class ArbitraryResample { /// This version of the Resample function processes just /// one vector. void Resample(const VectorBase &input, - VectorBase *output) const; + VectorBase *output) const; private: void SetIndexes(const Vector &sample_points); @@ -248,20 +248,35 @@ class LinearResample { ///< previously seen input signal. }; -/// Downsample a waveform. This is a convenience wrapper for the -/// class 'LinearResample'. -/// The low-pass filter cutoff used in 'LinearResample' is 0.99 of half of the -/// new_freq and num_zeros is 6. -/// The downsampling results is also checked wit sox resampling toolkit. -/// Sox design is inspired by Laurent De Soras' paper, -/// https://ccrma.stanford.edu/~jos/resample/Implementation.html -/// It designs low pass filter using pass-band, stop-band, Nyquist freq -/// and stop-band attenuation. -/// e.g. The mainlob for Hanning window is 4pi/M, where the main-lobe width is -/// equal to (pass-band-freq - stop-band-freq). -/// Also the cutoff frequency is equal to (pass-band-freq - stop-band-freq). -void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase &wave, - BaseFloat new_freq, Vector *new_wave); +/** + Downsample or upsample a waveform. This is a convenience wrapper for the + class 'LinearResample'. + The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist, + where the Nyquist is half of the minimum of (orig_freq, new_freq). The + resampling is done with a symmetric FIR filter with N_z (number of zeros) + as 6. + + We compared the downsampling results with those from the sox resampling + toolkit. + Sox's design is inspired by Laurent De Soras' paper, + https://ccrma.stanford.edu/~jos/resample/Implementation.html + + Note: we expect that while orig_freq and new_freq are of type BaseFloat, they + are actually required to have exact integer values (like 16000 or 8000) with + a ratio between them that can be expressed as a rational number with + reasonably small integer factors. +*/ +void ResampleWaveform(BaseFloat orig_freq, const VectorBase &wave, + BaseFloat new_freq, Vector *new_wave); + + +/// This function is deprecated. It is provided for backward compatibility, to avoid +/// breaking older code. +inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase &wave, + BaseFloat new_freq, Vector *new_wave) { + ResampleWaveform(orig_freq, wave, new_freq, new_wave); +} + /// @} End of "addtogroup feat" } // namespace kaldi