PaddlePaddle · zh794390558 · Jun 2, 2022 · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022
diff --git a/speechx/examples/custom_asr/run.sh b/speechx/examples/custom_asr/run.sh
@@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   recognizer_test_main \
     --wav_rspecifier=scp:$wav_scp \
     --cmvn_file=$cmvn \
-    --streaming_chunk=30 \
     --use_fbank=true \
     --model_path=$model_dir/avg_10.jit.pdmodel \
     --param_path=$model_dir/avg_10.jit.pdiparams \

diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
@@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
         --cmvn_file=$cmvn \
-        --streaming_chunk=0.36
     echo "feature make have finished!!!"
 fi
 
@@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_1.jit.pdmodel \
-        --streaming_chunk=30 \
         --param_path=$model_dir/avg_1.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \
         --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_5.jit.pdmodel \
-        --streaming_chunk=30 \
         --use_fbank=true \
         --param_path=$model_dir/avg_5.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \

diff --git a/speechx/examples/ds2_ol/websocket/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
 
 # websocket client
 websocket_client_main \
-    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
+    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5
diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@@ -62,7 +62,6 @@ fi
 websocket_server_main \
     --cmvn_file=$cmvn \
     --model_path=$model_dir/avg_1.jit.pdmodel \
-    --streaming_chunk=0.1 \
     --param_path=$model_dir/avg_1.jit.pdiparams \
     --word_symbol_table=$wfst/words.txt \
     --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
@@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
-DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
 // feature sliding window
 DEFINE_int32(receptive_field_length,
              7,
@@ -62,7 +61,6 @@ namespace ppspeech {
 FeaturePipelineOptions InitFeaturePipelineOptions() {
     FeaturePipelineOptions opts;
     opts.cmvn_file = FLAGS_cmvn_file;
-    opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
     kaldi::FrameExtractionOptions frame_opts;
     frame_opts.dither = 0.0;
     frame_opts.frame_shift_ms = 10;
@@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
         opts.to_float32 = false;
         frame_opts.window_type = "povey";
         frame_opts.frame_length_ms = 25;
-        opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-        opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
+        opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+        opts.fbank_opts.frame_opts = frame_opts;
     } else {
         opts.to_float32 = true;
         frame_opts.remove_dc_offset = false;

diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
@@ -19,6 +19,7 @@
 
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
 int main(int argc, char* argv[]) {
@@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
     KALDI_LOG << " cost:" << elapsed << " s";
     KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
     KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
-}
+}
diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
@@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
 
     virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
 
-    // the audio dim is 1, one sample
-    virtual size_t Dim() const { return 1; }
+    // the audio dim is 1, one sample, which is useless, 
+    // so we return size_(cache samples) instead.
+    virtual size_t Dim() const { return size_; }
 
     virtual void SetFinished() {
         std::lock_guard<std::mutex> lock(mutex_);

diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
     std::unique_ptr<ppspeech::FrontendInterface> data_source(
         new ppspeech::AudioCache(3600 * 1600, false));
 
-    ppspeech::FbankOptions opt;
-    opt.fbank_opts.frame_opts.frame_length_ms = 25;
-    opt.fbank_opts.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
-    opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-    opt.fbank_opts.frame_opts.dither = 0.0;
+    kaldi::FbankOptions opt;
+    opt.frame_opts.frame_length_ms = 25;
+    opt.frame_opts.frame_shift_ms = 10;
+    opt.mel_opts.num_bins = FLAGS_num_bins;
+    opt.frame_opts.dither = 0.0;
 
     std::unique_ptr<ppspeech::FrontendInterface> fbank(
         new ppspeech::Fbank(opt, std::move(data_source)));

diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
     opt.frame_opts.dither = 0.0;
     opt.frame_opts.remove_dc_offset = false;
     opt.frame_opts.window_type = "hanning";

diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "frontend/audio/fbank.h"
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
@@ -29,95 +28,33 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
 
-// todo refactor later:(SmileGoat)
-
-Fbank::Fbank(const FbankOptions& opts,
-             std::unique_ptr<FrontendInterface> base_extractor)
+FbankComputer::FbankComputer(const Options& opts)
     : opts_(opts),
-      computer_(opts.fbank_opts),
-      window_function_(opts.fbank_opts.frame_opts) {
-    base_extractor_ = std::move(base_extractor);
-    chunk_sample_size_ = static_cast<int32>(
-        opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq);
-}
+    computer_(opts) {}
 
-void Fbank::Accept(const VectorBase<BaseFloat>& inputs) {
-    base_extractor_->Accept(inputs);
+int32 FbankComputer::Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
 }
 
-bool Fbank::Read(Vector<BaseFloat>* feats) {
-    Vector<BaseFloat> wav(chunk_sample_size_);
-    bool flag = base_extractor_->Read(&wav);
-    if (flag == false || wav.Dim() == 0) return false;
-
-    // append remaned waves
-    int32 wav_len = wav.Dim();
-    int32 left_len = remained_wav_.Dim();
-    Vector<BaseFloat> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
-
-    // compute speech feature
-    Compute(waves, feats);
-
-    // cache remaned waves
-    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
-    int32 frame_shift = frame_opts.WindowShift();
-    int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
-    return true;
+bool FbankComputer::NeedRawLogEnergy() {
+    return opts_.use_energy && opts_.raw_energy; 
 }
 
-// Compute spectrogram feat
-bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
-    const kaldi::FrameExtractionOptions& frame_opts =
-        computer_.GetFrameOptions();
-    int32 num_samples = waves.Dim();
-    int32 frame_length = frame_opts.WindowSize();
-    int32 sample_rate = frame_opts.samp_freq;
-    if (num_samples < frame_length) {
-        return true;
-    }
-
-    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
-
-    Vector<BaseFloat> window;
-    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (int32 frame = 0; frame < num_frames; frame++) {
-        BaseFloat raw_log_energy = 0.0;
-        kaldi::ExtractWindow(0,
-                             waves,
-                             frame,
-                             frame_opts,
-                             window_function_,
-                             &window,
-                             need_raw_log_energy ? &raw_log_energy : NULL);
-
-
-        Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
-        // note: this online feature-extraction code does not support VTLN.
-        RealFft(&window, true);
-        kaldi::ComputePowerSpectrum(&window);
-        const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
-        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
-        if (!opts_.fbank_opts.use_power) {
-            power_spectrum.ApplyPow(0.5);
-        }
-        int32 mel_offset =
-            ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
-                                                                           : 0);
-        SubVector<BaseFloat> mel_energies(
-            this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
-        mel_bank.Compute(power_spectrum, &mel_energies);
-        mel_energies.ApplyFloor(1e-07);
-        mel_energies.ApplyLog();
-        SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
-        output_row.CopyFromVec(this_feature);
+// Compute feat
+bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
+    RealFft(window, true);
+    kaldi::ComputePowerSpectrum(window);
+    const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
+    SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
+    if (!opts_.use_power) {
+        power_spectrum.ApplyPow(0.5);
     }
+    int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+    SubVector<BaseFloat> mel_energies(
+        *feat, mel_offset, opts_.mel_opts.num_bins);
+    mel_bank.Compute(power_spectrum, &mel_energies);
+    mel_energies.ApplyFloor(1e-07);
+    mel_energies.ApplyLog();
     return true;
 }
 

diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h
@@ -15,63 +15,36 @@
 #pragma once
 
 #include "base/common.h"
+#include "frontend/audio/feature_common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-fbank.h"
 #include "kaldi/feat/feature-mfcc.h"
 #include "kaldi/matrix/kaldi-vector.h"
 
 namespace ppspeech {
 
-struct FbankOptions {
-    kaldi::FbankOptions fbank_opts;
-    kaldi::BaseFloat streaming_chunk;  // second
-
-    FbankOptions() : streaming_chunk(0.1), fbank_opts() {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("streaming-chunk",
-                       &streaming_chunk,
-                       "streaming chunk size, default: 0.1 sec");
-        fbank_opts.Register(opts);
-    }
-};
-
-
-class Fbank : public FrontendInterface {
+class FbankComputer {
   public:
-    explicit Fbank(const FbankOptions& opts,
-                   std::unique_ptr<FrontendInterface> base_extractor);
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    typedef kaldi::FbankOptions Options;
+    explicit FbankComputer(const Options& opts);
 
-    // the dim_ is the dim of single frame feature
-    virtual size_t Dim() const { return computer_.Dim(); }
-
-    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
+        return opts_.frame_opts;
+    }
 
-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
+                 kaldi::Vector<kaldi::BaseFloat>* feat);
+    int32 Dim() const;
 
-    virtual void Reset() {
-        base_extractor_->Reset();
-        remained_wav_.Resize(0);
-    }
+    bool NeedRawLogEnergy();
 
   private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
-                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    Options opts_;
 
-    FbankOptions opts_;
-    std::unique_ptr<FrontendInterface> base_extractor_;
-
-    kaldi::FeatureWindowFunction window_function_;
     kaldi::FbankComputer computer_;
-    // features_ is the Mfcc or Plp or Fbank features that we have already
-    // computed.
-    kaldi::Vector<kaldi::BaseFloat> features_;
-    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
-    kaldi::int32 chunk_sample_size_;
-
-    DISALLOW_COPY_AND_ASSIGN(Fbank);
+    DISALLOW_COPY_AND_ASSIGN(FbankComputer);
 };
 
+typedef StreamingFeatureTpl<FbankComputer> Fbank;
+
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/frontend/audio/feature_common.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "frontend_itf.h"
+#include "kaldi/feat/feature-window.h"
+
+namespace ppspeech {
+
+template <class F>
+class StreamingFeatureTpl : public FrontendInterface {
+  public:
+    typedef typename F::Options Options;
+    StreamingFeatureTpl(const Options& opts, 
+                        std::unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    // the dim_ is the dim of single frame feature
+    virtual size_t Dim() const { return computer_.Dim(); }
+
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+        remained_wav_.Resize(0);
+    }
+  private:
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, 
+                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    Options opts_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    kaldi::FeatureWindowFunction window_function_;
+    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
+    F computer_;
+};
+
+}  // namespace ppspeech
+
+#include "frontend/audio/feature_common_inl.h"