Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Speechx] add nnet prob cache && make 2 thread decode work #2769

Merged
merged 9 commits into from
Dec 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion speechx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ option(USE_PROFILING "enable c++ profling" OFF)
option(WITH_TESTING "unit test" ON)

option(USING_U2 "compile u2 model." ON)
option(USING_DS2 "compile with ds2 model." ON)
option(USING_DS2 "compile with ds2 model." OFF)

option(USING_GPU "u2 compute on GPU." OFF)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "fst/symbol-table.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/nnet_producer.h"
#include "nnet/u2_nnet.h"

DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
Expand All @@ -39,7 +40,7 @@ using kaldi::BaseFloat;
using kaldi::Matrix;
using std::vector;

// test ds2 online decoder by feeding speech feature
// test u2 online decoder by feeding speech feature
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
Expand Down Expand Up @@ -69,8 +70,10 @@ int main(int argc, char* argv[]) {
// decodeable
std::shared_ptr<ppspeech::DataCache> raw_data =
std::make_shared<ppspeech::DataCache>();
std::shared_ptr<ppspeech::NnetProducer> nnet_producer =
std::make_shared<ppspeech::NnetProducer>(nnet, raw_data);
std::shared_ptr<ppspeech::Decodable> decodable =
std::make_shared<ppspeech::Decodable>(nnet, raw_data);
std::make_shared<ppspeech::Decodable>(nnet_producer);

// decoder
ppspeech::CTCBeamSearchOptions opts;
Expand Down Expand Up @@ -114,9 +117,9 @@ int main(int argc, char* argv[]) {
ori_feature_len - chunk_idx * chunk_stride, chunk_size);
}
if (this_chunk_size < receptive_field_length) {
LOG(WARNING)
<< "utt: " << utt << " skip last " << this_chunk_size
<< " frames, expect is " << receptive_field_length;
LOG(WARNING) << "utt: " << utt << " skip last "
<< this_chunk_size << " frames, expect is "
<< receptive_field_length;
break;
}

Expand Down
22 changes: 11 additions & 11 deletions speechx/speechx/asr/nnet/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
set(srcs decodable.cc)
set(srcs decodable.cc nnet_producer.cc)

if(USING_DS2)
list(APPEND srcs ds2_nnet.cc)
Expand Down Expand Up @@ -27,13 +27,13 @@ if(USING_DS2)
endif()

# test bin
if(USING_U2)
set(bin_name u2_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)

target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS})
target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
endif()
#if(USING_U2)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么注释掉了?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

api一些使用暂时还没有改完全,基本框架通了,只验证了recognizer_main, 等整体改完,在验证这几个_main.

# set(bin_name u2_nnet_main)
# add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
# target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
# target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)

# target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS})
# target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
# target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
#endif()
88 changes: 23 additions & 65 deletions speechx/speechx/asr/nnet/decodable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,16 @@ using kaldi::Matrix;
using kaldi::Vector;
using std::vector;

Decodable::Decodable(const std::shared_ptr<NnetBase>& nnet,
const std::shared_ptr<FrontendInterface>& frontend,
Decodable::Decodable(const std::shared_ptr<NnetProducer>& nnet_producer,
kaldi::BaseFloat acoustic_scale)
: frontend_(frontend),
nnet_(nnet),
: nnet_producer_(nnet_producer),
frame_offset_(0),
frames_ready_(0),
acoustic_scale_(acoustic_scale) {}

// for debug
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
nnet_out_cache_ = likelihood;
frames_ready_ += likelihood.NumRows();
nnet_producer_->Acceptlikelihood(likelihood);
}


Expand All @@ -43,7 +40,7 @@ int32 Decodable::NumFramesReady() const { return frames_ready_; }

// frame idx is from 0 to frame_ready_ -1;
bool Decodable::IsLastFrame(int32 frame) {
bool flag = EnsureFrameHaveComputed(frame);
EnsureFrameHaveComputed(frame);
return frame >= frames_ready_;
}

Expand All @@ -64,32 +61,10 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) {

bool Decodable::AdvanceChunk() {
kaldi::Timer timer;
// read feats
Vector<BaseFloat> features;
if (frontend_ == NULL || frontend_->Read(&features) == false) {
// no feat or frontend_ not init.
VLOG(3) << "decodable exit;";
return false;
}
CHECK_GE(frontend_->Dim(), 0);
VLOG(1) << "AdvanceChunk feat cost: " << timer.Elapsed() << " sec.";
VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats.";

// forward feats
NnetOut out;
nnet_->FeedForward(features, frontend_->Dim(), &out);
int32& vocab_dim = out.vocab_dim;
Vector<BaseFloat>& logprobs = out.logprobs;

VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim
<< " decoder frames.";
// cache nnet outupts
nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
nnet_out_cache_.CopyRowsFromVec(logprobs);

// update state, decoding frame.
bool flag = nnet_producer_->Read(&framelikelihood_);
if (flag == false) return false;
frame_offset_ = frames_ready_;
frames_ready_ += nnet_out_cache_.NumRows();
frames_ready_ += 1;
VLOG(1) << "AdvanceChunk feat + forward cost: " << timer.Elapsed()
<< " sec.";
return true;
Expand All @@ -101,17 +76,17 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
return false;
}

int nrows = nnet_out_cache_.NumRows();
CHECK(nrows == (frames_ready_ - frame_offset_));
if (nrows <= 0) {
if (framelikelihood_.empty()) {
LOG(WARNING) << "No new nnet out in cache.";
return false;
}

logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols());
logprobs->CopyRowsFromMat(nnet_out_cache_);

*vocab_dim = nnet_out_cache_.NumCols();
size_t dim = framelikelihood_.size();
logprobs->Resize(framelikelihood_.size());
std::memcpy(logprobs->Data(),
framelikelihood_.data(),
dim * sizeof(kaldi::BaseFloat));
*vocab_dim = framelikelihood_.size();
return true;
}

Expand All @@ -122,19 +97,8 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
return false;
}

int nrows = nnet_out_cache_.NumRows();
CHECK(nrows == (frames_ready_ - frame_offset_));
int vocab_size = nnet_out_cache_.NumCols();
likelihood->resize(vocab_size);

for (int32 idx = 0; idx < vocab_size; ++idx) {
(*likelihood)[idx] =
nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;

VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " "
<< nnet_out_cache_.NumRows()
<< " logprob: " << nnet_out_cache_(frame - frame_offset_, idx);
}
CHECK_EQ(1, (frames_ready_ - frame_offset_));
*likelihood = framelikelihood_;
return true;
}

Expand All @@ -143,37 +107,31 @@ BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
return false;
}

CHECK_LE(index, nnet_out_cache_.NumCols());
CHECK_LE(index, framelikelihood_.size());
CHECK_LE(frame, frames_ready_);

// the nnet output is prob ranther than log prob
// the index - 1, because the ilabel
BaseFloat logprob = 0.0;
int32 frame_idx = frame - frame_offset_;
BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index));
if (nnet_->IsLogProb()) {
logprob = nnet_out;
} else {
logprob = std::log(nnet_out + std::numeric_limits<float>::epsilon());
}
CHECK(!std::isnan(logprob) && !std::isinf(logprob));
CHECK_EQ(frame_idx, 0);
logprob = framelikelihood_[TokenId2NnetId(index)];
return acoustic_scale_ * logprob;
}

void Decodable::Reset() {
if (frontend_ != nullptr) frontend_->Reset();
if (nnet_ != nullptr) nnet_->Reset();
if (nnet_producer_ != nullptr) nnet_producer_->Reset();
frame_offset_ = 0;
frames_ready_ = 0;
nnet_out_cache_.Resize(0, 0);
framelikelihood_.clear();
}

void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) {
kaldi::Timer timer;
nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
nnet_producer_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
VLOG(1) << "Attention Rescoring cost: " << timer.Elapsed() << " sec.";
}

} // namespace ppspeech
} // namespace ppspeech
16 changes: 5 additions & 11 deletions speechx/speechx/asr/nnet/decodable.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,18 @@
// limitations under the License.

#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/decoder/decodable-itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "nnet/nnet_producer.h"

namespace ppspeech {

struct DecodableOpts;

class Decodable : public kaldi::DecodableInterface {
public:
explicit Decodable(const std::shared_ptr<NnetBase>& nnet,
const std::shared_ptr<FrontendInterface>& frontend,
explicit Decodable(const std::shared_ptr<NnetProducer>& nnet_producer,
kaldi::BaseFloat acoustic_scale = 1.0);

// void Init(DecodableOpts config);
Expand Down Expand Up @@ -57,23 +56,17 @@ class Decodable : public kaldi::DecodableInterface {

void Reset();

bool IsInputFinished() const { return frontend_->IsFinished(); }
bool IsInputFinished() const { return nnet_producer_->IsFinished(); }

bool EnsureFrameHaveComputed(int32 frame);

int32 TokenId2NnetId(int32 token_id);

std::shared_ptr<NnetBase> Nnet() { return nnet_; }

// for offline test
void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);

private:
std::shared_ptr<FrontendInterface> frontend_;
std::shared_ptr<NnetBase> nnet_;

// nnet outputs' cache
kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
std::shared_ptr<NnetProducer> nnet_producer_;

// the frame is nnet prob frame rather than audio feature frame
// nnet frame subsample the feature frame
Expand All @@ -85,6 +78,7 @@ class Decodable : public kaldi::DecodableInterface {
// so use subsampled_frame
int32 current_log_post_subsampled_offset_;
int32 num_chunk_computed_;
std::vector<kaldi::BaseFloat> framelikelihood_;

kaldi::BaseFloat acoustic_scale_;
};
Expand Down
84 changes: 84 additions & 0 deletions speechx/speechx/asr/nnet/nnet_producer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "nnet/nnet_producer.h"

namespace ppspeech {

using kaldi::Vector;
using kaldi::BaseFloat;

NnetProducer::NnetProducer(std::shared_ptr<NnetBase> nnet,
std::shared_ptr<FrontendInterface> frontend)
: nnet_(nnet), frontend_(frontend) {}

void NnetProducer::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
frontend_->Accept(inputs);
bool result = false;
do {
result = Compute();
} while (result);
}

void NnetProducer::Acceptlikelihood(
const kaldi::Matrix<BaseFloat>& likelihood) {
std::vector<BaseFloat> prob;
prob.resize(likelihood.NumCols());
for (size_t idx = 0; idx < likelihood.NumRows(); ++idx) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

此处是否可以使用 memcpy相关函数 替换 两层 for循环;担心有效率问题

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不可以,Matrix内存非连续,不过后续会去掉kaldi::matrix这种,所以出现这种for的赋值了。

for (size_t col = 0; col < likelihood.NumCols(); ++col) {
prob[col] = likelihood(idx, col);
cache_.push_back(prob);
}
}
}

bool NnetProducer::Read(std::vector<kaldi::BaseFloat>* nnet_prob) {
bool flag = cache_.pop(nnet_prob);
return flag;
}

bool NnetProducer::Compute() {
Vector<BaseFloat> features;
if (frontend_ == NULL || frontend_->Read(&features) == false) {
// no feat or frontend_ not init.
VLOG(3) << "no feat avalible";
return false;
}
CHECK_GE(frontend_->Dim(), 0);
VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats.";

NnetOut out;
nnet_->FeedForward(features, frontend_->Dim(), &out);
int32& vocab_dim = out.vocab_dim;
Vector<BaseFloat>& logprobs = out.logprobs;
size_t nframes = logprobs.Dim() / vocab_dim;
VLOG(2) << "Forward out " << nframes << " decoder frames.";
std::vector<BaseFloat> logprob(vocab_dim);
// remove later.
for (size_t idx = 0; idx < nframes; ++idx) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

和上面类似的问题

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

todo remove later,feature 统一后,vector, Vector 等数据holder会统一处理。

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

for (size_t prob_idx = 0; prob_idx < vocab_dim; ++prob_idx) {
logprob[prob_idx] = logprobs(idx * vocab_dim + prob_idx);
}
cache_.push_back(logprob);
}
return true;
}

void NnetProducer::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) {
nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
}

} // namespace ppspeech
Loading