From 40095d3026bef08904acf831d69fff2a7ed72d8e Mon Sep 17 00:00:00 2001 From: YangZhou Date: Wed, 14 Dec 2022 15:44:03 +0800 Subject: [PATCH 1/6] fix openfst download error --- speechx/cmake/openfst.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speechx/cmake/openfst.cmake b/speechx/cmake/openfst.cmake index 9acf530a195..bb4512fc7c4 100644 --- a/speechx/cmake/openfst.cmake +++ b/speechx/cmake/openfst.cmake @@ -4,7 +4,7 @@ set(openfst_SOURCE_DIR ${fc_patch}/openfst-src) set(openfst_BINARY_DIR ${fc_patch}/openfst-build) ExternalProject_Add(openfst - URL https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip + URL https://paddleaudio.bj.bcebos.com/build/openfst_1.7.2.zip URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6 PREFIX ${openfst_PREFIX_DIR} SOURCE_DIR ${openfst_SOURCE_DIR} @@ -17,4 +17,4 @@ ExternalProject_Add(openfst BUILD_COMMAND make -j 4 ) link_directories(${openfst_PREFIX_DIR}/lib) -include_directories(${openfst_PREFIX_DIR}/include) \ No newline at end of file +include_directories(${openfst_PREFIX_DIR}/include) From f880229c25d04b2c75021a14f55f6a50cc4a657b Mon Sep 17 00:00:00 2001 From: YangZhou Date: Wed, 14 Dec 2022 15:51:03 +0800 Subject: [PATCH 2/6] add acknowledgments of openfst --- speechx/cmake/openfst.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/speechx/cmake/openfst.cmake b/speechx/cmake/openfst.cmake index bb4512fc7c4..07c33a74794 100644 --- a/speechx/cmake/openfst.cmake +++ b/speechx/cmake/openfst.cmake @@ -3,6 +3,13 @@ set(openfst_PREFIX_DIR ${fc_patch}/openfst) set(openfst_SOURCE_DIR ${fc_patch}/openfst-src) set(openfst_BINARY_DIR ${fc_patch}/openfst-build) +# openfst Acknowledgments: +#Cyril Allauzen, Michael Riley, Johan Schalkwyk, Wojciech Skut and Mehryar Mohri, +#"OpenFst: A General and Efficient Weighted Finite-State Transducer Library", +#Proceedings of the Ninth International Conference on Implementation and +#Application of Automata, (CIAA 2007), volume 4783 of Lecture Notes in +#Computer Science, pages 11-23. Springer, 2007. http://www.openfst.org. + ExternalProject_Add(openfst URL https://paddleaudio.bj.bcebos.com/build/openfst_1.7.2.zip URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6 From a2b5eb19c8513e06bffdaca14d00ca0d8c2eadd2 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Fri, 16 Dec 2022 11:21:06 +0800 Subject: [PATCH 3/6] refactor directory --- speechx/requirement.txt | 1 - speechx/speechx/CMakeLists.txt | 51 +- speechx/speechx/asr/CMakeLists.txt | 11 + .../speechx/{ => asr}/decoder/CMakeLists.txt | 0 speechx/speechx/{ => asr}/decoder/common.h | 0 .../decoder/ctc_beam_search_decoder.cc | 0 .../decoder/ctc_beam_search_decoder.h | 0 .../decoder/ctc_beam_search_decoder_main.cc | 0 .../{ => asr}/decoder/ctc_beam_search_opt.h | 0 .../asr/decoder/ctc_decoders/.gitignore | 9 + .../decoder/ctc_decoders/COPYING.APACHE2.0 | 201 ++++++ .../asr/decoder/ctc_decoders/COPYING.LESSER.3 | 165 +++++ .../speechx/asr/decoder/ctc_decoders/LICENSE | 8 + .../asr/decoder/ctc_decoders/__init__.py | 13 + .../ctc_decoders/ctc_beam_search_decoder.cpp | 607 ++++++++++++++++++ .../ctc_decoders/ctc_beam_search_decoder.h | 175 +++++ .../ctc_decoders/ctc_greedy_decoder.cpp | 61 ++ .../decoder/ctc_decoders/ctc_greedy_decoder.h | 35 + .../decoder/ctc_decoders/decoder_utils.cpp | 193 ++++++ .../asr/decoder/ctc_decoders/decoder_utils.h | 111 ++++ .../asr/decoder/ctc_decoders/decoders.i | 33 + .../asr/decoder/ctc_decoders/path_trie.cpp | 164 +++++ .../asr/decoder/ctc_decoders/path_trie.h | 82 +++ .../asr/decoder/ctc_decoders/scorer.cpp | 232 +++++++ .../speechx/asr/decoder/ctc_decoders/scorer.h | 114 ++++ .../speechx/asr/decoder/ctc_decoders/setup.py | 138 ++++ .../speechx/asr/decoder/ctc_decoders/setup.sh | 24 + .../decoder/ctc_prefix_beam_search_decoder.cc | 2 +- .../decoder/ctc_prefix_beam_search_decoder.h | 0 .../ctc_prefix_beam_search_decoder_main.cc | 0 .../decoder/ctc_prefix_beam_search_score.h | 0 .../{ => asr}/decoder/ctc_tlg_decoder.cc | 0 .../{ => asr}/decoder/ctc_tlg_decoder.h | 0 .../{ => asr}/decoder/ctc_tlg_decoder_main.cc | 0 .../speechx/{ => asr}/decoder/decoder_itf.h | 0 .../decoder/nnet_logprob_decoder_main.cc | 0 speechx/speechx/{ => asr}/decoder/param.h | 0 speechx/speechx/{ => asr}/nnet/CMakeLists.txt | 0 speechx/speechx/{ => asr}/nnet/decodable.cc | 0 speechx/speechx/{ => asr}/nnet/decodable.h | 0 speechx/speechx/{ => asr}/nnet/ds2_nnet.cc | 0 speechx/speechx/{ => asr}/nnet/ds2_nnet.h | 0 .../speechx/{ => asr}/nnet/ds2_nnet_main.cc | 0 speechx/speechx/{ => asr}/nnet/nnet_itf.h | 0 speechx/speechx/{ => asr}/nnet/u2_nnet.cc | 0 speechx/speechx/{ => asr}/nnet/u2_nnet.h | 0 .../speechx/{ => asr}/nnet/u2_nnet_main.cc | 0 .../{ => asr}/recognizer/CMakeLists.txt | 0 .../{ => asr}/recognizer/recognizer.cc | 0 .../speechx/{ => asr}/recognizer/recognizer.h | 0 .../{ => asr}/recognizer/recognizer_main.cc | 0 .../{ => asr}/recognizer/u2_recognizer.cc | 0 .../{ => asr}/recognizer/u2_recognizer.h | 0 .../recognizer/u2_recognizer_main.cc | 0 .../{protocol => asr/server}/CMakeLists.txt | 0 .../server}/websocket/CMakeLists.txt | 0 .../server}/websocket/websocket_client.cc | 0 .../server}/websocket/websocket_client.h | 0 .../websocket/websocket_client_main.cc | 0 .../server}/websocket/websocket_server.cc | 0 .../server}/websocket/websocket_server.h | 0 .../websocket/websocket_server_main.cc | 0 speechx/speechx/common/CMakeLists.txt | 16 + .../speechx/{ => common}/base/basic_types.h | 0 speechx/speechx/{ => common}/base/common.h | 0 speechx/speechx/{ => common}/base/flags.h | 0 speechx/speechx/{ => common}/base/log.h | 0 speechx/speechx/{ => common}/base/macros.h | 0 .../speechx/{ => common}/base/thread_pool.h | 0 .../{ => common}/frontend/CMakeLists.txt | 0 .../frontend/audio/CMakeLists.txt | 0 .../{ => common}/frontend/audio/assembler.cc | 0 .../{ => common}/frontend/audio/assembler.h | 0 .../frontend/audio/audio_cache.cc | 0 .../{ => common}/frontend/audio/audio_cache.h | 0 .../{ => common}/frontend/audio/cmvn.cc | 0 .../{ => common}/frontend/audio/cmvn.h | 0 .../frontend/audio/cmvn_json2kaldi_main.cc | 0 .../frontend/audio/compute_fbank_main.cc | 0 .../audio/compute_linear_spectrogram_main.cc | 0 .../{ => common}/frontend/audio/data_cache.h | 0 .../{ => common}/frontend/audio/db_norm.cc | 0 .../{ => common}/frontend/audio/db_norm.h | 0 .../{ => common}/frontend/audio/fbank.cc | 0 .../{ => common}/frontend/audio/fbank.h | 0 .../frontend/audio/feature_cache.cc | 0 .../frontend/audio/feature_cache.h | 0 .../frontend/audio/feature_common.h | 0 .../frontend/audio/feature_common_inl.h | 0 .../frontend/audio/feature_pipeline.cc | 0 .../frontend/audio/feature_pipeline.h | 0 .../frontend/audio/frontend_itf.h | 0 .../frontend/audio/linear_spectrogram.cc | 0 .../frontend/audio/linear_spectrogram.h | 0 .../{ => common}/frontend/audio/mfcc.cc | 0 .../{ => common}/frontend/audio/mfcc.h | 0 .../{ => common}/frontend/audio/normalizer.h | 0 .../speechx/{ => common}/utils/CMakeLists.txt | 0 .../speechx/{ => common}/utils/file_utils.cc | 0 .../speechx/{ => common}/utils/file_utils.h | 0 speechx/speechx/{ => common}/utils/math.cc | 0 speechx/speechx/{ => common}/utils/math.h | 0 speechx/speechx/decoder/ctc_decoders | 1 - speechx/speechx/frontend/text/CMakeLists.txt | 0 speechx/speechx/kaldi/CMakeLists.txt | 5 +- speechx/speechx/third_party/CMakeLists.txt | 0 speechx/speechx/third_party/README.md | 4 - 107 files changed, 2403 insertions(+), 53 deletions(-) delete mode 100644 speechx/requirement.txt create mode 100644 speechx/speechx/asr/CMakeLists.txt rename speechx/speechx/{ => asr}/decoder/CMakeLists.txt (100%) rename speechx/speechx/{ => asr}/decoder/common.h (100%) rename speechx/speechx/{ => asr}/decoder/ctc_beam_search_decoder.cc (100%) rename speechx/speechx/{ => asr}/decoder/ctc_beam_search_decoder.h (100%) rename speechx/speechx/{ => asr}/decoder/ctc_beam_search_decoder_main.cc (100%) rename speechx/speechx/{ => asr}/decoder/ctc_beam_search_opt.h (100%) create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/.gitignore create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/LICENSE create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/__init__.py create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.cpp create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.h create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.cpp create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.h create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.cpp create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.h create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/decoders.i create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/path_trie.cpp create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/path_trie.h create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/scorer.cpp create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/scorer.h create mode 100644 speechx/speechx/asr/decoder/ctc_decoders/setup.py create mode 100755 speechx/speechx/asr/decoder/ctc_decoders/setup.sh rename speechx/speechx/{ => asr}/decoder/ctc_prefix_beam_search_decoder.cc (99%) rename speechx/speechx/{ => asr}/decoder/ctc_prefix_beam_search_decoder.h (100%) rename speechx/speechx/{ => asr}/decoder/ctc_prefix_beam_search_decoder_main.cc (100%) rename speechx/speechx/{ => asr}/decoder/ctc_prefix_beam_search_score.h (100%) rename speechx/speechx/{ => asr}/decoder/ctc_tlg_decoder.cc (100%) rename speechx/speechx/{ => asr}/decoder/ctc_tlg_decoder.h (100%) rename speechx/speechx/{ => asr}/decoder/ctc_tlg_decoder_main.cc (100%) rename speechx/speechx/{ => asr}/decoder/decoder_itf.h (100%) rename speechx/speechx/{ => asr}/decoder/nnet_logprob_decoder_main.cc (100%) rename speechx/speechx/{ => asr}/decoder/param.h (100%) rename speechx/speechx/{ => asr}/nnet/CMakeLists.txt (100%) rename speechx/speechx/{ => asr}/nnet/decodable.cc (100%) rename speechx/speechx/{ => asr}/nnet/decodable.h (100%) rename speechx/speechx/{ => asr}/nnet/ds2_nnet.cc (100%) rename speechx/speechx/{ => asr}/nnet/ds2_nnet.h (100%) rename speechx/speechx/{ => asr}/nnet/ds2_nnet_main.cc (100%) rename speechx/speechx/{ => asr}/nnet/nnet_itf.h (100%) rename speechx/speechx/{ => asr}/nnet/u2_nnet.cc (100%) rename speechx/speechx/{ => asr}/nnet/u2_nnet.h (100%) rename speechx/speechx/{ => asr}/nnet/u2_nnet_main.cc (100%) rename speechx/speechx/{ => asr}/recognizer/CMakeLists.txt (100%) rename speechx/speechx/{ => asr}/recognizer/recognizer.cc (100%) rename speechx/speechx/{ => asr}/recognizer/recognizer.h (100%) rename speechx/speechx/{ => asr}/recognizer/recognizer_main.cc (100%) rename speechx/speechx/{ => asr}/recognizer/u2_recognizer.cc (100%) rename speechx/speechx/{ => asr}/recognizer/u2_recognizer.h (100%) rename speechx/speechx/{ => asr}/recognizer/u2_recognizer_main.cc (100%) rename speechx/speechx/{protocol => asr/server}/CMakeLists.txt (100%) rename speechx/speechx/{protocol => asr/server}/websocket/CMakeLists.txt (100%) rename speechx/speechx/{protocol => asr/server}/websocket/websocket_client.cc (100%) rename speechx/speechx/{protocol => asr/server}/websocket/websocket_client.h (100%) rename speechx/speechx/{protocol => asr/server}/websocket/websocket_client_main.cc (100%) rename speechx/speechx/{protocol => asr/server}/websocket/websocket_server.cc (100%) rename speechx/speechx/{protocol => asr/server}/websocket/websocket_server.h (100%) rename speechx/speechx/{protocol => asr/server}/websocket/websocket_server_main.cc (100%) create mode 100644 speechx/speechx/common/CMakeLists.txt rename speechx/speechx/{ => common}/base/basic_types.h (100%) rename speechx/speechx/{ => common}/base/common.h (100%) rename speechx/speechx/{ => common}/base/flags.h (100%) rename speechx/speechx/{ => common}/base/log.h (100%) rename speechx/speechx/{ => common}/base/macros.h (100%) rename speechx/speechx/{ => common}/base/thread_pool.h (100%) rename speechx/speechx/{ => common}/frontend/CMakeLists.txt (100%) rename speechx/speechx/{ => common}/frontend/audio/CMakeLists.txt (100%) rename speechx/speechx/{ => common}/frontend/audio/assembler.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/assembler.h (100%) rename speechx/speechx/{ => common}/frontend/audio/audio_cache.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/audio_cache.h (100%) rename speechx/speechx/{ => common}/frontend/audio/cmvn.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/cmvn.h (100%) rename speechx/speechx/{ => common}/frontend/audio/cmvn_json2kaldi_main.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/compute_fbank_main.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/compute_linear_spectrogram_main.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/data_cache.h (100%) rename speechx/speechx/{ => common}/frontend/audio/db_norm.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/db_norm.h (100%) rename speechx/speechx/{ => common}/frontend/audio/fbank.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/fbank.h (100%) rename speechx/speechx/{ => common}/frontend/audio/feature_cache.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/feature_cache.h (100%) rename speechx/speechx/{ => common}/frontend/audio/feature_common.h (100%) rename speechx/speechx/{ => common}/frontend/audio/feature_common_inl.h (100%) rename speechx/speechx/{ => common}/frontend/audio/feature_pipeline.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/feature_pipeline.h (100%) rename speechx/speechx/{ => common}/frontend/audio/frontend_itf.h (100%) rename speechx/speechx/{ => common}/frontend/audio/linear_spectrogram.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/linear_spectrogram.h (100%) rename speechx/speechx/{ => common}/frontend/audio/mfcc.cc (100%) rename speechx/speechx/{ => common}/frontend/audio/mfcc.h (100%) rename speechx/speechx/{ => common}/frontend/audio/normalizer.h (100%) rename speechx/speechx/{ => common}/utils/CMakeLists.txt (100%) rename speechx/speechx/{ => common}/utils/file_utils.cc (100%) rename speechx/speechx/{ => common}/utils/file_utils.h (100%) rename speechx/speechx/{ => common}/utils/math.cc (100%) rename speechx/speechx/{ => common}/utils/math.h (100%) delete mode 120000 speechx/speechx/decoder/ctc_decoders delete mode 100644 speechx/speechx/frontend/text/CMakeLists.txt delete mode 100644 speechx/speechx/third_party/CMakeLists.txt delete mode 100644 speechx/speechx/third_party/README.md diff --git a/speechx/requirement.txt b/speechx/requirement.txt deleted file mode 100644 index 6a6db09603f..00000000000 --- a/speechx/requirement.txt +++ /dev/null @@ -1 +0,0 @@ -paddlepaddle>=2.4rc diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt index 60c183472ba..b522e158c81 100644 --- a/speechx/speechx/CMakeLists.txt +++ b/speechx/speechx/CMakeLists.txt @@ -2,50 +2,11 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(speechx LANGUAGES CXX) -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/kaldi -) -add_subdirectory(kaldi) - -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/utils -) -add_subdirectory(utils) - -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/frontend -) -add_subdirectory(frontend) - -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/nnet -) -add_subdirectory(nnet) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/kaldi) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/common) -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/decoder -) -add_subdirectory(decoder) - -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/recognizer -) -add_subdirectory(recognizer) - -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/protocol -) -add_subdirectory(protocol) - -include_directories( -${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/codelab -) +add_subdirectory(asr) +add_subdirectory(common) +add_subdirectory(kaldi) add_subdirectory(codelab) diff --git a/speechx/speechx/asr/CMakeLists.txt b/speechx/speechx/asr/CMakeLists.txt new file mode 100644 index 00000000000..ff4cdecbe38 --- /dev/null +++ b/speechx/speechx/asr/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(ASR LANGUAGES CXX) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/server) + +add_subdirectory(decoder) +add_subdirectory(recognizer) +add_subdirectory(nnet) +add_subdirectory(server) diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/asr/decoder/CMakeLists.txt similarity index 100% rename from speechx/speechx/decoder/CMakeLists.txt rename to speechx/speechx/asr/decoder/CMakeLists.txt diff --git a/speechx/speechx/decoder/common.h b/speechx/speechx/asr/decoder/common.h similarity index 100% rename from speechx/speechx/decoder/common.h rename to speechx/speechx/asr/decoder/common.h diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/asr/decoder/ctc_beam_search_decoder.cc similarity index 100% rename from speechx/speechx/decoder/ctc_beam_search_decoder.cc rename to speechx/speechx/asr/decoder/ctc_beam_search_decoder.cc diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/asr/decoder/ctc_beam_search_decoder.h similarity index 100% rename from speechx/speechx/decoder/ctc_beam_search_decoder.h rename to speechx/speechx/asr/decoder/ctc_beam_search_decoder.h diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc b/speechx/speechx/asr/decoder/ctc_beam_search_decoder_main.cc similarity index 100% rename from speechx/speechx/decoder/ctc_beam_search_decoder_main.cc rename to speechx/speechx/asr/decoder/ctc_beam_search_decoder_main.cc diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/asr/decoder/ctc_beam_search_opt.h similarity index 100% rename from speechx/speechx/decoder/ctc_beam_search_opt.h rename to speechx/speechx/asr/decoder/ctc_beam_search_opt.h diff --git a/speechx/speechx/asr/decoder/ctc_decoders/.gitignore b/speechx/speechx/asr/decoder/ctc_decoders/.gitignore new file mode 100644 index 00000000000..0b1046ae8a4 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/.gitignore @@ -0,0 +1,9 @@ +ThreadPool/ +build/ +dist/ +kenlm/ +openfst-1.6.3/ +openfst-1.6.3.tar.gz +swig_decoders.egg-info/ +decoders_wrap.cxx +swig_decoders.py diff --git a/speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 b/speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 new file mode 100644 index 00000000000..261eeb9e9f8 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 b/speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 new file mode 100644 index 00000000000..cca7fc278f5 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/LICENSE b/speechx/speechx/asr/decoder/ctc_decoders/LICENSE new file mode 100644 index 00000000000..ad947f8d756 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/LICENSE @@ -0,0 +1,8 @@ +Most of the code here is licensed under the Apache License 2.0. +There are exceptions that have their own licenses, listed below. + +score.h and score.cpp is under the LGPL license. +The two files include the header files from KenLM project. + +For the rest: +The default license of paddlespeech-ctcdecoders is Apache License 2.0. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/__init__.py b/speechx/speechx/asr/decoder/ctc_decoders/__init__.py new file mode 100644 index 00000000000..185a92b8d94 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.cpp b/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.cpp new file mode 100644 index 00000000000..ebea5c222a3 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.cpp @@ -0,0 +1,607 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ctc_beam_search_decoder.h" + +#include +#include +#include +#include +#include +#include + +#include "ThreadPool.h" +#include "fst/fstlib.h" + +#include "decoder_utils.h" +#include "path_trie.h" + +using FSTMATCH = fst::SortedMatcher; + + +std::vector> ctc_beam_search_decoding( + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t beam_size, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id) { + // dimension check + size_t num_time_steps = probs_seq.size(); + for (size_t i = 0; i < num_time_steps; ++i) { + VALID_CHECK_EQ(probs_seq[i].size(), + // vocabulary.size() + 1, + vocabulary.size(), + "The shape of probs_seq does not match with " + "the shape of the vocabulary"); + } + + + // assign space id + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); + int space_id = it - vocabulary.begin(); + // if no space in vocabulary + if ((size_t)space_id >= vocabulary.size()) { + space_id = -2; + } + // init prefixes' root + PathTrie root; + root.score = root.log_prob_b_prev = 0.0; + std::vector prefixes; + prefixes.push_back(&root); + + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + auto fst_dict = + static_cast(ext_scorer->dictionary); + fst::StdVectorFst *dict_ptr = fst_dict->Copy(true); + root.set_dictionary(dict_ptr); + auto matcher = std::make_shared(*dict_ptr, fst::MATCH_INPUT); + root.set_matcher(matcher); + } + + // prefix search over time + for (size_t time_step = 0; time_step < num_time_steps; ++time_step) { + auto &prob = probs_seq[time_step]; + + float min_cutoff = -NUM_FLT_INF; + bool full_beam = false; + if (ext_scorer != nullptr) { + size_t num_prefixes = std::min(prefixes.size(), beam_size); + std::sort(prefixes.begin(), + prefixes.begin() + num_prefixes, + prefix_compare); + min_cutoff = prefixes[num_prefixes - 1]->score + + std::log(prob[blank_id]) - + std::max(0.0, ext_scorer->beta); + full_beam = (num_prefixes == beam_size); + } + + std::vector> log_prob_idx = + get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n); + // loop over chars + for (size_t index = 0; index < log_prob_idx.size(); index++) { + auto c = log_prob_idx[index].first; + auto log_prob_c = log_prob_idx[index].second; + + for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) { + auto prefix = prefixes[i]; + if (full_beam && log_prob_c + prefix->score < min_cutoff) { + break; + } + // blank + if (c == blank_id) { + prefix->log_prob_b_cur = log_sum_exp( + prefix->log_prob_b_cur, log_prob_c + prefix->score); + continue; + } + // repeated character + if (c == prefix->character) { + prefix->log_prob_nb_cur = + log_sum_exp(prefix->log_prob_nb_cur, + log_prob_c + prefix->log_prob_nb_prev); + } + // get new prefix + auto prefix_new = prefix->get_path_trie(c); + + if (prefix_new != nullptr) { + float log_p = -NUM_FLT_INF; + + if (c == prefix->character && + prefix->log_prob_b_prev > -NUM_FLT_INF) { + log_p = log_prob_c + prefix->log_prob_b_prev; + } else if (c != prefix->character) { + log_p = log_prob_c + prefix->score; + } + + // language model scoring + if (ext_scorer != nullptr && + (c == space_id || ext_scorer->is_character_based())) { + PathTrie *prefix_to_score = nullptr; + // skip scoring the space + if (ext_scorer->is_character_based()) { + prefix_to_score = prefix_new; + } else { + prefix_to_score = prefix; + } + + float score = 0.0; + std::vector ngram; + ngram = ext_scorer->make_ngram(prefix_to_score); + score = ext_scorer->get_log_cond_prob(ngram) * + ext_scorer->alpha; + log_p += score; + log_p += ext_scorer->beta; + } + prefix_new->log_prob_nb_cur = + log_sum_exp(prefix_new->log_prob_nb_cur, log_p); + } + } // end of loop over prefix + } // end of loop over vocabulary + + + prefixes.clear(); + // update log probs + root.iterate_to_vec(prefixes); + + // only preserve top beam_size prefixes + if (prefixes.size() >= beam_size) { + std::nth_element(prefixes.begin(), + prefixes.begin() + beam_size, + prefixes.end(), + prefix_compare); + for (size_t i = beam_size; i < prefixes.size(); ++i) { + prefixes[i]->remove(); + } + } + } // end of loop over time + + // score the last word of each prefix that doesn't end with space + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + auto prefix = prefixes[i]; + if (!prefix->is_empty() && prefix->character != space_id) { + float score = 0.0; + std::vector ngram = ext_scorer->make_ngram(prefix); + score = + ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; + score += ext_scorer->beta; + prefix->score += score; + } + } + } + + size_t num_prefixes = std::min(prefixes.size(), beam_size); + std::sort( + prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); + + // compute approximate ctc score as the return score, without affecting the + // return order of decoding result. To delete when decoder gets stable. + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + double approx_ctc = prefixes[i]->score; + if (ext_scorer != nullptr) { + std::vector output; + prefixes[i]->get_path_vec(output); + auto prefix_length = output.size(); + auto words = ext_scorer->split_labels(output); + // remove word insert + approx_ctc = approx_ctc - prefix_length * ext_scorer->beta; + // remove language model weight: + approx_ctc -= + (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha; + } + prefixes[i]->approx_ctc = approx_ctc; + } + + return get_beam_search_result(prefixes, vocabulary, beam_size); +} + + +std::vector>> +ctc_beam_search_decoding_batch( + const std::vector>> &probs_split, + const std::vector &vocabulary, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id) { + VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + ThreadPool pool(num_processes); + // number of samples + size_t batch_size = probs_split.size(); + + // enqueue the tasks of decoding + std::vector>>> res; + for (size_t i = 0; i < batch_size; ++i) { + res.emplace_back(pool.enqueue(ctc_beam_search_decoding, + probs_split[i], + vocabulary, + beam_size, + cutoff_prob, + cutoff_top_n, + ext_scorer, + blank_id)); + } + + // get decoding results + std::vector>> batch_results; + for (size_t i = 0; i < batch_size; ++i) { + batch_results.emplace_back(res[i].get()); + } + return batch_results; +} + +void ctc_beam_search_decode_chunk_begin(PathTrie *root, Scorer *ext_scorer) { + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + auto fst_dict = + static_cast(ext_scorer->dictionary); + fst::StdVectorFst *dict_ptr = fst_dict->Copy(true); + root->set_dictionary(dict_ptr); + auto matcher = std::make_shared(*dict_ptr, fst::MATCH_INPUT); + root->set_matcher(matcher); + } +} + +void ctc_beam_search_decode_chunk( + PathTrie *root, + std::vector &prefixes, + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t beam_size, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id) { + // dimension check + size_t num_time_steps = probs_seq.size(); + for (size_t i = 0; i < num_time_steps; ++i) { + VALID_CHECK_EQ(probs_seq[i].size(), + // vocabulary.size() + 1, + vocabulary.size(), + "The shape of probs_seq does not match with " + "the shape of the vocabulary"); + } + + // assign space id + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); + int space_id = it - vocabulary.begin(); + // if no space in vocabulary + if ((size_t)space_id >= vocabulary.size()) { + space_id = -2; + } + // init prefixes' root + // + // prefix search over time + for (size_t time_step = 0; time_step < num_time_steps; ++time_step) { + auto &prob = probs_seq[time_step]; + + float min_cutoff = -NUM_FLT_INF; + bool full_beam = false; + if (ext_scorer != nullptr) { + size_t num_prefixes = std::min(prefixes.size(), beam_size); + std::sort(prefixes.begin(), + prefixes.begin() + num_prefixes, + prefix_compare); + min_cutoff = prefixes[num_prefixes - 1]->score + + std::log(prob[blank_id]) - + std::max(0.0, ext_scorer->beta); + full_beam = (num_prefixes == beam_size); + } + + std::vector> log_prob_idx = + get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n); + // loop over chars + for (size_t index = 0; index < log_prob_idx.size(); index++) { + auto c = log_prob_idx[index].first; + auto log_prob_c = log_prob_idx[index].second; + + for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) { + auto prefix = prefixes[i]; + if (full_beam && log_prob_c + prefix->score < min_cutoff) { + break; + } + // blank + if (c == blank_id) { + prefix->log_prob_b_cur = log_sum_exp( + prefix->log_prob_b_cur, log_prob_c + prefix->score); + continue; + } + // repeated character + if (c == prefix->character) { + prefix->log_prob_nb_cur = + log_sum_exp(prefix->log_prob_nb_cur, + log_prob_c + prefix->log_prob_nb_prev); + } + // get new prefix + auto prefix_new = prefix->get_path_trie(c); + + if (prefix_new != nullptr) { + float log_p = -NUM_FLT_INF; + + if (c == prefix->character && + prefix->log_prob_b_prev > -NUM_FLT_INF) { + log_p = log_prob_c + prefix->log_prob_b_prev; + } else if (c != prefix->character) { + log_p = log_prob_c + prefix->score; + } + + // language model scoring + if (ext_scorer != nullptr && + (c == space_id || ext_scorer->is_character_based())) { + PathTrie *prefix_to_score = nullptr; + // skip scoring the space + if (ext_scorer->is_character_based()) { + prefix_to_score = prefix_new; + } else { + prefix_to_score = prefix; + } + + float score = 0.0; + std::vector ngram; + ngram = ext_scorer->make_ngram(prefix_to_score); + score = ext_scorer->get_log_cond_prob(ngram) * + ext_scorer->alpha; + log_p += score; + log_p += ext_scorer->beta; + } + prefix_new->log_prob_nb_cur = + log_sum_exp(prefix_new->log_prob_nb_cur, log_p); + } + } // end of loop over prefix + } // end of loop over vocabulary + + prefixes.clear(); + // update log probs + + root->iterate_to_vec(prefixes); + + // only preserve top beam_size prefixes + if (prefixes.size() >= beam_size) { + std::nth_element(prefixes.begin(), + prefixes.begin() + beam_size, + prefixes.end(), + prefix_compare); + for (size_t i = beam_size; i < prefixes.size(); ++i) { + prefixes[i]->remove(); + } + } + } // end of loop over time + + return; +} + + +std::vector> get_decode_result( + std::vector &prefixes, + const std::vector &vocabulary, + size_t beam_size, + Scorer *ext_scorer) { + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); + int space_id = it - vocabulary.begin(); + // if no space in vocabulary + if ((size_t)space_id >= vocabulary.size()) { + space_id = -2; + } + // score the last word of each prefix that doesn't end with space + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + auto prefix = prefixes[i]; + if (!prefix->is_empty() && prefix->character != space_id) { + float score = 0.0; + std::vector ngram = ext_scorer->make_ngram(prefix); + score = + ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; + score += ext_scorer->beta; + prefix->score += score; + } + } + } + + size_t num_prefixes = std::min(prefixes.size(), beam_size); + std::sort( + prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); + + // compute aproximate ctc score as the return score, without affecting the + // return order of decoding result. To delete when decoder gets stable. + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + double approx_ctc = prefixes[i]->score; + if (ext_scorer != nullptr) { + std::vector output; + prefixes[i]->get_path_vec(output); + auto prefix_length = output.size(); + auto words = ext_scorer->split_labels(output); + // remove word insert + approx_ctc = approx_ctc - prefix_length * ext_scorer->beta; + // remove language model weight: + approx_ctc -= + (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha; + } + prefixes[i]->approx_ctc = approx_ctc; + } + + std::vector> res = + get_beam_search_result(prefixes, vocabulary, beam_size); + + // pay back the last word of each prefix that doesn't end with space (for + // decoding by chunk) + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + auto prefix = prefixes[i]; + if (!prefix->is_empty() && prefix->character != space_id) { + float score = 0.0; + std::vector ngram = ext_scorer->make_ngram(prefix); + score = + ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; + score += ext_scorer->beta; + prefix->score -= score; + } + } + } + return res; +} + + +void free_storage(std::unique_ptr &storage) { + storage = nullptr; +} + + +CtcBeamSearchDecoderBatch::~CtcBeamSearchDecoderBatch() {} + +CtcBeamSearchDecoderBatch::CtcBeamSearchDecoderBatch( + const std::vector &vocabulary, + size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id) + : batch_size(batch_size), + beam_size(beam_size), + num_processes(num_processes), + cutoff_prob(cutoff_prob), + cutoff_top_n(cutoff_top_n), + ext_scorer(ext_scorer), + blank_id(blank_id) { + VALID_CHECK_GT(this->beam_size, 0, "beam_size must be greater than 0!"); + VALID_CHECK_GT( + this->num_processes, 0, "num_processes must be nonnegative!"); + this->vocabulary = vocabulary; + for (size_t i = 0; i < batch_size; i++) { + this->decoder_storage_vector.push_back( + std::unique_ptr( + new CtcBeamSearchDecoderStorage())); + ctc_beam_search_decode_chunk_begin( + this->decoder_storage_vector[i]->root, ext_scorer); + } +}; + +/** + * Input + * probs_split: shape [B, T, D] + */ +void CtcBeamSearchDecoderBatch::next( + const std::vector>> &probs_split, + const std::vector &has_value) { + VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + size_t num_has_value = 0; + for (int i = 0; i < has_value.size(); i++) + if (has_value[i] == "true") num_has_value += 1; + ThreadPool pool(std::min(num_processes, num_has_value)); + // number of samples + size_t probs_num = probs_split.size(); + VALID_CHECK_EQ(this->batch_size, + probs_num, + "The batch size of the current input data should be same " + "with the input data before"); + + // enqueue the tasks of decoding + std::vector> res; + for (size_t i = 0; i < batch_size; ++i) { + if (has_value[i] == "true") { + res.emplace_back(pool.enqueue( + ctc_beam_search_decode_chunk, + std::ref(this->decoder_storage_vector[i]->root), + std::ref(this->decoder_storage_vector[i]->prefixes), + probs_split[i], + this->vocabulary, + this->beam_size, + this->cutoff_prob, + this->cutoff_top_n, + this->ext_scorer, + this->blank_id)); + } + } + + for (size_t i = 0; i < batch_size; ++i) { + res[i].get(); + } + return; +}; + +/** + * Return + * batch_result: shape[B, beam_size,(-approx_ctc score, string)] + */ +std::vector>> +CtcBeamSearchDecoderBatch::decode() { + VALID_CHECK_GT( + this->num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + ThreadPool pool(this->num_processes); + // number of samples + // enqueue the tasks of decoding + std::vector>>> res; + for (size_t i = 0; i < this->batch_size; ++i) { + res.emplace_back( + pool.enqueue(get_decode_result, + std::ref(this->decoder_storage_vector[i]->prefixes), + this->vocabulary, + this->beam_size, + this->ext_scorer)); + } + // get decoding results + std::vector>> batch_results; + for (size_t i = 0; i < this->batch_size; ++i) { + batch_results.emplace_back(res[i].get()); + } + return batch_results; +} + + +/** + * reset the state of ctcBeamSearchDecoderBatch + */ +void CtcBeamSearchDecoderBatch::reset_state(size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n) { + this->batch_size = batch_size; + this->beam_size = beam_size; + this->num_processes = num_processes; + this->cutoff_prob = cutoff_prob; + this->cutoff_top_n = cutoff_top_n; + + VALID_CHECK_GT(this->beam_size, 0, "beam_size must be greater than 0!"); + VALID_CHECK_GT( + this->num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + ThreadPool pool(this->num_processes); + // number of samples + // enqueue the tasks of decoding + std::vector> res; + size_t storage_size = decoder_storage_vector.size(); + for (size_t i = 0; i < storage_size; i++) { + res.emplace_back(pool.enqueue( + free_storage, std::ref(this->decoder_storage_vector[i]))); + } + for (size_t i = 0; i < storage_size; ++i) { + res[i].get(); + } + std::vector>().swap( + decoder_storage_vector); + for (size_t i = 0; i < this->batch_size; i++) { + this->decoder_storage_vector.push_back( + std::unique_ptr( + new CtcBeamSearchDecoderStorage())); + ctc_beam_search_decode_chunk_begin( + this->decoder_storage_vector[i]->root, this->ext_scorer); + } +} \ No newline at end of file diff --git a/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.h b/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.h new file mode 100644 index 00000000000..92d2b855fc3 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.h @@ -0,0 +1,175 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CTC_BEAM_SEARCH_DECODER_H_ +#define CTC_BEAM_SEARCH_DECODER_H_ + +#include +#include +#include + +#include "scorer.h" + +/* CTC Beam Search Decoder + + * Parameters: + * probs_seq: 2-D vector that each element is a vector of probabilities + * over vocabulary of one time step. + * vocabulary: A vector of vocabulary. + * beam_size: The width of beam search. + * cutoff_prob: Cutoff probability for pruning. + * cutoff_top_n: Cutoff number for pruning. + * ext_scorer: External scorer to evaluate a prefix, which consists of + * n-gram language model scoring and word insertion term. + * Default null, decoding the input sample without scorer. + * Return: + * A vector that each element is a pair of score and decoding result, + * in desending order. +*/ +std::vector> ctc_beam_search_decoding( + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t beam_size, + double cutoff_prob = 1.0, + size_t cutoff_top_n = 40, + Scorer *ext_scorer = nullptr, + size_t blank_id = 0); + + +/* CTC Beam Search Decoder for batch data + + * Parameters: + * probs_seq: 3-D vector that each element is a 2-D vector that can be used + * by ctc_beam_search_decoder(). + * vocabulary: A vector of vocabulary. + * beam_size: The width of beam search. + * num_processes: Number of threads for beam search. + * cutoff_prob: Cutoff probability for pruning. + * cutoff_top_n: Cutoff number for pruning. + * ext_scorer: External scorer to evaluate a prefix, which consists of + * n-gram language model scoring and word insertion term. + * Default null, decoding the input sample without scorer. + * Return: + * A 2-D vector that each element is a vector of beam search decoding + * result for one audio sample. +*/ +std::vector>> +ctc_beam_search_decoding_batch( + const std::vector>> &probs_split, + const std::vector &vocabulary, + size_t beam_size, + size_t num_processes, + double cutoff_prob = 1.0, + size_t cutoff_top_n = 40, + Scorer *ext_scorer = nullptr, + size_t blank_id = 0); + +/** + * Store the root and prefixes for decoder + */ + +class CtcBeamSearchDecoderStorage { + public: + PathTrie *root = nullptr; + std::vector prefixes; + + CtcBeamSearchDecoderStorage() { + // init prefixes' root + this->root = new PathTrie(); + this->root->log_prob_b_prev = 0.0; + // The score of root is in log scale.Since the prob=1.0, the prob score + // in log scale is 0.0 + this->root->score = root->log_prob_b_prev; + // std::vector prefixes; + this->prefixes.push_back(root); + }; + + ~CtcBeamSearchDecoderStorage() { + if (root != nullptr) { + delete root; + root = nullptr; + } + }; +}; + +/** + * The ctc beam search decoder, support batchsize >= 1 + */ +class CtcBeamSearchDecoderBatch { + public: + CtcBeamSearchDecoderBatch(const std::vector &vocabulary, + size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id); + + ~CtcBeamSearchDecoderBatch(); + void next(const std::vector>> &probs_split, + const std::vector &has_value); + + std::vector>> decode(); + + void reset_state(size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n); + + private: + std::vector vocabulary; + size_t batch_size; + size_t beam_size; + size_t num_processes; + double cutoff_prob; + size_t cutoff_top_n; + Scorer *ext_scorer; + size_t blank_id; + std::vector> + decoder_storage_vector; +}; + +/** + * function for chunk decoding + */ +void ctc_beam_search_decode_chunk( + PathTrie *root, + std::vector &prefixes, + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t beam_size, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id); + +std::vector> get_decode_result( + std::vector &prefixes, + const std::vector &vocabulary, + size_t beam_size, + Scorer *ext_scorer); + +/** + * free the CtcBeamSearchDecoderStorage + */ +void free_storage(std::unique_ptr &storage); + +/** + * initialize the root + */ +void ctc_beam_search_decode_chunk_begin(PathTrie *root, Scorer *ext_scorer); + +#endif // CTC_BEAM_SEARCH_DECODER_H_ diff --git a/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.cpp b/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.cpp new file mode 100644 index 00000000000..6aa3c99647d --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.cpp @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ctc_greedy_decoder.h" +#include "decoder_utils.h" + +std::string ctc_greedy_decoding( + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t blank_id) { + // dimension check + size_t num_time_steps = probs_seq.size(); + for (size_t i = 0; i < num_time_steps; ++i) { + VALID_CHECK_EQ(probs_seq[i].size(), + vocabulary.size(), + "The shape of probs_seq does not match with " + "the shape of the vocabulary"); + } + + // size_t blank_id = vocabulary.size(); + + std::vector max_idx_vec(num_time_steps, 0); + std::vector idx_vec; + for (size_t i = 0; i < num_time_steps; ++i) { + double max_prob = 0.0; + size_t max_idx = 0; + const std::vector &probs_step = probs_seq[i]; + for (size_t j = 0; j < probs_step.size(); ++j) { + if (max_prob < probs_step[j]) { + max_idx = j; + max_prob = probs_step[j]; + } + } + // id with maximum probability in current time step + max_idx_vec[i] = max_idx; + // deduplicate + if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) { + idx_vec.push_back(max_idx_vec[i]); + } + } + + std::string best_path_result; + for (size_t i = 0; i < idx_vec.size(); ++i) { + if (idx_vec[i] != blank_id) { + std::string ch = vocabulary[idx_vec[i]]; + best_path_result += (ch == kSPACE) ? tSPACE : ch; + } + } + return best_path_result; +} diff --git a/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.h b/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.h new file mode 100644 index 00000000000..4451600d629 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.h @@ -0,0 +1,35 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CTC_GREEDY_DECODER_H +#define CTC_GREEDY_DECODER_H + +#include +#include + +/* CTC Greedy (Best Path) Decoder + * + * Parameters: + * probs_seq: 2-D vector that each element is a vector of probabilities + * over vocabulary of one time step. + * vocabulary: A vector of vocabulary. + * Return: + * The decoding result in string + */ +std::string ctc_greedy_decoding( + const std::vector>& probs_seq, + const std::vector& vocabulary, + size_t blank_id); + +#endif // CTC_GREEDY_DECODER_H diff --git a/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.cpp b/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.cpp new file mode 100644 index 00000000000..c7ef65428e1 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.cpp @@ -0,0 +1,193 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "decoder_utils.h" + +#include +#include +#include + +std::vector> get_pruned_log_probs( + const std::vector &prob_step, + double cutoff_prob, + size_t cutoff_top_n) { + std::vector> prob_idx; + for (size_t i = 0; i < prob_step.size(); ++i) { + prob_idx.push_back(std::pair(i, prob_step[i])); + } + // pruning of vocabulary + size_t cutoff_len = prob_step.size(); + if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) { + std::sort(prob_idx.begin(), + prob_idx.end(), + pair_comp_second_rev); + if (cutoff_prob < 1.0) { + double cum_prob = 0.0; + cutoff_len = 0; + for (size_t i = 0; i < prob_idx.size(); ++i) { + cum_prob += prob_idx[i].second; + cutoff_len += 1; + if (cum_prob >= cutoff_prob || cutoff_len >= cutoff_top_n) + break; + } + } + prob_idx = std::vector>( + prob_idx.begin(), prob_idx.begin() + cutoff_len); + } + std::vector> log_prob_idx; + for (size_t i = 0; i < cutoff_len; ++i) { + log_prob_idx.push_back(std::pair( + prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN))); + } + return log_prob_idx; +} + + +std::vector> get_beam_search_result( + const std::vector &prefixes, + const std::vector &vocabulary, + size_t beam_size) { + // allow for the post processing + std::vector space_prefixes; + if (space_prefixes.empty()) { + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + space_prefixes.push_back(prefixes[i]); + } + } + + std::sort(space_prefixes.begin(), space_prefixes.end(), prefix_compare); + std::vector> output_vecs; + for (size_t i = 0; i < beam_size && i < space_prefixes.size(); ++i) { + std::vector output; + space_prefixes[i]->get_path_vec(output); + // convert index to string + std::string output_str; + for (size_t j = 0; j < output.size(); j++) { + std::string ch = vocabulary[output[j]]; + output_str += (ch == kSPACE) ? tSPACE : ch; + } + std::pair output_pair( + -space_prefixes[i]->approx_ctc, output_str); + output_vecs.emplace_back(output_pair); + } + + return output_vecs; +} + +size_t get_utf8_str_len(const std::string &str) { + size_t str_len = 0; + for (char c : str) { + str_len += ((c & 0xc0) != 0x80); + } + return str_len; +} + +std::vector split_utf8_str(const std::string &str) { + std::vector result; + std::string out_str; + + for (char c : str) { + if ((c & 0xc0) != 0x80) // new UTF-8 character + { + if (!out_str.empty()) { + result.push_back(out_str); + out_str.clear(); + } + } + + out_str.append(1, c); + } + result.push_back(out_str); + return result; +} + +std::vector split_str(const std::string &s, + const std::string &delim) { + std::vector result; + std::size_t start = 0, delim_len = delim.size(); + while (true) { + std::size_t end = s.find(delim, start); + if (end == std::string::npos) { + if (start < s.size()) { + result.push_back(s.substr(start)); + } + break; + } + if (end > start) { + result.push_back(s.substr(start, end - start)); + } + start = end + delim_len; + } + return result; +} + +bool prefix_compare(const PathTrie *x, const PathTrie *y) { + if (x->score == y->score) { + if (x->character == y->character) { + return false; + } else { + return (x->character < y->character); + } + } else { + return x->score > y->score; + } +} + +void add_word_to_fst(const std::vector &word, + fst::StdVectorFst *dictionary) { + if (dictionary->NumStates() == 0) { + fst::StdVectorFst::StateId start = dictionary->AddState(); + assert(start == 0); + dictionary->SetStart(start); + } + fst::StdVectorFst::StateId src = dictionary->Start(); + fst::StdVectorFst::StateId dst; + for (auto c : word) { + dst = dictionary->AddState(); + dictionary->AddArc(src, fst::StdArc(c, c, 0, dst)); + src = dst; + } + dictionary->SetFinal(dst, fst::StdArc::Weight::One()); +} + +bool add_word_to_dictionary( + const std::string &word, + const std::unordered_map &char_map, + bool add_space, + int SPACE_ID, + fst::StdVectorFst *dictionary) { + auto characters = split_utf8_str(word); + + std::vector int_word; + + for (auto &c : characters) { + if (c == " ") { + int_word.push_back(SPACE_ID); + } else { + auto int_c = char_map.find(c); + if (int_c != char_map.end()) { + int_word.push_back(int_c->second); + } else { + return false; // return without adding + } + } + } + + if (add_space) { + int_word.push_back(SPACE_ID); + } + + add_word_to_fst(int_word, dictionary); + return true; // return with successful adding +} diff --git a/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.h b/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.h new file mode 100644 index 00000000000..0987415529a --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.h @@ -0,0 +1,111 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef DECODER_UTILS_H_ +#define DECODER_UTILS_H_ + +#include +#include +#include "fst/log.h" +#include "path_trie.h" + +const std::string kSPACE = ""; +const std::string tSPACE = " "; +const float NUM_FLT_INF = std::numeric_limits::max(); +const float NUM_FLT_MIN = std::numeric_limits::min(); + +// inline function for validation check +inline void check( + bool x, const char *expr, const char *file, int line, const char *err) { + if (!x) { + std::cout << "[" << file << ":" << line << "] "; + LOG(FATAL) << "\"" << expr << "\" check failed. " << err; + } +} + +#define VALID_CHECK(x, info) \ + check(static_cast(x), #x, __FILE__, __LINE__, info) +#define VALID_CHECK_EQ(x, y, info) VALID_CHECK((x) == (y), info) +#define VALID_CHECK_GT(x, y, info) VALID_CHECK((x) > (y), info) +#define VALID_CHECK_LT(x, y, info) VALID_CHECK((x) < (y), info) + + +// Function template for comparing two pairs +template +bool pair_comp_first_rev(const std::pair &a, + const std::pair &b) { + return a.first > b.first; +} + +// Function template for comparing two pairs +template +bool pair_comp_second_rev(const std::pair &a, + const std::pair &b) { + return a.second > b.second; +} + +// Return the sum of two probabilities in log scale +template +T log_sum_exp(const T &x, const T &y) { + static T num_min = -std::numeric_limits::max(); + if (x <= num_min) return y; + if (y <= num_min) return x; + T xmax = std::max(x, y); + return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; +} + +// Get pruned probability vector for each time step's beam search +std::vector> get_pruned_log_probs( + const std::vector &prob_step, + double cutoff_prob, + size_t cutoff_top_n); + +// Get beam search result from prefixes in trie tree +std::vector> get_beam_search_result( + const std::vector &prefixes, + const std::vector &vocabulary, + size_t beam_size); + +// Functor for prefix comparsion +bool prefix_compare(const PathTrie *x, const PathTrie *y); + +/* Get length of utf8 encoding string + * See: http://stackoverflow.com/a/4063229 + */ +size_t get_utf8_str_len(const std::string &str); + +/* Split a string into a list of strings on a given string + * delimiter. NB: delimiters on beginning / end of string are + * trimmed. Eg, "FooBarFoo" split on "Foo" returns ["Bar"]. + */ +std::vector split_str(const std::string &s, + const std::string &delim); + +/* Splits string into vector of strings representing + * UTF-8 characters (not same as chars) + */ +std::vector split_utf8_str(const std::string &str); + +// Add a word in index to the dicionary of fst +void add_word_to_fst(const std::vector &word, + fst::StdVectorFst *dictionary); + +// Add a word in string to dictionary +bool add_word_to_dictionary( + const std::string &word, + const std::unordered_map &char_map, + bool add_space, + int SPACE_ID, + fst::StdVectorFst *dictionary); +#endif // DECODER_UTILS_H diff --git a/speechx/speechx/asr/decoder/ctc_decoders/decoders.i b/speechx/speechx/asr/decoder/ctc_decoders/decoders.i new file mode 100644 index 00000000000..8fe3b279f59 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/decoders.i @@ -0,0 +1,33 @@ +%module paddlespeech_ctcdecoders +%{ +#include "scorer.h" +#include "ctc_greedy_decoder.h" +#include "ctc_beam_search_decoder.h" +#include "decoder_utils.h" +%} + +%include "std_vector.i" +%include "std_pair.i" +%include "std_string.i" +%import "decoder_utils.h" + +namespace std { + %template(DoubleVector) std::vector; + %template(IntVector) std::vector; + %template(StringVector) std::vector; + %template(VectorOfStructVector) std::vector >; + %template(FloatVector) std::vector; + %template(Pair) std::pair; + %template(PairFloatStringVector) std::vector >; + %template(PairDoubleStringVector) std::vector >; + %template(PairDoubleStringVector2) std::vector > >; + %template(DoubleVector3) std::vector > >; +} + +%template(IntDoublePairCompSecondRev) pair_comp_second_rev; +%template(StringDoublePairCompSecondRev) pair_comp_second_rev; +%template(DoubleStringPairCompFirstRev) pair_comp_first_rev; + +%include "scorer.h" +%include "ctc_greedy_decoder.h" +%include "ctc_beam_search_decoder.h" diff --git a/speechx/speechx/asr/decoder/ctc_decoders/path_trie.cpp b/speechx/speechx/asr/decoder/ctc_decoders/path_trie.cpp new file mode 100644 index 00000000000..777ca05201d --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/path_trie.cpp @@ -0,0 +1,164 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "path_trie.h" + +#include +#include +#include +#include +#include + +#include "decoder_utils.h" + +PathTrie::PathTrie() { + log_prob_b_prev = -NUM_FLT_INF; + log_prob_nb_prev = -NUM_FLT_INF; + log_prob_b_cur = -NUM_FLT_INF; + log_prob_nb_cur = -NUM_FLT_INF; + score = -NUM_FLT_INF; + + ROOT_ = -1; + character = ROOT_; + exists_ = true; + parent = nullptr; + + dictionary_ = nullptr; + dictionary_state_ = 0; + has_dictionary_ = false; + + matcher_ = nullptr; +} + +PathTrie::~PathTrie() { + for (auto child : children_) { + delete child.second; + child.second = nullptr; + } +} + +PathTrie* PathTrie::get_path_trie(int new_char, bool reset) { + auto child = children_.begin(); + for (child = children_.begin(); child != children_.end(); ++child) { + if (child->first == new_char) { + break; + } + } + if (child != children_.end()) { + if (!child->second->exists_) { + child->second->exists_ = true; + child->second->log_prob_b_prev = -NUM_FLT_INF; + child->second->log_prob_nb_prev = -NUM_FLT_INF; + child->second->log_prob_b_cur = -NUM_FLT_INF; + child->second->log_prob_nb_cur = -NUM_FLT_INF; + } + return (child->second); + } else { + if (has_dictionary_) { + matcher_->SetState(dictionary_state_); + bool found = matcher_->Find(new_char + 1); + if (!found) { + // Adding this character causes word outside dictionary + auto FSTZERO = fst::TropicalWeight::Zero(); + auto final_weight = dictionary_->Final(dictionary_state_); + bool is_final = (final_weight != FSTZERO); + if (is_final && reset) { + dictionary_state_ = dictionary_->Start(); + } + return nullptr; + } else { + PathTrie* new_path = new PathTrie; + new_path->character = new_char; + new_path->parent = this; + new_path->dictionary_ = dictionary_; + new_path->dictionary_state_ = matcher_->Value().nextstate; + new_path->has_dictionary_ = true; + new_path->matcher_ = matcher_; + children_.push_back(std::make_pair(new_char, new_path)); + return new_path; + } + } else { + PathTrie* new_path = new PathTrie; + new_path->character = new_char; + new_path->parent = this; + children_.push_back(std::make_pair(new_char, new_path)); + return new_path; + } + } +} + +PathTrie* PathTrie::get_path_vec(std::vector& output) { + return get_path_vec(output, ROOT_); +} + +PathTrie* PathTrie::get_path_vec(std::vector& output, + int stop, + size_t max_steps) { + if (character == stop || character == ROOT_ || output.size() == max_steps) { + std::reverse(output.begin(), output.end()); + return this; + } else { + output.push_back(character); + return parent->get_path_vec(output, stop, max_steps); + } +} + +void PathTrie::iterate_to_vec(std::vector& output) { + if (exists_) { + log_prob_b_prev = log_prob_b_cur; + log_prob_nb_prev = log_prob_nb_cur; + + log_prob_b_cur = -NUM_FLT_INF; + log_prob_nb_cur = -NUM_FLT_INF; + + score = log_sum_exp(log_prob_b_prev, log_prob_nb_prev); + output.push_back(this); + } + for (auto child : children_) { + child.second->iterate_to_vec(output); + } +} + +void PathTrie::remove() { + exists_ = false; + if (children_.size() == 0) { + if (parent != nullptr) { + auto child = parent->children_.begin(); + for (child = parent->children_.begin(); + child != parent->children_.end(); + ++child) { + if (child->first == character) { + parent->children_.erase(child); + break; + } + } + if (parent->children_.size() == 0 && !parent->exists_) { + parent->remove(); + } + } + delete this; + } +} + + +void PathTrie::set_dictionary(fst::StdVectorFst* dictionary) { + dictionary_ = dictionary; + dictionary_state_ = dictionary->Start(); + has_dictionary_ = true; +} + +using FSTMATCH = fst::SortedMatcher; +void PathTrie::set_matcher(std::shared_ptr matcher) { + matcher_ = matcher; +} diff --git a/speechx/speechx/asr/decoder/ctc_decoders/path_trie.h b/speechx/speechx/asr/decoder/ctc_decoders/path_trie.h new file mode 100644 index 00000000000..5193e0a47e6 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/path_trie.h @@ -0,0 +1,82 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PATH_TRIE_H +#define PATH_TRIE_H + +#include +#include +#include +#include +#include + +#include "fst/fstlib.h" + +/* Trie tree for prefix storing and manipulating, with a dictionary in + * finite-state transducer for spelling correction. + */ +class PathTrie { + public: + PathTrie(); + ~PathTrie(); + + // get new prefix after appending new char + PathTrie* get_path_trie(int new_char, bool reset = true); + + // get the prefix in index from root to current node + PathTrie* get_path_vec(std::vector& output); + + // get the prefix in index from some stop node to current nodel + PathTrie* get_path_vec( + std::vector& output, + int stop, + size_t max_steps = std::numeric_limits::max()); + + // update log probs + void iterate_to_vec(std::vector& output); + + // set dictionary for FST + void set_dictionary(fst::StdVectorFst* dictionary); + + void set_matcher(std::shared_ptr>); + + bool is_empty() { return ROOT_ == character; } + + // remove current path from root + void remove(); + + float log_prob_b_prev; + float log_prob_nb_prev; + float log_prob_b_cur; + float log_prob_nb_cur; + float score; + float approx_ctc; + int character; + PathTrie* parent; + + private: + int ROOT_; + bool exists_; + bool has_dictionary_; + + std::vector> children_; + + // pointer to dictionary of FST + fst::StdVectorFst* dictionary_; + fst::StdVectorFst::StateId dictionary_state_; + // true if finding ars in FST + std::shared_ptr> matcher_; +}; + +#endif // PATH_TRIE_H diff --git a/speechx/speechx/asr/decoder/ctc_decoders/scorer.cpp b/speechx/speechx/asr/decoder/ctc_decoders/scorer.cpp new file mode 100644 index 00000000000..6e7f68cf6ba --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/scorer.cpp @@ -0,0 +1,232 @@ +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the +// "COPYING.LESSER.3"); + +#include "scorer.h" + +#include +#include + +#include "lm/config.hh" +#include "lm/model.hh" +#include "lm/state.hh" + +#include "decoder_utils.h" + +using namespace lm::ngram; +// if your platform is windows ,you need add the define +#define F_OK 0 +Scorer::Scorer(double alpha, + double beta, + const std::string& lm_path, + const std::vector& vocab_list) { + this->alpha = alpha; + this->beta = beta; + + dictionary = nullptr; + is_character_based_ = true; + language_model_ = nullptr; + + max_order_ = 0; + dict_size_ = 0; + SPACE_ID_ = -1; + + setup(lm_path, vocab_list); +} + +Scorer::~Scorer() { + if (language_model_ != nullptr) { + delete static_cast(language_model_); + } + if (dictionary != nullptr) { + delete static_cast(dictionary); + } +} + +void Scorer::setup(const std::string& lm_path, + const std::vector& vocab_list) { + // load language model + load_lm(lm_path); + // set char map for scorer + set_char_map(vocab_list); + // fill the dictionary for FST + if (!is_character_based()) { + fill_dictionary(true); + } +} + +void Scorer::load_lm(const std::string& lm_path) { + const char* filename = lm_path.c_str(); + VALID_CHECK_EQ(access(filename, F_OK), 0, "Invalid language model path"); + + RetriveStrEnumerateVocab enumerate; + lm::ngram::Config config; + config.enumerate_vocab = &enumerate; + language_model_ = lm::ngram::LoadVirtual(filename, config); + max_order_ = static_cast(language_model_)->Order(); + vocabulary_ = enumerate.vocabulary; + for (size_t i = 0; i < vocabulary_.size(); ++i) { + if (is_character_based_ && vocabulary_[i] != UNK_TOKEN && + vocabulary_[i] != START_TOKEN && vocabulary_[i] != END_TOKEN && + get_utf8_str_len(enumerate.vocabulary[i]) > 1) { + is_character_based_ = false; + } + } +} + +double Scorer::get_log_cond_prob(const std::vector& words) { + lm::base::Model* model = static_cast(language_model_); + double cond_prob; + lm::ngram::State state, tmp_state, out_state; + // avoid to inserting in begin + model->NullContextWrite(&state); + for (size_t i = 0; i < words.size(); ++i) { + lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]); + // encounter OOV + if (word_index == 0) { + return OOV_SCORE; + } + cond_prob = model->BaseScore(&state, word_index, &out_state); + tmp_state = state; + state = out_state; + out_state = tmp_state; + } + // return log10 prob + return cond_prob; +} + +double Scorer::get_sent_log_prob(const std::vector& words) { + std::vector sentence; + if (words.size() == 0) { + for (size_t i = 0; i < max_order_; ++i) { + sentence.push_back(START_TOKEN); + } + } else { + for (size_t i = 0; i < max_order_ - 1; ++i) { + sentence.push_back(START_TOKEN); + } + sentence.insert(sentence.end(), words.begin(), words.end()); + } + sentence.push_back(END_TOKEN); + return get_log_prob(sentence); +} + +double Scorer::get_log_prob(const std::vector& words) { + assert(words.size() > max_order_); + double score = 0.0; + for (size_t i = 0; i < words.size() - max_order_ + 1; ++i) { + std::vector ngram(words.begin() + i, + words.begin() + i + max_order_); + score += get_log_cond_prob(ngram); + } + return score; +} + +void Scorer::reset_params(float alpha, float beta) { + this->alpha = alpha; + this->beta = beta; +} + +std::string Scorer::vec2str(const std::vector& input) { + std::string word; + for (auto ind : input) { + word += char_list_[ind]; + } + return word; +} + +std::vector Scorer::split_labels(const std::vector& labels) { + if (labels.empty()) return {}; + + std::string s = vec2str(labels); + std::vector words; + if (is_character_based_) { + words = split_utf8_str(s); + } else { + words = split_str(s, " "); + } + return words; +} + +void Scorer::set_char_map(const std::vector& char_list) { + char_list_ = char_list; + char_map_.clear(); + + // Set the char map for the FST for spelling correction + for (size_t i = 0; i < char_list_.size(); i++) { + if (char_list_[i] == kSPACE) { + SPACE_ID_ = i; + } + // The initial state of FST is state 0, hence the index of chars in + // the FST should start from 1 to avoid the conflict with the initial + // state, otherwise wrong decoding results would be given. + char_map_[char_list_[i]] = i + 1; + } +} + +std::vector Scorer::make_ngram(PathTrie* prefix) { + std::vector ngram; + PathTrie* current_node = prefix; + PathTrie* new_node = nullptr; + + for (int order = 0; order < max_order_; order++) { + std::vector prefix_vec; + + if (is_character_based_) { + new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_, 1); + current_node = new_node; + } else { + new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_); + current_node = new_node->parent; // Skipping spaces + } + + // reconstruct word + std::string word = vec2str(prefix_vec); + ngram.push_back(word); + + if (new_node->character == -1) { + // No more spaces, but still need order + for (int i = 0; i < max_order_ - order - 1; i++) { + ngram.push_back(START_TOKEN); + } + break; + } + } + std::reverse(ngram.begin(), ngram.end()); + return ngram; +} + +void Scorer::fill_dictionary(bool add_space) { + fst::StdVectorFst dictionary; + // For each unigram convert to ints and put in trie + int dict_size = 0; + for (const auto& word : vocabulary_) { + bool added = add_word_to_dictionary( + word, char_map_, add_space, SPACE_ID_ + 1, &dictionary); + dict_size += added ? 1 : 0; + } + + dict_size_ = dict_size; + + /* Simplify FST + + * This gets rid of "epsilon" transitions in the FST. + * These are transitions that don't require a string input to be taken. + * Getting rid of them is necessary to make the FST deterministic, but + * can greatly increase the size of the FST + */ + fst::RmEpsilon(&dictionary); + fst::StdVectorFst* new_dict = new fst::StdVectorFst; + + /* This makes the FST deterministic, meaning for any string input there's + * only one possible state the FST could be in. It is assumed our + * dictionary is deterministic when using it. + * (lest we'd have to check for multiple transitions at each state) + */ + fst::Determinize(dictionary, new_dict); + + /* Finds the simplest equivalent fst. This is unnecessary but decreases + * memory usage of the dictionary + */ + fst::Minimize(new_dict); + this->dictionary = new_dict; +} diff --git a/speechx/speechx/asr/decoder/ctc_decoders/scorer.h b/speechx/speechx/asr/decoder/ctc_decoders/scorer.h new file mode 100644 index 00000000000..08e109b78e3 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/scorer.h @@ -0,0 +1,114 @@ +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the +// "COPYING.LESSER.3"); + +#ifndef SCORER_H_ +#define SCORER_H_ + +#include +#include +#include +#include + +#include "lm/enumerate_vocab.hh" +#include "lm/virtual_interface.hh" +#include "lm/word_index.hh" + +#include "path_trie.h" + +const double OOV_SCORE = -1000.0; +const std::string START_TOKEN = ""; +const std::string UNK_TOKEN = ""; +const std::string END_TOKEN = ""; + +// Implement a callback to retrive the dictionary of language model. +class RetriveStrEnumerateVocab : public lm::EnumerateVocab { + public: + RetriveStrEnumerateVocab() {} + + void Add(lm::WordIndex index, const StringPiece &str) { + vocabulary.push_back(std::string(str.data(), str.length())); + } + + std::vector vocabulary; +}; + +/* External scorer to query score for n-gram or sentence, including language + * model scoring and word insertion. + * + * Example: + * Scorer scorer(alpha, beta, "path_of_language_model"); + * scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" }); + * scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" }); + */ +class Scorer { + public: + Scorer(double alpha, + double beta, + const std::string &lm_path, + const std::vector &vocabulary); + ~Scorer(); + + double get_log_cond_prob(const std::vector &words); + + double get_sent_log_prob(const std::vector &words); + + // return the max order + size_t get_max_order() const { return max_order_; } + + // return the dictionary size of language model + size_t get_dict_size() const { return dict_size_; } + + // retrun true if the language model is character based + bool is_character_based() const { return is_character_based_; } + + // reset params alpha & beta + void reset_params(float alpha, float beta); + + // make ngram for a given prefix + std::vector make_ngram(PathTrie *prefix); + + // trransform the labels in index to the vector of words (word based lm) or + // the vector of characters (character based lm) + std::vector split_labels(const std::vector &labels); + + // language model weight + double alpha; + // word insertion weight + double beta; + + // pointer to the dictionary of FST + void *dictionary; + + protected: + // necessary setup: load language model, set char map, fill FST's dictionary + void setup(const std::string &lm_path, + const std::vector &vocab_list); + + // load language model from given path + void load_lm(const std::string &lm_path); + + // fill dictionary for FST + void fill_dictionary(bool add_space); + + // set char map + void set_char_map(const std::vector &char_list); + + double get_log_prob(const std::vector &words); + + // translate the vector in index to string + std::string vec2str(const std::vector &input); + + private: + void *language_model_; + bool is_character_based_; + size_t max_order_; + size_t dict_size_; + + int SPACE_ID_; + std::vector char_list_; + std::unordered_map char_map_; + + std::vector vocabulary_; +}; + +#endif // SCORER_H_ diff --git a/speechx/speechx/asr/decoder/ctc_decoders/setup.py b/speechx/speechx/asr/decoder/ctc_decoders/setup.py new file mode 100644 index 00000000000..9a8b292a07b --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/setup.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Script to build and install decoder package.""" +import argparse +import glob +import multiprocessing.pool +import os +import platform +import sys + +from setuptools import distutils +from setuptools import Extension +from setuptools import setup + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--num_processes", + default=1, + type=int, + help="Number of cpu processes to build package. (default: %(default)d)") +args = parser.parse_known_args() + +# reconstruct sys.argv to pass to setup below +sys.argv = [sys.argv[0]] + args[1] + + +# monkey-patch for parallel compilation +# See: https://stackoverflow.com/a/13176803 +def parallelCCompile(self, + sources, + output_dir=None, + macros=None, + include_dirs=None, + debug=0, + extra_preargs=None, + extra_postargs=None, + depends=None): + # those lines are copied from distutils.ccompiler.CCompiler directly + macros, objects, extra_postargs, pp_opts, build = self._setup_compile( + output_dir, macros, include_dirs, sources, depends, extra_postargs) + cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) + + # parallel code + def _single_compile(obj): + try: + src, ext = build[obj] + except KeyError: + return + self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts) + + # convert to list, imap is evaluated on-demand + thread_pool = multiprocessing.pool.ThreadPool(args[0].num_processes) + list(thread_pool.imap(_single_compile, objects)) + return objects + + +def compile_test(header, library): + dummy_path = os.path.join(os.path.dirname(__file__), "dummy") + command = "bash -c \"g++ -include " + header \ + + " -l" + library + " -x c++ - <<<'int main() {}' -o " \ + + dummy_path + " >/dev/null 2>/dev/null && rm " \ + + dummy_path + " 2>/dev/null\"" + return os.system(command) == 0 + + +# hack compile to support parallel compiling +distutils.ccompiler.CCompiler.compile = parallelCCompile + +FILES = glob.glob('kenlm/util/*.cc') \ + + glob.glob('kenlm/lm/*.cc') \ + + glob.glob('kenlm/util/double-conversion/*.cc') + +FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') + +# yapf: disable +FILES = [ + fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') + or fn.endswith('unittest.cc')) +] +# yapf: enable +LIBS = ['stdc++'] +if platform.system() != 'Darwin': + LIBS.append('rt') +if platform.system() == 'Windows': + LIBS = ['-static-libstdc++'] + +ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11'] + +if compile_test('zlib.h', 'z'): + ARGS.append('-DHAVE_ZLIB') + LIBS.append('z') + +if compile_test('bzlib.h', 'bz2'): + ARGS.append('-DHAVE_BZLIB') + LIBS.append('bz2') + +if compile_test('lzma.h', 'lzma'): + ARGS.append('-DHAVE_XZLIB') + LIBS.append('lzma') + +os.system('swig -python -c++ ./decoders.i') + +decoders_module = [ + Extension( + name='_paddlespeech_ctcdecoders', + sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'), + language='c++', + include_dirs=[ + '.', + 'kenlm', + 'openfst-1.6.3/src/include', + 'ThreadPool', + ], + libraries=LIBS, + extra_compile_args=ARGS) +] + +setup( + name='paddlespeech_ctcdecoders', + version='0.2.0', + description="CTC decoders in paddlespeech", + author="PaddlePaddle Speech and Language Team", + author_email="paddlesl@baidu.com", + url="https://github.com/PaddlePaddle/PaddleSpeech", + license='Apache 2.0, GNU Lesser General Public License v3 (LGPLv3) (LGPL-3)', + ext_modules=decoders_module, + py_modules=['paddlespeech_ctcdecoders']) diff --git a/speechx/speechx/asr/decoder/ctc_decoders/setup.sh b/speechx/speechx/asr/decoder/ctc_decoders/setup.sh new file mode 100755 index 00000000000..302c5550250 --- /dev/null +++ b/speechx/speechx/asr/decoder/ctc_decoders/setup.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +if [ ! -d kenlm ]; then + git clone https://github.com/kpu/kenlm.git + cd kenlm/ + git checkout df2d717e95183f79a90b2fa6e4307083a351ca6a + cd .. + echo -e "\n" +fi + +if [ ! -d openfst-1.6.3 ]; then + echo "Download and extract openfst ..." + wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz --no-check-certificate + tar -xzvf openfst-1.6.3.tar.gz + echo -e "\n" +fi + +if [ ! -d ThreadPool ]; then + git clone https://github.com/progschj/ThreadPool.git + echo -e "\n" +fi + +echo "Install decoders ..." +python3 setup.py install --num_processes 4 diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc similarity index 99% rename from speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc rename to speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc index 07e8e5608b9..15dbd7e91d3 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc @@ -84,7 +84,7 @@ void CTCPrefixBeamSearch::AdvanceDecode( timer.Reset(); std::vector> likelihood; - likelihood.push_back(frame_prob); + likelihood.push_back(std::move(frame_prob)); AdvanceDecoding(likelihood); search_cost += timer.Elapsed(); diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.h similarity index 100% rename from speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h rename to speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.h diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc similarity index 100% rename from speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc rename to speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_score.h similarity index 100% rename from speechx/speechx/decoder/ctc_prefix_beam_search_score.h rename to speechx/speechx/asr/decoder/ctc_prefix_beam_search_score.h diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/asr/decoder/ctc_tlg_decoder.cc similarity index 100% rename from speechx/speechx/decoder/ctc_tlg_decoder.cc rename to speechx/speechx/asr/decoder/ctc_tlg_decoder.cc diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/asr/decoder/ctc_tlg_decoder.h similarity index 100% rename from speechx/speechx/decoder/ctc_tlg_decoder.h rename to speechx/speechx/asr/decoder/ctc_tlg_decoder.h diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/asr/decoder/ctc_tlg_decoder_main.cc similarity index 100% rename from speechx/speechx/decoder/ctc_tlg_decoder_main.cc rename to speechx/speechx/asr/decoder/ctc_tlg_decoder_main.cc diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/asr/decoder/decoder_itf.h similarity index 100% rename from speechx/speechx/decoder/decoder_itf.h rename to speechx/speechx/asr/decoder/decoder_itf.h diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/asr/decoder/nnet_logprob_decoder_main.cc similarity index 100% rename from speechx/speechx/decoder/nnet_logprob_decoder_main.cc rename to speechx/speechx/asr/decoder/nnet_logprob_decoder_main.cc diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/asr/decoder/param.h similarity index 100% rename from speechx/speechx/decoder/param.h rename to speechx/speechx/asr/decoder/param.h diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/asr/nnet/CMakeLists.txt similarity index 100% rename from speechx/speechx/nnet/CMakeLists.txt rename to speechx/speechx/asr/nnet/CMakeLists.txt diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/asr/nnet/decodable.cc similarity index 100% rename from speechx/speechx/nnet/decodable.cc rename to speechx/speechx/asr/nnet/decodable.cc diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/asr/nnet/decodable.h similarity index 100% rename from speechx/speechx/nnet/decodable.h rename to speechx/speechx/asr/nnet/decodable.h diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/asr/nnet/ds2_nnet.cc similarity index 100% rename from speechx/speechx/nnet/ds2_nnet.cc rename to speechx/speechx/asr/nnet/ds2_nnet.cc diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/asr/nnet/ds2_nnet.h similarity index 100% rename from speechx/speechx/nnet/ds2_nnet.h rename to speechx/speechx/asr/nnet/ds2_nnet.h diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/asr/nnet/ds2_nnet_main.cc similarity index 100% rename from speechx/speechx/nnet/ds2_nnet_main.cc rename to speechx/speechx/asr/nnet/ds2_nnet_main.cc diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/asr/nnet/nnet_itf.h similarity index 100% rename from speechx/speechx/nnet/nnet_itf.h rename to speechx/speechx/asr/nnet/nnet_itf.h diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/asr/nnet/u2_nnet.cc similarity index 100% rename from speechx/speechx/nnet/u2_nnet.cc rename to speechx/speechx/asr/nnet/u2_nnet.cc diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/asr/nnet/u2_nnet.h similarity index 100% rename from speechx/speechx/nnet/u2_nnet.h rename to speechx/speechx/asr/nnet/u2_nnet.h diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/asr/nnet/u2_nnet_main.cc similarity index 100% rename from speechx/speechx/nnet/u2_nnet_main.cc rename to speechx/speechx/asr/nnet/u2_nnet_main.cc diff --git a/speechx/speechx/recognizer/CMakeLists.txt b/speechx/speechx/asr/recognizer/CMakeLists.txt similarity index 100% rename from speechx/speechx/recognizer/CMakeLists.txt rename to speechx/speechx/asr/recognizer/CMakeLists.txt diff --git a/speechx/speechx/recognizer/recognizer.cc b/speechx/speechx/asr/recognizer/recognizer.cc similarity index 100% rename from speechx/speechx/recognizer/recognizer.cc rename to speechx/speechx/asr/recognizer/recognizer.cc diff --git a/speechx/speechx/recognizer/recognizer.h b/speechx/speechx/asr/recognizer/recognizer.h similarity index 100% rename from speechx/speechx/recognizer/recognizer.h rename to speechx/speechx/asr/recognizer/recognizer.h diff --git a/speechx/speechx/recognizer/recognizer_main.cc b/speechx/speechx/asr/recognizer/recognizer_main.cc similarity index 100% rename from speechx/speechx/recognizer/recognizer_main.cc rename to speechx/speechx/asr/recognizer/recognizer_main.cc diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/asr/recognizer/u2_recognizer.cc similarity index 100% rename from speechx/speechx/recognizer/u2_recognizer.cc rename to speechx/speechx/asr/recognizer/u2_recognizer.cc diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/asr/recognizer/u2_recognizer.h similarity index 100% rename from speechx/speechx/recognizer/u2_recognizer.h rename to speechx/speechx/asr/recognizer/u2_recognizer.h diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/asr/recognizer/u2_recognizer_main.cc similarity index 100% rename from speechx/speechx/recognizer/u2_recognizer_main.cc rename to speechx/speechx/asr/recognizer/u2_recognizer_main.cc diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/asr/server/CMakeLists.txt similarity index 100% rename from speechx/speechx/protocol/CMakeLists.txt rename to speechx/speechx/asr/server/CMakeLists.txt diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/asr/server/websocket/CMakeLists.txt similarity index 100% rename from speechx/speechx/protocol/websocket/CMakeLists.txt rename to speechx/speechx/asr/server/websocket/CMakeLists.txt diff --git a/speechx/speechx/protocol/websocket/websocket_client.cc b/speechx/speechx/asr/server/websocket/websocket_client.cc similarity index 100% rename from speechx/speechx/protocol/websocket/websocket_client.cc rename to speechx/speechx/asr/server/websocket/websocket_client.cc diff --git a/speechx/speechx/protocol/websocket/websocket_client.h b/speechx/speechx/asr/server/websocket/websocket_client.h similarity index 100% rename from speechx/speechx/protocol/websocket/websocket_client.h rename to speechx/speechx/asr/server/websocket/websocket_client.h diff --git a/speechx/speechx/protocol/websocket/websocket_client_main.cc b/speechx/speechx/asr/server/websocket/websocket_client_main.cc similarity index 100% rename from speechx/speechx/protocol/websocket/websocket_client_main.cc rename to speechx/speechx/asr/server/websocket/websocket_client_main.cc diff --git a/speechx/speechx/protocol/websocket/websocket_server.cc b/speechx/speechx/asr/server/websocket/websocket_server.cc similarity index 100% rename from speechx/speechx/protocol/websocket/websocket_server.cc rename to speechx/speechx/asr/server/websocket/websocket_server.cc diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/asr/server/websocket/websocket_server.h similarity index 100% rename from speechx/speechx/protocol/websocket/websocket_server.h rename to speechx/speechx/asr/server/websocket/websocket_server.h diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/asr/server/websocket/websocket_server_main.cc similarity index 100% rename from speechx/speechx/protocol/websocket/websocket_server_main.cc rename to speechx/speechx/asr/server/websocket/websocket_server_main.cc diff --git a/speechx/speechx/common/CMakeLists.txt b/speechx/speechx/common/CMakeLists.txt new file mode 100644 index 00000000000..dea9eb05df9 --- /dev/null +++ b/speechx/speechx/common/CMakeLists.txt @@ -0,0 +1,16 @@ +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/base +) + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR}/../ +${CMAKE_CURRENT_SOURCE_DIR}/utils +) +add_subdirectory(utils) + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/frontend +) +add_subdirectory(frontend) diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/common/base/basic_types.h similarity index 100% rename from speechx/speechx/base/basic_types.h rename to speechx/speechx/common/base/basic_types.h diff --git a/speechx/speechx/base/common.h b/speechx/speechx/common/base/common.h similarity index 100% rename from speechx/speechx/base/common.h rename to speechx/speechx/common/base/common.h diff --git a/speechx/speechx/base/flags.h b/speechx/speechx/common/base/flags.h similarity index 100% rename from speechx/speechx/base/flags.h rename to speechx/speechx/common/base/flags.h diff --git a/speechx/speechx/base/log.h b/speechx/speechx/common/base/log.h similarity index 100% rename from speechx/speechx/base/log.h rename to speechx/speechx/common/base/log.h diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/common/base/macros.h similarity index 100% rename from speechx/speechx/base/macros.h rename to speechx/speechx/common/base/macros.h diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/common/base/thread_pool.h similarity index 100% rename from speechx/speechx/base/thread_pool.h rename to speechx/speechx/common/base/thread_pool.h diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/common/frontend/CMakeLists.txt similarity index 100% rename from speechx/speechx/frontend/CMakeLists.txt rename to speechx/speechx/common/frontend/CMakeLists.txt diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/common/frontend/audio/CMakeLists.txt similarity index 100% rename from speechx/speechx/frontend/audio/CMakeLists.txt rename to speechx/speechx/common/frontend/audio/CMakeLists.txt diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/common/frontend/audio/assembler.cc similarity index 100% rename from speechx/speechx/frontend/audio/assembler.cc rename to speechx/speechx/common/frontend/audio/assembler.cc diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/common/frontend/audio/assembler.h similarity index 100% rename from speechx/speechx/frontend/audio/assembler.h rename to speechx/speechx/common/frontend/audio/assembler.h diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/common/frontend/audio/audio_cache.cc similarity index 100% rename from speechx/speechx/frontend/audio/audio_cache.cc rename to speechx/speechx/common/frontend/audio/audio_cache.cc diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/common/frontend/audio/audio_cache.h similarity index 100% rename from speechx/speechx/frontend/audio/audio_cache.h rename to speechx/speechx/common/frontend/audio/audio_cache.h diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/common/frontend/audio/cmvn.cc similarity index 100% rename from speechx/speechx/frontend/audio/cmvn.cc rename to speechx/speechx/common/frontend/audio/cmvn.cc diff --git a/speechx/speechx/frontend/audio/cmvn.h b/speechx/speechx/common/frontend/audio/cmvn.h similarity index 100% rename from speechx/speechx/frontend/audio/cmvn.h rename to speechx/speechx/common/frontend/audio/cmvn.h diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/common/frontend/audio/cmvn_json2kaldi_main.cc similarity index 100% rename from speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc rename to speechx/speechx/common/frontend/audio/cmvn_json2kaldi_main.cc diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/common/frontend/audio/compute_fbank_main.cc similarity index 100% rename from speechx/speechx/frontend/audio/compute_fbank_main.cc rename to speechx/speechx/common/frontend/audio/compute_fbank_main.cc diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/common/frontend/audio/compute_linear_spectrogram_main.cc similarity index 100% rename from speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc rename to speechx/speechx/common/frontend/audio/compute_linear_spectrogram_main.cc diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/common/frontend/audio/data_cache.h similarity index 100% rename from speechx/speechx/frontend/audio/data_cache.h rename to speechx/speechx/common/frontend/audio/data_cache.h diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/common/frontend/audio/db_norm.cc similarity index 100% rename from speechx/speechx/frontend/audio/db_norm.cc rename to speechx/speechx/common/frontend/audio/db_norm.cc diff --git a/speechx/speechx/frontend/audio/db_norm.h b/speechx/speechx/common/frontend/audio/db_norm.h similarity index 100% rename from speechx/speechx/frontend/audio/db_norm.h rename to speechx/speechx/common/frontend/audio/db_norm.h diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/common/frontend/audio/fbank.cc similarity index 100% rename from speechx/speechx/frontend/audio/fbank.cc rename to speechx/speechx/common/frontend/audio/fbank.cc diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/common/frontend/audio/fbank.h similarity index 100% rename from speechx/speechx/frontend/audio/fbank.h rename to speechx/speechx/common/frontend/audio/fbank.h diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/common/frontend/audio/feature_cache.cc similarity index 100% rename from speechx/speechx/frontend/audio/feature_cache.cc rename to speechx/speechx/common/frontend/audio/feature_cache.cc diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/common/frontend/audio/feature_cache.h similarity index 100% rename from speechx/speechx/frontend/audio/feature_cache.h rename to speechx/speechx/common/frontend/audio/feature_cache.h diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/common/frontend/audio/feature_common.h similarity index 100% rename from speechx/speechx/frontend/audio/feature_common.h rename to speechx/speechx/common/frontend/audio/feature_common.h diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/common/frontend/audio/feature_common_inl.h similarity index 100% rename from speechx/speechx/frontend/audio/feature_common_inl.h rename to speechx/speechx/common/frontend/audio/feature_common_inl.h diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/common/frontend/audio/feature_pipeline.cc similarity index 100% rename from speechx/speechx/frontend/audio/feature_pipeline.cc rename to speechx/speechx/common/frontend/audio/feature_pipeline.cc diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/common/frontend/audio/feature_pipeline.h similarity index 100% rename from speechx/speechx/frontend/audio/feature_pipeline.h rename to speechx/speechx/common/frontend/audio/feature_pipeline.h diff --git a/speechx/speechx/frontend/audio/frontend_itf.h b/speechx/speechx/common/frontend/audio/frontend_itf.h similarity index 100% rename from speechx/speechx/frontend/audio/frontend_itf.h rename to speechx/speechx/common/frontend/audio/frontend_itf.h diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/common/frontend/audio/linear_spectrogram.cc similarity index 100% rename from speechx/speechx/frontend/audio/linear_spectrogram.cc rename to speechx/speechx/common/frontend/audio/linear_spectrogram.cc diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/common/frontend/audio/linear_spectrogram.h similarity index 100% rename from speechx/speechx/frontend/audio/linear_spectrogram.h rename to speechx/speechx/common/frontend/audio/linear_spectrogram.h diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/common/frontend/audio/mfcc.cc similarity index 100% rename from speechx/speechx/frontend/audio/mfcc.cc rename to speechx/speechx/common/frontend/audio/mfcc.cc diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/common/frontend/audio/mfcc.h similarity index 100% rename from speechx/speechx/frontend/audio/mfcc.h rename to speechx/speechx/common/frontend/audio/mfcc.h diff --git a/speechx/speechx/frontend/audio/normalizer.h b/speechx/speechx/common/frontend/audio/normalizer.h similarity index 100% rename from speechx/speechx/frontend/audio/normalizer.h rename to speechx/speechx/common/frontend/audio/normalizer.h diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/common/utils/CMakeLists.txt similarity index 100% rename from speechx/speechx/utils/CMakeLists.txt rename to speechx/speechx/common/utils/CMakeLists.txt diff --git a/speechx/speechx/utils/file_utils.cc b/speechx/speechx/common/utils/file_utils.cc similarity index 100% rename from speechx/speechx/utils/file_utils.cc rename to speechx/speechx/common/utils/file_utils.cc diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/common/utils/file_utils.h similarity index 100% rename from speechx/speechx/utils/file_utils.h rename to speechx/speechx/common/utils/file_utils.h diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/common/utils/math.cc similarity index 100% rename from speechx/speechx/utils/math.cc rename to speechx/speechx/common/utils/math.cc diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/common/utils/math.h similarity index 100% rename from speechx/speechx/utils/math.h rename to speechx/speechx/common/utils/math.h diff --git a/speechx/speechx/decoder/ctc_decoders b/speechx/speechx/decoder/ctc_decoders deleted file mode 120000 index b280de09681..00000000000 --- a/speechx/speechx/decoder/ctc_decoders +++ /dev/null @@ -1 +0,0 @@ -../../../third_party/ctc_decoders \ No newline at end of file diff --git a/speechx/speechx/frontend/text/CMakeLists.txt b/speechx/speechx/frontend/text/CMakeLists.txt deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/speechx/speechx/kaldi/CMakeLists.txt b/speechx/speechx/kaldi/CMakeLists.txt index ce6b43f632a..d27668fccd3 100644 --- a/speechx/speechx/kaldi/CMakeLists.txt +++ b/speechx/speechx/kaldi/CMakeLists.txt @@ -1,4 +1,7 @@ project(kaldi) +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +) add_subdirectory(base) add_subdirectory(util) @@ -10,4 +13,4 @@ add_subdirectory(decoder) add_subdirectory(lm) add_subdirectory(fstbin) -add_subdirectory(lmbin) \ No newline at end of file +add_subdirectory(lmbin) diff --git a/speechx/speechx/third_party/CMakeLists.txt b/speechx/speechx/third_party/CMakeLists.txt deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/speechx/speechx/third_party/README.md b/speechx/speechx/third_party/README.md deleted file mode 100644 index 2d620335b9c..00000000000 --- a/speechx/speechx/third_party/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# third party - -Those libs copied and developed from third pary opensource software projects. -For all of these things, the official websites are the best place to go. From 8cc56717929cd6f4b98870d8ede6a481186a39dd Mon Sep 17 00:00:00 2001 From: YangZhou Date: Fri, 16 Dec 2022 11:30:50 +0800 Subject: [PATCH 4/6] clean ctc_decoders dir --- .../decoder/ctc_decoders/COPYING.APACHE2.0 | 201 ------------------ .../asr/decoder/ctc_decoders/COPYING.LESSER.3 | 165 -------------- .../speechx/asr/decoder/ctc_decoders/LICENSE | 8 - .../asr/decoder/ctc_decoders/__init__.py | 13 -- .../asr/decoder/ctc_decoders/decoders.i | 33 --- .../speechx/asr/decoder/ctc_decoders/setup.py | 138 ------------ .../speechx/asr/decoder/ctc_decoders/setup.sh | 24 --- 7 files changed, 582 deletions(-) delete mode 100644 speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 delete mode 100644 speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 delete mode 100644 speechx/speechx/asr/decoder/ctc_decoders/LICENSE delete mode 100644 speechx/speechx/asr/decoder/ctc_decoders/__init__.py delete mode 100644 speechx/speechx/asr/decoder/ctc_decoders/decoders.i delete mode 100644 speechx/speechx/asr/decoder/ctc_decoders/setup.py delete mode 100755 speechx/speechx/asr/decoder/ctc_decoders/setup.sh diff --git a/speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 b/speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 deleted file mode 100644 index 261eeb9e9f8..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/COPYING.APACHE2.0 +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 b/speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 deleted file mode 100644 index cca7fc278f5..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/COPYING.LESSER.3 +++ /dev/null @@ -1,165 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/LICENSE b/speechx/speechx/asr/decoder/ctc_decoders/LICENSE deleted file mode 100644 index ad947f8d756..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/LICENSE +++ /dev/null @@ -1,8 +0,0 @@ -Most of the code here is licensed under the Apache License 2.0. -There are exceptions that have their own licenses, listed below. - -score.h and score.cpp is under the LGPL license. -The two files include the header files from KenLM project. - -For the rest: -The default license of paddlespeech-ctcdecoders is Apache License 2.0. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/__init__.py b/speechx/speechx/asr/decoder/ctc_decoders/__init__.py deleted file mode 100644 index 185a92b8d94..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/speechx/speechx/asr/decoder/ctc_decoders/decoders.i b/speechx/speechx/asr/decoder/ctc_decoders/decoders.i deleted file mode 100644 index 8fe3b279f59..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/decoders.i +++ /dev/null @@ -1,33 +0,0 @@ -%module paddlespeech_ctcdecoders -%{ -#include "scorer.h" -#include "ctc_greedy_decoder.h" -#include "ctc_beam_search_decoder.h" -#include "decoder_utils.h" -%} - -%include "std_vector.i" -%include "std_pair.i" -%include "std_string.i" -%import "decoder_utils.h" - -namespace std { - %template(DoubleVector) std::vector; - %template(IntVector) std::vector; - %template(StringVector) std::vector; - %template(VectorOfStructVector) std::vector >; - %template(FloatVector) std::vector; - %template(Pair) std::pair; - %template(PairFloatStringVector) std::vector >; - %template(PairDoubleStringVector) std::vector >; - %template(PairDoubleStringVector2) std::vector > >; - %template(DoubleVector3) std::vector > >; -} - -%template(IntDoublePairCompSecondRev) pair_comp_second_rev; -%template(StringDoublePairCompSecondRev) pair_comp_second_rev; -%template(DoubleStringPairCompFirstRev) pair_comp_first_rev; - -%include "scorer.h" -%include "ctc_greedy_decoder.h" -%include "ctc_beam_search_decoder.h" diff --git a/speechx/speechx/asr/decoder/ctc_decoders/setup.py b/speechx/speechx/asr/decoder/ctc_decoders/setup.py deleted file mode 100644 index 9a8b292a07b..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/setup.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Script to build and install decoder package.""" -import argparse -import glob -import multiprocessing.pool -import os -import platform -import sys - -from setuptools import distutils -from setuptools import Extension -from setuptools import setup - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_processes", - default=1, - type=int, - help="Number of cpu processes to build package. (default: %(default)d)") -args = parser.parse_known_args() - -# reconstruct sys.argv to pass to setup below -sys.argv = [sys.argv[0]] + args[1] - - -# monkey-patch for parallel compilation -# See: https://stackoverflow.com/a/13176803 -def parallelCCompile(self, - sources, - output_dir=None, - macros=None, - include_dirs=None, - debug=0, - extra_preargs=None, - extra_postargs=None, - depends=None): - # those lines are copied from distutils.ccompiler.CCompiler directly - macros, objects, extra_postargs, pp_opts, build = self._setup_compile( - output_dir, macros, include_dirs, sources, depends, extra_postargs) - cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) - - # parallel code - def _single_compile(obj): - try: - src, ext = build[obj] - except KeyError: - return - self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts) - - # convert to list, imap is evaluated on-demand - thread_pool = multiprocessing.pool.ThreadPool(args[0].num_processes) - list(thread_pool.imap(_single_compile, objects)) - return objects - - -def compile_test(header, library): - dummy_path = os.path.join(os.path.dirname(__file__), "dummy") - command = "bash -c \"g++ -include " + header \ - + " -l" + library + " -x c++ - <<<'int main() {}' -o " \ - + dummy_path + " >/dev/null 2>/dev/null && rm " \ - + dummy_path + " 2>/dev/null\"" - return os.system(command) == 0 - - -# hack compile to support parallel compiling -distutils.ccompiler.CCompiler.compile = parallelCCompile - -FILES = glob.glob('kenlm/util/*.cc') \ - + glob.glob('kenlm/lm/*.cc') \ - + glob.glob('kenlm/util/double-conversion/*.cc') - -FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') - -# yapf: disable -FILES = [ - fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') - or fn.endswith('unittest.cc')) -] -# yapf: enable -LIBS = ['stdc++'] -if platform.system() != 'Darwin': - LIBS.append('rt') -if platform.system() == 'Windows': - LIBS = ['-static-libstdc++'] - -ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11'] - -if compile_test('zlib.h', 'z'): - ARGS.append('-DHAVE_ZLIB') - LIBS.append('z') - -if compile_test('bzlib.h', 'bz2'): - ARGS.append('-DHAVE_BZLIB') - LIBS.append('bz2') - -if compile_test('lzma.h', 'lzma'): - ARGS.append('-DHAVE_XZLIB') - LIBS.append('lzma') - -os.system('swig -python -c++ ./decoders.i') - -decoders_module = [ - Extension( - name='_paddlespeech_ctcdecoders', - sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'), - language='c++', - include_dirs=[ - '.', - 'kenlm', - 'openfst-1.6.3/src/include', - 'ThreadPool', - ], - libraries=LIBS, - extra_compile_args=ARGS) -] - -setup( - name='paddlespeech_ctcdecoders', - version='0.2.0', - description="CTC decoders in paddlespeech", - author="PaddlePaddle Speech and Language Team", - author_email="paddlesl@baidu.com", - url="https://github.com/PaddlePaddle/PaddleSpeech", - license='Apache 2.0, GNU Lesser General Public License v3 (LGPLv3) (LGPL-3)', - ext_modules=decoders_module, - py_modules=['paddlespeech_ctcdecoders']) diff --git a/speechx/speechx/asr/decoder/ctc_decoders/setup.sh b/speechx/speechx/asr/decoder/ctc_decoders/setup.sh deleted file mode 100755 index 302c5550250..00000000000 --- a/speechx/speechx/asr/decoder/ctc_decoders/setup.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash - -if [ ! -d kenlm ]; then - git clone https://github.com/kpu/kenlm.git - cd kenlm/ - git checkout df2d717e95183f79a90b2fa6e4307083a351ca6a - cd .. - echo -e "\n" -fi - -if [ ! -d openfst-1.6.3 ]; then - echo "Download and extract openfst ..." - wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz --no-check-certificate - tar -xzvf openfst-1.6.3.tar.gz - echo -e "\n" -fi - -if [ ! -d ThreadPool ]; then - git clone https://github.com/progschj/ThreadPool.git - echo -e "\n" -fi - -echo "Install decoders ..." -python3 setup.py install --num_processes 4 From cd49b31a18dd6295f110cb8566ee60a0425fd46e Mon Sep 17 00:00:00 2001 From: YangZhou Date: Tue, 27 Dec 2022 16:03:22 +0800 Subject: [PATCH 5/6] add nnet cache && make 2 thread work --- speechx/CMakeLists.txt | 2 +- .../ctc_prefix_beam_search_decoder_main.cc | 15 ++- speechx/speechx/asr/nnet/CMakeLists.txt | 22 ++-- speechx/speechx/asr/nnet/decodable.cc | 88 ++++--------- speechx/speechx/asr/nnet/decodable.h | 16 +-- speechx/speechx/asr/nnet/nnet_producer.cc | 84 ++++++++++++ speechx/speechx/asr/nnet/nnet_producer.h | 73 +++++++++++ speechx/speechx/asr/recognizer/CMakeLists.txt | 1 + .../speechx/asr/recognizer/u2_recognizer.cc | 15 ++- .../speechx/asr/recognizer/u2_recognizer.h | 10 +- .../recognizer/u2_recognizer_thread_main.cc | 123 ++++++++++++++++++ speechx/speechx/asr/server/CMakeLists.txt | 2 +- speechx/speechx/common/base/common.h | 2 +- speechx/speechx/common/base/safe_queue.h | 71 ++++++++++ 14 files changed, 416 insertions(+), 108 deletions(-) create mode 100644 speechx/speechx/asr/nnet/nnet_producer.cc create mode 100644 speechx/speechx/asr/nnet/nnet_producer.h create mode 100644 speechx/speechx/asr/recognizer/u2_recognizer_thread_main.cc create mode 100644 speechx/speechx/common/base/safe_queue.h diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 6b957160eec..2068b51ac71 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -35,7 +35,7 @@ option(TEST_DEBUG "option for debug" OFF) option(USE_PROFILING "enable c++ profling" OFF) option(USING_U2 "compile u2 model." ON) -option(USING_DS2 "compile with ds2 model." ON) +option(USING_DS2 "compile with ds2 model." OFF) option(USING_GPU "u2 compute on GPU." OFF) diff --git a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc index c59b1f2e742..9baa836b2a6 100644 --- a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "decoder/ctc_prefix_beam_search_decoder.h" #include "absl/strings/str_split.h" #include "base/common.h" -#include "decoder/ctc_prefix_beam_search_decoder.h" #include "frontend/audio/data_cache.h" #include "fst/symbol-table.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" +#include "nnet/nnet_producer.h" #include "nnet/u2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); @@ -40,7 +41,7 @@ using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; -// test ds2 online decoder by feeding speech feature +// test u2 online decoder by feeding speech feature int main(int argc, char* argv[]) { gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -70,8 +71,10 @@ int main(int argc, char* argv[]) { // decodeable std::shared_ptr raw_data = std::make_shared(); + std::shared_ptr nnet_producer = + std::make_shared(nnet, raw_data); std::shared_ptr decodable = - std::make_shared(nnet, raw_data); + std::make_shared(nnet_producer); // decoder ppspeech::CTCBeamSearchOptions opts; @@ -115,9 +118,9 @@ int main(int argc, char* argv[]) { ori_feature_len - chunk_idx * chunk_stride, chunk_size); } if (this_chunk_size < receptive_field_length) { - LOG(WARNING) - << "utt: " << utt << " skip last " << this_chunk_size - << " frames, expect is " << receptive_field_length; + LOG(WARNING) << "utt: " << utt << " skip last " + << this_chunk_size << " frames, expect is " + << receptive_field_length; break; } diff --git a/speechx/speechx/asr/nnet/CMakeLists.txt b/speechx/speechx/asr/nnet/CMakeLists.txt index 435666163a9..750c77521d8 100644 --- a/speechx/speechx/asr/nnet/CMakeLists.txt +++ b/speechx/speechx/asr/nnet/CMakeLists.txt @@ -1,4 +1,4 @@ -set(srcs decodable.cc) +set(srcs decodable.cc nnet_producer.cc) if(USING_DS2) list(APPEND srcs ds2_nnet.cc) @@ -27,13 +27,13 @@ if(USING_DS2) endif() # test bin -if(USING_U2) - set(bin_name u2_nnet_main) - add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) - target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) - target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) - - target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) - target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) - target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) -endif() +#if(USING_U2) +# set(bin_name u2_nnet_main) +# add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +# target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +# target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + +# target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) +# target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) +# target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +#endif() diff --git a/speechx/speechx/asr/nnet/decodable.cc b/speechx/speechx/asr/nnet/decodable.cc index 5fe2b984230..f01e9049324 100644 --- a/speechx/speechx/asr/nnet/decodable.cc +++ b/speechx/speechx/asr/nnet/decodable.cc @@ -21,19 +21,16 @@ using kaldi::Matrix; using kaldi::Vector; using std::vector; -Decodable::Decodable(const std::shared_ptr& nnet, - const std::shared_ptr& frontend, +Decodable::Decodable(const std::shared_ptr& nnet_producer, kaldi::BaseFloat acoustic_scale) - : frontend_(frontend), - nnet_(nnet), + : nnet_producer_(nnet_producer), frame_offset_(0), frames_ready_(0), acoustic_scale_(acoustic_scale) {} // for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { - nnet_out_cache_ = likelihood; - frames_ready_ += likelihood.NumRows(); + nnet_producer_->Acceptlikelihood(likelihood); } @@ -43,7 +40,7 @@ int32 Decodable::NumFramesReady() const { return frames_ready_; } // frame idx is from 0 to frame_ready_ -1; bool Decodable::IsLastFrame(int32 frame) { - bool flag = EnsureFrameHaveComputed(frame); + EnsureFrameHaveComputed(frame); return frame >= frames_ready_; } @@ -64,32 +61,10 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) { bool Decodable::AdvanceChunk() { kaldi::Timer timer; - // read feats - Vector features; - if (frontend_ == NULL || frontend_->Read(&features) == false) { - // no feat or frontend_ not init. - VLOG(3) << "decodable exit;"; - return false; - } - CHECK_GE(frontend_->Dim(), 0); - VLOG(1) << "AdvanceChunk feat cost: " << timer.Elapsed() << " sec."; - VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; - - // forward feats - NnetOut out; - nnet_->FeedForward(features, frontend_->Dim(), &out); - int32& vocab_dim = out.vocab_dim; - Vector& logprobs = out.logprobs; - - VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim - << " decoder frames."; - // cache nnet outupts - nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); - nnet_out_cache_.CopyRowsFromVec(logprobs); - - // update state, decoding frame. + bool flag = nnet_producer_->Read(&framelikelihood_); + if (flag == false) return false; frame_offset_ = frames_ready_; - frames_ready_ += nnet_out_cache_.NumRows(); + frames_ready_ += 1; VLOG(1) << "AdvanceChunk feat + forward cost: " << timer.Elapsed() << " sec."; return true; @@ -101,17 +76,17 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, return false; } - int nrows = nnet_out_cache_.NumRows(); - CHECK(nrows == (frames_ready_ - frame_offset_)); - if (nrows <= 0) { + if (framelikelihood_.empty()) { LOG(WARNING) << "No new nnet out in cache."; return false; } - logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols()); - logprobs->CopyRowsFromMat(nnet_out_cache_); - - *vocab_dim = nnet_out_cache_.NumCols(); + size_t dim = framelikelihood_.size(); + logprobs->Resize(framelikelihood_.size()); + std::memcpy(logprobs->Data(), + framelikelihood_.data(), + dim * sizeof(kaldi::BaseFloat)); + *vocab_dim = framelikelihood_.size(); return true; } @@ -122,19 +97,8 @@ bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { return false; } - int nrows = nnet_out_cache_.NumRows(); - CHECK(nrows == (frames_ready_ - frame_offset_)); - int vocab_size = nnet_out_cache_.NumCols(); - likelihood->resize(vocab_size); - - for (int32 idx = 0; idx < vocab_size; ++idx) { - (*likelihood)[idx] = - nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; - - VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " - << nnet_out_cache_.NumRows() - << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); - } + CHECK_EQ(1, (frames_ready_ - frame_offset_)); + *likelihood = framelikelihood_; return true; } @@ -143,37 +107,31 @@ BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { return false; } - CHECK_LE(index, nnet_out_cache_.NumCols()); + CHECK_LE(index, framelikelihood_.size()); CHECK_LE(frame, frames_ready_); // the nnet output is prob ranther than log prob // the index - 1, because the ilabel BaseFloat logprob = 0.0; int32 frame_idx = frame - frame_offset_; - BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index)); - if (nnet_->IsLogProb()) { - logprob = nnet_out; - } else { - logprob = std::log(nnet_out + std::numeric_limits::epsilon()); - } - CHECK(!std::isnan(logprob) && !std::isinf(logprob)); + CHECK_EQ(frame_idx, 0); + logprob = framelikelihood_[TokenId2NnetId(index)]; return acoustic_scale_ * logprob; } void Decodable::Reset() { - if (frontend_ != nullptr) frontend_->Reset(); - if (nnet_ != nullptr) nnet_->Reset(); + if (nnet_producer_ != nullptr) nnet_producer_->Reset(); frame_offset_ = 0; frames_ready_ = 0; - nnet_out_cache_.Resize(0, 0); + framelikelihood_.clear(); } void Decodable::AttentionRescoring(const std::vector>& hyps, float reverse_weight, std::vector* rescoring_score) { kaldi::Timer timer; - nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); + nnet_producer_->AttentionRescoring(hyps, reverse_weight, rescoring_score); VLOG(1) << "Attention Rescoring cost: " << timer.Elapsed() << " sec."; } -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/asr/nnet/decodable.h b/speechx/speechx/asr/nnet/decodable.h index dd7b329e581..cd498e42db2 100644 --- a/speechx/speechx/asr/nnet/decodable.h +++ b/speechx/speechx/asr/nnet/decodable.h @@ -13,10 +13,10 @@ // limitations under the License. #include "base/common.h" -#include "frontend/audio/frontend_itf.h" #include "kaldi/decoder/decodable-itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/nnet_itf.h" +#include "nnet/nnet_producer.h" namespace ppspeech { @@ -24,8 +24,7 @@ struct DecodableOpts; class Decodable : public kaldi::DecodableInterface { public: - explicit Decodable(const std::shared_ptr& nnet, - const std::shared_ptr& frontend, + explicit Decodable(const std::shared_ptr& nnet_producer, kaldi::BaseFloat acoustic_scale = 1.0); // void Init(DecodableOpts config); @@ -57,23 +56,17 @@ class Decodable : public kaldi::DecodableInterface { void Reset(); - bool IsInputFinished() const { return frontend_->IsFinished(); } + bool IsInputFinished() const { return nnet_producer_->IsFinished(); } bool EnsureFrameHaveComputed(int32 frame); int32 TokenId2NnetId(int32 token_id); - std::shared_ptr Nnet() { return nnet_; } - // for offline test void Acceptlikelihood(const kaldi::Matrix& likelihood); private: - std::shared_ptr frontend_; - std::shared_ptr nnet_; - - // nnet outputs' cache - kaldi::Matrix nnet_out_cache_; + std::shared_ptr nnet_producer_; // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame @@ -85,6 +78,7 @@ class Decodable : public kaldi::DecodableInterface { // so use subsampled_frame int32 current_log_post_subsampled_offset_; int32 num_chunk_computed_; + std::vector framelikelihood_; kaldi::BaseFloat acoustic_scale_; }; diff --git a/speechx/speechx/asr/nnet/nnet_producer.cc b/speechx/speechx/asr/nnet/nnet_producer.cc new file mode 100644 index 00000000000..3a0c4f18814 --- /dev/null +++ b/speechx/speechx/asr/nnet/nnet_producer.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "nnet/nnet_producer.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::BaseFloat; + +NnetProducer::NnetProducer(std::shared_ptr nnet, + std::shared_ptr frontend) + : nnet_(nnet), frontend_(frontend) {} + +void NnetProducer::Accept(const kaldi::VectorBase& inputs) { + frontend_->Accept(inputs); + bool result = false; + do { + result = Compute(); + } while (result); +} + +void NnetProducer::Acceptlikelihood( + const kaldi::Matrix& likelihood) { + std::vector prob; + prob.resize(likelihood.NumCols()); + for (size_t idx = 0; idx < likelihood.NumRows(); ++idx) { + for (size_t col = 0; col < likelihood.NumCols(); ++col) { + prob[col] = likelihood(idx, col); + cache_.push_back(prob); + } + } +} + +bool NnetProducer::Read(std::vector* nnet_prob) { + bool flag = cache_.pop(nnet_prob); + return flag; +} + +bool NnetProducer::Compute() { + Vector features; + if (frontend_ == NULL || frontend_->Read(&features) == false) { + // no feat or frontend_ not init. + VLOG(3) << "no feat avalible"; + return false; + } + CHECK_GE(frontend_->Dim(), 0); + VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; + + NnetOut out; + nnet_->FeedForward(features, frontend_->Dim(), &out); + int32& vocab_dim = out.vocab_dim; + Vector& logprobs = out.logprobs; + size_t nframes = logprobs.Dim() / vocab_dim; + VLOG(2) << "Forward out " << nframes << " decoder frames."; + std::vector logprob(vocab_dim); + // remove later. + for (size_t idx = 0; idx < nframes; ++idx) { + for (size_t prob_idx = 0; prob_idx < vocab_dim; ++prob_idx) { + logprob[prob_idx] = logprobs(idx * vocab_dim + prob_idx); + } + cache_.push_back(logprob); + } + return true; +} + +void NnetProducer::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { + nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/asr/nnet/nnet_producer.h b/speechx/speechx/asr/nnet/nnet_producer.h new file mode 100644 index 00000000000..65e9116fff6 --- /dev/null +++ b/speechx/speechx/asr/nnet/nnet_producer.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "base/safe_queue.h" +#include "frontend/audio/frontend_itf.h" +#include "nnet/nnet_itf.h" + +namespace ppspeech { + +class NnetProducer { + public: + explicit NnetProducer(std::shared_ptr nnet, + std::shared_ptr frontend = NULL); + + // Feed feats or waves + void Accept(const kaldi::VectorBase& inputs); + + void Acceptlikelihood(const kaldi::Matrix& likelihood); + + // nnet + bool Read(std::vector* nnet_prob); + + bool Empty() const { return cache_.empty(); } + + void SetFinished() { + LOG(INFO) << "set finished"; + // std::unique_lock lock(mutex_); + frontend_->SetFinished(); + + // read the last chunk data + Compute(); + // ready_feed_condition_.notify_one(); + LOG(INFO) << "compute last feats done."; + } + + bool IsFinished() const { return frontend_->IsFinished(); } + + void Reset() { + frontend_->Reset(); + nnet_->Reset(); + VLOG(3) << "feature cache reset: cache size: " << cache_.size(); + cache_.clear(); + } + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score); + + private: + bool Compute(); + + std::shared_ptr frontend_; + std::shared_ptr nnet_; + SafeQueue> cache_; + + DISALLOW_COPY_AND_ASSIGN(NnetProducer); +}; + +} // namespace ppspeech diff --git a/speechx/speechx/asr/recognizer/CMakeLists.txt b/speechx/speechx/asr/recognizer/CMakeLists.txt index 05078873952..53e2e58db68 100644 --- a/speechx/speechx/asr/recognizer/CMakeLists.txt +++ b/speechx/speechx/asr/recognizer/CMakeLists.txt @@ -30,6 +30,7 @@ endif() if (USING_U2) set(TEST_BINS u2_recognizer_main + u2_recognizer_thread_main ) foreach(bin_name IN LISTS TEST_BINS) diff --git a/speechx/speechx/asr/recognizer/u2_recognizer.cc b/speechx/speechx/asr/recognizer/u2_recognizer.cc index d1d308ebd4b..ea62ae1a1f1 100644 --- a/speechx/speechx/asr/recognizer/u2_recognizer.cc +++ b/speechx/speechx/asr/recognizer/u2_recognizer.cc @@ -27,13 +27,13 @@ using std::vector; U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) : opts_(resource) { + BaseFloat am_scale = resource.acoustic_scale; const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; - feature_pipeline_.reset(new FeaturePipeline(feature_opts)); - + std::shared_ptr feature_pipeline( + new FeaturePipeline(feature_opts)); std::shared_ptr nnet(new U2Nnet(resource.model_opts)); - - BaseFloat am_scale = resource.acoustic_scale; - decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); + nnet_producer_.reset(new NnetProducer(nnet, feature_pipeline)); + decodable_.reset(new Decodable(nnet_producer_, am_scale)); CHECK_NE(resource.vocab_path, ""); decoder_.reset(new CTCPrefixBeamSearch( @@ -49,6 +49,7 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) void U2Recognizer::Reset() { global_frame_offset_ = 0; + input_finished_ = false; num_frames_ = 0; result_.clear(); @@ -68,7 +69,7 @@ void U2Recognizer::ResetContinuousDecoding() { void U2Recognizer::Accept(const VectorBase& waves) { kaldi::Timer timer; - feature_pipeline_->Accept(waves); + nnet_producer_->Accept(waves); VLOG(1) << "feed waves cost: " << timer.Elapsed() << " sec. " << waves.Dim() << " samples."; } @@ -210,7 +211,7 @@ std::string U2Recognizer::GetFinalResult() { return result_[0].sentence; } std::string U2Recognizer::GetPartialResult() { return result_[0].sentence; } void U2Recognizer::SetFinished() { - feature_pipeline_->SetFinished(); + nnet_producer_->SetFinished(); input_finished_ = true; } diff --git a/speechx/speechx/asr/recognizer/u2_recognizer.h b/speechx/speechx/asr/recognizer/u2_recognizer.h index 25850863370..855d161a045 100644 --- a/speechx/speechx/asr/recognizer/u2_recognizer.h +++ b/speechx/speechx/asr/recognizer/u2_recognizer.h @@ -130,11 +130,11 @@ class U2Recognizer { return !result_.empty() && !result_[0].sentence.empty(); } - int FrameShiftInMs() const { - // one decoder frame length in ms - return decodable_->Nnet()->SubsamplingRate() * - feature_pipeline_->FrameShift(); + // one decoder frame length in ms, todo + return 1; + // return decodable_->Nnet()->SubsamplingRate() * + // feature_pipeline_->FrameShift(); } @@ -149,7 +149,7 @@ class U2Recognizer { // std::shared_ptr resource_; // U2RecognizerResource resource_; - std::shared_ptr feature_pipeline_; + std::shared_ptr nnet_producer_; std::shared_ptr decodable_; std::unique_ptr decoder_; diff --git a/speechx/speechx/asr/recognizer/u2_recognizer_thread_main.cc b/speechx/speechx/asr/recognizer/u2_recognizer_thread_main.cc new file mode 100644 index 00000000000..e73efef11c7 --- /dev/null +++ b/speechx/speechx/asr/recognizer/u2_recognizer_thread_main.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "recognizer/u2_recognizer.h" +#include "decoder/param.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/table-types.h" + +DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); +DEFINE_int32(sample_rate, 16000, "sample rate"); + +void decode_func(std::shared_ptr recognizer) { + while (!recognizer->IsFinished()) { + recognizer->Decode(); + usleep(100); + } + recognizer->Decode(); + recognizer->Rescoring(); +} + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + double tot_wav_duration = 0.0; + double tot_decode_time = 0.0; + + kaldi::SequentialTableReader wav_reader( + FLAGS_wav_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + int sample_rate = FLAGS_sample_rate; + float streaming_chunk = FLAGS_streaming_chunk; + int chunk_sample_size = streaming_chunk * sample_rate; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; + + ppspeech::U2RecognizerResource resource = + ppspeech::U2RecognizerResource::InitFromFlags(); + std::shared_ptr recognizer_ptr( + new ppspeech::U2Recognizer(resource)); + + for (; !wav_reader.Done(); wav_reader.Next()) { + std::thread recognizer_thread(decode_func, recognizer_ptr); + std::string utt = wav_reader.Key(); + const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec."; + double dur = wave_data.Duration(); + tot_wav_duration += dur; + + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), + this_channel); + int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + + int sample_offset = 0; + kaldi::Timer timer; + kaldi::Timer local_timer; + + while (sample_offset < tot_samples) { + int cur_chunk_size = + std::min(chunk_sample_size, tot_samples - sample_offset); + + kaldi::Vector wav_chunk(cur_chunk_size); + for (int i = 0; i < cur_chunk_size; ++i) { + wav_chunk(i) = waveform(sample_offset + i); + } + // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size); + + recognizer_ptr->Accept(wav_chunk); + if (cur_chunk_size < chunk_sample_size) { + recognizer_ptr->SetFinished(); + } + + // no overlap + sample_offset += cur_chunk_size; + } + CHECK(sample_offset == tot_samples); + + recognizer_thread.join(); + std::string result = recognizer_ptr->GetFinalResult(); + recognizer_ptr->Reset(); + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(INFO) << " the result of " << utt << " is empty"; + continue; + } + + LOG(INFO) << utt << " " << result; + LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur + << " cost: " << local_timer.Elapsed(); + + result_writer.Write(utt, result); + + ++num_done; + } + + LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); + LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; + LOG(INFO) << "total decode cost:" << tot_decode_time << " sec"; + LOG(INFO) << "RTF is: " << tot_decode_time / tot_wav_duration; +} diff --git a/speechx/speechx/asr/server/CMakeLists.txt b/speechx/speechx/asr/server/CMakeLists.txt index 71b33daa929..566b42eefe3 100644 --- a/speechx/speechx/asr/server/CMakeLists.txt +++ b/speechx/speechx/asr/server/CMakeLists.txt @@ -1 +1 @@ -add_subdirectory(websocket) +#add_subdirectory(websocket) diff --git a/speechx/speechx/common/base/common.h b/speechx/speechx/common/base/common.h index 97bff96620e..2a066ee68bc 100644 --- a/speechx/speechx/common/base/common.h +++ b/speechx/speechx/common/base/common.h @@ -48,4 +48,4 @@ #include "base/log.h" #include "base/macros.h" #include "utils/file_utils.h" -#include "utils/math.h" \ No newline at end of file +#include "utils/math.h" diff --git a/speechx/speechx/common/base/safe_queue.h b/speechx/speechx/common/base/safe_queue.h new file mode 100644 index 00000000000..25a012afb2d --- /dev/null +++ b/speechx/speechx/common/base/safe_queue.h @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "base/common.h" + +namespace ppspeech { + +template +class SafeQueue { + public: + explicit SafeQueue(size_t capacity = 0); + void push_back(const T& in); + bool pop(T* out); + bool empty() const { return buffer_.empty(); } + size_t size() const { return buffer_.size(); } + void clear(); + + + private: + std::mutex mutex_; + std::condition_variable condition_; + std::deque buffer_; + size_t capacity_; +}; + +template +SafeQueue::SafeQueue(size_t capacity) : capacity_(capacity) {} + +template +void SafeQueue::push_back(const T& in) { + std::unique_lock lock(mutex_); + if (capacity_ > 0 && buffer_.size() == capacity_) { + condition_.wait(lock, [this] { return capacity_ >= buffer_.size(); }); + } + + buffer_.push_back(in); + condition_.notify_one(); +} + +template +bool SafeQueue::pop(T* out) { + if (buffer_.empty()) { + return false; + } + + std::unique_lock lock(mutex_); + condition_.wait(lock, [this] { return buffer_.size() > 0; }); + *out = std::move(buffer_.front()); + buffer_.pop_front(); + condition_.notify_one(); + return true; +} + +template +void SafeQueue::clear() { + std::unique_lock lock(mutex_); + buffer_.clear(); + condition_.notify_one(); +} +} // namespace ppspeech From 28fc05bb0e04ea0716948ee6d09f297353e21657 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Tue, 27 Dec 2022 16:27:54 +0800 Subject: [PATCH 6/6] do not compile websocket --- speechx/speechx/asr/server/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speechx/speechx/asr/server/CMakeLists.txt b/speechx/speechx/asr/server/CMakeLists.txt index 71b33daa929..566b42eefe3 100644 --- a/speechx/speechx/asr/server/CMakeLists.txt +++ b/speechx/speechx/asr/server/CMakeLists.txt @@ -1 +1 @@ -add_subdirectory(websocket) +#add_subdirectory(websocket)