From 537e163dd012aec3b250af77697908d37759c057 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 24 Aug 2024 13:24:52 +0800 Subject: [PATCH] WebAssembly example for VAD + Non-streaming ASR (#1284) --- .../workflows/wasm-simd-hf-space-de-tts.yaml | 4 + .../wasm-simd-hf-space-en-asr-zipformer.yaml | 3 + .../workflows/wasm-simd-hf-space-en-tts.yaml | 4 + .../wasm-simd-hf-space-silero-vad.yaml | 1 + .../workflows/wasm-simd-hf-space-vad-asr.yaml | 93 +++++ ...-space-zh-cantonese-en-asr-paraformer.yaml | 4 + ...sm-simd-hf-space-zh-en-asr-paraformer.yaml | 4 + ...asm-simd-hf-space-zh-en-asr-zipformer.yaml | 4 + CMakeLists.txt | 15 +- README.md | 239 ++++++++--- build-wasm-simd-vad-asr.sh | 68 +++ scripts/wasm/generate-vad-asr.py | 229 +++++++++++ scripts/wasm/run-vad-asr.sh.in | 92 +++++ sherpa-onnx/c-api/c-api.cc | 5 + sherpa-onnx/c-api/c-api.h | 3 + wasm/CMakeLists.txt | 4 + wasm/asr/assets/README.md | 7 + wasm/asr/index.html | 2 +- wasm/tts/assets/README.md | 5 + wasm/vad-asr/CMakeLists.txt | 83 ++++ wasm/vad-asr/app-vad-asr.js | 389 ++++++++++++++++++ wasm/vad-asr/assets/README.md | 23 ++ wasm/vad-asr/index.html | 43 ++ wasm/vad-asr/sherpa-onnx-asr.js | 1 + wasm/vad-asr/sherpa-onnx-vad.js | 1 + wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc | 19 + wasm/vad/assets/README.md | 3 + wasm/vad/index.html | 2 +- wasm/vad/sherpa-onnx-vad.js | 1 - 29 files changed, 1281 insertions(+), 70 deletions(-) create mode 100644 .github/workflows/wasm-simd-hf-space-vad-asr.yaml create mode 100755 build-wasm-simd-vad-asr.sh create mode 100755 scripts/wasm/generate-vad-asr.py create mode 100644 scripts/wasm/run-vad-asr.sh.in create mode 100644 wasm/vad-asr/CMakeLists.txt create mode 100644 wasm/vad-asr/app-vad-asr.js create mode 100644 wasm/vad-asr/assets/README.md create mode 100644 wasm/vad-asr/index.html create mode 120000 wasm/vad-asr/sherpa-onnx-asr.js create mode 120000 wasm/vad-asr/sherpa-onnx-vad.js create mode 100644 wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc diff --git a/.github/workflows/wasm-simd-hf-space-de-tts.yaml b/.github/workflows/wasm-simd-hf-space-de-tts.yaml index f51535379..cbd3b1fce 100644 --- a/.github/workflows/wasm-simd-hf-space-de-tts.yaml +++ b/.github/workflows/wasm-simd-hf-space-de-tts.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml index 975266917..510a003c7 100644 --- a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml @@ -27,6 +27,9 @@ jobs: fetch-depth: 0 - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-en-tts.yaml b/.github/workflows/wasm-simd-hf-space-en-tts.yaml index f5f950c3c..9c5c1d446 100644 --- a/.github/workflows/wasm-simd-hf-space-en-tts.yaml +++ b/.github/workflows/wasm-simd-hf-space-en-tts.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-silero-vad.yaml b/.github/workflows/wasm-simd-hf-space-silero-vad.yaml index e384af3fb..dc8bada70 100644 --- a/.github/workflows/wasm-simd-hf-space-silero-vad.yaml +++ b/.github/workflows/wasm-simd-hf-space-silero-vad.yaml @@ -25,6 +25,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 with: diff --git a/.github/workflows/wasm-simd-hf-space-vad-asr.yaml b/.github/workflows/wasm-simd-hf-space-vad-asr.yaml new file mode 100644 index 000000000..726b69826 --- /dev/null +++ b/.github/workflows/wasm-simd-hf-space-vad-asr.yaml @@ -0,0 +1,93 @@ +name: wasm-simd-hf-space-vad-asr + +on: + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' + + workflow_dispatch: + +concurrency: + group: wasm-simd-hf-space-vad-asr${{ github.ref }} + cancel-in-progress: true + +jobs: + wasm-simd-hf-space-vad-asr: + name: ${{ matrix.index }}/${{ matrix.total }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + total: ["8"] + index: ["0", "1", "2", "3", "4", "5", "6", "7"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install Python dependencies + shell: bash + run: | + python3 -m pip install --upgrade pip jinja2 + + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + + - name: Generate build script + shell: bash + run: | + cd scripts/wasm + + total=${{ matrix.total }} + index=${{ matrix.index }} + + ./generate-vad-asr.py --total $total --index $index + + chmod +x run-vad-asr.sh + mv -v ./run-vad-asr.sh ../.. + + - name: Show build scripts + shell: bash + run: | + cat ./run-vad-asr.sh + + - uses: actions/upload-artifact@v4 + with: + name: run-vad-asr-${{ matrix.index }} + path: ./run-vad-asr.sh + + - name: Build sherpa-onnx for WebAssembly + shell: bash + env: + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + ./run-vad-asr.sh + + - name: Release jar + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + + - name: Upload wasm files + uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-simd-vad-asr-${{ matrix.index }} + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 diff --git a/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml index e0c665737..c72e0cef2 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml index 500305420..b76f912b4 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml index dfa0e1614..9bdd90ee2 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index 7408f8d69..b71bb133d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF) +option(SHERPA_ONNX_ENABLE_WASM_VAD_ASR "Whether to enable WASM for VAD+ASR" OFF) option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) @@ -137,6 +138,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}") +message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD_ASR ${SHERPA_ONNX_ENABLE_WASM_VAD_ASR}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") @@ -211,11 +213,22 @@ if(SHERPA_ONNX_ENABLE_WASM) endif() if(SHERPA_ONNX_ENABLE_WASM_KWS) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for KWS") + endif() add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) endif() if(SHERPA_ONNX_ENABLE_WASM_VAD) - add_definitions(-DSHERPA_ONNX_ENABLE_WASM_VAD=1) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD") + endif() +endif() + +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD+ASR") + endif() endif() if(NOT CMAKE_CXX_STANDARD) diff --git a/README.md b/README.md index dcdaec2f2..cc9acb2b1 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,13 @@ ### Supported platforms -|Architecture| Android | iOS | Windows | macOS | linux | -|------------|------------------|---------------|------------|-------|-------| -| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | -| x86 | ✔️ | | ✔️ | | | -| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | -| arm32 | ✔️ | | | | ✔️ | -| riscv64 | | | | | ✔️ | +|Architecture| Android | iOS | Windows | macOS | linux | +|------------|---------|---------|------------|-------|-------| +| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | +| x86 | ✔️ | | ✔️ | | | +| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | +| arm32 | ✔️ | | | | ✔️ | +| riscv64 | | | | | ✔️ | ### Supported programming languages @@ -37,7 +37,7 @@ |-------|----------|----------|------------| | ✔️ | ✔️ | ✔️ | ✔️ | -For Rust support, please see https://github.com/thewh1teagle/sherpa-rs +For Rust support, please see [sherpa-rs][sherpa-rs] It also supports WebAssembly. @@ -51,7 +51,7 @@ This repository supports running the following functions **locally** - Speaker verification - Spoken language identification - Audio tagging - - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad)) + - VAD (e.g., [silero-vad][silero-vad]) - Keyword spotting on the following platforms and operating systems: @@ -62,11 +62,12 @@ on the following platforms and operating systems: - iOS - NodeJS - WebAssembly - - [Raspberry Pi](https://www.raspberrypi.com/) - - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf) - - [LicheePi4A](https://sipeed.com/licheepi4a) - - [VisionFive 2](https://www.starfivetech.com/en/site/boards) - - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html) + - [Raspberry Pi][Raspberry Pi] + - [RV1126][RV1126] + - [LicheePi4A][LicheePi4A] + - [VisionFive 2][VisionFive 2] + - [旭日X3派][旭日X3派] + - [爱芯派][爱芯派] - etc with the following APIs @@ -81,59 +82,68 @@ with the following APIs You can visit the following Huggingface spaces to try `sherpa-onnx` without installing anything. All you need is a browser. -| Description | URL | -|---|---| -| Speech recognition | [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition)| -| Speech recognition with [Whisper](https://github.com/openai/whisper)| [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper)| -| Speech synthesis | [Click me](https://huggingface.co/spaces/k2-fsa/text-to-speech)| -| Generate subtitles| [Click me](https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos)| -|Audio tagging| [Click me](https://huggingface.co/spaces/k2-fsa/audio-tagging)| -|Spoken language identification with [Whisper](https://github.com/openai/whisper)|[Click me](https://huggingface.co/spaces/k2-fsa/spoken-language-identification)| +| Description | URL | +|-------------------------------------------------------|------------------------------------| +| Speech recognition | [Click me][hf-space-asr] | +| Speech recognition with [Whisper][Whisper] | [Click me][hf-space-asr-whisper] | +| Speech synthesis | [Click me][hf-space-tts] | +| Generate subtitles | [Click me][hf-space-subtitle] | +| Audio tagging | [Click me][hf-space-audio-tagging] | +| Spoken language identification with [Whisper][Whisper]| [Click me][hf-space-slid-whisper] | We also have spaces built using WebAssembly. The are listed below: -| Description | URL| Chinese users| -|---|---|---| -|Voice activity detection with [silero-vad](https://github.com/snakers4/silero-vad)| [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx)|[地址](https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx)| -|Real-time speech recognition (Chinese + English) with Zipformer | [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)| -|Real-time speech recognition (Chinese + English) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| -|Real-time speech recognition (Chinese + English + Cantonese) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| -|Real-time speech recognition (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en)| -|Speech synthesis (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en)| -|Speech synthesis (German)|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de)| +| Description | Huggingface space| ModelScope space| +|------------------------------------------------------------------------------------------|------------------|-----------------| +|Voice activity detection with [silero-vad][silero-vad] | [Click me][wasm-hf-vad]|[地址][wasm-ms-vad]| +|Real-time speech recognition (Chinese + English) with Zipformer | [Click me][wasm-hf-streaming-asr-zh-en-zipformer]|[地址][wasm-hf-streaming-asr-zh-en-zipformer]| +|Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| +|Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| +|Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| +|VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| +|VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| +|VAD + speech recognition (English) with Zipformer trained with [GigaSpeech][GigaSpeech] |[Click me][wasm-hf-vad-asr-en-zipformer-gigaspeech]| [地址][wasm-ms-vad-asr-en-zipformer-gigaspeech]| +|VAD + speech recognition (Chinese) with Zipformer trained with [WenetSpeech][WenetSpeech] |[Click me][wasm-hf-vad-asr-zh-zipformer-wenetspeech]| [地址][wasm-ms-vad-asr-zh-zipformer-wenetspeech]| +|VAD + speech recognition (Japanese) with Zipformer trained with [ReazonSpeech][ReazonSpeech]|[Click me][wasm-hf-vad-asr-ja-zipformer-reazonspeech]| [地址][wasm-ms-vad-asr-ja-zipformer-reazonspeech]| +|VAD + speech recognition (Thai) with Zipformer trained with [GigaSpeech2][GigaSpeech2] |[Click me][wasm-hf-vad-asr-th-zipformer-gigaspeech2]| [地址][wasm-ms-vad-asr-th-zipformer-gigaspeech2]| +|VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| +|Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| +|Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| ### Links for pre-built Android APKs -| Description | URL | 中国用户 | -|--------------------------------|-----------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html) | -| Text-to-speech | [Address](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html) | -|Voice activity detection (VAD) | [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html)| -|VAD + non-streaming speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html)| -|Two-pass speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html)| -| Audio tagging | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html) | -| Audio tagging (WearOS) | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html) | -| Speaker identification | [Address](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html) | -| Spoken language identification | [Address](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html) | -|Keyword spotting| [Address](https://k2-fsa.github.io/sherpa/onnx/kws/apk.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html)| +| Description | URL | 中国用户 | +|----------------------------------------|------------------------------|-----------------------------| +| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn]| +| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | +| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | +| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | +| Two-pass speech recognition | [Address][apk-2pass] | [点此][apk-2pass-cn] | +| Audio tagging | [Address][apk-at] | [点此][apk-at-cn] | +| Audio tagging (WearOS) | [Address][apk-at-wearos] | [点此][apk-at-wearos-cn] | +| Speaker identification | [Address][apk-sid] | [点此][apk-sid-cn] | +| Spoken language identification | [Address][apk-slid] | [点此][apk-slid-cn] | +| Keyword spotting | [Address][apk-kws] | [点此][apk-kws-cn] | ### Links for pre-built Flutter APPs #### Real-time speech recognition -| Description | URL | 中国用户 | -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------| -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html)| +| Description | URL | 中国用户 | +|--------------------------------|-------------------------------------|-------------------------------------| +| Streaming speech recognition | [Address][apk-flutter-streaming-asr]| [点此][apk-flutter-streaming-asr-cn]| #### Text-to-speech -| Description | URL | 中国用户 | -|--------------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------| -| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html)| -| Linux (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html) | -| macOS (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html) | -| macOS (arm64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html)| -| Windows (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html) | +| Description | URL | 中国用户 | +|------------------------------------------|------------------------------------|------------------------------------| +| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address][flutter-tts-android] | [点此][flutter-tts-android-cn] | +| Linux (x64) | [Address][flutter-tts-linux] | [点此][flutter-tts-linux-cn] | +| macOS (x64) | [Address][flutter-tts-macos-x64] | [点此][flutter-tts-macos-arm64-cn] | +| macOS (arm64) | [Address][flutter-tts-macos-arm64] | [点此][flutter-tts-macos-x64-cn] | +| Windows (x64) | [Address][flutter-tts-win-x64] | [点此][flutter-tts-win-x64-cn] | > Note: You need to build from source for iOS. @@ -141,23 +151,23 @@ We also have spaces built using WebAssembly. The are listed below: #### Generating subtitles -| Description | URL | 中国用户 | -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------| -| Generate subtitles (生成字幕) | [Address](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html)| +| Description | URL | 中国用户 | +|--------------------------------|----------------------------|----------------------------| +| Generate subtitles (生成字幕) | [Address][lazarus-subtitle]| [点此][lazarus-subtitle-cn]| ### Links for pre-trained models -| Description | URL | -|--------------------------------|--------------------------------------------------------------------------------------------------------------------------------| -| Speech recognition (speech to text, ASR) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | -| Text-to-speech (TTS) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) | -| VAD | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx)| -| Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)| -| Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)| -| Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)| -| Spoken language identification (Language ID) | See multi-lingual [Whisper](https://github.com/openai/whisper) ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | -| Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| +| Description | URL | +|---------------------------------------------|---------------------------------------------------------------------------------------| +| Speech recognition (speech to text, ASR) | [Address][asr-models] | +| Text-to-speech (TTS) | [Address][tts-models] | +| VAD | [Address][vad-models] | +| Keyword spotting | [Address][kws-models] | +| Audio tagging | [Address][at-models] | +| Speaker identification (Speaker ID) | [Address][sid-models] | +| Spoken language identification (Language ID)| See multi-lingual [Whisper][Whisper] ASR models from [Speech recognition][asr-models]| +| Punctuation | [Address][punct-models] | ### Useful links @@ -169,3 +179,100 @@ We also have spaces built using WebAssembly. The are listed below: Please see https://k2-fsa.github.io/sherpa/social-groups.html for 新一代 Kaldi **微信交流群** and **QQ 交流群**. + +[sherpa-rs]: https://github.com/thewh1teagle/sherpa-rs +[silero-vad]: https://github.com/snakers4/silero-vad +[Raspberry Pi]: https://www.raspberrypi.com/ +[RV1126]: https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf +[LicheePi4A]: https://sipeed.com/licheepi4a +[VisionFive 2]: https://www.starfivetech.com/en/site/boards +[旭日X3派]: https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html +[爱芯派]: https://wiki.sipeed.com/hardware/zh/maixIII/ax-pi/axpi.html +[hf-space-asr]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition +[Whisper]: https://github.com/openai/whisper +[hf-space-asr-whisper]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper +[hf-space-tts]: https://huggingface.co/spaces/k2-fsa/text-to-speech +[hf-space-subtitle]: https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos +[hf-space-audio-tagging]: https://huggingface.co/spaces/k2-fsa/audio-tagging +[hf-space-slid-whisper]: https://huggingface.co/spaces/k2-fsa/spoken-language-identification +[wasm-hf-vad]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx +[wasm-ms-vad]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx +[wasm-hf-streaming-asr-zh-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en +[wasm-ms-streaming-asr-zh-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en +[wasm-hf-streaming-asr-zh-en-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer +[wasm-ms-streaming-asr-zh-en-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer +[Paraformer-large]: https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary +[wasm-hf-streaming-asr-zh-en-yue-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer +[wasm-ms-streaming-asr-zh-en-yue-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer +[wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en +[wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en +[SenseVoice]: https://github.com/FunAudioLLM/SenseVoice +[wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice +[wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice +[wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny +[wasm-ms-vad-asr-en-whisper-tiny-en]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny +[wasm-hf-vad-asr-en-zipformer-gigaspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech +[wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech +[wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech +[wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech +[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf +[wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer +[wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer +[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 +[wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer +[wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer +[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR +[wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech +[wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech +[wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer +[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer +[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small +[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small +[wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en +[wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en +[wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de +[wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de +[apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html +[apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html +[apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html +[apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html +[apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html +[apk-vad-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html +[apk-vad-asr]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html +[apk-vad-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html +[apk-2pass]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html +[apk-2pass-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html +[apk-at]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html +[apk-at-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html +[apk-at-wearos]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html +[apk-at-wearos-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html +[apk-sid]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html +[apk-sid-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html +[apk-slid]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html +[apk-slid-cn]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html +[apk-kws]: https://k2-fsa.github.io/sherpa/onnx/kws/apk.html +[apk-kws-cn]: https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html +[apk-flutter-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html +[apk-flutter-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html +[flutter-tts-android]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html +[flutter-tts-android-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html +[flutter-tts-linux]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html +[flutter-tts-linux-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html +[flutter-tts-macos-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html +[flutter-tts-macos-arm64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html +[flutter-tts-macos-arm64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html +[flutter-tts-macos-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html +[flutter-tts-win-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html +[flutter-tts-win-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html +[lazarus-subtitle]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html +[lazarus-subtitle-cn]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html +[asr-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +[tts-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +[vad-models]: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +[kws-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models +[at-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +[sid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +[slid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +[punct-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models +[GigaSpeech]: https://github.com/SpeechColab/GigaSpeech +[WenetSpeech]: https://github.com/wenet-e2e/WenetSpeech diff --git a/build-wasm-simd-vad-asr.sh b/build-wasm-simd-vad-asr.sh new file mode 100755 index 000000000..5d15cf651 --- /dev/null +++ b/build-wasm-simd-vad-asr.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Copyright (c) 2024 Xiaomi Corporation +# +# This script is to build sherpa-onnx for WebAssembly (VAD+ASR) +# Note: ASR here means non-streaming ASR + +set -ex + +if [ x"$EMSCRIPTEN" == x"" ]; then + if ! command -v emcc &> /dev/null; then + echo "Please install emscripten first" + echo "" + echo "You can use the following commands to install it:" + echo "" + echo "git clone https://github.com/emscripten-core/emsdk.git" + echo "cd emsdk" + echo "git pull" + echo "./emsdk install latest" + echo "./emsdk activate latest" + echo "source ./emsdk_env.sh" + exit 1 + else + EMSCRIPTEN=$(dirname $(realpath $(which emcc))) + fi +fi + +export EMSCRIPTEN=$EMSCRIPTEN +echo "EMSCRIPTEN: $EMSCRIPTEN" +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" + echo "Please make sure you have installed emsdk correctly" + exit 1 +fi + +mkdir -p build-wasm-simd-vad-asr +pushd build-wasm-simd-vad-asr + +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON + +cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \ + \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=OFF \ + -DSHERPA_ONNX_ENABLE_TTS=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DSHERPA_ONNX_ENABLE_GPU=OFF \ + -DSHERPA_ONNX_ENABLE_WASM=ON \ + -DSHERPA_ONNX_ENABLE_WASM_VAD_ASR=ON \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \ + .. +make -j2 +make install + +echo "pwd: $PWD" + +cp -fv ../wasm/vad/sherpa-onnx-vad.js ./install/bin/wasm/vad-asr/ +cp -fv ../wasm/asr/sherpa-onnx-asr.js ./install/bin/wasm/vad-asr/ + +ls -lh install/bin/wasm/vad-asr diff --git a/scripts/wasm/generate-vad-asr.py b/scripts/wasm/generate-vad-asr.py new file mode 100755 index 000000000..4c0099af8 --- /dev/null +++ b/scripts/wasm/generate-vad-asr.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 + +import argparse +from dataclasses import dataclass +from typing import List, Optional + +import jinja2 + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--total", + type=int, + default=1, + help="Number of runners", + ) + parser.add_argument( + "--index", + type=int, + default=0, + help="Index of the current runner", + ) + return parser.parse_args() + + +@dataclass +class Model: + model_name: str + hf: str # huggingface space name + ms: str # modelscope space name + short_name: str + cmd: str = "" + + +def get_models(): + models = [ + Model( + model_name="sherpa-onnx-whisper-tiny.en", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny", + short_name="vad-asr-en-whisper_tiny", + cmd=""" + pushd $model_name + mv -v tiny.en-encoder.int8.onnx ../whisper-encoder.onnx + mv -v tiny.en-decoder.int8.onnx ../whisper-decoder.onnx + mv -v tiny.en-tokens.txt ../tokens.txt + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Whisper tiny.en supporting English 英文/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice", + short_name="vad-asr-zh_en_ja_ko_cantonese-sense_voice_small", + cmd=""" + pushd $model_name + mv -v model.int8.onnx ../sense-voice.onnx + mv -v tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/SenseVoice Small supporting English, Chinese, Japanese, Korean, Cantonese 中英日韩粤/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-paraformer-zh-2023-09-14", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer", + short_name="vad-asr-zh_en-paraformer_large", + cmd=""" + pushd $model_name + mv -v model.int8.onnx ../paraformer.onnx + mv -v tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Paraformer supporting Chinese, English 中英/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-paraformer-zh-small-2024-03-09", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small", + short_name="vad-asr-zh_en-paraformer_small", + cmd=""" + pushd $model_name + mv -v model.int8.onnx ../paraformer.onnx + mv -v tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Paraformer-small supporting Chinese, English 中英文/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-zipformer-gigaspeech-2023-12-12", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech", + short_name="vad-asr-en-zipformer_gigaspeech", + cmd=""" + pushd $model_name + mv encoder-epoch-30-avg-1.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-30-avg-1.onnx ../transducer-decoder.onnx + mv joiner-epoch-30-avg-1.int8.onnx ../transducer-joiner.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting English 英语/g' ../index.html + git diff + """, + ), + Model( + model_name="icefall-asr-zipformer-wenetspeech-20230615", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech", + short_name="vad-asr-zh-zipformer_wenetspeech", + cmd=""" + pushd $model_name + mv -v data/lang_char/tokens.txt ../ + mv -v exp/encoder-epoch-12-avg-4.int8.onnx ../transducer-encoder.onnx + mv -v exp/decoder-epoch-12-avg-4.onnx ../transducer-decoder.onnx + mv -v exp/joiner-epoch-12-avg-4.int8.onnx ../transducer-joiner.onnx + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting Chinese 中文/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer", + short_name="vad-asr-ja-zipformer_reazonspeech", + cmd=""" + pushd $model_name + mv encoder-epoch-99-avg-1.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-99-avg-1.onnx ../transducer-decoder.onnx + mv joiner-epoch-99-avg-1.int8.onnx ../transducer-joiner.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting Japanese 日语/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-zipformer-thai-2024-06-20", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer", + short_name="vad-asr-th-zipformer_gigaspeech2", + cmd=""" + pushd $model_name + mv encoder-epoch-12-avg-5.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-12-avg-5.onnx ../transducer-decoder.onnx + mv joiner-epoch-12-avg-5.int8.onnx ../transducer-joiner.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting Thai 泰语/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech", + short_name="vad-asr-zh-telespeech", + cmd=""" + pushd $model_name + mv model.int8.onnx ../telespeech.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/TeleSpeech-ASR supporting Chinese 多种中文方言/g' ../index.html + git diff + """, + ), + ] + return models + + +def main(): + args = get_args() + index = args.index + total = args.total + assert 0 <= index < total, (index, total) + + all_model_list = get_models() + + num_models = len(all_model_list) + + num_per_runner = num_models // total + if num_per_runner <= 0: + raise ValueError(f"num_models: {num_models}, num_runners: {total}") + + start = index * num_per_runner + end = start + num_per_runner + + remaining = num_models - args.total * num_per_runner + + print(f"{index}/{total}: {start}-{end}/{num_models}") + + d = dict() + d["model_list"] = all_model_list[start:end] + if index < remaining: + s = args.total * num_per_runner + index + d["model_list"].append(all_model_list[s]) + print(f"{s}/{num_models}") + + filename_list = [ + "./run-vad-asr.sh", + ] + for filename in filename_list: + environment = jinja2.Environment() + with open(f"{filename}.in") as f: + s = f.read() + template = environment.from_string(s) + + s = template.render(**d) + with open(filename, "w") as f: + print(s, file=f) + + +if __name__ == "__main__": + main() diff --git a/scripts/wasm/run-vad-asr.sh.in b/scripts/wasm/run-vad-asr.sh.in new file mode 100644 index 000000000..8d5e1d206 --- /dev/null +++ b/scripts/wasm/run-vad-asr.sh.in @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# +# Build WebAssembly APPs for huggingface spaces and modelscope spaces + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + +{% for model in model_list %} +model_name={{ model.model_name }} +short_name={{ model.short_name }} +hf_name={{ model.hf }} +ms_name={{ model.ms }} + +pushd wasm/vad-asr +git checkout . +rm -rf assets +mkdir assets +cd assets +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${model_name}.tar.bz2 +tar xvf ${model_name}.tar.bz2 +rm ${model_name}.tar.bz2 + +{{ model.cmd }} + +popd + +ls -lh wasm/vad-asr/assets + +rm -rf build-wasm-simd-vad-asr/install +rm -rf build-wasm-simd-vad-asr/wasm + +./build-wasm-simd-vad-asr.sh + +dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-${short_name} +mv build-wasm-simd-vad-asr/install/bin/wasm/vad-asr $dst +ls -lh $dst +tar cjfv $dst.tar.bz2 ./$dst +ls -lh *.tar.bz2 + +git config --global user.email "csukuangfj@gmail.com" +git config --global user.name "Fangjun Kuang" + +export GIT_LFS_SKIP_SMUDGE=1 +export GIT_CLONE_PROTECTION_ACTIVE=false + +rm -rf ms +git clone https://www.modelscope.cn/studios/$ms_name.git ms + +cd ms +cp -v ../$dst/* . + +git status +git lfs track "*.data" +git lfs track "*.wasm" +ls -lh + +git add . +git commit -m "update model" +git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/$ms_name.git +cd .. +rm -rf ms + +rm -rf huggingface + +git clone https://huggingface.co/spaces/$hf_name huggingface +cd huggingface +cp -v ../$dst/* . + +git status +git lfs track "*.data" +git lfs track "*.wasm" +ls -lh + +git add . +git commit -m "update model" +git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/$hf_name main +cd .. +rm -rf huggingface +rm -rf $dst + +ls -lh *.tar.bz2 + +{% endfor %} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index e01ae0478..f2bbf9d76 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -13,6 +13,7 @@ #include "sherpa-onnx/csrc/audio-tagging.h" #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" +#include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/keyword-spotter.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-punctuation.h" @@ -1638,3 +1639,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { p->impl->Reset(); } + +int32_t SherpaOnnxFileExists(const char *filename) { + return sherpa_onnx::FileExists(filename); +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 97b8d8081..d4844aed1 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1361,6 +1361,9 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( const SherpaOnnxLinearResampler *p); +// Return 1 if the file exists; return 0 if the file does not exist. +SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index 075dfbf8d..b143e57b8 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -14,6 +14,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD) add_subdirectory(vad) endif() +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) + add_subdirectory(vad-asr) +endif() + if(SHERPA_ONNX_ENABLE_WASM_NODEJS) add_subdirectory(nodejs) endif() diff --git a/wasm/asr/assets/README.md b/wasm/asr/assets/README.md index d37c431a7..983347f78 100644 --- a/wasm/asr/assets/README.md +++ b/wasm/asr/assets/README.md @@ -80,3 +80,10 @@ assets fangjun$ tree -L 1 0 directories, 4 files ``` + +You can find example build scripts at: + + - Streaming Zipformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/ wasm-simd-hf-space-zh-en-asr-zipformer.yaml + - Streaming Zipformer (English): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml + - Streaming Paraformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml + - Streaming Paraformer (English + Chinese + Cantonese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml diff --git a/wasm/asr/index.html b/wasm/asr/index.html index 3156321c6..53ee43d8f 100644 --- a/wasm/asr/index.html +++ b/wasm/asr/index.html @@ -3,7 +3,7 @@ - Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech + Next-gen Kaldi WebAssembly with sherpa-onnx for ASR + + + +

+ Next-gen Kaldi + WebAssembly
+ VAD+ASR Demo with sherpa-onnx
+ (with Zipformer) +

+ +
+ Loading model ... ... +
+
+ + + +
+
+ +
+ +
+
+ + + + + + diff --git a/wasm/vad-asr/sherpa-onnx-asr.js b/wasm/vad-asr/sherpa-onnx-asr.js new file mode 120000 index 000000000..fada5db1d --- /dev/null +++ b/wasm/vad-asr/sherpa-onnx-asr.js @@ -0,0 +1 @@ +../asr/sherpa-onnx-asr.js \ No newline at end of file diff --git a/wasm/vad-asr/sherpa-onnx-vad.js b/wasm/vad-asr/sherpa-onnx-vad.js new file mode 120000 index 000000000..47b3c8d0b --- /dev/null +++ b/wasm/vad-asr/sherpa-onnx-vad.js @@ -0,0 +1 @@ +../vad/sherpa-onnx-vad.js \ No newline at end of file diff --git a/wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc b/wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc new file mode 100644 index 000000000..1e2fc00b2 --- /dev/null +++ b/wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc @@ -0,0 +1,19 @@ +// wasm/sherpa-onnx-wasm-main-vad-asr.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +// see also +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html + +extern "C" { + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +} diff --git a/wasm/vad/assets/README.md b/wasm/vad/assets/README.md index 99510982a..3d5a76210 100644 --- a/wasm/vad/assets/README.md +++ b/wasm/vad/assets/README.md @@ -3,3 +3,6 @@ Please download https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. + +You can find example build script at +https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-silero-vad.yaml diff --git a/wasm/vad/index.html b/wasm/vad/index.html index 5d8e0372c..7ae2a76e6 100644 --- a/wasm/vad/index.html +++ b/wasm/vad/index.html @@ -3,7 +3,7 @@ - Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech + Next-gen Kaldi WebAssembly with sherpa-onnx for VAD