Add initial WIT API support

Signed-off-by: yshalsager <[email protected]>
BingLingGroup · Mar 28, 2022 · 1a30849 · 1a30849
1 parent f914db4
commit 1a30849
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -284,6 +284,11 @@ Supported formats below:
 - Supported
   - 16bit/16000Hz/mono PCM
 
+[Wit.ai Speech-to-Text API](https://wit.ai/docs/http/20200513#post__speech_link)
+
+- Supported
+  - 16bit/8000Hz/mono PCM
+
 Also, you can use the built-in audio pre-processing function though Google [doesn't recommend](https://cloud.google.com/speech-to-text/docs/best-practices) doing this. Honestly speaking, if your audio volume is not been standardized like too loud or too quiet, it's recommended to use some tools or just the built-in function to standardize it. The default [pre-processing commands](https://github.com/agermanidis/autosub/issues/40#issuecomment-509928060) depend on the ffmpeg-normalize and ffmpeg. The commands include three commands. The [first](https://trac.ffmpeg.org/wiki/AudioChannelManipulation) is for converting stereo to mono. The [second](https://superuser.com/questions/733061/reduce-background-noise-and-optimize-the-speech-from-an-audio-clip-using-ffmpeg) is for filtering out the sound not in the frequency of speech. The third is to normalize the audio to make sure it is not too loud or too quiet. If you are not satisfied with the default commands, you can also modified them yourself by input `-apc` option. Still, it currently only supports 24bit/44100Hz/mono FLAC format.
 
 If it is a subtitles file and you give the proper arguments, only translate it by py-googletrans.
@@ -648,6 +653,14 @@ command:
 autosub -sapi baidu -i input_file -sconf baidu_speech_config ...(other options)
 ```
 
+##### Wit.ai Speech-to-Text
+
+Use Wit.ai Speech-to-Text API key to transcribe.
+
+```
+autosub -i input_file -sapi witai -S lang_code -skey API_key ...(other options)
+```
+
 <escape><a href = "#TOC">&nbsp;↑&nbsp;</a></escape>
 
 ##### Translate Subtitles
@@ -795,6 +808,8 @@ Speech Options:
                         xfyun.cn/doc/asr/voicedictation/API.html). baidu:
                         Baidu Automatic Speech Recognition API
                         (https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily)
+                        witai: Wit.ai Speech Recognition API
+                        (https://wit.ai/docs/http/20200513#post__speech_link)
                         (arg_num = 1) (default: gsv2)
   -skey key, --speech-key key
                         The API key for Google Speech-to-Text API. (arg_num =
@@ -1061,7 +1076,7 @@ I won't add any new features unless I'm less busy in the future. However, pull r
 
 [issue #13](https://github.com/BingLingGroup/autosub/issues/13)
 
-Same as above. Currently won't add it. You can use batch/powershell/bash to implement it.
+Same as above. Currently, won't add it. You can use batch/powershell/bash to implement it.
 
 Example for batch:(working at current directory)
 

diff --git a/autosub/api_wit_ai.py b/autosub/api_wit_ai.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Defines WIT AI API used by autosub.
+"""
+
+# Import built-in modules
+import json
+import gettext
+import os
+
+# Import third-party modules
+import requests
+
+# Any changes to the path and your own modules
+from autosub import constants
+
+API_WIT_AI_TEXT = gettext.translation(domain=__name__,
+                                      localedir=constants.LOCALE_PATH,
+                                      languages=[constants.CURRENT_LOCALE],
+                                      fallback=True)
+
+_ = API_WIT_AI_TEXT.gettext
+
+
+def get_wit_ai_transcript(result_dict):
+    """
+    Function for getting transcript from WIT AI Speech-to-Text json format string result.
+    """
+    return result_dict['_text'] if '_text' in result_dict else result_dict['text']
+
+
+class WITAiAPI:  # pylint: disable=too-few-public-methods
+    """
+    Class for performing Speech-to-Text using Baidu ASR API.
+    """
+
+    def __init__(self,
+                 api_url,
+                 api_key,
+                 retries=3,
+                 is_keep=False,
+                 is_full_result=False):
+        # pylint: disable=too-many-arguments
+        self.retries = retries
+        self.api_url = api_url
+        self.api_key = api_key
+        self.is_keep = is_keep
+        self.is_full_result = is_full_result
+        self.headers = {
+            'authorization': f'Bearer {self.api_key}',
+            'accept': 'application/vnd.wit.20200513+json',
+            'content-type': 'audio/raw;encoding=signed-integer;bits=16;rate=8000;endian=little',
+        }
+
+    def __call__(self, filename):
+        try:  # pylint: disable=too-many-nested-blocks
+            with open(filename, mode='rb') as audio_file:
+                audio_data = audio_file.read()
+            if not self.is_keep:
+                os.remove(filename)
+            for _ in range(self.retries):
+                try:
+                    requests_result = requests.post(self.api_url, data=audio_data, headers=self.headers)
+                except requests.exceptions.ConnectionError:
+                    continue
+                requests_result_json = requests_result.content.decode("utf-8")
+                try:
+                    result_dict = json.loads(requests_result_json)
+                except ValueError:
+                    # no result
+                    continue
+
+                if not self.is_full_result:
+                    return get_wit_ai_transcript(result_dict)
+                return result_dict
+
+        except KeyboardInterrupt:
+            return None
+
+        return None
diff --git a/autosub/cmdline_utils.py b/autosub/cmdline_utils.py
@@ -299,6 +299,10 @@ def validate_speech_config(args):  # pylint: disable=too-many-branches, too-many
         elif "languageCode" in config_dict and config_dict["languageCode"]:
             args.speech_language = config_dict["languageCode"]
 
+    elif args.speech_api == "witai":
+        args.api_suffix = ".pcm"
+        args.api_sample_rate = 8000
+
     else:
         args.api_suffix = ".pcm"
         args.api_sample_rate = 16000
@@ -642,7 +646,10 @@ def fix_args(args):
                   "Now reset to {dmxcs}.").format(mxcs=args.max_continuous_silence,
                                                   dmxcs=constants.DEFAULT_CONTINUOUS_SILENCE))
             args.max_continuous_silence = constants.DEFAULT_CONTINUOUS_SILENCE
-
+    # Make sure witai convert command is correct
+    if args.speech_api == "witai":
+        args.api_suffix = ".pcm"
+        args.api_sample_rate = 8000
 
 def get_timed_text(
         is_empty_dropped,
@@ -1541,6 +1548,33 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
             concurrency=args.speech_concurrency,
             is_keep=False,
             result_list=result_list)
+    elif args.speech_api == "witai":
+        # WIT AI API
+        if args.http_speech_api:
+            wit_api_url = f"http://{constants.WIT_AI_API_URL}"
+        else:
+            wit_api_url = f"https://{constants.WIT_AI_API_URL}"
+        wit_api_key = None
+        if 'WIT_AI_API_KEY' in os.environ:
+            print(_("Using the WIT_AI_API_KEY "
+                    "in the environment variables."))
+            wit_api_key = os.environ.get('WIT_AI_API_KEY')
+        if args.speech_key:
+            print(_("Use the API key "
+                    "given in the option \"-skey\"/\"--speech-key\"."))
+            wit_api_key = args.speech_key
+        if wit_api_key:
+            text_list = core.wit_ai_to_text(
+                audio_fragments=audio_fragments,
+                api_url=wit_api_url,
+                api_key=wit_api_key,
+                concurrency=args.speech_concurrency,
+                is_keep=args.keep,
+                result_list=result_list)
+        else:
+            print(_("No available WIT_AI_API_KEY. "
+                    "Use \"-skey\"/\"--speech-key\" to set one."))
+            text_list = None
     else:
         text_list = None
 

diff --git a/autosub/constants.py b/autosub/constants.py
@@ -71,6 +71,7 @@
 BAIDU_ASR_URL = "http://vop.baidu.com/server_api"
 BAIDU_PRO_ASR_URL = "http://vop.baidu.com/pro_api"
 BAIDU_TOKEN_URL = "http://openapi.baidu.com/oauth/2.0/token"
+WIT_AI_API_URL = "api.wit.ai/speech"
 
 if multiprocessing.cpu_count() > 3:
     DEFAULT_CONCURRENCY = multiprocessing.cpu_count() >> 1

diff --git a/autosub/core.py b/autosub/core.py
@@ -24,6 +24,7 @@
 # Any changes to the path and your own modules
 from autosub import api_baidu
 from autosub import api_google
+from autosub import api_wit_ai
 from autosub import api_xfyun
 from autosub import auditok_utils
 from autosub import sub_utils
@@ -697,6 +698,75 @@ def baidu_to_text(  # pylint: disable=too-many-locals, too-many-arguments,
     return text_list
 
 
+def wit_ai_to_text(  # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements
+        audio_fragments,
+        api_url,
+        api_key,
+        concurrency=constants.DEFAULT_CONCURRENCY,
+        is_keep=False,
+        result_list=None):
+    """
+    Give a list of short-term audio fragment files
+    and generate text_list from WIT AI speech-to-text api.
+    """
+    text_list = []
+    pool = multiprocessing.Pool(concurrency)
+
+    recognizer = api_wit_ai.WITAiAPI(
+        api_url=api_url,
+        api_key=api_key,
+        is_keep=is_keep,
+        is_full_result=result_list is not None)
+
+    print(_("\nSending short-term fragments to WIT AI API and getting result."))
+    widgets = [_("Speech-to-Text: "),
+               progressbar.Percentage(), ' ',
+               progressbar.Bar(), ' ',
+               progressbar.ETA()]
+    pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(audio_fragments)).start()
+    try:
+        # get transcript
+        if result_list is None:
+            for i, transcript in enumerate(pool.imap(recognizer, audio_fragments)):
+                if transcript:
+                    text_list.append(transcript)
+                else:
+                    text_list.append("")
+                gc.collect(0)
+                pbar.update(i)
+        # get full result and transcript
+        else:
+            for i, result in enumerate(pool.imap(recognizer, audio_fragments)):
+                if result:
+                    result_list.append(result)
+                    transcript = \
+                        api_wit_ai.get_wit_ai_transcript(result)
+                    if transcript:
+                        text_list.append(result)
+                        continue
+                else:
+                    result_list.append("")
+                text_list.append("")
+                gc.collect(0)
+                pbar.update(i)
+        pbar.finish()
+        pool.terminate()
+        pool.join()
+
+    except (KeyboardInterrupt, AttributeError) as error:
+        pbar.finish()
+        pool.terminate()
+        pool.join()
+
+        if error == AttributeError:
+            print(
+                _("Error: Connection error happened too many times.\nAll work done."))
+
+        return None
+
+    return text_list
+
+
 def list_to_googletrans(  # pylint: disable=too-many-locals, too-many-arguments, too-many-branches, too-many-statements
         text_list,
         translator,

diff --git a/autosub/options.py b/autosub/options.py
@@ -246,7 +246,7 @@ def get_cmd_parser():  # pylint: disable=too-many-statements
         '-sapi', '--speech-api',
         metavar=_('API_code'),
         default='gsv2',
-        choices=["gsv2", "gcsv1", "xfyun", "baidu"],
+        choices=["gsv2", "gcsv1", "xfyun", "baidu", "witai"],
         help=_("Choose which Speech-to-Text API to use. "
                "Currently support: "
                "gsv2: Google Speech V2 (https://github.com/gillesdemey/google-speech-v2). "
@@ -256,6 +256,8 @@ def get_cmd_parser():  # pylint: disable=too-many-statements
                "(https://www.xfyun.cn/doc/asr/voicedictation/API.html). "
                "baidu: Baidu Automatic Speech Recognition API "
                "(https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily) "
+               "witai: Wit.ai Speech Recognition API "
+               "(https://wit.ai/docs/http/20200513#post__speech_link) "
                "(arg_num = 1) (default: %(default)s)"))
 
     speech_group.add_argument(