Skip to content

Commit

Permalink
Add initial WIT API support
Browse files Browse the repository at this point in the history
Signed-off-by: yshalsager <[email protected]>
  • Loading branch information
yshalsager committed Mar 28, 2022
1 parent f914db4 commit 1a30849
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 3 deletions.
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ Supported formats below:
- Supported
- 16bit/16000Hz/mono PCM

[Wit.ai Speech-to-Text API](https://wit.ai/docs/http/20200513#post__speech_link)

- Supported
- 16bit/8000Hz/mono PCM

Also, you can use the built-in audio pre-processing function though Google [doesn't recommend](https://cloud.google.com/speech-to-text/docs/best-practices) doing this. Honestly speaking, if your audio volume is not been standardized like too loud or too quiet, it's recommended to use some tools or just the built-in function to standardize it. The default [pre-processing commands](https://github.com/agermanidis/autosub/issues/40#issuecomment-509928060) depend on the ffmpeg-normalize and ffmpeg. The commands include three commands. The [first](https://trac.ffmpeg.org/wiki/AudioChannelManipulation) is for converting stereo to mono. The [second](https://superuser.com/questions/733061/reduce-background-noise-and-optimize-the-speech-from-an-audio-clip-using-ffmpeg) is for filtering out the sound not in the frequency of speech. The third is to normalize the audio to make sure it is not too loud or too quiet. If you are not satisfied with the default commands, you can also modified them yourself by input `-apc` option. Still, it currently only supports 24bit/44100Hz/mono FLAC format.

If it is a subtitles file and you give the proper arguments, only translate it by py-googletrans.
Expand Down Expand Up @@ -648,6 +653,14 @@ command:
autosub -sapi baidu -i input_file -sconf baidu_speech_config ...(other options)
```

##### Wit.ai Speech-to-Text

Use Wit.ai Speech-to-Text API key to transcribe.

```
autosub -i input_file -sapi witai -S lang_code -skey API_key ...(other options)
```

<escape><a href = "#TOC">&nbsp;&nbsp;</a></escape>

##### Translate Subtitles
Expand Down Expand Up @@ -795,6 +808,8 @@ Speech Options:
xfyun.cn/doc/asr/voicedictation/API.html). baidu:
Baidu Automatic Speech Recognition API
(https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily)
witai: Wit.ai Speech Recognition API
(https://wit.ai/docs/http/20200513#post__speech_link)
(arg_num = 1) (default: gsv2)
-skey key, --speech-key key
The API key for Google Speech-to-Text API. (arg_num =
Expand Down Expand Up @@ -1061,7 +1076,7 @@ I won't add any new features unless I'm less busy in the future. However, pull r

[issue #13](https://github.com/BingLingGroup/autosub/issues/13)

Same as above. Currently won't add it. You can use batch/powershell/bash to implement it.
Same as above. Currently, won't add it. You can use batch/powershell/bash to implement it.

Example for batch:(working at current directory)

Expand Down
81 changes: 81 additions & 0 deletions autosub/api_wit_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Defines WIT AI API used by autosub.
"""

# Import built-in modules
import json
import gettext
import os

# Import third-party modules
import requests

# Any changes to the path and your own modules
from autosub import constants

API_WIT_AI_TEXT = gettext.translation(domain=__name__,
localedir=constants.LOCALE_PATH,
languages=[constants.CURRENT_LOCALE],
fallback=True)

_ = API_WIT_AI_TEXT.gettext


def get_wit_ai_transcript(result_dict):
"""
Function for getting transcript from WIT AI Speech-to-Text json format string result.
"""
return result_dict['_text'] if '_text' in result_dict else result_dict['text']


class WITAiAPI: # pylint: disable=too-few-public-methods
"""
Class for performing Speech-to-Text using Baidu ASR API.
"""

def __init__(self,
api_url,
api_key,
retries=3,
is_keep=False,
is_full_result=False):
# pylint: disable=too-many-arguments
self.retries = retries
self.api_url = api_url
self.api_key = api_key
self.is_keep = is_keep
self.is_full_result = is_full_result
self.headers = {
'authorization': f'Bearer {self.api_key}',
'accept': 'application/vnd.wit.20200513+json',
'content-type': 'audio/raw;encoding=signed-integer;bits=16;rate=8000;endian=little',
}

def __call__(self, filename):
try: # pylint: disable=too-many-nested-blocks
with open(filename, mode='rb') as audio_file:
audio_data = audio_file.read()
if not self.is_keep:
os.remove(filename)
for _ in range(self.retries):
try:
requests_result = requests.post(self.api_url, data=audio_data, headers=self.headers)
except requests.exceptions.ConnectionError:
continue
requests_result_json = requests_result.content.decode("utf-8")
try:
result_dict = json.loads(requests_result_json)
except ValueError:
# no result
continue

if not self.is_full_result:
return get_wit_ai_transcript(result_dict)
return result_dict

except KeyboardInterrupt:
return None

return None
36 changes: 35 additions & 1 deletion autosub/cmdline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,10 @@ def validate_speech_config(args): # pylint: disable=too-many-branches, too-many
elif "languageCode" in config_dict and config_dict["languageCode"]:
args.speech_language = config_dict["languageCode"]

elif args.speech_api == "witai":
args.api_suffix = ".pcm"
args.api_sample_rate = 8000

else:
args.api_suffix = ".pcm"
args.api_sample_rate = 16000
Expand Down Expand Up @@ -642,7 +646,10 @@ def fix_args(args):
"Now reset to {dmxcs}.").format(mxcs=args.max_continuous_silence,
dmxcs=constants.DEFAULT_CONTINUOUS_SILENCE))
args.max_continuous_silence = constants.DEFAULT_CONTINUOUS_SILENCE

# Make sure witai convert command is correct
if args.speech_api == "witai":
args.api_suffix = ".pcm"
args.api_sample_rate = 8000

def get_timed_text(
is_empty_dropped,
Expand Down Expand Up @@ -1541,6 +1548,33 @@ def audio_or_video_prcs( # pylint: disable=too-many-branches, too-many-statemen
concurrency=args.speech_concurrency,
is_keep=False,
result_list=result_list)
elif args.speech_api == "witai":
# WIT AI API
if args.http_speech_api:
wit_api_url = f"http://{constants.WIT_AI_API_URL}"
else:
wit_api_url = f"https://{constants.WIT_AI_API_URL}"
wit_api_key = None
if 'WIT_AI_API_KEY' in os.environ:
print(_("Using the WIT_AI_API_KEY "
"in the environment variables."))
wit_api_key = os.environ.get('WIT_AI_API_KEY')
if args.speech_key:
print(_("Use the API key "
"given in the option \"-skey\"/\"--speech-key\"."))
wit_api_key = args.speech_key
if wit_api_key:
text_list = core.wit_ai_to_text(
audio_fragments=audio_fragments,
api_url=wit_api_url,
api_key=wit_api_key,
concurrency=args.speech_concurrency,
is_keep=args.keep,
result_list=result_list)
else:
print(_("No available WIT_AI_API_KEY. "
"Use \"-skey\"/\"--speech-key\" to set one."))
text_list = None
else:
text_list = None

Expand Down
1 change: 1 addition & 0 deletions autosub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
BAIDU_ASR_URL = "http://vop.baidu.com/server_api"
BAIDU_PRO_ASR_URL = "http://vop.baidu.com/pro_api"
BAIDU_TOKEN_URL = "http://openapi.baidu.com/oauth/2.0/token"
WIT_AI_API_URL = "api.wit.ai/speech"

if multiprocessing.cpu_count() > 3:
DEFAULT_CONCURRENCY = multiprocessing.cpu_count() >> 1
Expand Down
70 changes: 70 additions & 0 deletions autosub/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# Any changes to the path and your own modules
from autosub import api_baidu
from autosub import api_google
from autosub import api_wit_ai
from autosub import api_xfyun
from autosub import auditok_utils
from autosub import sub_utils
Expand Down Expand Up @@ -697,6 +698,75 @@ def baidu_to_text( # pylint: disable=too-many-locals, too-many-arguments,
return text_list


def wit_ai_to_text( # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements
audio_fragments,
api_url,
api_key,
concurrency=constants.DEFAULT_CONCURRENCY,
is_keep=False,
result_list=None):
"""
Give a list of short-term audio fragment files
and generate text_list from WIT AI speech-to-text api.
"""
text_list = []
pool = multiprocessing.Pool(concurrency)

recognizer = api_wit_ai.WITAiAPI(
api_url=api_url,
api_key=api_key,
is_keep=is_keep,
is_full_result=result_list is not None)

print(_("\nSending short-term fragments to WIT AI API and getting result."))
widgets = [_("Speech-to-Text: "),
progressbar.Percentage(), ' ',
progressbar.Bar(), ' ',
progressbar.ETA()]
pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(audio_fragments)).start()
try:
# get transcript
if result_list is None:
for i, transcript in enumerate(pool.imap(recognizer, audio_fragments)):
if transcript:
text_list.append(transcript)
else:
text_list.append("")
gc.collect(0)
pbar.update(i)
# get full result and transcript
else:
for i, result in enumerate(pool.imap(recognizer, audio_fragments)):
if result:
result_list.append(result)
transcript = \
api_wit_ai.get_wit_ai_transcript(result)
if transcript:
text_list.append(result)
continue
else:
result_list.append("")
text_list.append("")
gc.collect(0)
pbar.update(i)
pbar.finish()
pool.terminate()
pool.join()

except (KeyboardInterrupt, AttributeError) as error:
pbar.finish()
pool.terminate()
pool.join()

if error == AttributeError:
print(
_("Error: Connection error happened too many times.\nAll work done."))

return None

return text_list


def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, too-many-branches, too-many-statements
text_list,
translator,
Expand Down
4 changes: 3 additions & 1 deletion autosub/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def get_cmd_parser(): # pylint: disable=too-many-statements
'-sapi', '--speech-api',
metavar=_('API_code'),
default='gsv2',
choices=["gsv2", "gcsv1", "xfyun", "baidu"],
choices=["gsv2", "gcsv1", "xfyun", "baidu", "witai"],
help=_("Choose which Speech-to-Text API to use. "
"Currently support: "
"gsv2: Google Speech V2 (https://github.com/gillesdemey/google-speech-v2). "
Expand All @@ -256,6 +256,8 @@ def get_cmd_parser(): # pylint: disable=too-many-statements
"(https://www.xfyun.cn/doc/asr/voicedictation/API.html). "
"baidu: Baidu Automatic Speech Recognition API "
"(https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily) "
"witai: Wit.ai Speech Recognition API "
"(https://wit.ai/docs/http/20200513#post__speech_link) "
"(arg_num = 1) (default: %(default)s)"))

speech_group.add_argument(
Expand Down

0 comments on commit 1a30849

Please sign in to comment.