From 1a30849b551e08dbf1cc83a8c4bfb0e3be7ceb16 Mon Sep 17 00:00:00 2001 From: yshalsager Date: Wed, 14 Apr 2021 18:11:04 +0200 Subject: [PATCH] Add initial WIT API support Signed-off-by: yshalsager --- README.md | 17 ++++++++- autosub/api_wit_ai.py | 81 ++++++++++++++++++++++++++++++++++++++++ autosub/cmdline_utils.py | 36 +++++++++++++++++- autosub/constants.py | 1 + autosub/core.py | 70 ++++++++++++++++++++++++++++++++++ autosub/options.py | 4 +- 6 files changed, 206 insertions(+), 3 deletions(-) create mode 100644 autosub/api_wit_ai.py diff --git a/README.md b/README.md index e0b8cc43..179a9c76 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,11 @@ Supported formats below: - Supported - 16bit/16000Hz/mono PCM +[Wit.ai Speech-to-Text API](https://wit.ai/docs/http/20200513#post__speech_link) + +- Supported + - 16bit/8000Hz/mono PCM + Also, you can use the built-in audio pre-processing function though Google [doesn't recommend](https://cloud.google.com/speech-to-text/docs/best-practices) doing this. Honestly speaking, if your audio volume is not been standardized like too loud or too quiet, it's recommended to use some tools or just the built-in function to standardize it. The default [pre-processing commands](https://github.com/agermanidis/autosub/issues/40#issuecomment-509928060) depend on the ffmpeg-normalize and ffmpeg. The commands include three commands. The [first](https://trac.ffmpeg.org/wiki/AudioChannelManipulation) is for converting stereo to mono. The [second](https://superuser.com/questions/733061/reduce-background-noise-and-optimize-the-speech-from-an-audio-clip-using-ffmpeg) is for filtering out the sound not in the frequency of speech. The third is to normalize the audio to make sure it is not too loud or too quiet. If you are not satisfied with the default commands, you can also modified them yourself by input `-apc` option. Still, it currently only supports 24bit/44100Hz/mono FLAC format. If it is a subtitles file and you give the proper arguments, only translate it by py-googletrans. @@ -648,6 +653,14 @@ command: autosub -sapi baidu -i input_file -sconf baidu_speech_config ...(other options) ``` +##### Wit.ai Speech-to-Text + +Use Wit.ai Speech-to-Text API key to transcribe. + +``` +autosub -i input_file -sapi witai -S lang_code -skey API_key ...(other options) +``` +  ↑  ##### Translate Subtitles @@ -795,6 +808,8 @@ Speech Options: xfyun.cn/doc/asr/voicedictation/API.html). baidu: Baidu Automatic Speech Recognition API (https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily) + witai: Wit.ai Speech Recognition API + (https://wit.ai/docs/http/20200513#post__speech_link) (arg_num = 1) (default: gsv2) -skey key, --speech-key key The API key for Google Speech-to-Text API. (arg_num = @@ -1061,7 +1076,7 @@ I won't add any new features unless I'm less busy in the future. However, pull r [issue #13](https://github.com/BingLingGroup/autosub/issues/13) -Same as above. Currently won't add it. You can use batch/powershell/bash to implement it. +Same as above. Currently, won't add it. You can use batch/powershell/bash to implement it. Example for batch:(working at current directory) diff --git a/autosub/api_wit_ai.py b/autosub/api_wit_ai.py new file mode 100644 index 00000000..f8e08d90 --- /dev/null +++ b/autosub/api_wit_ai.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Defines WIT AI API used by autosub. +""" + +# Import built-in modules +import json +import gettext +import os + +# Import third-party modules +import requests + +# Any changes to the path and your own modules +from autosub import constants + +API_WIT_AI_TEXT = gettext.translation(domain=__name__, + localedir=constants.LOCALE_PATH, + languages=[constants.CURRENT_LOCALE], + fallback=True) + +_ = API_WIT_AI_TEXT.gettext + + +def get_wit_ai_transcript(result_dict): + """ + Function for getting transcript from WIT AI Speech-to-Text json format string result. + """ + return result_dict['_text'] if '_text' in result_dict else result_dict['text'] + + +class WITAiAPI: # pylint: disable=too-few-public-methods + """ + Class for performing Speech-to-Text using Baidu ASR API. + """ + + def __init__(self, + api_url, + api_key, + retries=3, + is_keep=False, + is_full_result=False): + # pylint: disable=too-many-arguments + self.retries = retries + self.api_url = api_url + self.api_key = api_key + self.is_keep = is_keep + self.is_full_result = is_full_result + self.headers = { + 'authorization': f'Bearer {self.api_key}', + 'accept': 'application/vnd.wit.20200513+json', + 'content-type': 'audio/raw;encoding=signed-integer;bits=16;rate=8000;endian=little', + } + + def __call__(self, filename): + try: # pylint: disable=too-many-nested-blocks + with open(filename, mode='rb') as audio_file: + audio_data = audio_file.read() + if not self.is_keep: + os.remove(filename) + for _ in range(self.retries): + try: + requests_result = requests.post(self.api_url, data=audio_data, headers=self.headers) + except requests.exceptions.ConnectionError: + continue + requests_result_json = requests_result.content.decode("utf-8") + try: + result_dict = json.loads(requests_result_json) + except ValueError: + # no result + continue + + if not self.is_full_result: + return get_wit_ai_transcript(result_dict) + return result_dict + + except KeyboardInterrupt: + return None + + return None diff --git a/autosub/cmdline_utils.py b/autosub/cmdline_utils.py index 6d5f3a3e..38ca458d 100644 --- a/autosub/cmdline_utils.py +++ b/autosub/cmdline_utils.py @@ -299,6 +299,10 @@ def validate_speech_config(args): # pylint: disable=too-many-branches, too-many elif "languageCode" in config_dict and config_dict["languageCode"]: args.speech_language = config_dict["languageCode"] + elif args.speech_api == "witai": + args.api_suffix = ".pcm" + args.api_sample_rate = 8000 + else: args.api_suffix = ".pcm" args.api_sample_rate = 16000 @@ -642,7 +646,10 @@ def fix_args(args): "Now reset to {dmxcs}.").format(mxcs=args.max_continuous_silence, dmxcs=constants.DEFAULT_CONTINUOUS_SILENCE)) args.max_continuous_silence = constants.DEFAULT_CONTINUOUS_SILENCE - + # Make sure witai convert command is correct + if args.speech_api == "witai": + args.api_suffix = ".pcm" + args.api_sample_rate = 8000 def get_timed_text( is_empty_dropped, @@ -1541,6 +1548,33 @@ def audio_or_video_prcs( # pylint: disable=too-many-branches, too-many-statemen concurrency=args.speech_concurrency, is_keep=False, result_list=result_list) + elif args.speech_api == "witai": + # WIT AI API + if args.http_speech_api: + wit_api_url = f"http://{constants.WIT_AI_API_URL}" + else: + wit_api_url = f"https://{constants.WIT_AI_API_URL}" + wit_api_key = None + if 'WIT_AI_API_KEY' in os.environ: + print(_("Using the WIT_AI_API_KEY " + "in the environment variables.")) + wit_api_key = os.environ.get('WIT_AI_API_KEY') + if args.speech_key: + print(_("Use the API key " + "given in the option \"-skey\"/\"--speech-key\".")) + wit_api_key = args.speech_key + if wit_api_key: + text_list = core.wit_ai_to_text( + audio_fragments=audio_fragments, + api_url=wit_api_url, + api_key=wit_api_key, + concurrency=args.speech_concurrency, + is_keep=args.keep, + result_list=result_list) + else: + print(_("No available WIT_AI_API_KEY. " + "Use \"-skey\"/\"--speech-key\" to set one.")) + text_list = None else: text_list = None diff --git a/autosub/constants.py b/autosub/constants.py index 513ca48d..91513a5f 100644 --- a/autosub/constants.py +++ b/autosub/constants.py @@ -71,6 +71,7 @@ BAIDU_ASR_URL = "http://vop.baidu.com/server_api" BAIDU_PRO_ASR_URL = "http://vop.baidu.com/pro_api" BAIDU_TOKEN_URL = "http://openapi.baidu.com/oauth/2.0/token" +WIT_AI_API_URL = "api.wit.ai/speech" if multiprocessing.cpu_count() > 3: DEFAULT_CONCURRENCY = multiprocessing.cpu_count() >> 1 diff --git a/autosub/core.py b/autosub/core.py index 52f4b7a0..7d1ad8c3 100644 --- a/autosub/core.py +++ b/autosub/core.py @@ -24,6 +24,7 @@ # Any changes to the path and your own modules from autosub import api_baidu from autosub import api_google +from autosub import api_wit_ai from autosub import api_xfyun from autosub import auditok_utils from autosub import sub_utils @@ -697,6 +698,75 @@ def baidu_to_text( # pylint: disable=too-many-locals, too-many-arguments, return text_list +def wit_ai_to_text( # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements + audio_fragments, + api_url, + api_key, + concurrency=constants.DEFAULT_CONCURRENCY, + is_keep=False, + result_list=None): + """ + Give a list of short-term audio fragment files + and generate text_list from WIT AI speech-to-text api. + """ + text_list = [] + pool = multiprocessing.Pool(concurrency) + + recognizer = api_wit_ai.WITAiAPI( + api_url=api_url, + api_key=api_key, + is_keep=is_keep, + is_full_result=result_list is not None) + + print(_("\nSending short-term fragments to WIT AI API and getting result.")) + widgets = [_("Speech-to-Text: "), + progressbar.Percentage(), ' ', + progressbar.Bar(), ' ', + progressbar.ETA()] + pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(audio_fragments)).start() + try: + # get transcript + if result_list is None: + for i, transcript in enumerate(pool.imap(recognizer, audio_fragments)): + if transcript: + text_list.append(transcript) + else: + text_list.append("") + gc.collect(0) + pbar.update(i) + # get full result and transcript + else: + for i, result in enumerate(pool.imap(recognizer, audio_fragments)): + if result: + result_list.append(result) + transcript = \ + api_wit_ai.get_wit_ai_transcript(result) + if transcript: + text_list.append(result) + continue + else: + result_list.append("") + text_list.append("") + gc.collect(0) + pbar.update(i) + pbar.finish() + pool.terminate() + pool.join() + + except (KeyboardInterrupt, AttributeError) as error: + pbar.finish() + pool.terminate() + pool.join() + + if error == AttributeError: + print( + _("Error: Connection error happened too many times.\nAll work done.")) + + return None + + return text_list + + def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, too-many-branches, too-many-statements text_list, translator, diff --git a/autosub/options.py b/autosub/options.py index e4271d66..6fac3c3f 100644 --- a/autosub/options.py +++ b/autosub/options.py @@ -246,7 +246,7 @@ def get_cmd_parser(): # pylint: disable=too-many-statements '-sapi', '--speech-api', metavar=_('API_code'), default='gsv2', - choices=["gsv2", "gcsv1", "xfyun", "baidu"], + choices=["gsv2", "gcsv1", "xfyun", "baidu", "witai"], help=_("Choose which Speech-to-Text API to use. " "Currently support: " "gsv2: Google Speech V2 (https://github.com/gillesdemey/google-speech-v2). " @@ -256,6 +256,8 @@ def get_cmd_parser(): # pylint: disable=too-many-statements "(https://www.xfyun.cn/doc/asr/voicedictation/API.html). " "baidu: Baidu Automatic Speech Recognition API " "(https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily) " + "witai: Wit.ai Speech Recognition API " + "(https://wit.ai/docs/http/20200513#post__speech_link) " "(arg_num = 1) (default: %(default)s)")) speech_group.add_argument(