Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Bing Speech API with Azure Speech API #389

Merged
merged 7 commits into from
Apr 3, 2019
3 changes: 2 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ Speech recognition engine/API support:
* Google Speech Recognition
* `Google Cloud Speech API <https://cloud.google.com/speech/>`__
* `Wit.ai <https://wit.ai/>`__
* `Microsoft Bing Voice Recognition <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
* `Microsoft Azure Speech <https://azure.microsoft.com/en-us/services/cognitive-services/speech/>`__
* `Microsoft Bing Voice Recognition (Deprecated) <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
* `Houndify API <https://houndify.com/>`__
* `IBM Speech to Text <http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/speech-to-text.html>`__
* `Snowboy Hotword Detection <https://snowboy.kitt.ai/>`__ (works offline)
Expand Down
9 changes: 9 additions & 0 deletions examples/audio_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@
except sr.RequestError as e:
print("Could not request results from Wit.ai service; {0}".format(e))

# recognize speech using Microsoft Azure Speech
AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings
try:
print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY))
except sr.UnknownValueError:
print("Microsoft Azure Speech could not understand audio")
except sr.RequestError as e:
print("Could not request results from Microsoft Azure Speech service; {0}".format(e))

# recognize speech using Microsoft Bing Voice Recognition
BING_KEY = "INSERT BING API KEY HERE" # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings
try:
Expand Down
9 changes: 9 additions & 0 deletions examples/microphone_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@
except sr.RequestError as e:
print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e))

# recognize speech using Microsoft Azure Speech
AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings
try:
print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY))
except sr.UnknownValueError:
print("Microsoft Azure Speech could not understand audio")
except sr.RequestError as e:
print("Could not request results from Microsoft Azure Speech service; {0}".format(e))

# recognize speech using Houndify
HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" # Houndify client IDs are Base64-encoded strings
HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE" # Houndify client keys are Base64-encoded strings
Expand Down
94 changes: 94 additions & 0 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,100 @@ def recognize_wit(self, audio_data, key, show_all=False):
if "_text" not in result or result["_text"] is None: raise UnknownValueError()
return result["_text"]

def recognize_azure(self, audio_data, key, language="en-US", result_format="simple", profanity="masked", location="westus", show_all=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Azure Speech API.

The Microsoft Azure Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure.

To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings.

The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".

Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
assert isinstance(audio_data, AudioData), "Data must be audio data"
assert isinstance(key, str), "``key`` must be a string"
assert isinstance(result_format, str), "``format`` must be a string"
assert isinstance(language, str), "``language`` must be a string"

access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None)
allow_caching = True
try:
from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
except ImportError:
try:
from monotonic import monotonic # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic)
except (ImportError, RuntimeError):
expire_time = None # monotonic time not available, don't cache access tokens
allow_caching = False # don't allow caching, since monotonic time isn't available
if expire_time is None or monotonic() > expire_time: # caching not enabled, first credential request, or the access token from the previous one expired
# get an access token using OAuth
credential_url = "https://" + location + ".api.cognitive.microsoft.com/sts/v1.0/issueToken"
credential_request = Request(credential_url, data=b"", headers={
"Content-type": "application/x-www-form-urlencoded",
"Content-Length": "0",
"Ocp-Apim-Subscription-Key": key,
})

if allow_caching:
start_time = monotonic()

try:
credential_response = urlopen(credential_request, timeout=60) # credential response can take longer, use longer timeout instead of default one
except HTTPError as e:
raise RequestError("credential request failed: {}".format(e.reason))
except URLError as e:
raise RequestError("credential connection failed: {}".format(e.reason))
access_token = credential_response.read().decode("utf-8")

if allow_caching:
# save the token for the duration it is valid for
self.azure_cached_access_token = access_token
self.azure_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes

wav_data = audio_data.get_wav_data(
convert_rate=16000, # audio samples must be 8kHz or 16 kHz
convert_width=2 # audio samples should be 16-bit
)

url = "https://" + location + ".stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{}".format(urlencode({
"language": language,
"format": result_format,
"profanity": profanity
}))

if sys.version_info >= (3, 6): # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible
request = Request(url, data=io.BytesIO(wav_data), headers={
"Authorization": "Bearer {}".format(access_token),
"Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
"Transfer-Encoding": "chunked",
})
else: # fall back on manually formatting the POST body as a chunked request
ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8")
chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n"
request = Request(url, data=chunked_transfer_encoding_data, headers={
"Authorization": "Bearer {}".format(access_token),
"Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
"Transfer-Encoding": "chunked",
})

try:
response = urlopen(request, timeout=self.operation_timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError("recognition connection failed: {}".format(e.reason))
response_text = response.read().decode("utf-8")
result = json.loads(response_text)

# return results
if show_all: return result
if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError()
return result["DisplayText"]

def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API.
Expand Down