Uberi · Uberi · Apr 3, 2019 · Oct 24, 2018 · Oct 25, 2018 · Oct 25, 2018
diff --git a/README.rst b/README.rst
@@ -29,7 +29,8 @@ Speech recognition engine/API support:
 * Google Speech Recognition
 * `Google Cloud Speech API <https://cloud.google.com/speech/>`__
 * `Wit.ai <https://wit.ai/>`__
-* `Microsoft Bing Voice Recognition <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
+* `Microsoft Azure Speech <https://azure.microsoft.com/en-us/services/cognitive-services/speech/>`__
+* `Microsoft Bing Voice Recognition (Deprecated) <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
 * `Houndify API <https://houndify.com/>`__
 * `IBM Speech to Text <http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/speech-to-text.html>`__
 * `Snowboy Hotword Detection <https://snowboy.kitt.ai/>`__ (works offline)

diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py
@@ -50,6 +50,15 @@
 except sr.RequestError as e:
     print("Could not request results from Wit.ai service; {0}".format(e))
 
+# recognize speech using Microsoft Azure Speech
+AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE"  # Microsoft Speech API keys 32-character lowercase hexadecimal strings
+try:
+    print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY))
+except sr.UnknownValueError:
+    print("Microsoft Azure Speech could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Microsoft Azure Speech service; {0}".format(e))
+
 # recognize speech using Microsoft Bing Voice Recognition
 BING_KEY = "INSERT BING API KEY HERE"  # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings
 try:

diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py
@@ -56,6 +56,15 @@
 except sr.RequestError as e:
     print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e))
 
+# recognize speech using Microsoft Azure Speech
+AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE"  # Microsoft Speech API keys 32-character lowercase hexadecimal strings
+try:
+    print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY))
+except sr.UnknownValueError:
+    print("Microsoft Azure Speech could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Microsoft Azure Speech service; {0}".format(e))
+
 # recognize speech using Houndify
 HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE"  # Houndify client IDs are Base64-encoded strings
 HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE"  # Houndify client keys are Base64-encoded strings

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1018,6 +1018,100 @@ def recognize_wit(self, audio_data, key, show_all=False):
         if "_text" not in result or result["_text"] is None: raise UnknownValueError()
         return result["_text"]
 
+    def recognize_azure(self, audio_data, key, language="en-US", result_format="simple", profanity="masked", location="westus", show_all=False):
+        """
+        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Azure Speech API.
+
+        The Microsoft Azure Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure.
+
+        To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings.
+
+        The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
+
+        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
+
+        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
+        """
+        assert isinstance(audio_data, AudioData), "Data must be audio data"
+        assert isinstance(key, str), "``key`` must be a string"
+        assert isinstance(result_format, str), "``format`` must be a string"
+        assert isinstance(language, str), "``language`` must be a string"
+
+        access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None)
+        allow_caching = True
+        try:
+            from time import monotonic  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
+        except ImportError:
+            try:
+                from monotonic import monotonic  # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic)
+            except (ImportError, RuntimeError):
+                expire_time = None  # monotonic time not available, don't cache access tokens
+                allow_caching = False  # don't allow caching, since monotonic time isn't available
+        if expire_time is None or monotonic() > expire_time:  # caching not enabled, first credential request, or the access token from the previous one expired
+            # get an access token using OAuth
+            credential_url = "https://" + location + ".api.cognitive.microsoft.com/sts/v1.0/issueToken"
+            credential_request = Request(credential_url, data=b"", headers={
+                "Content-type": "application/x-www-form-urlencoded",
+                "Content-Length": "0",
+                "Ocp-Apim-Subscription-Key": key,
+            })
+
+            if allow_caching:
+                start_time = monotonic()
+
+            try:
+                credential_response = urlopen(credential_request, timeout=60)  # credential response can take longer, use longer timeout instead of default one
+            except HTTPError as e:
+                raise RequestError("credential request failed: {}".format(e.reason))
+            except URLError as e:
+                raise RequestError("credential connection failed: {}".format(e.reason))
+            access_token = credential_response.read().decode("utf-8")
+
+            if allow_caching:
+                # save the token for the duration it is valid for
+                self.azure_cached_access_token = access_token
+                self.azure_cached_access_token_expiry = start_time + 600  # according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes
+
+        wav_data = audio_data.get_wav_data(
+            convert_rate=16000,  # audio samples must be 8kHz or 16 kHz
+            convert_width=2  # audio samples should be 16-bit
+        )
+
+        url = "https://" + location + ".stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{}".format(urlencode({
+            "language": language,
+            "format": result_format,
+            "profanity": profanity
+        }))
+
+        if sys.version_info >= (3, 6):  # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible
+            request = Request(url, data=io.BytesIO(wav_data), headers={
+                "Authorization": "Bearer {}".format(access_token),
+                "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
+                "Transfer-Encoding": "chunked",
+            })
+        else:  # fall back on manually formatting the POST body as a chunked request
+            ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8")
+            chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n"
+            request = Request(url, data=chunked_transfer_encoding_data, headers={
+                "Authorization": "Bearer {}".format(access_token),
+                "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
+                "Transfer-Encoding": "chunked",
+            })
+
+        try:
+            response = urlopen(request, timeout=self.operation_timeout)
+        except HTTPError as e:
+            raise RequestError("recognition request failed: {}".format(e.reason))
+        except URLError as e:
+            raise RequestError("recognition connection failed: {}".format(e.reason))
+        response_text = response.read().decode("utf-8")
+        result = json.loads(response_text)
+
+        # return results
+        if show_all: return result
+        if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError()
+        return result["DisplayText"]
+
     def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
         """
         Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API.