Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Speech recognition engine/API support:
* `Tensorflow <https://www.tensorflow.org/>`__
* `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
* `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)
* `Speechmatics ASR API <https://portal.speechmatics.com/>`__

**Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.

Expand Down Expand Up @@ -95,6 +96,7 @@ To use all of the functionality of the library, you should have:
* **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
* **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
* **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)
* **Speechmatics** (required only if you need to use Speechmatics ``recognizer_instance.recognize_speechmatics``)

The following requirements are optional, but can improve or extend functionality in some situations:

Expand Down Expand Up @@ -169,6 +171,12 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins

You can install it with ``python3 -m pip install git+https://github.com/openai/whisper.git soundfile``.

Speechmatics (for Speechmatics users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Speechmatics is **required if and only if you want to use speechmatics** (``recognizer_instance.recognize_speechmatics``).

You can install it with ``python3 -m pip install speechmatics-python``. You will also need an API key from `<https://portal.speechmatics.com/manage-access/>__`.

Troubleshooting
---------------

Expand Down
10 changes: 10 additions & 0 deletions examples/audio_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@
with sr.AudioFile(AUDIO_FILE) as source:
audio = r.record(source) # read the entire audio file

# recognize speech using Speechmatics
SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE"
try:
print("Speechmatics thinks you said " + r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY))
except sr.UnknownValueError:
print("Speechmatics could not understand audio")
except sr.RequestError as e:
print("Could not request results from the Speechmatics service; {0}".format(e))


# recognize speech using Sphinx
try:
print("Sphinx thinks you said " + r.recognize_sphinx(audio))
Expand Down
10 changes: 10 additions & 0 deletions examples/extended_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@
with sr.AudioFile(AUDIO_FILE) as source:
audio = r.record(source) # read the entire audio file

# recognize speech using Speechmatics
SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE"
try:
print("Speechmatics results:")
pprint(r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY, transcript_format="json-v2"))
except sr.UnknownValueError:
print("Speechmatics could not understand audio")
except sr.RequestError as e:
print("Speechmatics error; {0}".format(e))

# recognize speech using Sphinx
try:
print("Sphinx thinks you said " + r.recognize_sphinx(audio))
Expand Down
9 changes: 9 additions & 0 deletions examples/microphone_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@
print("Say something!")
audio = r.listen(source)

# recognize speech using Speechmatics
SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE"
try:
print("Speechmatics thinks you said " + r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY))
except sr.UnknownValueError:
print("Speechmatics could not understand audio")
except sr.RequestError as e:
print("Could not request results from Speechmatics service; {0}".format(e))

# recognize speech using Sphinx
try:
print("Sphinx thinks you said " + r.recognize_sphinx(audio))
Expand Down
41 changes: 41 additions & 0 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,6 +1702,47 @@ def recognize_vosk(self, audio_data, language='en'):

return finalRecognition

def recognize_speechmatics(self, audio_data, key=None, language="en", transcript_format="txt"):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Speechmatics ASR

The key value is your speechmatics API key. You can get an API key by creating an account and signing into the portal at https://portal.speechmatics.com/manage-access/.

The recognition language is determined by ``language``, an RFC5646 language tag like "en" or "es". The full list of supported languages can be found at https://docs.speechmatics.com/introduction/supported-languages.

Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'`

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
assert isinstance(audio_data, AudioData), "Data must be audio data"
assert isinstance(key, str), "``key`` must be a string"

try:
from speechmatics.models import ConnectionSettings, BatchTranscriptionConfig
from speechmatics.batch_client import BatchClient
from speechmatics.constants import BATCH_SELF_SERVICE_URL
except:
raise RequestError("missing speechmatics python module: install using `pip install speechmatics-python`")

wav_data = audio_data.get_wav_data(
convert_rate=None if audio_data.sample_rate >= 16000 else 16000 # audio samples must be at least 16 kHz

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might know better than me - is this correct? Do we deal with sample rate >= 16000 ourselves?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure, let me check with some of the heads

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked Dan, he reckons we don't get anything less than 8000, but may be able to support lower than that. It's not documented anywhere that I can see. Maybe we just let the runtime deal with it and pass any errors to users?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah sounds good to me, we can keep an eye on the repo and see if anyone is having issues if it gets merged

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed that now, will resolve and merge and then open a PR into the upstream

)
audio_input = ("audio_file.wav", wav_data)
settings = ConnectionSettings(
url=BATCH_SELF_SERVICE_URL,
auth_token=key,
)
conf = BatchTranscriptionConfig(
language=language,
)
with BatchClient(settings) as client:
job_id = client.submit_job(
audio=audio_input,
transcription_config=conf,
)
transcript = client.wait_for_completion(job_id, transcription_format=transcript_format)
return transcript

def get_flac_converter():
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
flac_converter = shutil_which("flac") # check for installed version first
Expand Down
18 changes: 18 additions & 0 deletions tests/test_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,24 @@ def test_google_chinese(self):
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚")

@unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable")
def test_speechmatics_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"]), "One, two, three.")

@unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable")
def test_speechmatics_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="fr"), u"C'est la dictée numéro un.")

@unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable")
def test_speechmatics_mandarin(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="cmn"), u"砸自己的脚。")

@unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable")
def test_wit_english(self):
r = sr.Recognizer()
Expand Down