Uberi · ftnext · May 18, 2025 · May 18, 2025 · May 18, 2025 · May 18, 2025
diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -280,8 +280,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot
 
 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
 
-``recognizer_instance.recognize_vosk(audio_data: AudioData)``
---------------------------------------------------------------
+``recognizer_instance.recognize_vosk(audio_data: AudioData, *, verbose: bool = False) -> Union[str, Dict[str, Any]]``
+------------------------------------------------------------------------------------------------------------------------------
 
 .. autofunction:: speech_recognition.recognizers.vosk.recognize
 

diff --git a/speech_recognition/recognizers/vosk.py b/speech_recognition/recognizers/vosk.py
@@ -1,17 +1,39 @@
 from __future__ import annotations
 
+import json
 import os
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal, TypedDict, Union, cast, overload
 
 if TYPE_CHECKING:
     from speech_recognition.audio import AudioData
 
 
-def recognize(recognizer, audio_data: AudioData) -> str:
+class VoskResponse(TypedDict):
+    text: str
+
+
+@overload
+def recognize(
+    recognizer, audio_data: AudioData, *, verbose: Literal[False]
+) -> str: ...
+
+
+@overload
+def recognize(
+    recognizer, audio_data: AudioData, *, verbose: Literal[True]
+) -> VoskResponse: ...
+
+
+def recognize(
+    recognizer, audio_data: AudioData, *, verbose: bool = False
-    recognizer, audio_data: AudioData, *, verbose: Literal[False]
-) -> str: ...
-
-
-@overload
-def recognize(
-    recognizer, audio_data: AudioData, *, verbose: Literal[True]
-) -> VoskResponse: ...
-
-
-def recognize(
-    recognizer, audio_data: AudioData, *, verbose: bool = False
+    _recognizer, audio_data: AudioData, *, verbose: Literal[False]
+) -> str: ...
+
+
+@overload
+def recognize(
+    _recognizer, audio_data: AudioData, *, verbose: Literal[True]
+) -> VoskResponse: ...
+
+
+def recognize(
+    _recognizer, audio_data: AudioData, *, verbose: bool = False
-    recognizer, audio_data: AudioData, *, verbose: Literal[False]
-) -> str: ...
-
-
-@overload
-def recognize(
-    recognizer, audio_data: AudioData, *, verbose: Literal[True]
-) -> VoskResponse: ...
-
-
-def recognize(
-    recognizer, audio_data: AudioData, *, verbose: bool = False
+    _recognizer, audio_data: AudioData, *, verbose: Literal[False]
+) -> str: ...
+
+
+@overload
+def recognize(
+    _recognizer, audio_data: AudioData, *, verbose: Literal[True]
+) -> VoskResponse: ...
+
+
+def recognize(
+    _recognizer, audio_data: AudioData, *, verbose: bool = False
+) -> Union[str, VoskResponse]:
     """
     Perform speech recognition on ``audio_data`` using Vosk.
 
     Requires the Vosk model to be downloaded and unpacked in a folder named 'model' (``$PWD/model``).
+
+    If ``verbose`` is ``False`` (default), only the recognized text is returned.
+    If ``verbose`` is ``True``, the parsed result dictionary from Vosk is returned.
-    If ``verbose`` is ``False`` (default), only the recognized text is returned.
-    If ``verbose`` is ``True``, the parsed result dictionary from Vosk is returned.
+    :param recognizer: The recognizer instance performing the recognition.
+    :param audio_data: The audio data to recognize, provided as an instance of ``AudioData``.
+    :param verbose: If ``False`` (default), only the recognized text is returned. If ``True``, the parsed result dictionary from Vosk is returned.
+    :return: The recognized text as a string if ``verbose`` is ``False``, or the parsed result dictionary if ``verbose`` is ``True``.
-    If ``verbose`` is ``False`` (default), only the recognized text is returned.
-    If ``verbose`` is ``True``, the parsed result dictionary from Vosk is returned.
+    :param recognizer: The recognizer instance performing the recognition.
+    :param audio_data: The audio data to recognize, provided as an instance of ``AudioData``.
+    :param verbose: If ``False`` (default), only the recognized text is returned. If ``True``, the parsed result dictionary from Vosk is returned.
+    :return: The recognized text as a string if ``verbose`` is ``False``, or the parsed result dictionary if ``verbose`` is ``True``.
     """
 
     from vosk import KaldiRecognizer, Model
@@ -25,6 +47,10 @@ def recognize(recognizer, audio_data: AudioData) -> str:
     rec.AcceptWaveform(
         audio_data.get_raw_data(convert_rate=SAMPLE_RATE, convert_width=2)
     )
-    finalRecognition = rec.FinalResult()
+    final_recognition: str = rec.FinalResult()
+
+    result = cast(VoskResponse, json.loads(final_recognition))
+    if verbose:
+        return result
 
-    return finalRecognition
+    return result["text"]
diff --git a/tests/recognizers/test_vosk.py b/tests/recognizers/test_vosk.py
@@ -1,18 +1,25 @@
 from pathlib import Path
 
+import pytest
+
 from speech_recognition import AudioData, Recognizer
 
 
-def test_recognize_vosk():
+@pytest.fixture
+def audio_data() -> AudioData:
     audio_file = str(Path(__file__).parent.parent / "english.wav")
-    audio_data = AudioData.from_file(audio_file)
-    sut = Recognizer()
+    return AudioData.from_file(audio_file)
+
+
+def test_recognize_vosk(audio_data):
+    recognizer = Recognizer()
+    actual = recognizer.recognize_vosk(audio_data)
+
+    assert actual == "one two three"
+
 
-    actual = sut.recognize_vosk(audio_data)
+def test_recognize_vosk_verbose(audio_data):
+    recognizer = Recognizer()
+    actual = recognizer.recognize_vosk(audio_data, verbose=True)
 
-    expected = """\
-{
-  "text" : "one two three"
-}\
-"""
-    assert actual == expected
+    assert actual == {"text": "one two three"}