Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions reference/library-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.

``recognizer_instance.recognize_vosk(audio_data: AudioData)``
--------------------------------------------------------------
``recognizer_instance.recognize_vosk(audio_data: AudioData, *, verbose: bool = False) -> Union[str, Dict[str, Any]]``
Copy link

Copilot AI May 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The docs specify a generic Dict[str, Any] return type, but the code returns a specific VoskResponse TypedDict. Updating this to the precise type (or at least Dict[str, str]) would improve clarity.

Copilot uses AI. Check for mistakes.
------------------------------------------------------------------------------------------------------------------------------

.. autofunction:: speech_recognition.recognizers.vosk.recognize

Expand Down
34 changes: 30 additions & 4 deletions speech_recognition/recognizers/vosk.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,39 @@
from __future__ import annotations

import json
import os
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal, TypedDict, Union, cast, overload

if TYPE_CHECKING:
from speech_recognition.audio import AudioData


def recognize(recognizer, audio_data: AudioData) -> str:
class VoskResponse(TypedDict):
text: str


@overload
def recognize(
recognizer, audio_data: AudioData, *, verbose: Literal[False]
) -> str: ...


@overload
def recognize(
recognizer, audio_data: AudioData, *, verbose: Literal[True]
) -> VoskResponse: ...


def recognize(
recognizer, audio_data: AudioData, *, verbose: bool = False
Copy link

Copilot AI May 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The recognizer parameter is declared but unused in the function body, which could confuse readers. Consider renaming it to _recognizer or adding a brief comment explaining it’s required for interface consistency.

Suggested change
recognizer, audio_data: AudioData, *, verbose: Literal[False]
) -> str: ...
@overload
def recognize(
recognizer, audio_data: AudioData, *, verbose: Literal[True]
) -> VoskResponse: ...
def recognize(
recognizer, audio_data: AudioData, *, verbose: bool = False
_recognizer, audio_data: AudioData, *, verbose: Literal[False]
) -> str: ...
@overload
def recognize(
_recognizer, audio_data: AudioData, *, verbose: Literal[True]
) -> VoskResponse: ...
def recognize(
_recognizer, audio_data: AudioData, *, verbose: bool = False

Copilot uses AI. Check for mistakes.
) -> Union[str, VoskResponse]:
"""
Perform speech recognition on ``audio_data`` using Vosk.

Requires the Vosk model to be downloaded and unpacked in a folder named 'model' (``$PWD/model``).

If ``verbose`` is ``False`` (default), only the recognized text is returned.
If ``verbose`` is ``True``, the parsed result dictionary from Vosk is returned.
Comment on lines +35 to +36
Copy link

Copilot AI May 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The docstring describes the verbose flag but doesn't use a structured :param verbose: block. Adding a proper parameter section could make the docs more consistent and easier to parse.

Suggested change
If ``verbose`` is ``False`` (default), only the recognized text is returned.
If ``verbose`` is ``True``, the parsed result dictionary from Vosk is returned.
:param recognizer: The recognizer instance performing the recognition.
:param audio_data: The audio data to recognize, provided as an instance of ``AudioData``.
:param verbose: If ``False`` (default), only the recognized text is returned. If ``True``, the parsed result dictionary from Vosk is returned.
:return: The recognized text as a string if ``verbose`` is ``False``, or the parsed result dictionary if ``verbose`` is ``True``.

Copilot uses AI. Check for mistakes.
"""

from vosk import KaldiRecognizer, Model
Expand All @@ -25,6 +47,10 @@ def recognize(recognizer, audio_data: AudioData) -> str:
rec.AcceptWaveform(
audio_data.get_raw_data(convert_rate=SAMPLE_RATE, convert_width=2)
)
finalRecognition = rec.FinalResult()
final_recognition: str = rec.FinalResult()

result = cast(VoskResponse, json.loads(final_recognition))
if verbose:
return result

return finalRecognition
return result["text"]
27 changes: 17 additions & 10 deletions tests/recognizers/test_vosk.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
from pathlib import Path

import pytest

from speech_recognition import AudioData, Recognizer


def test_recognize_vosk():
@pytest.fixture
def audio_data() -> AudioData:
audio_file = str(Path(__file__).parent.parent / "english.wav")
audio_data = AudioData.from_file(audio_file)
sut = Recognizer()
return AudioData.from_file(audio_file)


def test_recognize_vosk(audio_data):
recognizer = Recognizer()
actual = recognizer.recognize_vosk(audio_data)

assert actual == "one two three"


actual = sut.recognize_vosk(audio_data)
def test_recognize_vosk_verbose(audio_data):
recognizer = Recognizer()
actual = recognizer.recognize_vosk(audio_data, verbose=True)

expected = """\
{
"text" : "one two three"
}\
"""
assert actual == expected
assert actual == {"text": "one two three"}
Loading