Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"medical_conversation",
"chirp",
"chirp_2",
"chirp_3",
"latest_long",
"latest_short",
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -618,17 +618,28 @@ def _streaming_recognize_response_to_speech_data(
) -> stt.SpeechData | None:
text = ""
confidence = 0.0
final_result = None
for result in resp.results:
if len(result.alternatives) == 0:
continue
text += result.alternatives[0].transcript
confidence += result.alternatives[0].confidence

confidence /= len(resp.results)
lg = resp.results[0].language_code
else:
if result.is_final:
final_result = result
break
else:
text += result.alternatives[0].transcript
confidence += result.alternatives[0].confidence

if final_result is not None:
text = final_result.alternatives[0].transcript
confidence = final_result.alternatives[0].confidence
lg = final_result.language_code
else:
confidence /= len(resp.results)
if confidence < min_confidence_threshold:
return None
lg = resp.results[0].language_code

if confidence < min_confidence_threshold:
return None
if text == "":
return None

Expand Down
106 changes: 106 additions & 0 deletions tests/test_plugin_google_stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from google.cloud.speech_v2.types import cloud_speech

from livekit.agents.stt import SpeechData
from livekit.plugins.google.stt import (
_streaming_recognize_response_to_speech_data, # pyright: ignore[reportPrivateUsage]
)


async def test_streaming_recognize_response_to_speech_data_01():
srr = cloud_speech.StreamingRecognizeResponse(
results=[cloud_speech.StreamingRecognitionResult()]
)
assert _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=1.0) is None


async def test_streaming_recognize_response_to_speech_data_02():
srr = cloud_speech.StreamingRecognizeResponse(
results=[
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=0.0, transcript="test")
],
is_final=True,
language_code="te-ST",
)
]
)
result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5)
assert type(result) is SpeechData
assert result.text == "test"
assert result.language == "te-ST"
assert result.confidence == 0.0


async def test_streaming_recognize_response_to_speech_data_03():
srr = cloud_speech.StreamingRecognizeResponse(
results=[
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=0.0, transcript="test")
],
is_final=False,
)
]
)
result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5)
assert result is None


async def test_streaming_recognize_response_to_speech_data_04():
srr = cloud_speech.StreamingRecognizeResponse(
results=[
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test01")
],
is_final=False,
language_code="te-ST",
),
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test02")
],
is_final=False,
language_code="te-ST",
),
]
)
result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5)
assert type(result) is SpeechData
assert result.text == "test01test02"
assert result.language == "te-ST"
assert result.confidence == 1.0


async def test_streaming_recognize_response_to_speech_data_05():
srr = cloud_speech.StreamingRecognizeResponse(
results=[
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test01")
],
is_final=False,
language_code="te-ST",
),
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test02")
],
is_final=False,
language_code="te-ST",
),
cloud_speech.StreamingRecognitionResult(
alternatives=[
cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="best")
],
is_final=True,
language_code="te-ST",
),
]
)
result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5)
assert type(result) is SpeechData
assert result.text == "best"
assert result.language == "te-ST"
assert result.confidence == 1.0