diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py index 1b5ef43f83..86f8bf0cda 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py @@ -10,6 +10,7 @@ "medical_conversation", "chirp", "chirp_2", + "chirp_3", "latest_long", "latest_short", ] diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py index d495aee307..e2a6491ac9 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py @@ -618,17 +618,28 @@ def _streaming_recognize_response_to_speech_data( ) -> stt.SpeechData | None: text = "" confidence = 0.0 + final_result = None for result in resp.results: if len(result.alternatives) == 0: continue - text += result.alternatives[0].transcript - confidence += result.alternatives[0].confidence - - confidence /= len(resp.results) - lg = resp.results[0].language_code + else: + if result.is_final: + final_result = result + break + else: + text += result.alternatives[0].transcript + confidence += result.alternatives[0].confidence + + if final_result is not None: + text = final_result.alternatives[0].transcript + confidence = final_result.alternatives[0].confidence + lg = final_result.language_code + else: + confidence /= len(resp.results) + if confidence < min_confidence_threshold: + return None + lg = resp.results[0].language_code - if confidence < min_confidence_threshold: - return None if text == "": return None diff --git a/tests/test_plugin_google_stt.py b/tests/test_plugin_google_stt.py new file mode 100644 index 0000000000..e176ffb00a --- /dev/null +++ b/tests/test_plugin_google_stt.py @@ -0,0 +1,106 @@ +from google.cloud.speech_v2.types import cloud_speech + +from livekit.agents.stt import SpeechData +from livekit.plugins.google.stt import ( + _streaming_recognize_response_to_speech_data, # pyright: ignore[reportPrivateUsage] +) + + +async def test_streaming_recognize_response_to_speech_data_01(): + srr = cloud_speech.StreamingRecognizeResponse( + results=[cloud_speech.StreamingRecognitionResult()] + ) + assert _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=1.0) is None + + +async def test_streaming_recognize_response_to_speech_data_02(): + srr = cloud_speech.StreamingRecognizeResponse( + results=[ + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=0.0, transcript="test") + ], + is_final=True, + language_code="te-ST", + ) + ] + ) + result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5) + assert type(result) is SpeechData + assert result.text == "test" + assert result.language == "te-ST" + assert result.confidence == 0.0 + + +async def test_streaming_recognize_response_to_speech_data_03(): + srr = cloud_speech.StreamingRecognizeResponse( + results=[ + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=0.0, transcript="test") + ], + is_final=False, + ) + ] + ) + result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5) + assert result is None + + +async def test_streaming_recognize_response_to_speech_data_04(): + srr = cloud_speech.StreamingRecognizeResponse( + results=[ + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test01") + ], + is_final=False, + language_code="te-ST", + ), + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test02") + ], + is_final=False, + language_code="te-ST", + ), + ] + ) + result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5) + assert type(result) is SpeechData + assert result.text == "test01test02" + assert result.language == "te-ST" + assert result.confidence == 1.0 + + +async def test_streaming_recognize_response_to_speech_data_05(): + srr = cloud_speech.StreamingRecognizeResponse( + results=[ + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test01") + ], + is_final=False, + language_code="te-ST", + ), + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="test02") + ], + is_final=False, + language_code="te-ST", + ), + cloud_speech.StreamingRecognitionResult( + alternatives=[ + cloud_speech.SpeechRecognitionAlternative(confidence=1.0, transcript="best") + ], + is_final=True, + language_code="te-ST", + ), + ] + ) + result = _streaming_recognize_response_to_speech_data(srr, min_confidence_threshold=0.5) + assert type(result) is SpeechData + assert result.text == "best" + assert result.language == "te-ST" + assert result.confidence == 1.0