From 89ff77a8e2b89de0aee0a28e089813d125c41043 Mon Sep 17 00:00:00 2001 From: blechdom Date: Tue, 14 May 2019 15:54:39 -0700 Subject: [PATCH 1/6] adding result end time and color-coded result output --- .../speech/InfiniteStreamRecognize.java | 147 +++++++++++++++--- .../java/com/example/speech/Recognize.java | 2 +- 2 files changed, 128 insertions(+), 21 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java index 7844e23af5e..a326643f970 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java @@ -1,5 +1,5 @@ /* - * Copyright 2018 Google LLC + * Copyright 2019 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,17 +20,20 @@ import com.google.api.gax.rpc.ClientStream; import com.google.api.gax.rpc.ResponseObserver; import com.google.api.gax.rpc.StreamController; -import com.google.cloud.speech.v1.RecognitionConfig; -import com.google.cloud.speech.v1.SpeechClient; -import com.google.cloud.speech.v1.SpeechRecognitionAlternative; -import com.google.cloud.speech.v1.StreamingRecognitionConfig; -import com.google.cloud.speech.v1.StreamingRecognitionResult; -import com.google.cloud.speech.v1.StreamingRecognizeRequest; -import com.google.cloud.speech.v1.StreamingRecognizeResponse; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig; +import com.google.cloud.speech.v1p1beta1.SpeechClient; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig; +import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult; +import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest; +import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse; +import com.google.protobuf.Duration; import com.google.protobuf.ByteString; -import java.util.ArrayList; +import java.lang.Math; +import java.util.*; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import java.text.DecimalFormat; import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.DataLine; @@ -39,11 +42,29 @@ public class InfiniteStreamRecognize { + private static final int STREAMING_LIMIT = 10000; // 10 seconds + + public static final String RED = "\033[0;31m"; + public static final String GREEN = "\033[0;32m"; + public static final String YELLOW = "\033[0;33m"; + // Creating shared object private static volatile BlockingQueue sharedQueue = new LinkedBlockingQueue(); private static TargetDataLine targetDataLine; private static int BYTES_PER_BUFFER = 6400; // buffer size in bytes + private static int restartCounter = 0; + private static ArrayList audioInput = new ArrayList(); + private static ArrayList lastAudioInput = new ArrayList(); + private static int resultEndTimeInMS = 0; + private static int isFinalEndTime = 0; + private static int finalRequestEndTime = 0; + private static boolean newStream = true; + private static double bridgingOffset = 0; + private static boolean lastTranscriptWasFinal = false; + private static StreamController referenceToStreamController; + private static ByteString tempByteString; + public static void main(String... args) { try { infiniteStreamingRecognize(); @@ -60,6 +81,7 @@ class MicBuffer implements Runnable { @Override public void run() { + System.out.println(YELLOW); System.out.println("Start speaking...Press Ctrl-C to stop"); targetDataLine.start(); byte[] data = new byte[BYTES_PER_BUFFER]; @@ -88,23 +110,51 @@ public void run() { ArrayList responses = new ArrayList<>(); - public void onStart(StreamController controller) {} + public void onStart(StreamController controller) { + referenceToStreamController = controller; + } public void onResponse(StreamingRecognizeResponse response) { + responses.add(response); + StreamingRecognitionResult result = response.getResultsList().get(0); - // There can be several alternative transcripts for a given chunk of speech. Just - // use the first (most likely) one here. + + Duration resultEndTime = result.getResultEndTime(); + + resultEndTimeInMS = (int) ((resultEndTime.getSeconds() * 1000) + + (resultEndTime.getNanos() / 1000000)); + + double correctedTime = resultEndTimeInMS - bridgingOffset + + (STREAMING_LIMIT * restartCounter); + DecimalFormat format = new DecimalFormat("0.#"); + + System.out.printf("\nresultEndTime: %d bridgingOffset: %.2f correctedTime: %.2f restartCounter: %d\n", + resultEndTimeInMS, bridgingOffset, correctedTime, restartCounter); + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcript : %s\n", alternative.getTranscript()); - } + if(result.getIsFinal()){ + System.out.print(GREEN); + System.out.print("\033[2K\r"); + System.out.printf("%s: %s\n", format.format(correctedTime), alternative.getTranscript()); + + isFinalEndTime = resultEndTimeInMS; + lastTranscriptWasFinal = true; + } + else { + System.out.print(RED); + System.out.print("\033[2K\r"); + System.out.printf("%s: %s", format.format(correctedTime), alternative.getTranscript()); + lastTranscriptWasFinal = false; + } + + } public void onComplete() { - System.out.println("Done"); + // Should be cancrestartStream(); } - public void onError(Throwable t) { - System.out.println(t); + // Error indicates canceled observer, do nothing. } }; @@ -116,8 +166,12 @@ public void onError(Throwable t) { .setLanguageCode("en-US") .setSampleRateHertz(16000) .build(); + StreamingRecognitionConfig streamingRecognitionConfig = - StreamingRecognitionConfig.newBuilder().setConfig(recognitionConfig).build(); + StreamingRecognitionConfig.newBuilder() + .setConfig(recognitionConfig) + .setInterimResults(true) + .build(); StreamingRecognizeRequest request = StreamingRecognizeRequest.newBuilder() @@ -151,9 +205,28 @@ public void onError(Throwable t) { long estimatedTime = System.currentTimeMillis() - startTime; - if (estimatedTime >= 55000) { + if (estimatedTime >= STREAMING_LIMIT) { clientStream.closeSend(); + referenceToStreamController.cancel(); // remove Observer + + if (resultEndTimeInMS > 0) { + finalRequestEndTime = isFinalEndTime; + } + resultEndTimeInMS = 0; + + lastAudioInput = null; + lastAudioInput = audioInput; + audioInput = new ArrayList(); + + restartCounter++; + + if (!lastTranscriptWasFinal) { + System.out.print('\n'); + } + + newStream = true; + clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); request = @@ -161,13 +234,47 @@ public void onError(Throwable t) { .setStreamingConfig(streamingRecognitionConfig) .build(); + System.out.println(YELLOW); + System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_LIMIT); + startTime = System.currentTimeMillis(); } else { + + if((newStream) && (lastAudioInput.size() > 0)){ + System.out.println("gonna send that stuff"); + double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); // in ms + if(chunkTime != 0){ + if(bridgingOffset < 0){ + bridgingOffset = 0; + } + if(bridgingOffset > finalRequestEndTime){ + bridgingOffset = finalRequestEndTime; + } + int chunksFromMS = (int) Math.floor((finalRequestEndTime - bridgingOffset) / chunkTime); + bridgingOffset = (int) Math.floor((lastAudioInput.size() - chunksFromMS) * chunkTime); + System.out.printf("finalEndTime: %d chunkTime: %.2f chunksFromMS: %d bridgingOffset: %.2f\n", finalRequestEndTime, chunkTime, chunksFromMS, bridgingOffset); + for(int i = chunksFromMS; i< lastAudioInput.size(); i++){ + //System.out.println(i); + request = + StreamingRecognizeRequest.newBuilder() + .setAudioContent(lastAudioInput.get(i)) + .build(); + + } + } + newStream = false; + } + + tempByteString = ByteString.copyFrom(sharedQueue.take()); + request = StreamingRecognizeRequest.newBuilder() - .setAudioContent(ByteString.copyFrom(sharedQueue.take())) + .setAudioContent(tempByteString) .build(); + + audioInput.add(tempByteString); + } clientStream.send(request); diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index d8845d499be..0a8f720a284 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -699,7 +699,7 @@ public void onError(Throwable t) { System.out.println("Stop speaking."); targetDataLine.stop(); targetDataLine.close(); - break; + // break; } request = StreamingRecognizeRequest.newBuilder() From 556375992a7c2d03f072379109de9e95c738875b Mon Sep 17 00:00:00 2001 From: blechdom Date: Wed, 15 May 2019 12:22:21 -0700 Subject: [PATCH 2/6] updated infinited streaming sample to use result_end_time --- .../java/com/example/speech/InfiniteStreamRecognize.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java index a326643f970..9f6619e2e46 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java @@ -129,9 +129,6 @@ public void onResponse(StreamingRecognizeResponse response) { (STREAMING_LIMIT * restartCounter); DecimalFormat format = new DecimalFormat("0.#"); - System.out.printf("\nresultEndTime: %d bridgingOffset: %.2f correctedTime: %.2f restartCounter: %d\n", - resultEndTimeInMS, bridgingOffset, correctedTime, restartCounter); - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); if(result.getIsFinal()){ System.out.print(GREEN); @@ -242,7 +239,6 @@ public void onError(Throwable t) { } else { if((newStream) && (lastAudioInput.size() > 0)){ - System.out.println("gonna send that stuff"); double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); // in ms if(chunkTime != 0){ if(bridgingOffset < 0){ @@ -253,14 +249,13 @@ public void onError(Throwable t) { } int chunksFromMS = (int) Math.floor((finalRequestEndTime - bridgingOffset) / chunkTime); bridgingOffset = (int) Math.floor((lastAudioInput.size() - chunksFromMS) * chunkTime); - System.out.printf("finalEndTime: %d chunkTime: %.2f chunksFromMS: %d bridgingOffset: %.2f\n", finalRequestEndTime, chunkTime, chunksFromMS, bridgingOffset); for(int i = chunksFromMS; i< lastAudioInput.size(); i++){ - //System.out.println(i); + request = StreamingRecognizeRequest.newBuilder() .setAudioContent(lastAudioInput.get(i)) .build(); - + clientStream.send(request); } } newStream = false; From 0aa1b2eaba26ec4a052c3cbc2a6c1983497425d7 Mon Sep 17 00:00:00 2001 From: blechdom Date: Wed, 15 May 2019 12:49:53 -0700 Subject: [PATCH 3/6] lint fixes --- .../speech/InfiniteStreamRecognize.java | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java index 9f6619e2e46..717b297ea5f 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java @@ -27,13 +27,13 @@ import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult; import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest; import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse; -import com.google.protobuf.Duration; import com.google.protobuf.ByteString; +import com.google.protobuf.Duration; import java.lang.Math; -import java.util.*; +import java.text.DecimalFormat; +import java.util.ArrayList; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; -import java.text.DecimalFormat; import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.DataLine; @@ -122,37 +122,36 @@ public void onResponse(StreamingRecognizeResponse response) { Duration resultEndTime = result.getResultEndTime(); - resultEndTimeInMS = (int) ((resultEndTime.getSeconds() * 1000) + - (resultEndTime.getNanos() / 1000000)); + resultEndTimeInMS = (int) ((resultEndTime.getSeconds() * 1000) + + (resultEndTime.getNanos() / 1000000)); - double correctedTime = resultEndTimeInMS - bridgingOffset + - (STREAMING_LIMIT * restartCounter); + double correctedTime = resultEndTimeInMS - bridgingOffset + + (STREAMING_LIMIT * restartCounter); DecimalFormat format = new DecimalFormat("0.#"); SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - if(result.getIsFinal()){ + if (result.getIsFinal()) { System.out.print(GREEN); System.out.print("\033[2K\r"); - System.out.printf("%s: %s\n", format.format(correctedTime), alternative.getTranscript()); + System.out.printf("%s: %s\n", format.format(correctedTime), + alternative.getTranscript()); isFinalEndTime = resultEndTimeInMS; lastTranscriptWasFinal = true; - } - else { + } else { System.out.print(RED); System.out.print("\033[2K\r"); - System.out.printf("%s: %s", format.format(correctedTime), alternative.getTranscript()); + System.out.printf("%s: %s", format.format(correctedTime), + alternative.getTranscript()); lastTranscriptWasFinal = false; } - - } - public void onComplete() { - // Should be cancrestartStream(); - } - public void onError(Throwable t) { - // Error indicates canceled observer, do nothing. } + + public void onComplete() {} + + public void onError(Throwable t) {} + }; clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); @@ -238,18 +237,20 @@ public void onError(Throwable t) { } else { - if((newStream) && (lastAudioInput.size() > 0)){ - double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); // in ms - if(chunkTime != 0){ - if(bridgingOffset < 0){ + if ((newStream) && (lastAudioInput.size() > 0)) { + double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); // ms + if (chunkTime != 0) { + if (bridgingOffset < 0) { bridgingOffset = 0; } - if(bridgingOffset > finalRequestEndTime){ + if (bridgingOffset > finalRequestEndTime) { bridgingOffset = finalRequestEndTime; } - int chunksFromMS = (int) Math.floor((finalRequestEndTime - bridgingOffset) / chunkTime); - bridgingOffset = (int) Math.floor((lastAudioInput.size() - chunksFromMS) * chunkTime); - for(int i = chunksFromMS; i< lastAudioInput.size(); i++){ + int chunksFromMS = (int) Math.floor((finalRequestEndTime + - bridgingOffset) / chunkTime); + bridgingOffset = (int) Math.floor((lastAudioInput.size() + - chunksFromMS) * chunkTime); + for (int i = chunksFromMS; i < lastAudioInput.size(); i++) { request = StreamingRecognizeRequest.newBuilder() From 55dd1c9657bd2ff451b2ec47e7ab6c564a7badb3 Mon Sep 17 00:00:00 2001 From: blechdom Date: Wed, 15 May 2019 15:36:30 -0700 Subject: [PATCH 4/6] more comments and formatting updates --- .../com/example/speech/InfiniteStreamRecognize.java | 12 ++++++++++-- .../src/main/java/com/example/speech/Recognize.java | 1 - 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java index 717b297ea5f..c281c298da9 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/InfiniteStreamRecognize.java @@ -1,5 +1,5 @@ /* - * Copyright 2019 Google LLC + * Copyright 2018 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -238,9 +238,15 @@ public void onError(Throwable t) {} } else { if ((newStream) && (lastAudioInput.size() > 0)) { - double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); // ms + // if this is the first audio from a new request + // calculate amount of unfinalized audio from last request + // resend the audio to the speech client before incoming audio + double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); + // ms length of each chunk in previous request audio arrayList if (chunkTime != 0) { if (bridgingOffset < 0) { + // bridging Offset accounts for time of resent audio + // calculated from last request bridgingOffset = 0; } if (bridgingOffset > finalRequestEndTime) { @@ -248,8 +254,10 @@ public void onError(Throwable t) {} } int chunksFromMS = (int) Math.floor((finalRequestEndTime - bridgingOffset) / chunkTime); + // chunks from MS is number of chunks to resend bridgingOffset = (int) Math.floor((lastAudioInput.size() - chunksFromMS) * chunkTime); + // set bridging offset for next request for (int i = chunksFromMS; i < lastAudioInput.size(); i++) { request = diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 0a8f720a284..0797e8211d0 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -699,7 +699,6 @@ public void onError(Throwable t) { System.out.println("Stop speaking."); targetDataLine.stop(); targetDataLine.close(); - // break; } request = StreamingRecognizeRequest.newBuilder() From a86baa5f7350f50088976764eaf0ad7017b013c6 Mon Sep 17 00:00:00 2001 From: Kurtis Van Gent Date: Thu, 16 May 2019 11:16:57 -0700 Subject: [PATCH 5/6] Fix speech test. --- .../src/test/java/com/example/speech/RecognizeIT.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index 2eef6f05807..dfe93b31e5b 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -114,21 +114,21 @@ public void testStreamRecognize() throws Exception { public void testAutoPunctuation() throws Exception { Recognize.transcribeFileWithAutomaticPunctuation(audioFileName); String got = bout.toString(); - assertThat(got).contains("How old is the Brooklyn Bridge?"); + assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test public void testGcsAutoPunctuation() throws Exception { Recognize.transcribeGcsWithAutomaticPunctuation(gcsAudioPath); String got = bout.toString(); - assertThat(got).contains("How old is the Brooklyn Bridge?"); + assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test public void testStreamAutoPunctuation() throws Exception { Recognize.streamingTranscribeWithAutomaticPunctuation(audioFileName); String got = bout.toString(); - assertThat(got).contains("How old is the Brooklyn Bridge?"); + assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test From 401c83e74c3e5c686a4c639ef8e182e906107620 Mon Sep 17 00:00:00 2001 From: Kurtis Van Gent Date: Thu, 16 May 2019 11:28:23 -0700 Subject: [PATCH 6/6] More test adjustments. --- .../test/java/com/example/speech/RecognizeIT.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index dfe93b31e5b..0aa8db5d2ca 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -113,22 +113,22 @@ public void testStreamRecognize() throws Exception { @Test public void testAutoPunctuation() throws Exception { Recognize.transcribeFileWithAutomaticPunctuation(audioFileName); - String got = bout.toString(); - assertThat(got).contains("how old is the Brooklyn Bridge"); + String got = bout.toString().toLowerCase(); + assertThat(got).contains("how old is the brooklyn bridge"); } @Test public void testGcsAutoPunctuation() throws Exception { Recognize.transcribeGcsWithAutomaticPunctuation(gcsAudioPath); - String got = bout.toString(); - assertThat(got).contains("how old is the Brooklyn Bridge"); + String got = bout.toString().toLowerCase(); + assertThat(got).contains("how old is the brooklyn bridge"); } @Test public void testStreamAutoPunctuation() throws Exception { Recognize.streamingTranscribeWithAutomaticPunctuation(audioFileName); - String got = bout.toString(); - assertThat(got).contains("how old is the Brooklyn Bridge"); + String got = bout.toString().toLowerCase(); + assertThat(got).contains("how old is the brooklyn bridge"); } @Test