diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md index aa07100c8dd..d697b735655 100644 --- a/speech/cloud-client/README.md +++ b/speech/cloud-client/README.md @@ -57,3 +57,28 @@ Build your project with: java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ com.example.speech.Recognize wordoffsets gs://cloud-samples-tests/speech/vr.flac ``` + +### Synchronously transcribe and punctuate a remote audio file +``` + java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ + com.example.speech.Recognize punctuation ./resources/audio.raw +``` + +### Asynchronously transcribe and punctuate an audio file hosted on GCS +``` + java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ + com.example.speech.Recognize punctuation gs://cloud-samples-tests/speech/brooklyn.flac +``` + +### Synchronously transcribe a video file +``` + java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ + com.example.speech.Recognize video ./resources/Google_Gnome.wav +``` + +### Asynchronously transcribe a video file hosted on GCS +``` + java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ + com.example.speech.Recognize video gs://cloud-samples-tests/speech/Google_Gnome.wav +``` + diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml index 9d097a566d1..f8442a57b99 100644 --- a/speech/cloud-client/pom.xml +++ b/speech/cloud-client/pom.xml @@ -34,11 +34,12 @@ + com.google.cloud google-cloud-speech - 0.21.1-alpha + 0.22.1-alpha-SNAPSHOT diff --git a/speech/cloud-client/resources/Google_Gnome.wav b/speech/cloud-client/resources/Google_Gnome.wav new file mode 100644 index 00000000000..2f497b7fbe7 Binary files /dev/null and b/speech/cloud-client/resources/Google_Gnome.wav differ diff --git a/speech/cloud-client/src/main/java/com/example/speech/QuickstartSample.java b/speech/cloud-client/src/main/java/com/example/speech/QuickstartSample.java index e84dc11e7ba..d361015bb7a 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/QuickstartSample.java +++ b/speech/cloud-client/src/main/java/com/example/speech/QuickstartSample.java @@ -18,13 +18,13 @@ // [START speech_quickstart] // Imports the Google Cloud client library -import com.google.cloud.speech.v1.RecognitionAudio; -import com.google.cloud.speech.v1.RecognitionConfig; -import com.google.cloud.speech.v1.RecognitionConfig.AudioEncoding; -import com.google.cloud.speech.v1.RecognizeResponse; -import com.google.cloud.speech.v1.SpeechClient; -import com.google.cloud.speech.v1.SpeechRecognitionAlternative; -import com.google.cloud.speech.v1.SpeechRecognitionResult; +import com.google.cloud.speech.v1_1beta1.SpeechClient; +import com.google.cloud.speech.v1p1beta1.RecognitionAudio; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding; +import com.google.cloud.speech.v1p1beta1.RecognizeResponse; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult; import com.google.protobuf.ByteString; import java.nio.file.Files; @@ -33,39 +33,43 @@ import java.util.List; public class QuickstartSample { + + /** + * Demonstrates using the Speech API to transcribe an audio file. + */ public static void main(String... args) throws Exception { // Instantiates a client - SpeechClient speech = SpeechClient.create(); + try (SpeechClient speechClient = SpeechClient.create()) { - // The path to the audio file to transcribe - String fileName = "./resources/audio.raw"; + // The path to the audio file to transcribe + String fileName = "./resources/audio.raw"; - // Reads the audio file into memory - Path path = Paths.get(fileName); - byte[] data = Files.readAllBytes(path); - ByteString audioBytes = ByteString.copyFrom(data); + // Reads the audio file into memory + Path path = Paths.get(fileName); + byte[] data = Files.readAllBytes(path); + ByteString audioBytes = ByteString.copyFrom(data); - // Builds the sync recognize request - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setSampleRateHertz(16000) - .setLanguageCode("en-US") - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); + // Builds the sync recognize request + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("en-US") + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setContent(audioBytes) + .build(); - // Performs speech recognition on the audio file - RecognizeResponse response = speech.recognize(config, audio); - List results = response.getResultsList(); + // Performs speech recognition on the audio file + RecognizeResponse response = speechClient.recognize(config, audio); + List results = response.getResultsList(); - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s%n", alternative.getTranscript()); + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s%n", alternative.getTranscript()); + } } - speech.close(); } } // [END speech_quickstart] diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 1d959263064..f54e8234e41 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -17,22 +17,24 @@ package com.example.speech; import com.google.api.gax.rpc.ApiStreamObserver; +import com.google.api.gax.rpc.BidiStreamingCallable; import com.google.api.gax.rpc.OperationFuture; -import com.google.api.gax.rpc.StreamingCallable; -import com.google.cloud.speech.v1.LongRunningRecognizeMetadata; -import com.google.cloud.speech.v1.LongRunningRecognizeResponse; -import com.google.cloud.speech.v1.RecognitionAudio; -import com.google.cloud.speech.v1.RecognitionConfig; -import com.google.cloud.speech.v1.RecognitionConfig.AudioEncoding; -import com.google.cloud.speech.v1.RecognizeResponse; -import com.google.cloud.speech.v1.SpeechClient; -import com.google.cloud.speech.v1.SpeechRecognitionAlternative; -import com.google.cloud.speech.v1.SpeechRecognitionResult; -import com.google.cloud.speech.v1.StreamingRecognitionConfig; -import com.google.cloud.speech.v1.StreamingRecognitionResult; -import com.google.cloud.speech.v1.StreamingRecognizeRequest; -import com.google.cloud.speech.v1.StreamingRecognizeResponse; -import com.google.cloud.speech.v1.WordInfo; +import com.google.cloud.speech.v1_1beta1.SpeechClient; +import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata; +import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse; +import com.google.cloud.speech.v1p1beta1.RecognitionAudio; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.OriginalMediaType; +import com.google.cloud.speech.v1p1beta1.RecognizeResponse; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult; +import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig; +import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult; +import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest; +import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse; +import com.google.cloud.speech.v1p1beta1.WordInfo; import com.google.common.util.concurrent.SettableFuture; import com.google.longrunning.Operation; import com.google.protobuf.ByteString; @@ -44,6 +46,10 @@ import java.util.List; public class Recognize { + + /** + * Run speech recognition tasks. + */ public static void main(String... args) throws Exception { if (args.length < 1) { System.out.println("Usage:"); @@ -80,8 +86,19 @@ public static void main(String... args) throws Exception { } } else if (command.equals("streamrecognize")) { streamingRecognizeFile(path); + } else if (command.equals("punctuation")) { + if (path.startsWith("gs://")) { + transcribeGcsWithAutomaticPunctuation(path); + } else { + transcribeFileWithAutomaticPunctuation(path); + } + } else if (command.equals("video")) { + if (path.startsWith("gs://")) { + transcribeGcsVideoFile(path); + } else { + transcribeVideoFile(path); + } } - } /** @@ -89,34 +106,33 @@ public static void main(String... args) throws Exception { * * @param fileName the path to a PCM audio file to transcribe. */ - public static void syncRecognizeFile(String fileName) throws Exception, IOException { - SpeechClient speech = SpeechClient.create(); - - Path path = Paths.get(fileName); - byte[] data = Files.readAllBytes(path); - ByteString audioBytes = ByteString.copyFrom(data); - - // Configure request with local raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); - - // Use blocking call to get audio transcript - RecognizeResponse response = speech.recognize(config, audio); - List results = response.getResultsList(); - - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s%n", alternative.getTranscript()); + public static void syncRecognizeFile(String fileName) throws Exception { + try (SpeechClient speech = SpeechClient.create()) { + Path path = Paths.get(fileName); + byte[] data = Files.readAllBytes(path); + ByteString audioBytes = ByteString.copyFrom(data); + + // Configure request with local raw PCM audio + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setContent(audioBytes) + .build(); + + // Use blocking call to get audio transcript + RecognizeResponse response = speech.recognize(config, audio); + List results = response.getResultsList(); + + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s%n", alternative.getTranscript()); + } } - speech.close(); } /** @@ -124,122 +140,118 @@ public static void syncRecognizeFile(String fileName) throws Exception, IOExcept * * @param fileName the path to a PCM audio file to transcribe get offsets on. */ - public static void syncRecognizeWords(String fileName) throws Exception, IOException { - SpeechClient speech = SpeechClient.create(); - - Path path = Paths.get(fileName); - byte[] data = Files.readAllBytes(path); - ByteString audioBytes = ByteString.copyFrom(data); - - // Configure request with local raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableWordTimeOffsets(true) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); - - // Use blocking call to get audio transcript - RecognizeResponse response = speech.recognize(config, audio); - List results = response.getResultsList(); - - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s%n", alternative.getTranscript()); - for (WordInfo wordInfo: alternative.getWordsList()) { - System.out.println(wordInfo.getWord()); - System.out.printf("\t%s.%s sec - %s.%s sec\n", - wordInfo.getStartTime().getSeconds(), - wordInfo.getStartTime().getNanos() / 100000000, - wordInfo.getEndTime().getSeconds(), - wordInfo.getEndTime().getNanos() / 100000000); + public static void syncRecognizeWords(String fileName) throws Exception { + try (SpeechClient speech = SpeechClient.create()) { + Path path = Paths.get(fileName); + byte[] data = Files.readAllBytes(path); + ByteString audioBytes = ByteString.copyFrom(data); + + // Configure request with local raw PCM audio + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableWordTimeOffsets(true) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setContent(audioBytes) + .build(); + + // Use blocking call to get audio transcript + RecognizeResponse response = speech.recognize(config, audio); + List results = response.getResultsList(); + + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s%n", alternative.getTranscript()); + for (WordInfo wordInfo : alternative.getWordsList()) { + System.out.println(wordInfo.getWord()); + System.out.printf("\t%s.%s sec - %s.%s sec\n", + wordInfo.getStartTime().getSeconds(), + wordInfo.getStartTime().getNanos() / 100000000, + wordInfo.getEndTime().getSeconds(), + wordInfo.getEndTime().getNanos() / 100000000); + } } } - speech.close(); } - /** * Performs speech recognition on remote FLAC file and prints the transcription. * * @param gcsUri the path to the remote FLAC audio file to transcribe. */ - public static void syncRecognizeGcs(String gcsUri) throws Exception, IOException { + public static void syncRecognizeGcs(String gcsUri) throws Exception { // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS - SpeechClient speech = SpeechClient.create(); - - // Builds the request for remote FLAC file - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); - - // Use blocking call for getting audio transcript - RecognizeResponse response = speech.recognize(config, audio); - List results = response.getResultsList(); - - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s%n", alternative.getTranscript()); + try (SpeechClient speech = SpeechClient.create()) { + // Builds the request for remote FLAC file + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setUri(gcsUri) + .build(); + + // Use blocking call for getting audio transcript + RecognizeResponse response = speech.recognize(config, audio); + List results = response.getResultsList(); + + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s%n", alternative.getTranscript()); + } } - speech.close(); } - /* + /** * Performs non-blocking speech recognition on raw PCM audio and prints * the transcription. Note that transcription is limited to 60 seconds audio. * * @param fileName the path to a PCM audio file to transcribe. */ - public static void asyncRecognizeFile(String fileName) throws Exception, IOException { + public static void asyncRecognizeFile(String fileName) throws Exception { // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS - SpeechClient speech = SpeechClient.create(); - - Path path = Paths.get(fileName); - byte[] data = Files.readAllBytes(path); - ByteString audioBytes = ByteString.copyFrom(data); - - // Configure request with local raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speech.longRunningRecognizeAsync(config, audio); - - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } + try (SpeechClient speech = SpeechClient.create()) { + + Path path = Paths.get(fileName); + byte[] data = Files.readAllBytes(path); + ByteString audioBytes = ByteString.copyFrom(data); + + // Configure request with local raw PCM audio + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setContent(audioBytes) + .build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speech.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } - List results = response.get().getResultsList(); + List results = response.get().getResultsList(); - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s%n", alternative.getTranscript()); + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s%n", alternative.getTranscript()); + } } - speech.close(); } /** @@ -248,47 +260,47 @@ public static void asyncRecognizeFile(String fileName) throws Exception, IOExcep * * @param gcsUri the path to the remote LINEAR16 audio file to transcribe. */ - public static void asyncRecognizeWords(String gcsUri) throws Exception, IOException { + public static void asyncRecognizeWords(String gcsUri) throws Exception { // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS - SpeechClient speech = SpeechClient.create(); - - // Configure remote file request for Linear16 - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableWordTimeOffsets(true) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speech.longRunningRecognizeAsync(config, audio); - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } - - List results = response.get().getResultsList(); + try (SpeechClient speech = SpeechClient.create()) { + + // Configure remote file request for Linear16 + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableWordTimeOffsets(true) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setUri(gcsUri) + .build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speech.longRunningRecognizeAsync(config, audio); + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s\n",alternative.getTranscript()); - for (WordInfo wordInfo: alternative.getWordsList()) { - System.out.println(wordInfo.getWord()); - System.out.printf("\t%s.%s sec - %s.%s sec\n", - wordInfo.getStartTime().getSeconds(), - wordInfo.getStartTime().getNanos() / 100000000, - wordInfo.getEndTime().getSeconds(), - wordInfo.getEndTime().getNanos() / 100000000); + List results = response.get().getResultsList(); + + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s\n", alternative.getTranscript()); + for (WordInfo wordInfo : alternative.getWordsList()) { + System.out.println(wordInfo.getWord()); + System.out.printf("\t%s.%s sec - %s.%s sec\n", + wordInfo.getStartTime().getSeconds(), + wordInfo.getStartTime().getNanos() / 100000000, + wordInfo.getEndTime().getSeconds(), + wordInfo.getEndTime().getNanos() / 100000000); + } } } - speech.close(); } /** @@ -297,38 +309,37 @@ public static void asyncRecognizeWords(String gcsUri) throws Exception, IOExcept * * @param gcsUri the path to the remote LINEAR16 audio file to transcribe. */ - public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException { + public static void asyncRecognizeGcs(String gcsUri) throws Exception { // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS - SpeechClient speech = SpeechClient.create(); - - // Configure remote file request for Linear16 - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speech.longRunningRecognizeAsync(config, audio); - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } + try (SpeechClient speech = SpeechClient.create()) { + + // Configure remote file request for Linear16 + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setUri(gcsUri) + .build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speech.longRunningRecognizeAsync(config, audio); + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } - List results = response.get().getResultsList(); + List results = response.get().getResultsList(); - for (SpeechRecognitionResult result: results) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcription: %s\n",alternative.getTranscript()); + for (SpeechRecognitionResult result : results) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcription: %s\n", alternative.getTranscript()); + } } - speech.close(); } @@ -380,9 +391,9 @@ public SettableFuture> future() { } ResponseApiStreamingObserver responseObserver = - new ResponseApiStreamingObserver(); + new ResponseApiStreamingObserver<>(); - StreamingCallable callable = + BidiStreamingCallable callable = speech.streamingRecognizeCallable(); ApiStreamObserver requestObserver = @@ -411,9 +422,172 @@ public SettableFuture> future() { // There can be several alternative transcripts for a given chunk of speech. Just use the // first (most likely) one here. SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.println(alternative.getTranscript()); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); } speech.close(); } + + /** + * Performs transcription with automatic punctuation on raw PCM audio data. + * + * @param fileName the path to a PCM audio file to transcribe. + */ + public static void transcribeFileWithAutomaticPunctuation(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + // [START transcribe_file_with_automatic_punctuation] + try (SpeechClient speechClient = SpeechClient.create()) { + // Configure request with local raw PCM audio + RecognitionConfig recConfig = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableAutomaticPunctuation(true) + .build(); + + RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() + .setContent(ByteString.copyFrom(content)) + .build(); + + RecognizeResponse recognizeResponse = speechClient.recognize(recConfig, recognitionAudio); + // Just print the first result here. + SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0); + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + } + // [END transcribe_file_with_automatic_punctuation] + } + + /** + * Performs transcription on remote FLAC file and prints the transcription. + * + * @param gcsUri the path to the remote FLAC audio file to transcribe. + */ + public static void transcribeGcsWithAutomaticPunctuation(String gcsUri) throws Exception { + // [START transcribe_gcs_with_automatic_punctuation] + try (SpeechClient speechClient = SpeechClient.create()) { + + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableAutomaticPunctuation(true) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setUri(gcsUri) + .build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + // Just print the first result here. + SpeechRecognitionResult result = response.get().getResultsList().get(0); + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + } + // [START transcribe_gcs_with_automatic_punctuation] + } + + /** + * Performs transcription of the given audio file synchronously with + * video as the original media type. + * @param fileName the path to a video file to transcribe + */ + public static void transcribeVideoFile(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + // [START transcribe_video_file] + try (SpeechClient speech = SpeechClient.create()) { + + RecognitionMetadata recognitionMetadata = RecognitionMetadata.newBuilder() + .setOriginalMediaType(OriginalMediaType.VIDEO) + .build(); + + // Configure request with video media type + RecognitionConfig recConfig = RecognitionConfig.newBuilder() + // encoding may either be omitted or must match the value in the file header + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + // sample rate hertz may be either be omitted or must match the value in the file header + .setSampleRateHertz(16000) + .setMetadata(recognitionMetadata) + .build(); + + RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() + .setContent(ByteString.copyFrom(content)) + .build(); + + + RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio); + // Just print the first result here. + SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0); + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + } + // [END transcribe_video_file] + } + + /** + * Performs transcription on remote video file and prints the transcription. + * + * @param gcsUri the path to the remote video file to transcribe. + */ + public static void transcribeGcsVideoFile(String gcsUri) throws Exception { + // [START transcribe_video_gcs] + try (SpeechClient speech = SpeechClient.create()) { + + RecognitionMetadata recognitionMetadata = RecognitionMetadata.newBuilder() + .setOriginalMediaType(OriginalMediaType.VIDEO) + .build(); + + // Configure request with video media type + RecognitionConfig config = RecognitionConfig.newBuilder() + // encoding may either be omitted or must match the value in the file header + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + // sample rate hertz may be either be omitted or must match the value in the file header + .setSampleRateHertz(16000) + .setMetadata(recognitionMetadata) + .build(); + + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setUri(gcsUri) + .build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speech.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + List results = response.get().getResultsList(); + + // Just print the first result here. + SpeechRecognitionResult result = results.get(0); + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + } + // [START transcribe_video_gcs] + } } diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index 7e2c4862fda..b01517e1c4d 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -40,8 +40,12 @@ public class RecognizeIT { private PrintStream out; // The path to the audio file to transcribe - private String fileName = "./resources/audio.raw"; - private String gcsPath = "gs://" + BUCKET + "/speech/brooklyn.flac"; + private String audioFileName = "./resources/audio.raw"; + private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac"; + + // The path to the video file to transcribe + private String videoFileName = "./resources/Google_Gnome.wav"; + private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav"; @Before public void setUp() { @@ -57,14 +61,14 @@ public void tearDown() { @Test public void testRecognizeFile() throws Exception { - Recognize.syncRecognizeFile(fileName); + Recognize.syncRecognizeFile(audioFileName); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test public void testRecognizeWordoffset() throws Exception { - Recognize.syncRecognizeWords(fileName); + Recognize.syncRecognizeWords(audioFileName); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); assertThat(got).contains("\t0.0 sec -"); @@ -72,28 +76,28 @@ public void testRecognizeWordoffset() throws Exception { @Test public void testRecognizeGcs() throws Exception { - Recognize.syncRecognizeGcs(gcsPath); + Recognize.syncRecognizeGcs(gcsAudioPath); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test public void testAsyncRecognizeFile() throws Exception { - Recognize.asyncRecognizeFile(fileName); + Recognize.asyncRecognizeFile(audioFileName); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test public void testAsyncRecognizeGcs() throws Exception { - Recognize.asyncRecognizeGcs(gcsPath); + Recognize.asyncRecognizeGcs(gcsAudioPath); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); } @Test public void testAsyncWordoffset() throws Exception { - Recognize.asyncRecognizeWords(gcsPath); + Recognize.asyncRecognizeWords(gcsAudioPath); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); assertThat(got).contains("\t0.0 sec -"); @@ -101,8 +105,38 @@ public void testAsyncWordoffset() throws Exception { @Test public void testStreamRecognize() throws Exception { - Recognize.streamingRecognizeFile(fileName); + Recognize.streamingRecognizeFile(audioFileName); String got = bout.toString(); assertThat(got).contains("how old is the Brooklyn Bridge"); } + + @Test + public void testAutomaticPunctuationFile() throws Exception { + Recognize.transcribeFileWithAutomaticPunctuation(audioFileName); + String got = bout.toString(); + assertThat(got).contains("How"); + assertThat(got).contains("Bridge?"); + } + + @Test + public void testAutomaticPunctuationGcs() throws Exception { + Recognize.transcribeGcsWithAutomaticPunctuation(gcsAudioPath); + String got = bout.toString(); + assertThat(got).contains("How"); + assertThat(got).contains("Bridge?"); + } + + @Test + public void testVideoTranscription() throws Exception { + Recognize.transcribeVideoFile(videoFileName); + String got = bout.toString(); + assertThat(got).contains("OK Google"); + } + + @Test + public void testGcsVideoTranscription() throws Exception { + Recognize.transcribeGcsVideoFile(gcsVideoPath); + String got = bout.toString(); + assertThat(got).contains("OK Google"); + } }