From 14f31f0d2a83f1289602efef0f5c6de3d7954bc7 Mon Sep 17 00:00:00 2001 From: Gus Class Date: Thu, 3 Aug 2017 16:41:55 -0700 Subject: [PATCH] Updates to highlight word time offsets (#787) --- speech/cloud-client/README.md | 12 +++ speech/cloud-client/pom.xml | 7 +- .../java/com/example/speech/Recognize.java | 98 ++++++++++++++++++- .../java/com/example/speech/RecognizeIT.java | 11 ++- 4 files changed, 121 insertions(+), 7 deletions(-) diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md index 83565a21e04..aa07100c8dd 100644 --- a/speech/cloud-client/README.md +++ b/speech/cloud-client/README.md @@ -45,3 +45,15 @@ Build your project with: java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ com.example.speech.Recognize asyncrecognize gs://cloud-samples-tests/speech/vr.flac ``` + +### Synchronously transcribe an audio file and print word offsets +``` + java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ + com.example.speech.Recognize wordoffsets ./resources/audio.raw +``` + +### Asynchronously transcribe a remote audio file and print word offsets +``` + java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \ + com.example.speech.Recognize wordoffsets gs://cloud-samples-tests/speech/vr.flac +``` diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml index b887baeae18..9d097a566d1 100644 --- a/speech/cloud-client/pom.xml +++ b/speech/cloud-client/pom.xml @@ -21,9 +21,10 @@ - com.google.cloud.samples - shared-configuration - 1.0.5 + doc-samples + com.google.cloud + 1.0.0 + ../.. diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 619c402c49e..c571ad6714e 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -50,7 +50,7 @@ public static void main(String... args) throws Exception { System.out.printf( "\tjava %s \"\" \"\"\n" + "Commands:\n" - + "\tsyncrecognize | asyncrecognize | streamrecognize\n" + + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n" + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + "for a Cloud Storage resource (gs://...)\n", Recognize.class.getCanonicalName()); @@ -66,6 +66,12 @@ public static void main(String... args) throws Exception { } else { syncRecognizeFile(path); } + } else if (command.equals("wordoffsets")) { + if (path.startsWith("gs://")) { + asyncRecognizeWords(path); + } else { + syncRecognizeWords(path); + } } else if (command.equals("asyncrecognize")) { if (path.startsWith("gs://")) { asyncRecognizeGcs(path); @@ -113,6 +119,51 @@ public static void syncRecognizeFile(String fileName) throws Exception, IOExcept speech.close(); } + /** + * Performs sync recognize and prints word time offsets. + * + * @param fileName the path to a PCM audio file to transcribe get offsets on. + */ + public static void syncRecognizeWords(String fileName) throws Exception, IOException { + SpeechClient speech = SpeechClient.create(); + + Path path = Paths.get(fileName); + byte[] data = Files.readAllBytes(path); + ByteString audioBytes = ByteString.copyFrom(data); + + // Configure request with local raw PCM audio + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableWordTimeOffsets(true) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setContent(audioBytes) + .build(); + + // Use blocking call to get audio transcript + RecognizeResponse response = speech.recognize(config, audio); + List results = response.getResultsList(); + + for (SpeechRecognitionResult result: results) { + List alternatives = result.getAlternativesList(); + for (SpeechRecognitionAlternative alternative: alternatives) { + System.out.printf("Transcription: %s%n", alternative.getTranscript()); + for (WordInfo wordInfo: alternative.getWordsList()) { + System.out.println(wordInfo.getWord()); + System.out.printf("\t%s.%s sec - %s.%s sec\n", + wordInfo.getStartTime().getSeconds(), + wordInfo.getStartTime().getNanos() / 100000000, + wordInfo.getEndTime().getSeconds(), + wordInfo.getEndTime().getNanos() / 100000000); + } + } + } + speech.close(); + } + + /** * Performs speech recognition on remote FLAC file and prints the transcription. * @@ -193,11 +244,11 @@ public static void asyncRecognizeFile(String fileName) throws Exception, IOExcep /** * Performs non-blocking speech recognition on remote FLAC file and prints - * the transcription. + * the transcription as well as word time offsets. * * @param gcsUri the path to the remote LINEAR16 audio file to transcribe. */ - public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException { + public static void asyncRecognizeWords(String gcsUri) throws Exception, IOException { // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS SpeechClient speech = SpeechClient.create(); @@ -240,6 +291,47 @@ public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOExceptio speech.close(); } + /** + * Performs non-blocking speech recognition on remote FLAC file and prints + * the transcription. + * + * @param gcsUri the path to the remote LINEAR16 audio file to transcribe. + */ + public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException { + // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS + SpeechClient speech = SpeechClient.create(); + + // Configure remote file request for Linear16 + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder() + .setUri(gcsUri) + .build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speech.longRunningRecognizeAsync(config, audio); + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + List results = response.get().getResultsList(); + + for (SpeechRecognitionResult result: results) { + List alternatives = result.getAlternativesList(); + for (SpeechRecognitionAlternative alternative: alternatives) { + System.out.printf("Transcription: %s\n",alternative.getTranscript()); + } + } + speech.close(); + } + + /** * Performs streaming speech recognition on raw PCM audio data. * diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index c2417fe7cfc..7e2c4862fda 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -62,6 +62,14 @@ public void testRecognizeFile() throws Exception { assertThat(got).contains("how old is the Brooklyn Bridge"); } + @Test + public void testRecognizeWordoffset() throws Exception { + Recognize.syncRecognizeWords(fileName); + String got = bout.toString(); + assertThat(got).contains("how old is the Brooklyn Bridge"); + assertThat(got).contains("\t0.0 sec -"); + } + @Test public void testRecognizeGcs() throws Exception { Recognize.syncRecognizeGcs(gcsPath); @@ -85,8 +93,9 @@ public void testAsyncRecognizeGcs() throws Exception { @Test public void testAsyncWordoffset() throws Exception { - Recognize.asyncRecognizeGcs(gcsPath); + Recognize.asyncRecognizeWords(gcsPath); String got = bout.toString(); + assertThat(got).contains("how old is the Brooklyn Bridge"); assertThat(got).contains("\t0.0 sec -"); }